aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/acpica/acpi_wakeup.c18
-rw-r--r--sys/amd64/amd64/apic_vector.S9
-rw-r--r--sys/amd64/amd64/cpu_switch.S4
-rw-r--r--sys/amd64/amd64/efirt_machdep.c33
-rw-r--r--sys/amd64/amd64/exec_machdep.c2
-rw-r--r--sys/amd64/amd64/initcpu.c4
-rw-r--r--sys/amd64/amd64/machdep.c34
-rw-r--r--sys/amd64/amd64/mem.c4
-rw-r--r--sys/amd64/amd64/minidump_machdep.c10
-rw-r--r--sys/amd64/amd64/pmap.c773
-rw-r--r--sys/amd64/amd64/support.S20
-rw-r--r--sys/amd64/amd64/trap.c21
-rw-r--r--sys/amd64/conf/MINIMALUP4
-rw-r--r--sys/amd64/include/efi.h4
-rw-r--r--sys/amd64/include/param.h7
-rw-r--r--sys/amd64/include/pmap.h34
-rw-r--r--sys/amd64/include/smp.h3
-rw-r--r--sys/amd64/include/vmparam.h55
-rw-r--r--sys/amd64/linux/linux_proto.h7
-rw-r--r--sys/amd64/linux/linux_sysent.c4
-rw-r--r--sys/amd64/linux/linux_systrace_args.c40
-rw-r--r--sys/amd64/linux/syscalls.master11
-rw-r--r--sys/amd64/linux32/linux32_proto.h9
-rw-r--r--sys/amd64/linux32/linux32_sysent.c6
-rw-r--r--sys/amd64/linux32/linux32_systrace_args.c54
-rw-r--r--sys/amd64/linux32/syscalls.master15
-rw-r--r--sys/amd64/pt/pt.c978
-rw-r--r--sys/amd64/pt/pt.h49
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S6
-rw-r--r--sys/arm/allwinner/aw_gpio.c8
-rw-r--r--sys/arm/allwinner/aw_mmc.c33
-rw-r--r--sys/arm/allwinner/aw_rtc.c29
-rw-r--r--sys/arm/arm/pmap-v6.c32
-rw-r--r--sys/arm/broadcom/bcm2835/bcm2835_gpio.c6
-rw-r--r--sys/arm/mv/mvebu_gpio.c1
-rw-r--r--sys/arm/nvidia/as3722_gpio.c2
-rw-r--r--sys/arm/nvidia/tegra_gpio.c1
-rw-r--r--sys/arm64/apple/apple_pinctrl.c20
-rw-r--r--sys/arm64/arm64/pmap.c98
-rw-r--r--sys/arm64/broadcom/genet/if_genet.c4
-rw-r--r--sys/arm64/linux/linux_proto.h7
-rw-r--r--sys/arm64/linux/linux_sysent.c4
-rw-r--r--sys/arm64/linux/linux_systrace_args.c40
-rw-r--r--sys/arm64/linux/syscalls.master11
-rw-r--r--sys/arm64/nvidia/tegra210/max77620_gpio.c2
-rw-r--r--sys/arm64/rockchip/rk_gpio.c12
-rw-r--r--sys/bsm/audit_kevents.h1
-rw-r--r--sys/cam/ata/ata_da.c5
-rw-r--r--sys/cam/cam_periph.c42
-rw-r--r--sys/cam/cam_xpt.c14
-rw-r--r--sys/cam/cam_xpt.h20
-rw-r--r--sys/cam/mmc/mmc_da.c57
-rw-r--r--sys/cam/mmc/mmc_xpt.c1
-rw-r--r--sys/cam/scsi/scsi_all.c12
-rw-r--r--sys/cam/scsi/scsi_cd.c8
-rw-r--r--sys/cam/scsi/scsi_ch.c6
-rw-r--r--sys/cam/scsi/scsi_da.c19
-rw-r--r--sys/cam/scsi/scsi_enc_ses.c5
-rw-r--r--sys/cam/scsi/scsi_sa.c7
-rw-r--r--sys/cam/scsi/scsi_xpt.c56
-rw-r--r--sys/cddl/boot/zfs/zfsimpl.h2
-rw-r--r--sys/compat/freebsd32/freebsd32_syscall.h4
-rw-r--r--sys/compat/freebsd32/freebsd32_syscalls.c2
-rw-r--r--sys/compat/freebsd32/freebsd32_sysent.c4
-rw-r--r--sys/compat/freebsd32/freebsd32_systrace_args.c60
-rw-r--r--sys/compat/linux/linux_dummy.c4
-rw-r--r--sys/compat/linux/linux_file.c121
-rw-r--r--sys/compat/linux/linux_file.h32
-rw-r--r--sys/compat/linuxkpi/common/include/acpi/acpi.h76
-rw-r--r--sys/compat/linuxkpi/common/include/acpi/acpi_bus.h6
-rw-r--r--sys/compat/linuxkpi/common/include/linux/device.h2
-rw-r--r--sys/compat/linuxkpi/common/include/linux/pci.h9
-rw-r--r--sys/compat/linuxkpi/common/include/linux/slab.h2
-rw-r--r--sys/compat/linuxkpi/common/src/linux_acpi.c19
-rw-r--r--sys/compat/linuxkpi/common/src/linux_page.c5
-rw-r--r--sys/conf/files83
-rw-r--r--sys/conf/files.amd648
-rw-r--r--sys/conf/kern.pre.mk4
-rw-r--r--sys/conf/options3
-rw-r--r--sys/contrib/dev/iwlwifi/iwl-debug.h4
-rw-r--r--sys/contrib/dev/rtw89/acpi.c26
-rw-r--r--sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h1
-rw-r--r--sys/dev/drm2/drm_fb_helper.c2
-rw-r--r--sys/dev/efidev/efirt.c42
-rw-r--r--sys/dev/gpio/acpi_gpiobus.c3
-rw-r--r--sys/dev/gpio/gpiobus.c103
-rw-r--r--sys/dev/gpio/gpiobus_internal.h (renamed from sys/dev/sound/midi/sequencer.h)74
-rw-r--r--sys/dev/gpio/gpiobusvar.h17
-rw-r--r--sys/dev/gpio/gpiopps.c2
-rw-r--r--sys/dev/gpio/ofw_gpiobus.c1
-rw-r--r--sys/dev/gpio/pl061.c12
-rw-r--r--sys/dev/gpio/pl061.h1
-rw-r--r--sys/dev/gpio/pl061_acpi.c15
-rw-r--r--sys/dev/gpio/pl061_fdt.c15
-rw-r--r--sys/dev/gpio/qoriq_gpio.c11
-rw-r--r--sys/dev/hwt/hwt.c242
-rw-r--r--sys/dev/hwt/hwt_backend.c289
-rw-r--r--sys/dev/hwt/hwt_backend.h87
-rw-r--r--sys/dev/hwt/hwt_config.c108
-rw-r--r--sys/dev/hwt/hwt_config.h36
-rw-r--r--sys/dev/hwt/hwt_context.c201
-rw-r--r--sys/dev/hwt/hwt_context.h86
-rw-r--r--sys/dev/hwt/hwt_contexthash.c134
-rw-r--r--sys/dev/hwt/hwt_contexthash.h42
-rw-r--r--sys/dev/hwt/hwt_cpu.c115
-rw-r--r--sys/dev/hwt/hwt_cpu.h45
-rw-r--r--sys/dev/hwt/hwt_hook.c323
-rw-r--r--sys/dev/hwt/hwt_hook.h56
-rw-r--r--sys/dev/hwt/hwt_intr.h33
-rw-r--r--sys/dev/hwt/hwt_ioctl.c445
-rw-r--r--sys/dev/hwt/hwt_ioctl.h35
-rw-r--r--sys/dev/hwt/hwt_owner.c157
-rw-r--r--sys/dev/hwt/hwt_owner.h45
-rw-r--r--sys/dev/hwt/hwt_ownerhash.c141
-rw-r--r--sys/dev/hwt/hwt_ownerhash.h42
-rw-r--r--sys/dev/hwt/hwt_record.c302
-rw-r--r--sys/dev/hwt/hwt_record.h47
-rw-r--r--sys/dev/hwt/hwt_thread.c162
-rw-r--r--sys/dev/hwt/hwt_thread.h64
-rw-r--r--sys/dev/hwt/hwt_vm.c503
-rw-r--r--sys/dev/hwt/hwt_vm.h47
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_chan.c6
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_reg.h10
-rw-r--r--sys/dev/ice/ice_features.h2
-rw-r--r--sys/dev/ice/ice_iflib.h16
-rw-r--r--sys/dev/ice/ice_iov.c1856
-rw-r--r--sys/dev/ice/ice_iov.h125
-rw-r--r--sys/dev/ice/ice_lib.c28
-rw-r--r--sys/dev/ice/ice_lib.h4
-rw-r--r--sys/dev/ice/ice_vf_mbx.c471
-rw-r--r--sys/dev/ice/ice_vf_mbx.h67
-rw-r--r--sys/dev/ice/if_ice_iflib.c132
-rw-r--r--sys/dev/ichiic/ig4_pci.c12
-rw-r--r--sys/dev/iicbus/gpio/tca64xx.c3
-rw-r--r--sys/dev/md/md.c31
-rw-r--r--sys/dev/mem/memutil.c19
-rw-r--r--sys/dev/mgb/if_mgb.c2
-rw-r--r--sys/dev/mlx5/mlx5_accel/ipsec.h8
-rw-r--r--sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c16
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c100
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_rx.c2
-rw-r--r--sys/dev/nvme/nvme_ctrlr.c295
-rw-r--r--sys/dev/nvme/nvme_private.h4
-rw-r--r--sys/dev/nvmf/host/nvmf.c119
-rw-r--r--sys/dev/nvmf/host/nvmf_var.h6
-rw-r--r--sys/dev/nvmf/nvmf.h11
-rw-r--r--sys/dev/ofw/ofw_bus_subr.c101
-rw-r--r--sys/dev/qlnx/qlnxe/qlnx_os.c11
-rw-r--r--sys/dev/random/fortuna.c7
-rw-r--r--sys/dev/random/random_harvestq.c358
-rw-r--r--sys/dev/random/random_harvestq.h13
-rw-r--r--sys/dev/random/randomdev.c8
-rw-r--r--sys/dev/regulator/regulator_fixed.c8
-rw-r--r--sys/dev/sound/midi/midi.c670
-rw-r--r--sys/dev/sound/midi/midi.h7
-rw-r--r--sys/dev/sound/midi/mpu401.c44
-rw-r--r--sys/dev/sound/midi/mpu_if.m11
-rw-r--r--sys/dev/sound/midi/sequencer.c2107
-rw-r--r--sys/dev/sound/midi/synth_if.m312
-rw-r--r--sys/dev/sound/pcm/mixer.c4
-rw-r--r--sys/dev/sound/pcm/sndstat.c7
-rw-r--r--sys/dev/sound/pcm/sound.h8
-rw-r--r--sys/dev/ufshci/ufshci_private.h4
-rw-r--r--sys/dev/ufshci/ufshci_req_sdb.c45
-rw-r--r--sys/dev/usb/controller/xhci_pci.c7
-rw-r--r--sys/dev/vmm/vmm_dev.c1
-rw-r--r--sys/dev/vt/hw/vga/vt_vga.c2
-rw-r--r--sys/dev/vt/vt_core.c4
-rw-r--r--sys/fs/fdescfs/fdesc_vnops.c9
-rw-r--r--sys/fs/fuse/fuse_file.h2
-rw-r--r--sys/fs/fuse/fuse_kernel.h18
-rw-r--r--sys/fs/fuse/fuse_vnops.c203
-rw-r--r--sys/fs/msdosfs/msdosfs_conv.c11
-rw-r--r--sys/fs/msdosfs/msdosfs_lookup.c1
-rw-r--r--sys/fs/msdosfs/msdosfs_vfsops.c3
-rw-r--r--sys/fs/msdosfs/msdosfs_vnops.c35
-rw-r--r--sys/fs/msdosfs/msdosfsmount.h1
-rw-r--r--sys/fs/nfs/nfs_commonsubs.c99
-rw-r--r--sys/fs/nfs/nfs_var.h8
-rw-r--r--sys/fs/nfs/nfsproto.h8
-rw-r--r--sys/fs/nfsclient/nfs_clrpcops.c10
-rw-r--r--sys/fs/nfsclient/nfs_clstate.c2
-rw-r--r--sys/fs/nfsclient/nfs_clvnops.c23
-rw-r--r--sys/fs/nfsserver/nfs_nfsdport.c93
-rw-r--r--sys/fs/nfsserver/nfs_nfsdserv.c67
-rw-r--r--sys/fs/nullfs/null_subr.c4
-rw-r--r--sys/fs/nullfs/null_vnops.c29
-rw-r--r--sys/fs/p9fs/p9fs_vnops.c8
-rw-r--r--sys/fs/smbfs/smbfs_vnops.c3
-rw-r--r--sys/fs/tmpfs/tmpfs_vnops.c4
-rw-r--r--sys/fs/udf/ecma167-udf.h4
-rw-r--r--sys/fs/udf/udf_vfsops.c7
-rw-r--r--sys/fs/udf/udf_vnops.c48
-rw-r--r--sys/i386/conf/GENERIC2
-rw-r--r--sys/i386/conf/GENERIC-NODEBUG2
-rw-r--r--sys/i386/conf/LINT1
-rw-r--r--sys/i386/conf/MINIMAL2
-rw-r--r--sys/i386/conf/PAE2
-rw-r--r--sys/i386/i386/pmap.c12
-rw-r--r--sys/i386/linux/linux_proto.h9
-rw-r--r--sys/i386/linux/linux_sysent.c6
-rw-r--r--sys/i386/linux/linux_systrace_args.c54
-rw-r--r--sys/i386/linux/syscalls.master15
-rw-r--r--sys/kern/init_sysent.c4
-rw-r--r--sys/kern/kern_descrip.c151
-rw-r--r--sys/kern/kern_exec.c19
-rw-r--r--sys/kern/kern_linker.c5
-rw-r--r--sys/kern/kern_pmc.c4
-rw-r--r--sys/kern/kern_resource.c21
-rw-r--r--sys/kern/kern_sendfile.c4
-rw-r--r--sys/kern/kern_sig.c18
-rw-r--r--sys/kern/kern_syscalls.c5
-rw-r--r--sys/kern/kern_thr.c12
-rw-r--r--sys/kern/kern_thread.c9
-rw-r--r--sys/kern/sched_4bsd.c22
-rw-r--r--sys/kern/sched_ule.c19
-rw-r--r--sys/kern/subr_asan.c3
-rw-r--r--sys/kern/subr_capability.c4
-rw-r--r--sys/kern/subr_pctrie.c36
-rw-r--r--sys/kern/subr_trap.c5
-rw-r--r--sys/kern/sys_generic.c104
-rw-r--r--sys/kern/sys_pipe.c2
-rw-r--r--sys/kern/syscalls.c2
-rw-r--r--sys/kern/syscalls.master17
-rw-r--r--sys/kern/systrace_args.c60
-rw-r--r--sys/kern/sysv_msg.c2
-rw-r--r--sys/kern/sysv_sem.c2
-rw-r--r--sys/kern/sysv_shm.c2
-rw-r--r--sys/kern/uipc_syscalls.c13
-rw-r--r--sys/kern/uipc_usrreq.c3
-rw-r--r--sys/kern/vfs_aio.c46
-rw-r--r--sys/kern/vfs_cache.c72
-rw-r--r--sys/kern/vfs_default.c18
-rw-r--r--sys/kern/vfs_inotify.c1011
-rw-r--r--sys/kern/vfs_lookup.c177
-rw-r--r--sys/kern/vfs_mount.c2
-rw-r--r--sys/kern/vfs_subr.c100
-rw-r--r--sys/kern/vfs_syscalls.c37
-rw-r--r--sys/kern/vfs_vnops.c30
-rw-r--r--sys/kern/vnode_if.src24
-rw-r--r--sys/modules/Makefile7
-rw-r--r--sys/modules/efirt/Makefile2
-rw-r--r--sys/modules/hwt/Makefile21
-rw-r--r--sys/modules/ice/Makefile1
-rw-r--r--sys/modules/iwlwifi/Makefile8
-rw-r--r--sys/modules/pt/Makefile8
-rw-r--r--sys/modules/qlnx/qlnxe/Makefile1
-rw-r--r--sys/modules/rtw89/Makefile1
-rw-r--r--sys/modules/sound/sound/Makefile6
-rw-r--r--sys/net/ethernet.h23
-rw-r--r--sys/net/if_bridge.c305
-rw-r--r--sys/net/if_bridgevar.h29
-rw-r--r--sys/net/if_ethersubr.c9
-rw-r--r--sys/net/if_gif.h3
-rw-r--r--sys/net/if_lagg.c1
-rw-r--r--sys/net/if_vlan.c1
-rw-r--r--sys/net/if_vlan_var.h7
-rw-r--r--sys/net/pfvar.h67
-rw-r--r--sys/net80211/ieee80211_hostap.c7
-rw-r--r--sys/net80211/ieee80211_ht.c13
-rw-r--r--sys/net80211/ieee80211_node.c30
-rw-r--r--sys/net80211/ieee80211_node.h6
-rw-r--r--sys/net80211/ieee80211_output.c44
-rw-r--r--sys/net80211/ieee80211_vht.c4
-rw-r--r--sys/net80211/ieee80211_vht.h3
-rw-r--r--sys/netinet/icmp_var.h9
-rw-r--r--sys/netinet/in_pcb.c17
-rw-r--r--sys/netinet/in_pcb.h1
-rw-r--r--sys/netinet/ip_icmp.c3
-rw-r--r--sys/netinet/tcp_hpts.c73
-rw-r--r--sys/netinet/tcp_hpts.h17
-rw-r--r--sys/netinet/tcp_input.c48
-rw-r--r--sys/netinet/tcp_log_buf.c2
-rw-r--r--sys/netinet/tcp_log_buf.h8
-rw-r--r--sys/netinet/tcp_stacks/bbr.c22
-rw-r--r--sys/netinet/tcp_stacks/rack.c63
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.c2
-rw-r--r--sys/netinet/tcp_subr.c65
-rw-r--r--sys/netinet/tcp_usrreq.c3
-rw-r--r--sys/netinet6/in6_gif.c18
-rw-r--r--sys/netinet6/mld6.c29
-rw-r--r--sys/netinet6/raw_ip6.c3
-rw-r--r--sys/netipsec/ipsec.c6
-rw-r--r--sys/netipsec/ipsec_offload.c25
-rw-r--r--sys/netipsec/ipsec_offload.h16
-rw-r--r--sys/netipsec/key.c2
-rw-r--r--sys/netlink/netlink_message_parser.h3
-rw-r--r--sys/netpfil/ipfilter/netinet/fil.c29
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c5
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c8
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_htable.c6
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c4
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_irc_pxy.c7
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_lookup.c4
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_nat.c43
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_nat6.c52
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c5
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c5
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_proxy.c4
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c8
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c8
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c8
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_state.c22
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c4
-rw-r--r--sys/netpfil/ipfilter/netinet/ipf_rb.h2
-rw-r--r--sys/netpfil/ipfw/ip_fw2.c2
-rw-r--r--sys/netpfil/pf/if_pflog.c6
-rw-r--r--sys/netpfil/pf/if_pfsync.c22
-rw-r--r--sys/netpfil/pf/pf.c229
-rw-r--r--sys/netpfil/pf/pf.h3
-rw-r--r--sys/netpfil/pf/pf_if.c4
-rw-r--r--sys/netpfil/pf/pf_ioctl.c457
-rw-r--r--sys/netpfil/pf/pf_lb.c235
-rw-r--r--sys/netpfil/pf/pf_nl.c68
-rw-r--r--sys/netpfil/pf/pf_ruleset.c31
-rw-r--r--sys/netpfil/pf/pf_table.c92
-rw-r--r--sys/powerpc/aim/mmu_oea.c3
-rw-r--r--sys/powerpc/aim/mmu_oea64.c3
-rw-r--r--sys/powerpc/aim/mmu_radix.c4
-rw-r--r--sys/powerpc/include/pcb.h10
-rw-r--r--sys/powerpc/include/ucontext.h2
-rw-r--r--sys/powerpc/mpc85xx/mpc85xx_gpio.c4
-rw-r--r--sys/powerpc/powerpc/exec_machdep.c39
-rw-r--r--sys/powerpc/powerpc/fpu.c30
-rw-r--r--sys/riscv/allwinner/files.allwinner2
-rw-r--r--sys/riscv/conf/std.allwinner2
-rw-r--r--sys/riscv/riscv/pmap.c2
-rw-r--r--sys/rpc/clnt_rc.c7
-rw-r--r--sys/rpc/rpcsec_gss/rpcsec_gss.c14
-rw-r--r--sys/rpc/rpcsec_tls/rpctls_impl.c8
-rw-r--r--sys/sys/caprights.h2
-rw-r--r--sys/sys/capsicum.h8
-rw-r--r--sys/sys/efi.h18
-rw-r--r--sys/sys/elf_common.h280
-rw-r--r--sys/sys/exterr_cat.h2
-rw-r--r--sys/sys/exterrvar.h1
-rw-r--r--sys/sys/fcntl.h17
-rw-r--r--sys/sys/file.h1
-rw-r--r--sys/sys/filedesc.h2
-rw-r--r--sys/sys/hwt.h129
-rw-r--r--sys/sys/hwt_record.h70
-rw-r--r--sys/sys/inotify.h158
-rw-r--r--sys/sys/mount.h1
-rw-r--r--sys/sys/namei.h12
-rw-r--r--sys/sys/param.h2
-rw-r--r--sys/sys/proc.h2
-rw-r--r--sys/sys/random.h3
-rw-r--r--sys/sys/resourcevar.h4
-rw-r--r--sys/sys/socket.h6
-rw-r--r--sys/sys/specialfd.h5
-rw-r--r--sys/sys/syscall.h4
-rw-r--r--sys/sys/syscall.mk4
-rw-r--r--sys/sys/syscallsubr.h1
-rw-r--r--sys/sys/sysent.h9
-rw-r--r--sys/sys/sysproto.h14
-rw-r--r--sys/sys/unistd.h2
-rw-r--r--sys/sys/user.h5
-rw-r--r--sys/sys/vnode.h24
-rw-r--r--sys/tools/vnode_if.awk1
-rw-r--r--sys/ufs/ffs/ffs_vfsops.c3
-rw-r--r--sys/ufs/ufs/ufs_lookup.c1
-rw-r--r--sys/ufs/ufs/ufs_vnops.c21
-rw-r--r--sys/ufs/ufs/ufsmount.h2
-rw-r--r--sys/vm/swap_pager.c23
-rw-r--r--sys/vm/vm_domainset.c16
-rw-r--r--sys/vm/vm_fault.c3
-rw-r--r--sys/vm/vm_kern.c9
-rw-r--r--sys/vm/vm_mmap.c16
-rw-r--r--sys/vm/vm_pagequeue.h6
-rw-r--r--sys/x86/linux/linux_dummy_x86.c2
370 files changed, 14679 insertions, 6052 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c
index 51d6d5e36840..99565fbb69ca 100644
--- a/sys/amd64/acpica/acpi_wakeup.c
+++ b/sys/amd64/acpica/acpi_wakeup.c
@@ -54,10 +54,8 @@
#include <x86/apicreg.h>
#include <x86/apicvar.h>
-#ifdef SMP
#include <machine/smp.h>
#include <machine/vmparam.h>
-#endif
#include <contrib/dev/acpica/include/acpi.h>
@@ -73,19 +71,13 @@ extern int acpi_resume_beep;
extern int acpi_reset_video;
extern int acpi_susp_bounce;
-#ifdef SMP
extern struct susppcb **susppcbs;
static cpuset_t suspcpus;
-#else
-static struct susppcb **susppcbs;
-#endif
static void acpi_stop_beep(void *);
-#ifdef SMP
static int acpi_wakeup_ap(struct acpi_softc *, int);
static void acpi_wakeup_cpus(struct acpi_softc *);
-#endif
#define ACPI_WAKEPT_PAGES 7
@@ -103,7 +95,6 @@ acpi_stop_beep(void *arg)
timer_spkr_release();
}
-#ifdef SMP
static int
acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
{
@@ -177,7 +168,6 @@ acpi_wakeup_cpus(struct acpi_softc *sc)
outb(CMOS_DATA, mpbiosreason);
}
}
-#endif
int
acpi_sleep_machdep(struct acpi_softc *sc, int state)
@@ -190,10 +180,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
if (sc->acpi_wakeaddr == 0ul)
return (-1); /* couldn't alloc wake memory */
-#ifdef SMP
suspcpus = all_cpus;
CPU_CLR(PCPU_GET(cpuid), &suspcpus);
-#endif
if (acpi_resume_beep != 0)
timer_spkr_acquire();
@@ -208,12 +196,10 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
pcb = &susppcbs[0]->sp_pcb;
if (savectx(pcb)) {
fpususpend(susppcbs[0]->sp_fpususpend);
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
device_printf(sc->acpi_dev, "Failed to suspend APs\n");
return (0); /* couldn't sleep */
}
-#endif
hw_ibrs_ibpb_active = 0;
hw_ssb_active = 0;
cpu_stdext_feature3 = 0;
@@ -278,16 +264,12 @@ acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
PCPU_SET(switchtime, 0);
PCPU_SET(switchticks, ticks);
lapic_xapic_mode();
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus))
acpi_wakeup_cpus(sc);
-#endif
}
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus))
resume_cpus(suspcpus);
-#endif
/*
* Re-read cpu_stdext_feature3, which was zeroed-out
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 6e51ebff298a..e98bae9eb6c5 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -49,12 +49,6 @@
#include <machine/specialreg.h>
#include <x86/apicreg.h>
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
.text
SUPERALIGN_TEXT
/* End Of Interrupt to APIC */
@@ -163,7 +157,6 @@ IDTVEC(spuriousint)
jmp doreti
#endif
-#ifdef SMP
/*
* Global address space TLB shootdown.
*/
@@ -270,5 +263,3 @@ IDTVEC(justreturn)
INTR_HANDLER justreturn1
call as_lapic_eoi
jmp doreti
-
-#endif /* SMP */
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
index a053f6c70af1..d7e954f573b0 100644
--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -136,7 +136,7 @@ ctx_switch_fpusave_done:
movq %r15,TD_LOCK(%r13) /* Release the old thread */
sw1:
leaq TD_MD_PCB(%r12),%r8
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
movq $blocked_lock, %rdx
movq TD_LOCK(%r12),%rcx
cmpq %rcx, %rdx
@@ -492,7 +492,7 @@ ENTRY(resumectx)
END(resumectx)
/* Wait for the new thread to become unblocked */
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
sw1wait:
1:
pause
diff --git a/sys/amd64/amd64/efirt_machdep.c b/sys/amd64/amd64/efirt_machdep.c
index 81a28ebe97ee..fe5d60c978dd 100644
--- a/sys/amd64/amd64/efirt_machdep.c
+++ b/sys/amd64/amd64/efirt_machdep.c
@@ -56,6 +56,13 @@
#include <vm/vm_pager.h>
#include <vm/vm_radix.h>
+/* The EFI regions we're allowed to map. */
+#define EFI_ALLOWED_TYPES_MASK ( \
+ 1u << EFI_MD_TYPE_BS_CODE | 1u << EFI_MD_TYPE_BS_DATA | \
+ 1u << EFI_MD_TYPE_RT_CODE | 1u << EFI_MD_TYPE_RT_DATA | \
+ 1u << EFI_MD_TYPE_FIRMWARE \
+)
+
static pml5_entry_t *efi_pml5;
static pml4_entry_t *efi_pml4;
static vm_object_t obj_1t1_pt;
@@ -181,6 +188,7 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz)
vm_offset_t va;
uint64_t idx;
int bits, i, mode;
+ bool map_pz = true;
obj_1t1_pt = vm_pager_allocate(OBJT_PHYS, NULL, ptoa(1 +
NPML4EPG + NPML4EPG * NPDPEPG + NPML4EPG * NPDPEPG * NPDEPG),
@@ -198,9 +206,16 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz)
pmap_pinit_pml4(efi_pmltop_page);
}
+ if ((efi_map_regs & ~EFI_ALLOWED_TYPES_MASK) != 0) {
+ printf("Ignoring the following runtime EFI regions: %#x\n",
+ efi_map_regs & ~EFI_ALLOWED_TYPES_MASK);
+ efi_map_regs &= EFI_ALLOWED_TYPES_MASK;
+ }
+
for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p,
descsz)) {
- if ((p->md_attr & EFI_MD_ATTR_RT) == 0)
+ if ((p->md_attr & EFI_MD_ATTR_RT) == 0 &&
+ !EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
continue;
if (p->md_virt != 0 && p->md_virt != p->md_phys) {
if (bootverbose)
@@ -256,6 +271,22 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz)
}
}
VM_OBJECT_WUNLOCK(obj_1t1_pt);
+ if (p->md_phys == 0)
+ map_pz = false;
+ }
+
+ /*
+ * Some BIOSes tend to access phys 0 during efirt calls,
+ * so map it if we haven't yet.
+ */
+ if (map_pz) {
+ VM_OBJECT_WLOCK(obj_1t1_pt);
+ pte = efi_1t1_pte(0);
+ /* Assume Write-Back */
+ bits = pmap_cache_bits(kernel_pmap, VM_MEMATTR_WRITE_BACK,
+ false) | X86_PG_RW | X86_PG_V;
+ pte_store(pte, bits);
+ VM_OBJECT_WUNLOCK(obj_1t1_pt);
}
return (true);
diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c
index da68289e2c83..6752b716deb5 100644
--- a/sys/amd64/amd64/exec_machdep.c
+++ b/sys/amd64/amd64/exec_machdep.c
@@ -59,9 +59,7 @@
#include <sys/reg.h>
#include <sys/rwlock.h>
#include <sys/signalvar.h>
-#ifdef SMP
#include <sys/smp.h>
-#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
index 05e482f7783b..7f317674907e 100644
--- a/sys/amd64/amd64/initcpu.c
+++ b/sys/amd64/amd64/initcpu.c
@@ -325,6 +325,10 @@ initializecpu(void)
wrmsr(MSR_EFER, msr);
pg_nx = PG_NX;
}
+ if ((amd_feature2 & AMDID2_TCE) != 0) {
+ msr = rdmsr(MSR_EFER) | EFER_TCE;
+ wrmsr(MSR_EFER, msr);
+ }
hw_ibrs_recalculate(false);
hw_ssb_recalculate(false);
amd64_syscall_ret_flush_l1d_recalc();
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 032a134bbd4b..37c7056f649c 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -38,7 +38,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include "opt_atpic.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
@@ -82,9 +81,7 @@
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
-#ifdef SMP
#include <sys/smp.h>
-#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
@@ -132,9 +129,7 @@
#include <machine/tss.h>
#include <x86/ucode.h>
#include <x86/ifunc.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#ifdef FDT
#include <x86/fdt.h>
#endif
@@ -149,6 +144,10 @@
#include <isa/rtc.h>
#include <x86/init.h>
+#ifndef SMP
+#error amd64 requires options SMP
+#endif
+
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
@@ -188,6 +187,12 @@ struct init_ops init_ops = {
*/
vm_paddr_t efi_systbl_phys;
+/*
+ * Bitmap of extra EFI memory region types that should be preserved and mapped
+ * during runtime services calls.
+ */
+uint32_t efi_map_regs;
+
/* Intel ICH registers */
#define ICH_PMBASE 0x400
#define ICH_SMI_EN ICH_PMBASE + 0x30
@@ -645,7 +650,7 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
* NB: physmap_idx points to the next free slot.
*/
insert_idx = physmap_idx;
- for (i = 0; i <= physmap_idx; i += 2) {
+ for (i = 0; i < physmap_idx; i += 2) {
if (base < physmap[i + 1]) {
if (base + length <= physmap[i]) {
insert_idx = i;
@@ -659,7 +664,7 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
}
/* See if we can prepend to the next entry. */
- if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
+ if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) {
physmap[insert_idx] = base;
return (1);
}
@@ -670,8 +675,6 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
return (1);
}
- physmap_idx += 2;
- *physmap_idxp = physmap_idx;
if (physmap_idx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many segments in the physical address map, giving up\n");
@@ -682,11 +685,14 @@ add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
- for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
+ for (i = physmap_idx; i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
+ physmap_idx += 2;
+ *physmap_idxp = physmap_idx;
+
/* Insert the new entry. */
physmap[insert_idx] = base;
physmap[insert_idx + 1] = base + length;
@@ -757,6 +763,7 @@ add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
printf("%23s %12s %12s %8s %4s\n",
"Type", "Physical", "Virtual", "#Pages", "Attr");
+ TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs);
for (i = 0, p = map; i < ndesc; i++,
p = efi_next_descriptor(p, efihdr->descriptor_size)) {
if (boothowto & RB_VERBOSE) {
@@ -794,10 +801,13 @@ add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
}
switch (p->md_type) {
- case EFI_MD_TYPE_CODE:
- case EFI_MD_TYPE_DATA:
case EFI_MD_TYPE_BS_CODE:
case EFI_MD_TYPE_BS_DATA:
+ if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
+ continue;
+ /* FALLTHROUGH */
+ case EFI_MD_TYPE_CODE:
+ case EFI_MD_TYPE_DATA:
case EFI_MD_TYPE_FREE:
/*
* We're allowed to use any entry with these types.
diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c
index 413b7c74890e..851f2df0e6e1 100644
--- a/sys/amd64/amd64/mem.c
+++ b/sys/amd64/amd64/mem.c
@@ -105,8 +105,8 @@ memrw(struct cdev *dev, struct uio *uio, int flags)
* PAGE_SIZE, the uiomove() call does not
* access past the end of the direct map.
*/
- if (v >= DMAP_MIN_ADDRESS &&
- v < DMAP_MIN_ADDRESS + dmaplimit) {
+ if (v >= kva_layout.dmap_low &&
+ v < kva_layout.dmap_high) {
error = uiomove((void *)v, c, uio);
break;
}
diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
index 6d0917e16099..43bf81a991bf 100644
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -186,7 +186,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
* tables, so care must be taken to read each entry only once.
*/
pmapsize = 0;
- for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) {
+ for (va = kva_layout.km_low; va < kva_end; ) {
/*
* We always write a page, even if it is zero. Each
* page written corresponds to 1GB of space
@@ -279,9 +279,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
mdhdr.msgbufsize = mbp->msg_size;
mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages));
mdhdr.pmapsize = pmapsize;
- mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
- mdhdr.dmapbase = DMAP_MIN_ADDRESS;
- mdhdr.dmapend = DMAP_MAX_ADDRESS;
+ mdhdr.kernbase = kva_layout.km_low;
+ mdhdr.dmapbase = kva_layout.dmap_low;
+ mdhdr.dmapend = kva_layout.dmap_high;
mdhdr.dumpavailsize = round_page(sizeof(dump_avail));
dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION,
@@ -323,7 +323,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
/* Dump kernel page directory pages */
bzero(fakepd, sizeof(fakepd));
- for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) {
+ for (va = kva_layout.km_low; va < kva_end; va += NBPDP) {
ii = pmap_pml4e_index(va);
pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 2ab8c3b17e22..d1d80afccdc7 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -162,9 +162,7 @@
#include <machine/msan.h>
#include <machine/pcb.h>
#include <machine/specialreg.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#include <machine/sysarch.h>
#include <machine/tss.h>
@@ -415,7 +413,7 @@ SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
static int ndmpdp;
vm_paddr_t dmaplimit;
-vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
+vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS_LA48;
pt_entry_t pg_nx;
static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -475,11 +473,56 @@ _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow");
static pml4_entry_t *kernel_pml4;
static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
+static u_int64_t DMPML4phys; /* ... level 4, for la57 */
static int ndmpdpphys; /* number of DMPDPphys pages */
vm_paddr_t kernphys; /* phys addr of start of bootstrap data */
vm_paddr_t KERNend; /* and the end */
+struct kva_layout_s kva_layout = {
+ .kva_min = KV4ADDR(PML4PML4I, 0, 0, 0),
+ .kva_max = KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .dmap_low = KV4ADDR(DMPML4I, 0, 0, 0),
+ .dmap_high = KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0),
+ .lm_low = KV4ADDR(LMSPML4I, 0, 0, 0),
+ .lm_high = KV4ADDR(LMEPML4I + 1, 0, 0, 0),
+ .km_low = KV4ADDR(KPML4BASE, 0, 0, 0),
+ .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .rec_pt = KV4ADDR(PML4PML4I, 0, 0, 0),
+ .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+ .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+ .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+ .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+ 0, 0, 0),
+ .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+ .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+ 0, 0, 0),
+};
+
+struct kva_layout_s kva_layout_la57 = {
+ .kva_min = KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0), /* == rec_pt */
+ .kva_max = KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .dmap_low = KV5ADDR(DMPML5I, 0, 0, 0, 0),
+ .dmap_high = KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0),
+ .lm_low = KV5ADDR(LMSPML5I, 0, 0, 0, 0),
+ .lm_high = KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0),
+ .km_low = KV4ADDR(KPML4BASE, 0, 0, 0),
+ .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .rec_pt = KV5ADDR(PML5PML5I, 0, 0, 0, 0),
+ .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+ .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+ .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+ .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+ 0, 0, 0),
+ .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+ .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+ 0, 0, 0),
+};
+
/*
* pmap_mapdev support pre initialization (i.e. console)
*/
@@ -549,8 +592,8 @@ static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
static vmem_t *large_vmem;
static u_int lm_ents;
-#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \
- (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
+#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= kva_layout.lm_low && \
+ (va) < kva_layout.lm_high)
int pmap_pcid_enabled = 1;
SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
@@ -1301,8 +1344,10 @@ static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
static bool pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
static bool pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
vm_offset_t va, struct rwlock **lockp);
+static bool pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde,
+ vm_offset_t va, struct rwlock **lockp, vm_page_t mpte);
static bool pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
- vm_offset_t va);
+ vm_offset_t va, vm_page_t m);
static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
vm_prot_t prot, struct rwlock **lockp);
static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
@@ -1334,7 +1379,7 @@ static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
static pd_entry_t *pmap_pti_pde(vm_offset_t va);
static void pmap_pti_wire_pte(void *pte);
static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
- struct spglist *free, struct rwlock **lockp);
+ bool demote_kpde, struct spglist *free, struct rwlock **lockp);
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
@@ -1720,7 +1765,7 @@ create_pagetables(vm_paddr_t *firstaddr)
{
pd_entry_t *pd_p;
pdp_entry_t *pdp_p;
- pml4_entry_t *p4_p;
+ pml4_entry_t *p4_p, *p4d_p;
pml5_entry_t *p5_p;
uint64_t DMPDkernphys;
vm_paddr_t pax;
@@ -1730,7 +1775,7 @@ create_pagetables(vm_paddr_t *firstaddr)
vm_offset_t kasankernbase;
int kasankpdpi, kasankpdi, nkasanpte;
#endif
- int i, j, ndm1g, nkpdpe, nkdmpde;
+ int i, j, ndm1g, nkpdpe, nkdmpde, ndmpml4phys;
TSENTER();
/* Allocate page table pages for the direct map */
@@ -1738,15 +1783,30 @@ create_pagetables(vm_paddr_t *firstaddr)
if (ndmpdp < 4) /* Minimum 4GB of dirmap */
ndmpdp = 4;
ndmpdpphys = howmany(ndmpdp, NPDPEPG);
- if (ndmpdpphys > NDMPML4E) {
- /*
- * Each NDMPML4E allows 512 GB, so limit to that,
- * and then readjust ndmpdp and ndmpdpphys.
- */
- printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
- Maxmem = atop(NDMPML4E * NBPML4);
- ndmpdpphys = NDMPML4E;
- ndmpdp = NDMPML4E * NPDEPG;
+ if (la57) {
+ ndmpml4phys = howmany(ndmpdpphys, NPML4EPG);
+ if (ndmpml4phys > NDMPML5E) {
+ printf("NDMPML5E limits system to %ld GB\n",
+ (u_long)NDMPML5E * NBPML5 / 1024 / 1024 / 1024);
+ Maxmem = atop(NDMPML5E * NBPML5);
+ ndmpml4phys = NDMPML5E;
+ ndmpdpphys = ndmpml4phys * NPML4EPG;
+ ndmpdp = ndmpdpphys * NPDEPG;
+ }
+ DMPML4phys = allocpages(firstaddr, ndmpml4phys);
+ } else {
+ if (ndmpdpphys > NDMPML4E) {
+ /*
+ * Each NDMPML4E allows 512 GB, so limit to
+ * that, and then readjust ndmpdp and
+ * ndmpdpphys.
+ */
+ printf("NDMPML4E limits system to %d GB\n",
+ NDMPML4E * 512);
+ Maxmem = atop(NDMPML4E * NBPML4);
+ ndmpdpphys = NDMPML4E;
+ ndmpdp = NDMPML4E * NPDEPG;
+ }
}
DMPDPphys = allocpages(firstaddr, ndmpdpphys);
ndm1g = 0;
@@ -1771,7 +1831,13 @@ create_pagetables(vm_paddr_t *firstaddr)
dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
/* Allocate pages. */
+ if (la57) {
+ KPML5phys = allocpages(firstaddr, 1);
+ p5_p = (pml5_entry_t *)KPML5phys;
+ }
KPML4phys = allocpages(firstaddr, 1);
+ p4_p = (pml4_entry_t *)KPML4phys;
+
KPDPphys = allocpages(firstaddr, NKPML4E);
#ifdef KASAN
KASANPDPphys = allocpages(firstaddr, NKASANPML4E);
@@ -1891,6 +1957,16 @@ create_pagetables(vm_paddr_t *firstaddr)
}
/*
+ * Connect the Direct Map slots up to the PML4.
+ * pml5 entries for DMAP are handled below in global pml5 loop.
+ */
+ p4d_p = la57 ? (pml4_entry_t *)DMPML4phys : &p4_p[DMPML4I];
+ for (i = 0; i < ndmpdpphys; i++) {
+ p4d_p[i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
+ pg_nx;
+ }
+
+ /*
* Instead of using a 1G page for the memory containing the kernel,
* use 2M pages with read-only and no-execute permissions. (If using 1G
* pages, this will partially overwrite the PDPEs above.)
@@ -1909,11 +1985,6 @@ create_pagetables(vm_paddr_t *firstaddr)
}
}
- /* And recursively map PML4 to itself in order to get PTmap */
- p4_p = (pml4_entry_t *)KPML4phys;
- p4_p[PML4PML4I] = KPML4phys;
- p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
-
#ifdef KASAN
/* Connect the KASAN shadow map slots up to the PML4. */
for (i = 0; i < NKASANPML4E; i++) {
@@ -1936,25 +2007,15 @@ create_pagetables(vm_paddr_t *firstaddr)
}
#endif
- /* Connect the Direct Map slots up to the PML4. */
- for (i = 0; i < ndmpdpphys; i++) {
- p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
- p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
- }
-
/* Connect the KVA slots up to the PML4 */
for (i = 0; i < NKPML4E; i++) {
p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
}
- kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
-
if (la57) {
/* XXXKIB bootstrap KPML5phys page is lost */
- KPML5phys = allocpages(firstaddr, 1);
- for (i = 0, p5_p = (pml5_entry_t *)KPML5phys; i < NPML5EPG;
- i++) {
+ for (i = 0; i < NPML5EPG; i++) {
if (i == PML5PML5I) {
/*
* Recursively map PML5 to itself in
@@ -1962,6 +2023,10 @@ create_pagetables(vm_paddr_t *firstaddr)
*/
p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A |
X86_PG_M | X86_PG_V | pg_nx;
+ } else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) {
+ /* Connect DMAP pml4 pages to PML5. */
+ p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) |
+ X86_PG_RW | X86_PG_V | pg_nx;
} else if (i == pmap_pml5e_index(UPT_MAX_ADDRESS)) {
p5_p[i] = KPML4phys | X86_PG_RW | X86_PG_A |
X86_PG_M | X86_PG_V;
@@ -1969,6 +2034,10 @@ create_pagetables(vm_paddr_t *firstaddr)
p5_p[i] = 0;
}
}
+ } else {
+ /* Recursively map PML4 to itself in order to get PTmap */
+ p4_p[PML4PML4I] = KPML4phys;
+ p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
}
TSEXIT();
}
@@ -2022,7 +2091,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
*/
virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
(vm_paddr_t)kernphys);
- virtual_end = VM_MAX_KERNEL_ADDRESS;
+ virtual_end = kva_layout.km_high;
/*
* Enable PG_G global pages, then switch to the kernel page
@@ -2044,9 +2113,13 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
* Initialize the kernel pmap (which is statically allocated).
* Count bootstrap data as being resident in case any of this data is
* later unmapped (using pmap_remove()) and freed.
+ *
+ * DMAP_TO_PHYS()/PHYS_TO_DMAP() are functional only after
+ * kva_layout is fixed.
*/
PMAP_LOCK_INIT(kernel_pmap);
if (la57) {
+ kva_layout = kva_layout_la57;
vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3;
PTmap = (vm_offset_t)P5Tmap;
@@ -2057,6 +2130,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
kernel_pmap->pm_cr3 = KPML5phys;
pmap_pt_page_count_adj(kernel_pmap, 1); /* top-level page */
} else {
+ kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
kernel_pmap->pm_pmltop = kernel_pml4;
kernel_pmap->pm_cr3 = KPML4phys;
}
@@ -2418,6 +2492,8 @@ pmap_init(void)
{
struct pmap_preinit_mapping *ppim;
vm_page_t m, mpte;
+ pml4_entry_t *pml4e;
+ unsigned long lm_max;
int error, i, ret, skz63;
/* L1TF, reserve page @0 unconditionally */
@@ -2543,10 +2619,15 @@ pmap_init(void)
lm_ents = 8;
TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
- if (lm_ents > LMEPML4I - LMSPML4I + 1)
- lm_ents = LMEPML4I - LMSPML4I + 1;
+ lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4;
+ if (lm_ents > lm_max) {
+ printf(
+ "pmap: shrinking large map from requested %d slots to %ld slots\n",
+ lm_ents, lm_max);
+ lm_ents = lm_max;
+ }
#ifdef KMSAN
- if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
+ if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) {
printf(
"pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
lm_ents, KMSANORIGPML4I - LMSPML4I);
@@ -2557,18 +2638,27 @@ pmap_init(void)
printf("pmap: large map %u PML4 slots (%lu GB)\n",
lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
if (lm_ents != 0) {
- large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
+ large_vmem = vmem_create("large", kva_layout.lm_low,
(vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
if (large_vmem == NULL) {
printf("pmap: cannot create large map\n");
lm_ents = 0;
}
+ if (la57) {
+ for (i = 0; i < howmany((vm_offset_t)NBPML4 *
+ lm_ents, NBPML5); i++) {
+ m = pmap_large_map_getptp_unlocked();
+ kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M |
+ pg_nx | VM_PAGE_TO_PHYS(m);
+ }
+ }
for (i = 0; i < lm_ents; i++) {
m = pmap_large_map_getptp_unlocked();
- /* XXXKIB la57 */
- kernel_pml4[LMSPML4I + i] = X86_PG_V |
- X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
- VM_PAGE_TO_PHYS(m);
+ pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low +
+ (u_long)i * NBPML4);
+ *pml4e = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M |
+ pg_nx | VM_PAGE_TO_PHYS(m);
}
}
}
@@ -2973,7 +3063,6 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
* XXX TODO
*/
-#ifdef SMP
/*
* Interrupt the cpus that are executing in the guest context.
* This will force the vcpu to exit and the cached EPT mappings
@@ -3431,168 +3520,6 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
}
sched_unpin();
}
-#else /* !SMP */
-/*
- * Normal, non-SMP, invalidation functions.
- */
-void
-pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
- invlpg(va);
- if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
- pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = va;
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlpg(ucr3, kcr3, va);
- }
- critical_exit();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- vm_offset_t addr;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
- if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
- pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = sva;
- for (; d.addr < eva; d.addr += PAGE_SIZE)
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
- }
- critical_exit();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_all(pmap_t pmap)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap) {
- if (pmap_pcid_enabled && invpcid_works) {
- bzero(&d, sizeof(d));
- invpcid(&d, INVPCID_CTXGLOB);
- } else {
- invltlb_glob();
- }
- } else if (pmap == PCPU_GET(curpmap)) {
- if (pmap_pcid_enabled) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid;
- d.pad = 0;
- d.addr = 0;
- invpcid(&d, INVPCID_CTX);
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- d.pcid |= PMAP_PCID_USER_PT;
- invpcid(&d, INVPCID_CTX);
- }
- } else {
- kcr3 = pmap->pm_cr3 | pcid;
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT;
- pmap_pti_pcid_invalidate(ucr3, kcr3);
- } else
- load_cr3(kcr3);
- }
- critical_exit();
- } else {
- invltlb();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_cache(void)
-{
-
- wbinvd();
-}
-
-static void
-pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
-{
- struct pmap_pcid *pcidp;
-
- pmap_update_pde_store(pmap, pde, newpde);
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
- pmap_update_pde_invalidate(pmap, va, newpde);
- else {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-#endif /* !SMP */
static void
pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
@@ -3897,7 +3824,7 @@ pmap_kextract(vm_offset_t va)
pd_entry_t pde;
vm_paddr_t pa;
- if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
+ if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) {
pa = DMAP_TO_PHYS(va);
} else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
pa = pmap_large_map_kextract(va);
@@ -4038,7 +3965,7 @@ pmap_qremove(vm_offset_t sva, int count)
* enough to one of those pmap_enter() calls for it to
* be caught up in a promotion.
*/
- KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
+ KASSERT(va >= kva_layout.km_low, ("usermode va %lx", va));
KASSERT((*vtopde(va) & X86_PG_PS) == 0,
("pmap_qremove on promoted va %#lx", va));
@@ -4326,21 +4253,13 @@ void
pmap_pinit_pml5(vm_page_t pml5pg)
{
pml5_entry_t *pm_pml5;
+ int i;
pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
-
- /*
- * Add pml5 entry at top of KVA pointing to existing pml4 table,
- * entering all existing kernel mappings into level 5 table.
- */
- pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
- X86_PG_RW | X86_PG_A | X86_PG_M;
-
- /*
- * Install self-referential address mapping entry.
- */
- pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
- X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A;
+ for (i = 0; i < NPML5EPG / 2; i++)
+ pm_pml5[i] = 0;
+ for (; i < NPML5EPG; i++)
+ pm_pml5[i] = kernel_pmap->pm_pmltop[i];
}
static void
@@ -4897,8 +4816,8 @@ pmap_release(pmap_t pmap)
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
if (pmap_is_la57(pmap)) {
- pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
- pmap->pm_pmltop[PML5PML5I] = 0;
+ for (i = NPML5EPG / 2; i < NPML5EPG; i++)
+ pmap->pm_pmltop[i] = 0;
} else {
for (i = 0; i < NKPML4E; i++) /* KVA */
pmap->pm_pmltop[KPML4BASE + i] = 0;
@@ -4940,7 +4859,7 @@ pmap_release(pmap_t pmap)
static int
kvm_size(SYSCTL_HANDLER_ARGS)
{
- unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
+ unsigned long ksize = kva_layout.km_high - kva_layout.km_low;
return sysctl_handle_long(oidp, &ksize, 0, req);
}
@@ -4951,7 +4870,7 @@ SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
static int
kvm_free(SYSCTL_HANDLER_ARGS)
{
- unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
+ unsigned long kfree = kva_layout.km_high - kernel_vm_end;
return sysctl_handle_long(oidp, &kfree, 0, req);
}
@@ -5029,7 +4948,7 @@ pmap_page_array_startup(long pages)
vm_page_array_size = pages;
- start = VM_MIN_KERNEL_ADDRESS;
+ start = kva_layout.km_low;
end = start + pages * sizeof(struct vm_page);
for (va = start; va < end; va += NBPDR) {
pfn = first_page + (va - start) / sizeof(struct vm_page);
@@ -5999,7 +5918,7 @@ pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
SLIST_INIT(&free);
sva = trunc_2mpage(va);
- pmap_remove_pde(pmap, pde, sva, &free, lockp);
+ pmap_remove_pde(pmap, pde, sva, true, &free, lockp);
if ((oldpde & pmap_global_bit(pmap)) == 0)
pmap_invalidate_pde_page(pmap, sva, oldpde);
vm_page_free_pages_toq(&free, true);
@@ -6011,11 +5930,17 @@ static bool
pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
struct rwlock **lockp)
{
+ return (pmap_demote_pde_mpte(pmap, pde, va, lockp, NULL));
+}
+
+static bool
+pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+ struct rwlock **lockp, vm_page_t mpte)
+{
pd_entry_t newpde, oldpde;
pt_entry_t *firstpte, newpte;
pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
vm_paddr_t mptepa;
- vm_page_t mpte;
int PG_PTE_CACHE;
bool in_kernel;
@@ -6028,61 +5953,65 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
PG_PKU_MASK = pmap_pku_mask_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- in_kernel = va >= VM_MAXUSER_ADDRESS;
oldpde = *pde;
KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
-
- /*
- * Invalidate the 2MB page mapping and return "failure" if the
- * mapping was never accessed.
- */
- if ((oldpde & PG_A) == 0) {
- KASSERT((oldpde & PG_W) == 0,
- ("pmap_demote_pde: a wired mapping is missing PG_A"));
- pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
- return (false);
- }
-
- mpte = pmap_remove_pt_page(pmap, va);
+ KASSERT((oldpde & PG_MANAGED) == 0 || lockp != NULL,
+ ("pmap_demote_pde: lockp for a managed mapping is NULL"));
+ in_kernel = va >= VM_MAXUSER_ADDRESS;
if (mpte == NULL) {
- KASSERT((oldpde & PG_W) == 0,
- ("pmap_demote_pde: page table page for a wired mapping"
- " is missing"));
-
/*
- * If the page table page is missing and the mapping
- * is for a kernel address, the mapping must belong to
- * the direct map. Page table pages are preallocated
- * for every other part of the kernel address space,
- * so the direct map region is the only part of the
- * kernel address space that must be handled here.
+ * Invalidate the 2MB page mapping and return "failure" if the
+ * mapping was never accessed.
*/
- KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
- va < DMAP_MAX_ADDRESS),
- ("pmap_demote_pde: No saved mpte for va %#lx", va));
-
- /*
- * If the 2MB page mapping belongs to the direct map
- * region of the kernel's address space, then the page
- * allocation request specifies the highest possible
- * priority (VM_ALLOC_INTERRUPT). Otherwise, the
- * priority is normal.
- */
- mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va),
- (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED);
-
- /*
- * If the allocation of the new page table page fails,
- * invalidate the 2MB page mapping and return "failure".
- */
- if (mpte == NULL) {
+ if ((oldpde & PG_A) == 0) {
+ KASSERT((oldpde & PG_W) == 0,
+ ("pmap_demote_pde: a wired mapping is missing PG_A"));
pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
return (false);
}
- if (!in_kernel)
- mpte->ref_count = NPTEPG;
+ mpte = pmap_remove_pt_page(pmap, va);
+ if (mpte == NULL) {
+ KASSERT((oldpde & PG_W) == 0,
+ ("pmap_demote_pde: page table page for a wired mapping is missing"));
+
+ /*
+ * If the page table page is missing and the mapping
+ * is for a kernel address, the mapping must belong to
+ * the direct map. Page table pages are preallocated
+ * for every other part of the kernel address space,
+ * so the direct map region is the only part of the
+ * kernel address space that must be handled here.
+ */
+ KASSERT(!in_kernel || (va >= kva_layout.dmap_low &&
+ va < kva_layout.dmap_high),
+ ("pmap_demote_pde: No saved mpte for va %#lx", va));
+
+ /*
+ * If the 2MB page mapping belongs to the direct map
+ * region of the kernel's address space, then the page
+ * allocation request specifies the highest possible
+ * priority (VM_ALLOC_INTERRUPT). Otherwise, the
+ * priority is normal.
+ */
+ mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va),
+ (in_kernel ? VM_ALLOC_INTERRUPT : 0) |
+ VM_ALLOC_WIRED);
+
+ /*
+ * If the allocation of the new page table page fails,
+ * invalidate the 2MB page mapping and return "failure".
+ */
+ if (mpte == NULL) {
+ pmap_demote_pde_abort(pmap, va, pde, oldpde,
+ lockp);
+ return (false);
+ }
+
+ if (!in_kernel)
+ mpte->ref_count = NPTEPG;
+ }
}
mptepa = VM_PAGE_TO_PHYS(mpte);
firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
@@ -6162,8 +6091,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
mpte = pmap_remove_pt_page(pmap, va);
- if (mpte == NULL)
- panic("pmap_remove_kernel_pde: Missing pt page.");
+ KASSERT(mpte != NULL, ("pmap_remove_kernel_pde: missing pt page"));
mptepa = VM_PAGE_TO_PHYS(mpte);
newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
@@ -6193,7 +6121,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
* pmap_remove_pde: do the things to unmap a superpage in a process
*/
static int
-pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
+pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool demote_kpde,
struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
@@ -6233,9 +6161,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
pmap_delayed_invl_page(m);
}
}
- if (pmap == kernel_pmap) {
- pmap_remove_kernel_pde(pmap, pdq, sva);
- } else {
+ if (pmap != kernel_pmap) {
mpte = pmap_remove_pt_page(pmap, sva);
if (mpte != NULL) {
KASSERT(vm_page_any_valid(mpte),
@@ -6246,6 +6172,14 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
mpte->ref_count = 0;
pmap_add_delayed_free_list(mpte, free, false);
}
+ } else if (demote_kpde) {
+ pmap_remove_kernel_pde(pmap, pdq, sva);
+ } else {
+ mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(sva));
+ if (vm_page_any_valid(mpte)) {
+ mpte->valid = 0;
+ pmap_zero_page(mpte);
+ }
}
return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
}
@@ -6476,7 +6410,8 @@ pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
*/
if ((ptpaddr & PG_G) == 0)
anyvalid = 1;
- pmap_remove_pde(pmap, pde, sva, &free, &lock);
+ pmap_remove_pde(pmap, pde, sva, true, &free,
+ &lock);
continue;
} else if (!pmap_demote_pde_locked(pmap, pde, sva,
&lock)) {
@@ -7166,7 +7101,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
PG_RW = pmap_rw_bit(pmap);
va = trunc_page(va);
- KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
+ KASSERT(va <= kva_layout.km_high, ("pmap_enter: toobig"));
KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
va));
@@ -7495,6 +7430,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
PG_RW = pmap_rw_bit(pmap);
KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
("pmap_enter_pde: newpde is missing PG_M"));
+ KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+ PMAP_ENTER_NORECLAIM,
+ ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
PG_V = pmap_valid_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -7552,13 +7490,35 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
/*
* The reference to the PD page that was acquired by
* pmap_alloc_pde() ensures that it won't be freed.
- * However, if the PDE resulted from a promotion, then
+ * However, if the PDE resulted from a promotion, and
+ * the mapping is not from kernel_pmap, then
* a reserved PT page could be freed.
*/
- (void)pmap_remove_pde(pmap, pde, va, &free, lockp);
+ (void)pmap_remove_pde(pmap, pde, va, false, &free,
+ lockp);
if ((oldpde & PG_G) == 0)
pmap_invalidate_pde_page(pmap, va, oldpde);
} else {
+ if (va >= VM_MAXUSER_ADDRESS) {
+ /*
+ * Try to save the ptp in the trie
+ * before any changes to mappings are
+ * made. Abort on failure.
+ */
+ mt = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
+ if (pmap_insert_pt_page(pmap, mt, false,
+ false)) {
+ CTR1(KTR_PMAP,
+ "pmap_enter_pde: cannot ins kern ptp va %#lx",
+ va);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
+ /*
+ * Both pmap_remove_pde() and
+ * pmap_remove_ptes() will zero-fill
+ * the kernel page table page.
+ */
+ }
pmap_delayed_invl_start();
if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
lockp))
@@ -7572,14 +7532,6 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
} else {
KASSERT(SLIST_EMPTY(&free),
("pmap_enter_pde: freed kernel page table page"));
-
- /*
- * Both pmap_remove_pde() and pmap_remove_ptes() will
- * leave the kernel page table page zero filled.
- */
- mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
- if (pmap_insert_pt_page(pmap, mt, false, false))
- panic("pmap_enter_pde: trie insert failed");
}
}
@@ -7609,6 +7561,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
if (pdpg != NULL)
pmap_abort_ptp(pmap, va, pdpg);
+ else {
+ KASSERT(va >= VM_MAXUSER_ADDRESS &&
+ (*pde & (PG_PS | PG_V)) == PG_V,
+ ("pmap_enter_pde: invalid kernel PDE"));
+ mt = pmap_remove_pt_page(pmap, va);
+ KASSERT(mt != NULL,
+ ("pmap_enter_pde: missing kernel PTP"));
+ }
if (uwptpg != NULL) {
mt = pmap_remove_pt_page(pmap, va);
KASSERT(mt == uwptpg,
@@ -9518,7 +9478,7 @@ pmap_unmapdev(void *p, vm_size_t size)
va = (vm_offset_t)p;
/* If we gave a direct map region in pmap_mapdev, do nothing */
- if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
+ if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high)
return;
offset = va & PAGE_MASK;
size = round_page(offset + size);
@@ -9547,7 +9507,7 @@ pmap_unmapdev(void *p, vm_size_t size)
* Tries to demote a 1GB page mapping.
*/
static bool
-pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
+pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va, vm_page_t m)
{
pdp_entry_t newpdpe, oldpdpe;
pd_entry_t *firstpde, newpde, *pde;
@@ -9564,12 +9524,19 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
oldpdpe = *pdpe;
KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
- pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT,
- VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT);
- if (pdpg == NULL) {
- CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
- " in pmap %p", va, pmap);
- return (false);
+ if (m == NULL) {
+ pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT,
+ VM_ALLOC_WIRED);
+ if (pdpg == NULL) {
+ CTR2(KTR_PMAP,
+ "pmap_demote_pdpe: failure for va %#lx in pmap %p",
+ va, pmap);
+ return (false);
+ }
+ } else {
+ pdpg = m;
+ pdpg->pindex = va >> PDPSHIFT;
+ pmap_pt_page_count_adj(pmap, 1);
}
pdpgpa = VM_PAGE_TO_PHYS(pdpg);
firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
@@ -9610,6 +9577,8 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
void
pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pat_mode == ma)
+ return;
m->md.pat_mode = ma;
@@ -9629,6 +9598,9 @@ pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma)
{
int error;
+ if (m->md.pat_mode == ma)
+ return;
+
m->md.pat_mode = ma;
if ((m->flags & PG_FICTITIOUS) != 0)
@@ -9685,7 +9657,7 @@ pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
int error;
/* Only supported within the kernel map. */
- if (va < VM_MIN_KERNEL_ADDRESS)
+ if (va < kva_layout.km_low)
return (EINVAL);
PMAP_LOCK(kernel_pmap);
@@ -9716,7 +9688,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
* Only supported on kernel virtual addresses, including the direct
* map but excluding the recursive map.
*/
- if (base < DMAP_MIN_ADDRESS)
+ if (base < kva_layout.dmap_low)
return (EINVAL);
/*
@@ -9739,7 +9711,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pte_bits |= X86_PG_RW;
}
if ((prot & VM_PROT_EXECUTE) == 0 ||
- va < VM_MIN_KERNEL_ADDRESS) {
+ va < kva_layout.km_low) {
pde_bits |= pg_nx;
pte_bits |= pg_nx;
}
@@ -9779,7 +9751,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
tmpva += NBPDP;
continue;
}
- if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
+ if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva, NULL))
return (ENOMEM);
}
pde = pmap_pdpe_to_pde(pdpe, tmpva);
@@ -9835,7 +9807,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pmap_pte_props(pdpe, pde_bits, pde_mask);
changed = true;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ if (tmpva >= kva_layout.km_low &&
(*pdpe & PG_PS_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
@@ -9865,7 +9837,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pmap_pte_props(pde, pde_bits, pde_mask);
changed = true;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ if (tmpva >= kva_layout.km_low &&
(*pde & PG_PS_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
@@ -9893,7 +9865,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pmap_pte_props(pte, pte_bits, pte_mask);
changed = true;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ if (tmpva >= kva_layout.km_low &&
(*pte & PG_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
@@ -9937,11 +9909,13 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
}
/*
- * Demotes any mapping within the direct map region that covers more than the
- * specified range of physical addresses. This range's size must be a power
- * of two and its starting address must be a multiple of its size. Since the
- * demotion does not change any attributes of the mapping, a TLB invalidation
- * is not mandatory. The caller may, however, request a TLB invalidation.
+ * Demotes any mapping within the direct map region that covers more
+ * than the specified range of physical addresses. This range's size
+ * must be a power of two and its starting address must be a multiple
+ * of its size, which means that any pdp from the mapping is fully
+ * covered by the range if len > NBPDP. Since the demotion does not
+ * change any attributes of the mapping, a TLB invalidation is not
+ * mandatory. The caller may, however, request a TLB invalidation.
*/
void
pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate)
@@ -9949,38 +9923,67 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate)
pdp_entry_t *pdpe;
pd_entry_t *pde;
vm_offset_t va;
- bool changed;
+ vm_page_t m, mpte;
+ bool changed, rv __diagused;
if (len == 0)
return;
KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
KASSERT((base & (len - 1)) == 0,
("pmap_demote_DMAP: base is not a multiple of len"));
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "pmap_demote_DMAP");
+
if (len < NBPDP && base < dmaplimit) {
va = PHYS_TO_DMAP(base);
changed = false;
+
+ /*
+ * Assume that it is fine to sleep there.
+ * The only existing caller of pmap_demote_DMAP() is the
+ * x86_mr_split_dmap() function.
+ */
+ m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
+ if (len < NBPDR) {
+ mpte = vm_page_alloc_noobj(VM_ALLOC_WIRED |
+ VM_ALLOC_WAITOK);
+ } else
+ mpte = NULL;
+
PMAP_LOCK(kernel_pmap);
pdpe = pmap_pdpe(kernel_pmap, va);
if ((*pdpe & X86_PG_V) == 0)
panic("pmap_demote_DMAP: invalid PDPE");
if ((*pdpe & PG_PS) != 0) {
- if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
- panic("pmap_demote_DMAP: PDPE failed");
+ rv = pmap_demote_pdpe(kernel_pmap, pdpe, va, m);
+ KASSERT(rv, ("pmap_demote_DMAP: PDPE failed"));
changed = true;
+ m = NULL;
}
if (len < NBPDR) {
pde = pmap_pdpe_to_pde(pdpe, va);
if ((*pde & X86_PG_V) == 0)
panic("pmap_demote_DMAP: invalid PDE");
if ((*pde & PG_PS) != 0) {
- if (!pmap_demote_pde(kernel_pmap, pde, va))
- panic("pmap_demote_DMAP: PDE failed");
+ mpte->pindex = pmap_pde_pindex(va);
+ pmap_pt_page_count_adj(kernel_pmap, 1);
+ rv = pmap_demote_pde_mpte(kernel_pmap, pde, va,
+ NULL, mpte);
+ KASSERT(rv, ("pmap_demote_DMAP: PDE failed"));
changed = true;
+ mpte = NULL;
}
}
if (changed && invalidate)
pmap_invalidate_page(kernel_pmap, va);
PMAP_UNLOCK(kernel_pmap);
+ if (m != NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free(m);
+ }
+ if (mpte != NULL) {
+ vm_page_unwire_noq(mpte);
+ vm_page_free(mpte);
+ }
}
}
@@ -10210,17 +10213,9 @@ pmap_activate_sw(struct thread *td)
return;
}
cpuid = PCPU_GET(cpuid);
-#ifdef SMP
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
- CPU_SET(cpuid, &pmap->pm_active);
-#endif
pmap_activate_sw_mode(td, pmap, cpuid);
-#ifdef SMP
CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
-#else
- CPU_CLR(cpuid, &oldpmap->pm_active);
-#endif
}
void
@@ -10261,11 +10256,7 @@ pmap_activate_boot(pmap_t pmap)
MPASS(pmap != kernel_pmap);
cpuid = PCPU_GET(cpuid);
-#ifdef SMP
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
- CPU_SET(cpuid, &pmap->pm_active);
-#endif
PCPU_SET(curpmap, pmap);
if (pti) {
kcr3 = pmap->pm_cr3;
@@ -10629,19 +10620,28 @@ pmap_large_map_getptp(void)
static pdp_entry_t *
pmap_large_map_pdpe(vm_offset_t va)
{
+ pml4_entry_t *pml4;
vm_pindex_t pml4_idx;
vm_paddr_t mphys;
- pml4_idx = pmap_pml4e_index(va);
- KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
- ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
- "%#jx lm_ents %d",
- (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
- ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
- "LMSPML4I %#jx lm_ents %d",
- (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+ KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low +
+ (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va));
+ if (la57) {
+ pml4 = pmap_pml4e(kernel_pmap, va);
+ mphys = *pml4 & PG_FRAME;
+ } else {
+ pml4_idx = pmap_pml4e_index(va);
+
+ KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
+ ("pmap_large_map_pdpe: va %#jx out of range idx %#jx "
+ "LMSPML4I %#jx lm_ents %d",
+ (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+ KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
+ ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
+ "LMSPML4I %#jx lm_ents %d",
+ (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+ mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+ }
return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
}
@@ -10834,8 +10834,8 @@ pmap_large_unmap(void *svaa, vm_size_t len)
struct spglist spgf;
sva = (vm_offset_t)svaa;
- if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
- sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
+ if (len == 0 || sva + len < sva || (sva >= kva_layout.dmap_low &&
+ sva + len < kva_layout.dmap_high))
return;
SLIST_INIT(&spgf);
@@ -11081,11 +11081,10 @@ pmap_large_map_wb(void *svap, vm_size_t len)
sva = (vm_offset_t)svap;
eva = sva + len;
pmap_large_map_wb_fence();
- if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
+ if (sva >= kva_layout.dmap_low && eva < kva_layout.dmap_high) {
pmap_large_map_flush_range(sva, len);
} else {
- KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
- eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
+ KASSERT(sva >= kva_layout.lm_low && eva < kva_layout.lm_high,
("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
pmap_large_map_wb_large(sva, eva);
}
@@ -11126,8 +11125,8 @@ pmap_pti_init(void)
VM_OBJECT_WLOCK(pti_obj);
pml4_pg = pmap_pti_alloc_page();
pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
- for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
- va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
+ for (va = kva_layout.km_low; va <= kva_layout.km_high &&
+ va >= kva_layout.km_low && va > NBPML4; va += NBPML4) {
pdpe = pmap_pti_pdpe(va);
pmap_pti_wire_pte(pdpe);
}
@@ -11901,9 +11900,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
mode, range->pdpes, range->pdes, range->ptes);
/* Reset to sentinel value. */
- range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1);
+ range->sva = kva_layout.kva_max;
}
/*
@@ -11944,12 +11941,18 @@ sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
*/
static void
sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
- vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
- pt_entry_t pte)
+ vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe,
+ pd_entry_t pde, pt_entry_t pte)
{
pt_entry_t attrs;
- attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+ if (la57) {
+ attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx);
+ attrs |= pml4e & pg_nx;
+ attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U));
+ } else {
+ attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+ }
attrs |= pdpe & pg_nx;
attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
@@ -11982,13 +11985,15 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
{
struct pmap_kernel_map_range range;
struct sbuf sbuf, *sb;
+ pml5_entry_t pml5e;
pml4_entry_t pml4e;
pdp_entry_t *pdp, pdpe;
pd_entry_t *pd, pde;
pt_entry_t *pt, pte;
vm_offset_t sva;
vm_paddr_t pa;
- int error, i, j, k, l;
+ int error, j, k, l;
+ bool first;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
@@ -11997,9 +12002,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
/* Sentinel value. */
- range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1);
+ range.sva = kva_layout.kva_max;
+ pml5e = 0; /* no UB for la48 */
/*
* Iterate over the kernel page tables without holding the kernel pmap
@@ -12008,41 +12012,50 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
* Within the large map, ensure that PDP and PD page addresses are
* valid before descending.
*/
- for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
- switch (i) {
- case PML4PML4I:
+ for (first = true, sva = 0; sva != 0 || first; first = false) {
+ if (sva == kva_layout.rec_pt)
sbuf_printf(sb, "\nRecursive map:\n");
- break;
- case DMPML4I:
+ else if (sva == kva_layout.dmap_low)
sbuf_printf(sb, "\nDirect map:\n");
- break;
#ifdef KASAN
- case KASANPML4I:
+ else if (sva == kva_layout.kasan_shadow_low)
sbuf_printf(sb, "\nKASAN shadow map:\n");
- break;
#endif
#ifdef KMSAN
- case KMSANSHADPML4I:
+ else if (sva == kva_layout.kmsan_shadow_low)
sbuf_printf(sb, "\nKMSAN shadow map:\n");
- break;
- case KMSANORIGPML4I:
+ else if (sva == kva_layout.kmsan_origin_low)
sbuf_printf(sb, "\nKMSAN origin map:\n");
- break;
#endif
- case KPML4BASE:
+ else if (sva == kva_layout.km_low)
sbuf_printf(sb, "\nKernel map:\n");
- break;
- case LMSPML4I:
+ else if (sva == kva_layout.lm_low)
sbuf_printf(sb, "\nLarge map:\n");
- break;
- }
/* Convert to canonical form. */
- if (sva == 1ul << 47)
- sva |= -1ul << 48;
+ if (la57) {
+ if (sva == 1ul << 56) {
+ sva |= -1ul << 57;
+ continue;
+ }
+ } else {
+ if (sva == 1ul << 47) {
+ sva |= -1ul << 48;
+ continue;
+ }
+ }
restart:
- pml4e = kernel_pml4[i];
+ if (la57) {
+ pml5e = *pmap_pml5e(kernel_pmap, sva);
+ if ((pml5e & X86_PG_V) == 0) {
+ sva = rounddown2(sva, NBPML5);
+ sysctl_kmaps_dump(sb, &range, sva);
+ sva += NBPML5;
+ continue;
+ }
+ }
+ pml4e = *pmap_pml4e(kernel_pmap, sva);
if ((pml4e & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPML4);
sysctl_kmaps_dump(sb, &range, sva);
@@ -12063,8 +12076,8 @@ restart:
pa = pdpe & PG_FRAME;
if ((pdpe & PG_PS) != 0) {
sva = rounddown2(sva, NBPDP);
- sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
- 0, 0);
+ sysctl_kmaps_check(sb, &range, sva, pml5e,
+ pml4e, pdpe, 0, 0);
range.pdpes++;
sva += NBPDP;
continue;
@@ -12076,6 +12089,7 @@ restart:
* freed. Validate the next-level address
* before descending.
*/
+ sva += NBPDP;
goto restart;
}
pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
@@ -12092,7 +12106,7 @@ restart:
if ((pde & PG_PS) != 0) {
sva = rounddown2(sva, NBPDR);
sysctl_kmaps_check(sb, &range, sva,
- pml4e, pdpe, pde, 0);
+ pml5e, pml4e, pdpe, pde, 0);
range.pdes++;
sva += NBPDR;
continue;
@@ -12104,6 +12118,7 @@ restart:
* may be freed. Validate the
* next-level address before descending.
*/
+ sva += NBPDR;
goto restart;
}
pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
@@ -12117,7 +12132,7 @@ restart:
continue;
}
sysctl_kmaps_check(sb, &range, sva,
- pml4e, pdpe, pde, pte);
+ pml5e, pml4e, pdpe, pde, pte);
range.ptes++;
}
}
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index c95696bbe7ef..870cd255abb7 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -934,10 +934,7 @@ ENTRY(casueword32_nosmap)
ja fusufault
movl %esi,%eax /* old */
-#ifdef SMP
- lock
-#endif
- cmpxchgl %ecx,(%rdi) /* new = %ecx */
+ lock cmpxchgl %ecx,(%rdi) /* new = %ecx */
setne %cl
/*
@@ -971,10 +968,7 @@ ENTRY(casueword32_smap)
movl %esi,%eax /* old */
stac
-#ifdef SMP
- lock
-#endif
- cmpxchgl %ecx,(%rdi) /* new = %ecx */
+ lock cmpxchgl %ecx,(%rdi) /* new = %ecx */
clac
setne %cl
@@ -1014,10 +1008,7 @@ ENTRY(casueword_nosmap)
ja fusufault
movq %rsi,%rax /* old */
-#ifdef SMP
- lock
-#endif
- cmpxchgq %rcx,(%rdi) /* new = %rcx */
+ lock cmpxchgq %rcx,(%rdi) /* new = %rcx */
setne %cl
/*
@@ -1045,10 +1036,7 @@ ENTRY(casueword_smap)
movq %rsi,%rax /* old */
stac
-#ifdef SMP
- lock
-#endif
- cmpxchgq %rcx,(%rdi) /* new = %rcx */
+ lock cmpxchgq %rcx,(%rdi) /* new = %rcx */
clac
setne %cl
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 09ac0a67dbef..f3469ed5e2bc 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -37,7 +37,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
/*
* AMD64 Trap and System call handling
*/
@@ -87,9 +86,7 @@ PMC_SOFT_DEFINE( , , page_fault, write);
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#include <machine/stack.h>
#include <machine/trap.h>
#include <machine/tss.h>
@@ -769,7 +766,7 @@ trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode)
return (-1);
}
}
- if (eva >= VM_MIN_KERNEL_ADDRESS) {
+ if (eva >= kva_layout.km_low) {
/*
* Don't allow user-mode faults in kernel address space.
*/
@@ -900,11 +897,9 @@ trap_diag(struct trapframe *frame, vm_offset_t eva)
printf("\n\nFatal trap %d: %s while in %s mode\n", type,
type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN,
TRAPF_USERMODE(frame) ? "user" : "kernel");
-#ifdef SMP
- /* two separate prints in case of a trap on an unmapped page */
- printf("cpuid = %d; ", PCPU_GET(cpuid));
- printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+ /* Print these separately in case pcpu accesses trap. */
+ printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+ PCPU_GET(apic_id));
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%lx\n", eva);
printf("fault code = %s %s %s%s%s, %s\n",
@@ -1025,11 +1020,9 @@ dblfault_handler(struct trapframe *frame)
frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
frame->tf_fs, frame->tf_gs,
rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
-#ifdef SMP
- /* two separate prints in case of a trap on an unmapped page */
- printf("cpuid = %d; ", PCPU_GET(cpuid));
- printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+ /* Print these separately in case pcpu accesses trap. */
+ printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+ PCPU_GET(apic_id));
panic("double fault");
}
diff --git a/sys/amd64/conf/MINIMALUP b/sys/amd64/conf/MINIMALUP
deleted file mode 100644
index 0dbddbe5b341..000000000000
--- a/sys/amd64/conf/MINIMALUP
+++ /dev/null
@@ -1,4 +0,0 @@
-include MINIMAL
-ident MINIMALUP
-nooptions SMP
-nooptions NUMA
diff --git a/sys/amd64/include/efi.h b/sys/amd64/include/efi.h
index b47c4aa27ac7..439f2f0b317d 100644
--- a/sys/amd64/include/efi.h
+++ b/sys/amd64/include/efi.h
@@ -53,6 +53,10 @@
#define EFI_TIME_OWNED() mtx_assert(&atrtc_time_lock, MA_OWNED)
#define EFI_RT_HANDLE_FAULTS_DEFAULT 1
+
+#define EFI_MAP_BOOTTYPE_ALLOWED(type) (((efi_map_regs >> (type)) & 1) != 0)
+
+extern uint32_t efi_map_regs;
#endif
struct efirt_callinfo {
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 8db314fa034d..5a9c3162e14c 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -146,11 +146,10 @@
#define amd64_btop(x) ((unsigned long)(x) >> PAGE_SHIFT)
#define amd64_ptob(x) ((unsigned long)(x) << PAGE_SHIFT)
-#define INKERNEL(va) (((va) >= DMAP_MIN_ADDRESS && (va) < DMAP_MAX_ADDRESS) \
- || ((va) >= VM_MIN_KERNEL_ADDRESS && (va) < VM_MAX_KERNEL_ADDRESS))
+#define INKERNEL(va) \
+ (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
+ ((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
-#ifdef SMP
#define SC_TABLESIZE 1024 /* Must be power of 2. */
-#endif
#endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 7d3e91bcd9b9..e2f97442c10f 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -169,11 +169,12 @@
* the recursive page table map.
*/
#define NDMPML4E 8
+#define NDMPML5E 32
/*
- * These values control the layout of virtual memory. The starting address
- * of the direct map, which is controlled by DMPML4I, must be a multiple of
- * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ * These values control the layout of virtual memory. The starting
+ * address of the direct map is controlled by DMPML4I on LA48 and
+ * DMPML5I on LA57.
*
* Note: KPML4I is the index of the (single) level 4 page that maps
* the KVA that holds KERNBASE, while KPML4BASE is the index of the
@@ -191,6 +192,7 @@
#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */
#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
+#define DMPML5I (NPML5EPG / 2 + 1)
#define KPML4I (NPML4EPG-1)
#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */
@@ -200,9 +202,14 @@
#define KMSANSHADPML4I (KPML4BASE - NKMSANSHADPML4E)
#define KMSANORIGPML4I (DMPML4I - NKMSANORIGPML4E)
-/* Large map: index of the first and max last pml4 entry */
+/*
+ * Large map: index of the first and max last pml4/la48 and pml5/la57
+ * entry.
+ */
#define LMSPML4I (PML4PML4I + 1)
#define LMEPML4I (KASANPML4I - 1)
+#define LMSPML5I (DMPML5I + NDMPML5E)
+#define LMEPML5I (LMSPML5I + 32 - 1) /* 32 slots for large map */
/*
* XXX doesn't really belong here I guess...
@@ -548,6 +555,25 @@ pmap_pml5e_index(vm_offset_t va)
return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1));
}
+struct kva_layout_s {
+ vm_offset_t kva_min;
+ vm_offset_t kva_max;
+ vm_offset_t dmap_low; /* DMAP_MIN_ADDRESS */
+ vm_offset_t dmap_high; /* DMAP_MAX_ADDRESS */
+ vm_offset_t lm_low; /* LARGEMAP_MIN_ADDRESS */
+ vm_offset_t lm_high; /* LARGEMAP_MAX_ADDRESS */
+ vm_offset_t km_low; /* VM_MIN_KERNEL_ADDRESS */
+ vm_offset_t km_high; /* VM_MAX_KERNEL_ADDRESS */
+ vm_offset_t rec_pt;
+ vm_offset_t kasan_shadow_low; /* KASAN_MIN_ADDRESS */
+ vm_offset_t kasan_shadow_high; /* KASAN_MAX_ADDRESS */
+ vm_offset_t kmsan_shadow_low; /* KMSAN_SHAD_MIN_ADDRESS */
+ vm_offset_t kmsan_shadow_high; /* KMSAN_SHAD_MAX_ADDRESS */
+ vm_offset_t kmsan_origin_low; /* KMSAN_ORIG_MIN_ADDRESS */
+ vm_offset_t kmsan_origin_high; /* KMSAN_ORIG_MAX_ADDRESS */
+};
+extern struct kva_layout_s kva_layout;
+
#endif /* !LOCORE */
#endif /* !_MACHINE_PMAP_H_ */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 26eb227211da..bff92570ff82 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -13,8 +13,6 @@
#ifdef _KERNEL
-#ifdef SMP
-
#ifndef LOCORE
#include <x86/x86_smp.h>
@@ -39,7 +37,6 @@ void invlop_handler(void);
int start_all_aps(void);
#endif /* !LOCORE */
-#endif /* SMP */
#endif /* _KERNEL */
#endif /* _MACHINE_SMP_H_ */
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 0cd9bb4fa7a4..d2ac3c6648b2 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -163,6 +163,7 @@
* Virtual addresses of things. Derived from the page directory and
* page table indexes from pmap.h for precision.
*
+ * LA48:
* 0x0000000000000000 - 0x00007fffffffffff user map
* 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole)
* 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot)
@@ -175,32 +176,38 @@
* 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional
* 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map
*
+ * LA57:
+ * 0x0000000000000000 - 0x00ffffffffffffff user map
+ * 0x0100000000000000 - 0xf0ffffffffffffff does not exist (hole)
+ * 0xff00000000000000 - 0xff00ffffffffffff recursive page table (2048TB slot)
+ * 0xff01000000000000 - 0xff20ffffffffffff direct map (32 x 2048TB slots)
+ * 0xff21000000000000 - 0xff40ffffffffffff large map
+ * 0xff41000000000000 - 0xffff7fffffffffff unused
+ * 0xffff800000000000 - 0xfffff5ffffffffff unused (start of kernel pml4 entry)
+ * 0xfffff60000000000 - 0xfffff7ffffffffff 2TB KMSAN origin map, optional
+ * 0xfffff78000000000 - 0xfffff7bfffffffff 512GB KASAN shadow map, optional
+ * 0xfffff80000000000 - 0xfffffbffffffffff 4TB unused
+ * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional
+ * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map
+ *
* Within the kernel map:
*
* 0xfffffe0000000000 vm_page_array
* 0xffffffff80000000 KERNBASE
*/
-#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0)
-#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \
- NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-
-#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0)
-#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0)
-
-#define KASAN_MIN_ADDRESS KV4ADDR(KASANPML4I, 0, 0, 0)
-#define KASAN_MAX_ADDRESS KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0)
+#define VM_MIN_KERNEL_ADDRESS_LA48 KV4ADDR(KPML4BASE, 0, 0, 0)
+#define VM_MIN_KERNEL_ADDRESS kva_layout.km_low
+#define VM_MAX_KERNEL_ADDRESS kva_layout.km_high
-#define KMSAN_SHAD_MIN_ADDRESS KV4ADDR(KMSANSHADPML4I, 0, 0, 0)
-#define KMSAN_SHAD_MAX_ADDRESS KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, \
- 0, 0, 0)
+#define KASAN_MIN_ADDRESS (kva_layout.kasan_shadow_low)
+#define KASAN_MAX_ADDRESS (kva_layout.kasan_shadow_high)
-#define KMSAN_ORIG_MIN_ADDRESS KV4ADDR(KMSANORIGPML4I, 0, 0, 0)
-#define KMSAN_ORIG_MAX_ADDRESS KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \
- 0, 0, 0)
+#define KMSAN_SHAD_MIN_ADDRESS (kva_layout.kmsan_shadow_low)
+#define KMSAN_SHAD_MAX_ADDRESS (kva_layout.kmsan_shadow_high)
-#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0)
-#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0)
+#define KMSAN_ORIG_MIN_ADDRESS (kva_layout.kmsan_origin_low)
+#define KMSAN_ORIG_MAX_ADDRESS (kva_layout.kmsan_origin_high)
/*
* Formally kernel mapping starts at KERNBASE, but kernel linker
@@ -239,21 +246,21 @@
* vt fb startup needs to be reworked.
*/
#define PHYS_IN_DMAP(pa) (dmaplimit == 0 || (pa) < dmaplimit)
-#define VIRT_IN_DMAP(va) ((va) >= DMAP_MIN_ADDRESS && \
- (va) < (DMAP_MIN_ADDRESS + dmaplimit))
+#define VIRT_IN_DMAP(va) \
+ ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit)
#define PMAP_HAS_DMAP 1
-#define PHYS_TO_DMAP(x) ({ \
+#define PHYS_TO_DMAP(x) __extension__ ({ \
KASSERT(PHYS_IN_DMAP(x), \
("physical address %#jx not covered by the DMAP", \
(uintmax_t)x)); \
- (x) | DMAP_MIN_ADDRESS; })
+ (x) + kva_layout.dmap_low; })
-#define DMAP_TO_PHYS(x) ({ \
+#define DMAP_TO_PHYS(x) __extension__ ({ \
KASSERT(VIRT_IN_DMAP(x), \
("virtual address %#jx not covered by the DMAP", \
(uintmax_t)x)); \
- (x) & ~DMAP_MIN_ADDRESS; })
+ (x) - kva_layout.dmap_low; })
/*
* amd64 maps the page array into KVA so that it can be more easily
@@ -274,7 +281,7 @@
*/
#ifndef VM_KMEM_SIZE_MAX
#define VM_KMEM_SIZE_MAX ((VM_MAX_KERNEL_ADDRESS - \
- VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5)
+ kva_layout.km_low + 1) * 3 / 5)
#endif
/* initial pagein size of beginning of executable file */
diff --git a/sys/amd64/linux/linux_proto.h b/sys/amd64/linux/linux_proto.h
index 15e1dfc1a444..f1d9c96a78d7 100644
--- a/sys/amd64/linux/linux_proto.h
+++ b/sys/amd64/linux/linux_proto.h
@@ -914,10 +914,13 @@ struct linux_inotify_init_args {
syscallarg_t dummy;
};
struct linux_inotify_add_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)];
+ char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)];
};
struct linux_inotify_rm_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)];
};
struct linux_migrate_pages_args {
syscallarg_t dummy;
diff --git a/sys/amd64/linux/linux_sysent.c b/sys/amd64/linux/linux_sysent.c
index 8413d2723551..62b50cf68a32 100644
--- a/sys/amd64/linux/linux_sysent.c
+++ b/sys/amd64/linux/linux_sysent.c
@@ -268,8 +268,8 @@ struct sysent linux_sysent[] = {
{ .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 251 = linux_ioprio_set */
{ .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 252 = linux_ioprio_get */
{ .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 253 = linux_inotify_init */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 254 = linux_inotify_add_watch */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 255 = linux_inotify_rm_watch */
+ { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 254 = linux_inotify_add_watch */
+ { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 255 = linux_inotify_rm_watch */
{ .sy_narg = 0, .sy_call = (sy_call_t *)linux_migrate_pages, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 256 = linux_migrate_pages */
{ .sy_narg = AS(linux_openat_args), .sy_call = (sy_call_t *)linux_openat, .sy_auevent = AUE_OPEN_RWTC, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 257 = linux_openat */
{ .sy_narg = AS(linux_mkdirat_args), .sy_call = (sy_call_t *)linux_mkdirat, .sy_auevent = AUE_MKDIRAT, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 258 = linux_mkdirat */
diff --git a/sys/amd64/linux/linux_systrace_args.c b/sys/amd64/linux/linux_systrace_args.c
index 20322f7a8660..1dc4de019080 100644
--- a/sys/amd64/linux/linux_systrace_args.c
+++ b/sys/amd64/linux/linux_systrace_args.c
@@ -1918,12 +1918,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
}
/* linux_inotify_add_watch */
case 254: {
- *n_args = 0;
+ struct linux_inotify_add_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = (intptr_t)p->pathname; /* const char * */
+ uarg[a++] = p->mask; /* uint32_t */
+ *n_args = 3;
break;
}
/* linux_inotify_rm_watch */
case 255: {
- *n_args = 0;
+ struct linux_inotify_rm_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = p->wd; /* uint32_t */
+ *n_args = 2;
break;
}
/* linux_migrate_pages */
@@ -5860,9 +5867,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_add_watch */
case 254:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "userland const char *";
+ break;
+ case 2:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_inotify_rm_watch */
case 255:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_migrate_pages */
case 256:
@@ -8353,8 +8383,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
case 253:
/* linux_inotify_add_watch */
case 254:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_inotify_rm_watch */
case 255:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_migrate_pages */
case 256:
/* linux_openat */
diff --git a/sys/amd64/linux/syscalls.master b/sys/amd64/linux/syscalls.master
index fd08c9b0279d..5e1394751ef6 100644
--- a/sys/amd64/linux/syscalls.master
+++ b/sys/amd64/linux/syscalls.master
@@ -1476,10 +1476,17 @@
int linux_inotify_init(void);
}
254 AUE_NULL STD {
- int linux_inotify_add_watch(void);
+ int linux_inotify_add_watch(
+ l_int fd,
+ const char *pathname,
+ uint32_t mask
+ );
}
255 AUE_NULL STD {
- int linux_inotify_rm_watch(void);
+ int linux_inotify_rm_watch(
+ l_int fd,
+ uint32_t wd
+ );
}
256 AUE_NULL STD {
int linux_migrate_pages(void);
diff --git a/sys/amd64/linux32/linux32_proto.h b/sys/amd64/linux32/linux32_proto.h
index ab0edd99df42..57a303271f1c 100644
--- a/sys/amd64/linux32/linux32_proto.h
+++ b/sys/amd64/linux32/linux32_proto.h
@@ -983,10 +983,13 @@ struct linux_inotify_init_args {
syscallarg_t dummy;
};
struct linux_inotify_add_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)];
+ char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)];
};
struct linux_inotify_rm_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)];
};
struct linux_migrate_pages_args {
syscallarg_t dummy;
@@ -1184,7 +1187,7 @@ struct linux_pipe2_args {
char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
};
struct linux_inotify_init1_args {
- syscallarg_t dummy;
+ char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
};
struct linux_preadv_args {
char fd_l_[PADL_(l_ulong)]; l_ulong fd; char fd_r_[PADR_(l_ulong)];
diff --git a/sys/amd64/linux32/linux32_sysent.c b/sys/amd64/linux32/linux32_sysent.c
index add9844254ce..1bc8841badf3 100644
--- a/sys/amd64/linux32/linux32_sysent.c
+++ b/sys/amd64/linux32/linux32_sysent.c
@@ -307,8 +307,8 @@ struct sysent linux32_sysent[] = {
{ .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 289 = linux_ioprio_set */
{ .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 290 = linux_ioprio_get */
{ .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 291 = linux_inotify_init */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */
+ { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */
+ { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */
{ .sy_narg = 0, .sy_call = (sy_call_t *)linux_migrate_pages, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 294 = linux_migrate_pages */
{ .sy_narg = AS(linux_openat_args), .sy_call = (sy_call_t *)linux_openat, .sy_auevent = AUE_OPEN_RWTC, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 295 = linux_openat */
{ .sy_narg = AS(linux_mkdirat_args), .sy_call = (sy_call_t *)linux_mkdirat, .sy_auevent = AUE_MKDIRAT, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 296 = linux_mkdirat */
@@ -347,7 +347,7 @@ struct sysent linux32_sysent[] = {
{ .sy_narg = AS(linux_epoll_create1_args), .sy_call = (sy_call_t *)linux_epoll_create1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 329 = linux_epoll_create1 */
{ .sy_narg = AS(linux_dup3_args), .sy_call = (sy_call_t *)linux_dup3, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 330 = linux_dup3 */
{ .sy_narg = AS(linux_pipe2_args), .sy_call = (sy_call_t *)linux_pipe2, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 331 = linux_pipe2 */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */
+ { .sy_narg = AS(linux_inotify_init1_args), .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */
{ .sy_narg = AS(linux_preadv_args), .sy_call = (sy_call_t *)linux_preadv, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 333 = linux_preadv */
{ .sy_narg = AS(linux_pwritev_args), .sy_call = (sy_call_t *)linux_pwritev, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 334 = linux_pwritev */
{ .sy_narg = AS(linux_rt_tgsigqueueinfo_args), .sy_call = (sy_call_t *)linux_rt_tgsigqueueinfo, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 335 = linux_rt_tgsigqueueinfo */
diff --git a/sys/amd64/linux32/linux32_systrace_args.c b/sys/amd64/linux32/linux32_systrace_args.c
index 7793124e6935..cbd1641c2a34 100644
--- a/sys/amd64/linux32/linux32_systrace_args.c
+++ b/sys/amd64/linux32/linux32_systrace_args.c
@@ -2036,12 +2036,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
}
/* linux_inotify_add_watch */
case 292: {
- *n_args = 0;
+ struct linux_inotify_add_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = (intptr_t)p->pathname; /* const char * */
+ uarg[a++] = p->mask; /* uint32_t */
+ *n_args = 3;
break;
}
/* linux_inotify_rm_watch */
case 293: {
- *n_args = 0;
+ struct linux_inotify_rm_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = p->wd; /* uint32_t */
+ *n_args = 2;
break;
}
/* linux_migrate_pages */
@@ -2379,7 +2386,9 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
}
/* linux_inotify_init1 */
case 332: {
- *n_args = 0;
+ struct linux_inotify_init1_args *p = params;
+ iarg[a++] = p->flags; /* l_int */
+ *n_args = 1;
break;
}
/* linux_preadv */
@@ -6536,9 +6545,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_add_watch */
case 292:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "userland const char *";
+ break;
+ case 2:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_inotify_rm_watch */
case 293:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_migrate_pages */
case 294:
@@ -7116,6 +7148,13 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_init1 */
case 332:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
break;
/* linux_preadv */
case 333:
@@ -9809,8 +9848,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
case 291:
/* linux_inotify_add_watch */
case 292:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_inotify_rm_watch */
case 293:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_migrate_pages */
case 294:
/* linux_openat */
@@ -9982,6 +10027,9 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_init1 */
case 332:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_preadv */
case 333:
if (ndx == 0 || ndx == 1)
diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master
index 92d5f09c423f..7bd522a598e8 100644
--- a/sys/amd64/linux32/syscalls.master
+++ b/sys/amd64/linux32/syscalls.master
@@ -1589,10 +1589,17 @@
int linux_inotify_init(void);
}
292 AUE_NULL STD {
- int linux_inotify_add_watch(void);
+ int linux_inotify_add_watch(
+ l_int fd,
+ const char *pathname,
+ uint32_t mask
+ );
}
293 AUE_NULL STD {
- int linux_inotify_rm_watch(void);
+ int linux_inotify_rm_watch(
+ l_int fd,
+ uint32_t wd
+ );
}
; Linux 2.6.16:
294 AUE_NULL STD {
@@ -1860,7 +1867,9 @@
);
}
332 AUE_NULL STD {
- int linux_inotify_init1(void);
+ int linux_inotify_init1(
+ l_int flags
+ );
}
; Linux 2.6.30:
333 AUE_NULL STD {
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..c7b75767680a
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ * 'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ * each traced CPU core or thread. Upon initialization, a ToPA configuration
+ * is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ * The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ * 4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ * relevant PT registers. Every time a traced thread is switched
+ * out or in, its state will be saved to or loaded from its corresponding
+ * 'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ * tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ * interrupt before continuing. The interrupt handler will then fetch the
+ * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ * The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ * and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ * a 4K ToPA entry is filled. While this is fine when tracing smaller
+ * functions or infrequent code paths, this will generate too much interrupt
+ * traffic when tracing hotter functions. A proper solution for this issue
+ * should estimate the amount of data generated by the current configuration
+ * and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS \
+ (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
+ RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF 0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+ uint64_t rtit_ctl;
+ uint64_t rtit_output_base;
+ uint64_t rtit_output_mask_ptrs;
+ uint64_t rtit_status;
+ uint64_t rtit_cr3_match;
+ uint64_t rtit_addr0_a;
+ uint64_t rtit_addr0_b;
+ uint64_t rtit_addr1_a;
+ uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+ uint64_t *topa_hw; /* ToPA table entries. */
+ size_t size;
+ struct mtx lock; /* Lock for fields below. */
+ vm_offset_t offset;
+ uint64_t wrap_count;
+ int curpage;
+};
+
+struct pt_ctx {
+ int id;
+ struct pt_buffer buf; /* ToPA buffer metadata */
+ struct task task; /* ToPA buffer notification task */
+ struct hwt_context *hwt_ctx;
+ uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+ PT_DISABLED = 0,
+ PT_STOPPED,
+ PT_ACTIVE
+};
+
+static struct pt_cpu {
+ struct pt_ctx *ctx; /* active PT tracing context */
+ enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+ uint32_t l0_eax;
+ uint32_t l0_ebx;
+ uint32_t l0_ecx;
+ uint32_t l1_eax;
+ uint32_t l1_ebx;
+ size_t xsave_area_size;
+ size_t xstate_hdr_offset;
+ size_t pt_xsave_offset;
+} pt_info __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+ return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+ atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+ return ((struct xstate_hdr *)(ctx->save_area +
+ pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+ return ((struct pt_ext_area *)(ctx->save_area +
+ pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+ uint64_t reg;
+ int curpage;
+
+ /* Update buffer offset. */
+ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+ curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+ mtx_lock_spin(&buf->lock);
+ /* Check if the output wrapped. */
+ if (buf->curpage > curpage)
+ buf->wrap_count++;
+ buf->curpage = curpage;
+ buf->offset = reg >> 32;
+ mtx_unlock_spin(&buf->lock);
+
+ dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+ buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+ struct hwt_record_entry *rec)
+{
+ rec->record_type = HWT_RECORD_BUFFER;
+ rec->buf_id = id;
+ rec->curpage = buf->curpage;
+ rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+ u_long xcr0, cr0;
+ u_long xss;
+
+ cr0 = rcr0();
+ if (cr0 & CR0_TS)
+ clts();
+ xcr0 = rxcr(XCR0);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+ xss = rdmsr(MSR_IA32_XSS);
+ wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+ if (!enable) {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+ ("%s: PT is disabled", __func__));
+ xsaves(save_area, XFEATURE_ENABLED_PT);
+ } else {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+ ("%s: PT is enabled", __func__));
+ xrstors(save_area, XFEATURE_ENABLED_PT);
+ }
+ wrmsr(MSR_IA32_XSS, xss);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0);
+ if (cr0 & CR0_TS)
+ load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+ struct pt_cpu *cpu;
+
+ cpu = &pt_pcpu[curcpu];
+ MPASS(cpu->ctx != NULL);
+
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+ load_cr4(rcr4() | CR4_XSAVE);
+ wrmsr(MSR_IA32_RTIT_STATUS, 0);
+ pt_cpu_set_state(curcpu, PT_ACTIVE);
+ pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+ struct pt_cpu *cpu;
+ struct pt_ctx *ctx;
+
+ /* Shutdown may occur before PT gets properly configured. */
+ if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+ return;
+
+ cpu = &pt_pcpu[curcpu];
+ ctx = cpu->ctx;
+ MPASS(ctx != NULL);
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+ pt_cpu_set_state(curcpu, PT_STOPPED);
+ pt_cpu_toggle_local(cpu->ctx->save_area, false);
+ pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+ struct pt_buffer *buf;
+ size_t topa_size;
+ int i;
+
+ topa_size = TOPA_SIZE_4K;
+ buf = &ctx->buf;
+
+ KASSERT(buf->topa_hw == NULL,
+ ("%s: ToPA info already exists", __func__));
+ buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+ M_ZERO | M_WAITOK);
+ dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+ buf->size = vm->npages * PAGE_SIZE;
+ for (i = 0; i < vm->npages; i++) {
+ buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+ /*
+ * XXX: TOPA_INT should ideally be set according to
+ * expected amount of incoming trace data. Too few TOPA_INT
+ * entries will not trigger interrupts often enough when tracing
+ * smaller functions.
+ */
+ buf->topa_hw[i] |= TOPA_INT;
+ }
+ buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+ return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+ struct pt_ext_area *pt_ext;
+ int nranges_supp, n, error = 0;
+
+ pt_ext = pt_ctx_get_ext_area(ctx);
+ if (pt_info.l0_ebx & CPUPT_IPF) {
+ nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+ CPUPT_NADDR_S;
+
+ if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+ nranges_supp = PT_IP_FILTER_MAX_RANGES;
+ n = cfg->nranges;
+ if (n > nranges_supp) {
+ printf("%s: %d IP filtering ranges requested, CPU "
+ "supports %d, truncating\n",
+ __func__, n, nranges_supp);
+ n = nranges_supp;
+ }
+
+ switch (n) {
+ case 2:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+ pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+ pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+ case 1:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+ pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+ pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+ break;
+ default:
+ error = (EINVAL);
+ break;
+ };
+ } else
+ error = (ENXIO);
+
+ return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+ dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+ KASSERT(pt_ctx->buf.topa_hw == NULL,
+ ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+ memset(pt_ctx, 0, sizeof(struct pt_ctx));
+ mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+ pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+ M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx->save_area == NULL)
+ return (ENOMEM);
+ dprintf("%s: preparing ToPA buffer\n", __func__);
+ if (pt_topa_prepare(pt_ctx, vm) != 0) {
+ dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+ free(pt_ctx->save_area, M_PT);
+ return (ENOMEM);
+ }
+
+ pt_ctx->id = ctx_id;
+ TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+ return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+ if (pt_ctx->buf.topa_hw != NULL)
+ free(pt_ctx->buf.topa_hw, M_PT);
+ if (pt_ctx->save_area != NULL)
+ free(pt_ctx->save_area, M_PT);
+ memset(pt_ctx, 0, sizeof(*pt_ctx));
+ pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+ struct hwt_cpu *hwt_cpu;
+ struct hwt_thread *thr;
+ struct pt_ctx *pt_ctx;
+ struct pt_cpu_config *cfg;
+ struct pt_ext_area *pt_ext;
+ struct xstate_hdr *hdr;
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ cfg = (struct pt_cpu_config *)ctx->config;
+ pt_ctx = NULL;
+
+ /* Clear any flags we don't support yet. */
+ cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+ if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+ if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+ printf("%s: CPU does not support generating MTC "
+ "packets\n", __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+ if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+ printf("%s: CPU does not support CR3 filtering\n",
+ __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+ if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+ printf("%s: CPU does not support TNT\n", __func__);
+ return (ENXIO);
+ }
+ }
+ /* TODO: support for more config bits. */
+
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ if (hwt_cpu->cpu_id != cpu_id)
+ continue;
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ break;
+ }
+ } else {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ if (thr->thread_id != thread_id)
+ continue;
+ KASSERT(thr->private != NULL,
+ ("%s: hwt thread private"
+ " not set, thr %p",
+ __func__, thr));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ break;
+ }
+ }
+ if (pt_ctx == NULL)
+ return (ENOENT);
+
+ dprintf("%s: preparing MSRs\n", __func__);
+ pt_ext = pt_ctx_get_ext_area(pt_ctx);
+ hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+ pt_ext->rtit_ctl |= cfg->rtit_ctl;
+ if (cfg->nranges != 0) {
+ dprintf("%s: preparing IPF ranges\n", __func__);
+ if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+ return (error);
+ }
+ pt_ctx->hwt_ctx = ctx;
+ pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+ pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+ pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+ hdr->xstate_bv = XFEATURE_ENABLED_PT;
+ hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+ XSTATE_XCOMP_BV_COMPACT;
+ pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+ pt_pcpu[cpu_id].ctx = pt_ctx;
+ pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to start PT on another cpu", __func__));
+ pt_cpu_start(NULL);
+ CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+ struct pt_cpu *cpu;
+
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to disable PT on another cpu", __func__));
+ pt_cpu_stop(NULL);
+ CPU_CLR(cpu_id, &ctx->cpu_map);
+ cpu = &pt_pcpu[cpu_id];
+ cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+ return (-1);
+
+ KASSERT(ctx->mode == HWT_MODE_CPU,
+ ("%s: should only be used for CPU mode", __func__));
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+ return (-1);
+
+ if (CPU_EMPTY(&ctx->cpu_map)) {
+ dprintf("%s: empty cpu map\n", __func__);
+ return (-1);
+ }
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+ struct hwt_cpu *hwt_cpu;
+ int error;
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+ hwt_cpu->vm, hwt_cpu->cpu_id);
+ if (error)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+ struct pt_ctx *pt_ctx;
+ struct hwt_thread *thr;
+ int cpu_id;
+
+ dprintf("%s\n", __func__);
+
+ pt_backend_disable_smp(ctx);
+ if (ctx->mode == HWT_MODE_THREAD) {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ KASSERT(thr->private != NULL,
+ ("%s: thr->private not set", __func__));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ pt_deinit_ctx(pt_ctx);
+ }
+ } else {
+ CPU_FOREACH(cpu_id) {
+ if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+ continue;
+ if (pt_pcpu[cpu_id].ctx != NULL) {
+ KASSERT(pt_pcpu[cpu_id].ctx ==
+ &pt_pcpu_ctx[cpu_id],
+ ("%s: CPU mode tracing with non-cpu mode PT"
+ "context active",
+ __func__));
+ pt_pcpu[cpu_id].ctx = NULL;
+ }
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ pt_deinit_ctx(pt_ctx);
+ memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+ uint64_t *data)
+{
+ struct pt_buffer *buf;
+
+ if (vm->ctx->mode == HWT_MODE_THREAD)
+ buf = &((struct pt_ctx *)vm->thr->private)->buf;
+ else
+ buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+ mtx_lock_spin(&buf->lock);
+ *curpage = buf->curpage;
+ *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+ mtx_unlock_spin(&buf->lock);
+
+ return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *pt_ctx;
+ int error;
+
+ /* Omit M_WAITOK since this might get invoked a non-sleepable context */
+ pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx == NULL)
+ return (ENOMEM);
+
+ error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+ if (error)
+ return (error);
+
+ thr->private = pt_ctx;
+ return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *ctx;
+
+ ctx = (struct pt_ctx *)thr->private;
+
+ pt_deinit_ctx(ctx);
+ free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+ .hwt_backend_init = pt_backend_init,
+ .hwt_backend_deinit = pt_backend_deinit,
+
+ .hwt_backend_configure = pt_backend_configure,
+
+ .hwt_backend_enable = pt_backend_enable,
+ .hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+ .hwt_backend_enable_smp = pt_backend_enable_smp,
+ .hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+ .hwt_backend_read = pt_backend_read,
+ .hwt_backend_dump = pt_backend_dump,
+
+ .hwt_backend_thread_alloc = pt_backend_alloc_thread,
+ .hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+ .ops = &pt_ops,
+ .name = "pt",
+ .kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+ struct hwt_record_entry record;
+ struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+ /* Prepare buffer record. */
+ mtx_lock_spin(&ctx->buf.lock);
+ pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+ mtx_unlock_spin(&ctx->buf.lock);
+ hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+ uint64_t reg;
+
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+ reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+ struct pt_buffer *buf;
+ struct pt_ctx *ctx;
+ uint64_t reg;
+
+ SDT_PROBE0(pt, , , topa__intr);
+
+ if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+ return (0);
+ }
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+ if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+ /* ACK spurious or leftover interrupt. */
+ pt_topa_status_clear();
+ return (1);
+ }
+
+ ctx = pt_pcpu[curcpu].ctx;
+ buf = &ctx->buf;
+ KASSERT(buf->topa_hw != NULL,
+ ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+ pt_cpu_toggle_local(ctx->save_area, false);
+ pt_update_buffer(buf);
+ pt_topa_status_clear();
+ taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+ TASKQUEUE_FAIL_IF_PENDING);
+
+ if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+ pt_cpu_toggle_local(ctx->save_area, true);
+ lapic_reenable_pcint();
+ }
+ return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+ u_int cp[4];
+ int error;
+
+ dprintf("pt: Enumerating part 1\n");
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+ dprintf("pt: ecx %x\n", cp[2]);
+
+ pt_info.l0_eax = cp[0];
+ pt_info.l0_ebx = cp[1];
+ pt_info.l0_ecx = cp[2];
+
+ dprintf("pt: Enumerating part 2\n");
+ cpuid_count(CPUID_PT_LEAF, 1, cp);
+ dprintf("pt: eax %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+
+ pt_info.l1_eax = cp[0];
+ pt_info.l1_ebx = cp[1];
+
+ error = hwt_backend_register(&backend);
+ if (error != 0) {
+ printf("pt: unable to register hwt backend, error %d\n", error);
+ return (error);
+ }
+ pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+ M_ZERO | M_WAITOK);
+ pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+ M_ZERO | M_WAITOK);
+
+ nmi_register_handler(pt_topa_intr);
+ if (!lapic_enable_pcint()) {
+ nmi_remove_handler(pt_topa_intr);
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ pt_pcpu_ctx = NULL;
+ printf("pt: failed to setup interrupt line\n");
+ return (error);
+ }
+ initialized = true;
+
+ return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+ u_int cp[4];
+
+ if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+ printf("pt: CPU does not support Intel Processor Trace\n");
+ return (false);
+ }
+ if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+ printf("pt: XSAVE is not supported\n");
+ return (false);
+ }
+ if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+ printf("pt: CPU does not support managing PT state using XSAVE\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+ printf("pt: XSAVE compaction is not supported\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+ printf("pt: CPU does not support XSAVES/XRSTORS\n");
+ return (false);
+ }
+
+ /* Require ToPA support. */
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ if ((cp[2] & CPUPT_TOPA) == 0) {
+ printf("pt: ToPA is not supported\n");
+ return (false);
+ }
+ if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+ printf("pt: multiple ToPA outputs are not supported\n");
+ return (false);
+ }
+
+ pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+ pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+ pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+ XFEATURE_ENABLED_PT, true, true);
+
+ return (true);
+}
+
+static void
+pt_deinit(void)
+{
+ if (!initialized)
+ return;
+ nmi_remove_handler(pt_topa_intr);
+ lapic_disable_pcint();
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+ switch (type) {
+ case MOD_LOAD:
+ if (!pt_supported() || pt_init() != 0) {
+ return (ENXIO);
+ }
+ break;
+ case MOD_UNLOAD:
+ pt_deinit();
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
index 000000000000..2423afdf22e9
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+ uint64_t rtit_ctl;
+ register_t cr3_filter;
+ int nranges;
+ struct ipf_range {
+ vm_offset_t start;
+ vm_offset_t end;
+ } ip_ranges[PT_IP_FILTER_MAX_RANGES];
+ uint32_t mtc_freq;
+ uint32_t cyc_thresh;
+ uint32_t psb_freq;
+};
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index f393f160b101..130130b64541 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -32,12 +32,6 @@
#include "vmx_assym.h"
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
#define VENTER push %rbp ; mov %rsp,%rbp
#define VLEAVE pop %rbp
diff --git a/sys/arm/allwinner/aw_gpio.c b/sys/arm/allwinner/aw_gpio.c
index 18b47bab12d9..2061e38a155f 100644
--- a/sys/arm/allwinner/aw_gpio.c
+++ b/sys/arm/allwinner/aw_gpio.c
@@ -1154,10 +1154,6 @@ aw_gpio_attach(device_t dev)
aw_gpio_register_isrcs(sc);
intr_pic_register(dev, OF_xref_from_node(ofw_bus_get_node(dev)));
- sc->sc_busdev = gpiobus_attach_bus(dev);
- if (sc->sc_busdev == NULL)
- goto fail;
-
/*
* Register as a pinctrl device
*/
@@ -1166,6 +1162,10 @@ aw_gpio_attach(device_t dev)
fdt_pinctrl_register(dev, "allwinner,pins");
fdt_pinctrl_configure_tree(dev);
+ sc->sc_busdev = gpiobus_attach_bus(dev);
+ if (sc->sc_busdev == NULL)
+ goto fail;
+
config_intrhook_oneshot(aw_gpio_enable_bank_supply, sc);
return (0);
diff --git a/sys/arm/allwinner/aw_mmc.c b/sys/arm/allwinner/aw_mmc.c
index 6bebf5e5fb5e..a8add957dc74 100644
--- a/sys/arm/allwinner/aw_mmc.c
+++ b/sys/arm/allwinner/aw_mmc.c
@@ -84,21 +84,26 @@
struct aw_mmc_conf {
uint32_t dma_xferlen;
+ uint32_t dma_desc_shift;
bool mask_data0;
bool can_calibrate;
bool new_timing;
+ bool zero_is_skip;
};
static const struct aw_mmc_conf a10_mmc_conf = {
.dma_xferlen = 0x2000,
+ .dma_desc_shift = 0,
};
static const struct aw_mmc_conf a13_mmc_conf = {
.dma_xferlen = 0x10000,
+ .dma_desc_shift = 0,
};
static const struct aw_mmc_conf a64_mmc_conf = {
.dma_xferlen = 0x10000,
+ .dma_desc_shift = 0,
.mask_data0 = true,
.can_calibrate = true,
.new_timing = true,
@@ -106,13 +111,24 @@ static const struct aw_mmc_conf a64_mmc_conf = {
static const struct aw_mmc_conf a64_emmc_conf = {
.dma_xferlen = 0x2000,
+ .dma_desc_shift = 0,
.can_calibrate = true,
};
+static const struct aw_mmc_conf d1_mmc_conf = {
+ .dma_xferlen = 0x1000,
+ .dma_desc_shift = 2,
+ .mask_data0 = true,
+ .can_calibrate = true,
+ .new_timing = true,
+ .zero_is_skip = true,
+};
+
static struct ofw_compat_data compat_data[] = {
{"allwinner,sun4i-a10-mmc", (uintptr_t)&a10_mmc_conf},
{"allwinner,sun5i-a13-mmc", (uintptr_t)&a13_mmc_conf},
{"allwinner,sun7i-a20-mmc", (uintptr_t)&a13_mmc_conf},
+ {"allwinner,sun20i-d1-mmc", (uintptr_t)&d1_mmc_conf},
{"allwinner,sun50i-a64-mmc", (uintptr_t)&a64_mmc_conf},
{"allwinner,sun50i-a64-emmc", (uintptr_t)&a64_emmc_conf},
{NULL, 0}
@@ -607,16 +623,18 @@ aw_dma_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int err)
dma_desc = sc->aw_dma_desc;
for (i = 0; i < nsegs; i++) {
- if (segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen)
+ if ((segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen) &&
+ !sc->aw_mmc_conf->zero_is_skip)
dma_desc[i].buf_size = 0; /* Size of 0 indicate max len */
else
dma_desc[i].buf_size = segs[i].ds_len;
- dma_desc[i].buf_addr = segs[i].ds_addr;
+ dma_desc[i].buf_addr = segs[i].ds_addr >>
+ sc->aw_mmc_conf->dma_desc_shift;
dma_desc[i].config = AW_MMC_DMA_CONFIG_CH |
- AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC;
-
- dma_desc[i].next = sc->aw_dma_desc_phys +
- ((i + 1) * sizeof(struct aw_mmc_dma_desc));
+ AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC;
+ dma_desc[i].next = (sc->aw_dma_desc_phys +
+ (i + 1) * sizeof(struct aw_mmc_dma_desc)) >>
+ sc->aw_mmc_conf->dma_desc_shift;
}
dma_desc[0].config |= AW_MMC_DMA_CONFIG_FD;
@@ -678,7 +696,8 @@ aw_mmc_prepare_dma(struct aw_mmc_softc *sc)
AW_MMC_WRITE_4(sc, AW_MMC_IDIE, val);
/* Set DMA descritptor list address */
- AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys);
+ AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys >>
+ sc->aw_mmc_conf->dma_desc_shift);
/* FIFO trigger level */
AW_MMC_WRITE_4(sc, AW_MMC_FWLR, AW_MMC_DMA_FTRGLEVEL);
diff --git a/sys/arm/allwinner/aw_rtc.c b/sys/arm/allwinner/aw_rtc.c
index 9938601f17ce..4af57ab879e8 100644
--- a/sys/arm/allwinner/aw_rtc.c
+++ b/sys/arm/allwinner/aw_rtc.c
@@ -134,6 +134,7 @@ static struct ofw_compat_data compat_data[] = {
{ "allwinner,sun7i-a20-rtc", (uintptr_t) &a20_conf },
{ "allwinner,sun6i-a31-rtc", (uintptr_t) &a31_conf },
{ "allwinner,sun8i-h3-rtc", (uintptr_t) &h3_conf },
+ { "allwinner,sun20i-d1-rtc", (uintptr_t) &h3_conf },
{ "allwinner,sun50i-h5-rtc", (uintptr_t) &h3_conf },
{ "allwinner,sun50i-h6-rtc", (uintptr_t) &h3_conf },
{ NULL, 0 }
@@ -147,11 +148,13 @@ struct aw_rtc_softc {
static struct clk_fixed_def aw_rtc_osc32k = {
.clkdef.id = 0,
+ .clkdef.name = "osc32k",
.freq = 32768,
};
static struct clk_fixed_def aw_rtc_iosc = {
.clkdef.id = 2,
+ .clkdef.name = "iosc",
};
static void aw_rtc_install_clocks(struct aw_rtc_softc *sc, device_t dev);
@@ -250,23 +253,33 @@ aw_rtc_install_clocks(struct aw_rtc_softc *sc, device_t dev) {
int nclocks;
node = ofw_bus_get_node(dev);
- nclocks = ofw_bus_string_list_to_array(node, "clock-output-names", &clknames);
- /* No clocks to export */
- if (nclocks <= 0)
- return;
- if (nclocks != 3) {
- device_printf(dev, "Having only %d clocks instead of 3, aborting\n", nclocks);
+ /* Nothing to do. */
+ if (!OF_hasprop(node, "clocks"))
return;
+
+ /*
+ * If the device tree gives us specific output names for the clocks,
+ * use them.
+ */
+ nclocks = ofw_bus_string_list_to_array(node, "clock-output-names", &clknames);
+ if (nclocks > 0) {
+ if (nclocks != 3) {
+ device_printf(dev,
+ "Found %d clocks names instead of 3, aborting\n",
+ nclocks);
+ return;
+ }
+
+ aw_rtc_osc32k.clkdef.name = clknames[0];
+ aw_rtc_iosc.clkdef.name = clknames[2];
}
clkdom = clkdom_create(dev);
- aw_rtc_osc32k.clkdef.name = clknames[0];
if (clknode_fixed_register(clkdom, &aw_rtc_osc32k) != 0)
device_printf(dev, "Cannot register osc32k clock\n");
- aw_rtc_iosc.clkdef.name = clknames[2];
aw_rtc_iosc.freq = sc->conf->iosc_freq;
if (clknode_fixed_register(clkdom, &aw_rtc_iosc) != 0)
device_printf(dev, "Cannot register iosc clock\n");
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
index 92eb0589f80b..78883296c5b7 100644
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -5767,7 +5767,7 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m,
VM_PAGE_TO_PHYS(m), oma, ma);
- if ((m->flags & PG_FICTITIOUS) != 0)
+ if (ma == oma || (m->flags & PG_FICTITIOUS) != 0)
return;
#if 0
/*
@@ -5784,22 +5784,20 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
* If page is not mapped by sf buffer, map the page
* transient and do invalidation.
*/
- if (ma != oma) {
- pa = VM_PAGE_TO_PHYS(m);
- sched_pin();
- pc = get_pcpu();
- cmap2_pte2p = pc->pc_cmap2_pte2p;
- mtx_lock(&pc->pc_cmap_lock);
- if (pte2_load(cmap2_pte2p) != 0)
- panic("%s: CMAP2 busy", __func__);
- pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW,
- vm_memattr_to_pte2(ma)));
- dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE);
- pte2_clear(cmap2_pte2p);
- tlb_flush((vm_offset_t)pc->pc_cmap2_addr);
- sched_unpin();
- mtx_unlock(&pc->pc_cmap_lock);
- }
+ pa = VM_PAGE_TO_PHYS(m);
+ sched_pin();
+ pc = get_pcpu();
+ cmap2_pte2p = pc->pc_cmap2_pte2p;
+ mtx_lock(&pc->pc_cmap_lock);
+ if (pte2_load(cmap2_pte2p) != 0)
+ panic("%s: CMAP2 busy", __func__);
+ pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW,
+ vm_memattr_to_pte2(ma)));
+ dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE);
+ pte2_clear(cmap2_pte2p);
+ tlb_flush((vm_offset_t)pc->pc_cmap2_addr);
+ sched_unpin();
+ mtx_unlock(&pc->pc_cmap_lock);
}
/*
diff --git a/sys/arm/broadcom/bcm2835/bcm2835_gpio.c b/sys/arm/broadcom/bcm2835/bcm2835_gpio.c
index e4fc57b79ba5..48d1d2af5abc 100644
--- a/sys/arm/broadcom/bcm2835/bcm2835_gpio.c
+++ b/sys/arm/broadcom/bcm2835/bcm2835_gpio.c
@@ -837,12 +837,12 @@ bcm_gpio_attach(device_t dev)
}
sc->sc_gpio_npins = i;
bcm_gpio_sysctl_init(sc);
- sc->sc_busdev = gpiobus_attach_bus(dev);
- if (sc->sc_busdev == NULL)
- goto fail;
fdt_pinctrl_register(dev, "brcm,pins");
fdt_pinctrl_configure_tree(dev);
+ sc->sc_busdev = gpiobus_attach_bus(dev);
+ if (sc->sc_busdev == NULL)
+ goto fail;
return (0);
diff --git a/sys/arm/mv/mvebu_gpio.c b/sys/arm/mv/mvebu_gpio.c
index 681cf20f7f9f..7acdfff539dc 100644
--- a/sys/arm/mv/mvebu_gpio.c
+++ b/sys/arm/mv/mvebu_gpio.c
@@ -810,7 +810,6 @@ mvebu_gpio_attach(device_t dev)
return (ENXIO);
}
- bus_attach_children(dev);
return (0);
}
diff --git a/sys/arm/nvidia/as3722_gpio.c b/sys/arm/nvidia/as3722_gpio.c
index 073d057884c9..f7b3d4d43bab 100644
--- a/sys/arm/nvidia/as3722_gpio.c
+++ b/sys/arm/nvidia/as3722_gpio.c
@@ -544,7 +544,7 @@ as3722_gpio_attach(struct as3722_softc *sc, phandle_t node)
sc->gpio_pins = malloc(sizeof(struct as3722_gpio_pin *) *
sc->gpio_npins, M_AS3722_GPIO, M_WAITOK | M_ZERO);
- sc->gpio_busdev = gpiobus_attach_bus(sc->dev);
+ sc->gpio_busdev = gpiobus_add_bus(sc->dev);
if (sc->gpio_busdev == NULL)
return (ENXIO);
for (i = 0; i < sc->gpio_npins; i++) {
diff --git a/sys/arm/nvidia/tegra_gpio.c b/sys/arm/nvidia/tegra_gpio.c
index 16e1ef94d6a9..e37fd69a121e 100644
--- a/sys/arm/nvidia/tegra_gpio.c
+++ b/sys/arm/nvidia/tegra_gpio.c
@@ -824,7 +824,6 @@ tegra_gpio_attach(device_t dev)
return (ENXIO);
}
- bus_attach_children(dev);
return (0);
}
diff --git a/sys/arm64/apple/apple_pinctrl.c b/sys/arm64/apple/apple_pinctrl.c
index ec2dd5907024..ebaaccea1d99 100644
--- a/sys/arm64/apple/apple_pinctrl.c
+++ b/sys/arm64/apple/apple_pinctrl.c
@@ -161,22 +161,22 @@ apple_pinctrl_attach(device_t dev)
goto error;
}
+ fdt_pinctrl_register(dev, "pinmux");
+ fdt_pinctrl_configure_tree(dev);
+
+ if (OF_hasprop(node, "interrupt-controller")) {
+ sc->sc_irqs = mallocarray(sc->sc_ngpios,
+ sizeof(*sc->sc_irqs), M_DEVBUF, M_ZERO | M_WAITOK);
+ intr_pic_register(dev,
+ OF_xref_from_node(ofw_bus_get_node(dev)));
+ }
+
sc->sc_busdev = gpiobus_attach_bus(dev);
if (sc->sc_busdev == NULL) {
device_printf(dev, "failed to attach gpiobus\n");
goto error;
}
- fdt_pinctrl_register(dev, "pinmux");
- fdt_pinctrl_configure_tree(dev);
-
- if (!OF_hasprop(node, "interrupt-controller"))
- return (0);
-
- sc->sc_irqs = mallocarray(sc->sc_ngpios,
- sizeof(*sc->sc_irqs), M_DEVBUF, M_ZERO | M_WAITOK);
- intr_pic_register(dev, OF_xref_from_node(ofw_bus_get_node(dev)));
-
return (0);
error:
mtx_destroy(&sc->sc_mtx);
diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c
index d2e56a270f54..459cc8ebe505 100644
--- a/sys/arm64/arm64/pmap.c
+++ b/sys/arm64/arm64/pmap.c
@@ -497,7 +497,8 @@ static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
struct rwlock **lockp);
static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
- pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
+ pd_entry_t l1e, bool demote_kl2e, struct spglist *free,
+ struct rwlock **lockp);
static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
@@ -3847,8 +3848,7 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
ml3 = pmap_remove_pt_page(pmap, va);
- if (ml3 == NULL)
- panic("pmap_remove_kernel_l2: Missing pt page");
+ KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page"));
ml3pa = VM_PAGE_TO_PHYS(ml3);
newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
@@ -3873,8 +3873,8 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
* pmap_remove_l2: Do the things to unmap a level 2 superpage.
*/
static int
-pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
- pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
+pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e,
+ bool demote_kl2e, struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
pt_entry_t old_l2;
@@ -3910,9 +3910,7 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
vm_page_aflag_clear(mt, PGA_WRITEABLE);
}
}
- if (pmap == kernel_pmap) {
- pmap_remove_kernel_l2(pmap, l2, sva);
- } else {
+ if (pmap != kernel_pmap) {
ml3 = pmap_remove_pt_page(pmap, sva);
if (ml3 != NULL) {
KASSERT(vm_page_any_valid(ml3),
@@ -3923,6 +3921,14 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
ml3->ref_count = 0;
pmap_add_delayed_free_list(ml3, free, false);
}
+ } else if (demote_kl2e) {
+ pmap_remove_kernel_l2(pmap, l2, sva);
+ } else {
+ ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva));
+ if (vm_page_any_valid(ml3)) {
+ ml3->valid = 0;
+ pmap_zero_page(ml3);
+ }
}
return (pmap_unuse_pt(pmap, sva, l1e, free));
}
@@ -4232,7 +4238,7 @@ pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
if (sva + L2_SIZE == va_next && eva >= va_next) {
pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
- &free, &lock);
+ true, &free, &lock);
continue;
} else if (pmap_demote_l2_locked(pmap, l2, sva,
&lock) == NULL)
@@ -5703,6 +5709,9 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT(ADDR_IS_CANONICAL(va),
("%s: Address not in canonical form: %lx", __func__, va));
+ KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+ PMAP_ENTER_NORECLAIM,
+ ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
@@ -5747,33 +5756,51 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
}
}
SLIST_INIT(&free);
- if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
+ if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
(void)pmap_remove_l2(pmap, l2, va,
- pmap_load(pmap_l1(pmap, va)), &free, lockp);
- else
+ pmap_load(pmap_l1(pmap, va)), false, &free, lockp);
+ } else {
+ if (ADDR_IS_KERNEL(va)) {
+ /*
+ * Try to save the ptp in the trie
+ * before any changes to mappings are
+ * made. Abort on failure.
+ */
+ mt = PTE_TO_VM_PAGE(old_l2);
+ if (pmap_insert_pt_page(pmap, mt, false,
+ false)) {
+ CTR1(KTR_PMAP,
+ "pmap_enter_l2: cannot ins kern ptp va %#lx",
+ va);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
+ /*
+ * Both pmap_remove_l2() and
+ * pmap_remove_l3_range() will zero fill
+ * the L3 kernel page table page.
+ */
+ }
pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
&free, lockp);
+ if (ADDR_IS_KERNEL(va)) {
+ /*
+ * The TLB could have an intermediate
+ * entry for the L3 kernel page table
+ * page, so request an invalidation at
+ * all levels after clearing the
+ * L2_TABLE entry.
+ */
+ pmap_clear(l2);
+ pmap_s1_invalidate_page(pmap, va, false);
+ }
+ }
+ KASSERT(pmap_load(l2) == 0,
+ ("pmap_enter_l2: non-zero L2 entry %p", l2));
if (!ADDR_IS_KERNEL(va)) {
vm_page_free_pages_toq(&free, true);
- KASSERT(pmap_load(l2) == 0,
- ("pmap_enter_l2: non-zero L2 entry %p", l2));
} else {
KASSERT(SLIST_EMPTY(&free),
("pmap_enter_l2: freed kernel page table page"));
-
- /*
- * Both pmap_remove_l2() and pmap_remove_l3_range()
- * will leave the kernel page table page zero filled.
- * Nonetheless, the TLB could have an intermediate
- * entry for the kernel page table page, so request
- * an invalidation at all levels after clearing
- * the L2_TABLE entry.
- */
- mt = PTE_TO_VM_PAGE(pmap_load(l2));
- if (pmap_insert_pt_page(pmap, mt, false, false))
- panic("pmap_enter_l2: trie insert failed");
- pmap_clear(l2);
- pmap_s1_invalidate_page(pmap, va, false);
}
}
@@ -5804,6 +5831,15 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
if (l2pg != NULL)
pmap_abort_ptp(pmap, va, l2pg);
+ else {
+ KASSERT(ADDR_IS_KERNEL(va) &&
+ (pmap_load(l2) & ATTR_DESCR_MASK) ==
+ L2_TABLE,
+ ("pmap_enter_l2: invalid kernel L2E"));
+ mt = pmap_remove_pt_page(pmap, va);
+ KASSERT(mt != NULL,
+ ("pmap_enter_l2: missing kernel PTP"));
+ }
if (uwptpg != NULL) {
mt = pmap_remove_pt_page(pmap, va);
KASSERT(mt == uwptpg,
@@ -8045,6 +8081,8 @@ pmap_unmapbios(void *p, vm_size_t size)
void
pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pv_memattr == ma)
+ return;
m->md.pv_memattr = ma;
@@ -8424,8 +8462,8 @@ pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
struct spglist free;
SLIST_INIT(&free);
- (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
- lockp);
+ (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true,
+ &free, lockp);
vm_page_free_pages_toq(&free, true);
}
diff --git a/sys/arm64/broadcom/genet/if_genet.c b/sys/arm64/broadcom/genet/if_genet.c
index 0602f076b257..182b5582fb7c 100644
--- a/sys/arm64/broadcom/genet/if_genet.c
+++ b/sys/arm64/broadcom/genet/if_genet.c
@@ -349,7 +349,7 @@ gen_attach(device_t dev)
}
/* If address was not found, create one based on the hostid and name. */
- if (eaddr_found == 0)
+ if (!eaddr_found)
ether_gen_addr(sc->ifp, &eaddr);
/* Attach ethernet interface */
ether_ifattach(sc->ifp, eaddr.octet);
@@ -653,7 +653,7 @@ gen_bus_dma_teardown(struct gen_softc *sc)
error);
}
- if (sc->tx_buf_tag != NULL) {
+ if (sc->rx_buf_tag != NULL) {
for (i = 0; i < RX_DESC_COUNT; i++) {
error = bus_dmamap_destroy(sc->rx_buf_tag,
sc->rx_ring_ent[i].map);
diff --git a/sys/arm64/linux/linux_proto.h b/sys/arm64/linux/linux_proto.h
index ae3d8569df58..82f57f77ffae 100644
--- a/sys/arm64/linux/linux_proto.h
+++ b/sys/arm64/linux/linux_proto.h
@@ -141,10 +141,13 @@ struct linux_inotify_init1_args {
char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
};
struct linux_inotify_add_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)];
+ char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)];
};
struct linux_inotify_rm_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)];
};
struct linux_ioctl_args {
char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
diff --git a/sys/arm64/linux/linux_sysent.c b/sys/arm64/linux/linux_sysent.c
index 722ada465730..e54a76cfd55e 100644
--- a/sys/arm64/linux/linux_sysent.c
+++ b/sys/arm64/linux/linux_sysent.c
@@ -41,8 +41,8 @@ struct sysent linux_sysent[] = {
{ .sy_narg = AS(linux_dup3_args), .sy_call = (sy_call_t *)linux_dup3, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 24 = linux_dup3 */
{ .sy_narg = AS(linux_fcntl_args), .sy_call = (sy_call_t *)linux_fcntl, .sy_auevent = AUE_FCNTL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 25 = linux_fcntl */
{ .sy_narg = AS(linux_inotify_init1_args), .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 26 = linux_inotify_init1 */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 27 = linux_inotify_add_watch */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 28 = linux_inotify_rm_watch */
+ { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 27 = linux_inotify_add_watch */
+ { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 28 = linux_inotify_rm_watch */
{ .sy_narg = AS(linux_ioctl_args), .sy_call = (sy_call_t *)linux_ioctl, .sy_auevent = AUE_IOCTL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 29 = linux_ioctl */
{ .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 30 = linux_ioprio_set */
{ .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 31 = linux_ioprio_get */
diff --git a/sys/arm64/linux/linux_systrace_args.c b/sys/arm64/linux/linux_systrace_args.c
index 54e4dd82355d..1b946a9406a5 100644
--- a/sys/arm64/linux/linux_systrace_args.c
+++ b/sys/arm64/linux/linux_systrace_args.c
@@ -210,12 +210,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
}
/* linux_inotify_add_watch */
case 27: {
- *n_args = 0;
+ struct linux_inotify_add_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = (intptr_t)p->pathname; /* const char * */
+ uarg[a++] = p->mask; /* uint32_t */
+ *n_args = 3;
break;
}
/* linux_inotify_rm_watch */
case 28: {
- *n_args = 0;
+ struct linux_inotify_rm_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = p->wd; /* uint32_t */
+ *n_args = 2;
break;
}
/* linux_ioctl */
@@ -2780,9 +2787,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_add_watch */
case 27:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "userland const char *";
+ break;
+ case 2:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_inotify_rm_watch */
case 28:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_ioctl */
case 29:
@@ -6455,8 +6485,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_add_watch */
case 27:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_inotify_rm_watch */
case 28:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_ioctl */
case 29:
if (ndx == 0 || ndx == 1)
diff --git a/sys/arm64/linux/syscalls.master b/sys/arm64/linux/syscalls.master
index 79c04c398e00..2babdcaf03bf 100644
--- a/sys/arm64/linux/syscalls.master
+++ b/sys/arm64/linux/syscalls.master
@@ -170,10 +170,17 @@
);
}
27 AUE_NULL STD {
- int linux_inotify_add_watch(void);
+ int linux_inotify_add_watch(
+ l_int fd,
+ const char *pathname,
+ uint32_t mask
+ );
}
28 AUE_NULL STD {
- int linux_inotify_rm_watch(void);
+ int linux_inotify_rm_watch(
+ l_int fd,
+ uint32_t wd
+ );
}
29 AUE_IOCTL STD {
int linux_ioctl(
diff --git a/sys/arm64/nvidia/tegra210/max77620_gpio.c b/sys/arm64/nvidia/tegra210/max77620_gpio.c
index 8dcf98099dac..5d91e23324c7 100644
--- a/sys/arm64/nvidia/tegra210/max77620_gpio.c
+++ b/sys/arm64/nvidia/tegra210/max77620_gpio.c
@@ -672,7 +672,7 @@ max77620_gpio_attach(struct max77620_softc *sc, phandle_t node)
sx_init(&sc->gpio_lock, "MAX77620 GPIO lock");
- sc->gpio_busdev = gpiobus_attach_bus(sc->dev);
+ sc->gpio_busdev = gpiobus_add_bus(sc->dev);
if (sc->gpio_busdev == NULL)
return (ENXIO);
diff --git a/sys/arm64/rockchip/rk_gpio.c b/sys/arm64/rockchip/rk_gpio.c
index a86392f16624..847bc7394dd0 100644
--- a/sys/arm64/rockchip/rk_gpio.c
+++ b/sys/arm64/rockchip/rk_gpio.c
@@ -362,12 +362,6 @@ rk_gpio_attach(device_t dev)
return (ENXIO);
}
- sc->sc_busdev = gpiobus_attach_bus(dev);
- if (sc->sc_busdev == NULL) {
- rk_gpio_detach(dev);
- return (ENXIO);
- }
-
/* Set the cached value to unknown */
for (i = 0; i < RK_GPIO_MAX_PINS; i++)
sc->pin_cached[i].is_gpio = 2;
@@ -377,6 +371,12 @@ rk_gpio_attach(device_t dev)
sc->swporta_ddr = rk_gpio_read_4(sc, RK_GPIO_SWPORTA_DDR);
RK_GPIO_UNLOCK(sc);
+ sc->sc_busdev = gpiobus_attach_bus(dev);
+ if (sc->sc_busdev == NULL) {
+ rk_gpio_detach(dev);
+ return (ENXIO);
+ }
+
return (0);
}
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index 0f110d5f9ddd..9381396f247c 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -663,6 +663,7 @@
#define AUE_FSPACECTL 43269 /* FreeBSD-specific. */
#define AUE_TIMERFD 43270 /* FreeBSD/Linux. */
#define AUE_SETCRED 43271 /* FreeBSD-specific. */
+#define AUE_INOTIFY 43272 /* FreeBSD/Linux. */
/*
* Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c
index ae7cf14c8f8e..1facab47473c 100644
--- a/sys/cam/ata/ata_da.c
+++ b/sys/cam/ata/ata_da.c
@@ -1359,10 +1359,7 @@ adaasync(void *callback_arg, uint32_t code,
case AC_GETDEV_CHANGED:
{
softc = (struct ada_softc *)periph->softc;
- memset(&cgd, 0, sizeof(cgd));
- xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, periph->path);
/*
* Update our information based on the new Identify data.
diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c
index 833df6cfb99b..730656684e2a 100644
--- a/sys/cam/cam_periph.c
+++ b/sys/cam/cam_periph.c
@@ -767,27 +767,28 @@ camperiphfree(struct cam_periph *periph)
CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) {
- union ccb ccb;
- void *arg;
-
- memset(&ccb, 0, sizeof(ccb));
switch (periph->deferred_ac) {
- case AC_FOUND_DEVICE:
- ccb.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
- xpt_action(&ccb);
- arg = &ccb;
+ case AC_FOUND_DEVICE: {
+ struct ccb_getdev cgd;
+
+ xpt_gdev_type(&cgd, periph->path);
+ periph->deferred_callback(NULL, periph->deferred_ac,
+ periph->path, &cgd);
break;
- case AC_PATH_REGISTERED:
- xpt_path_inq(&ccb.cpi, periph->path);
- arg = &ccb;
+ }
+ case AC_PATH_REGISTERED: {
+ struct ccb_pathinq cpi;
+
+ xpt_path_inq(&cpi, periph->path);
+ periph->deferred_callback(NULL, periph->deferred_ac,
+ periph->path, &cpi);
break;
+ }
default:
- arg = NULL;
+ periph->deferred_callback(NULL, periph->deferred_ac,
+ periph->path, NULL);
break;
}
- periph->deferred_callback(NULL, periph->deferred_ac,
- periph->path, arg);
}
xpt_free_path(periph->path);
free(periph, M_CAMPERIPH);
@@ -1682,10 +1683,7 @@ camperiphscsisenseerror(union ccb *ccb, union ccb **orig,
/*
* Grab the inquiry data for this device.
*/
- memset(&cgd, 0, sizeof(cgd));
- xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, ccb->ccb_h.path);
err_action = scsi_error_action(&ccb->csio, &cgd.inq_data,
sense_flags);
@@ -2133,11 +2131,7 @@ cam_periph_devctl_notify(union ccb *ccb)
sbuf_cat(&sb, "serial=\"");
if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) {
- xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path,
- CAM_PRIORITY_NORMAL);
- cgd->ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)cgd);
-
+ xpt_gdev_type(cgd, ccb->ccb_h.path);
if (cgd->ccb_h.status == CAM_REQ_CMP)
sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len);
xpt_free_ccb((union ccb *)cgd);
diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c
index 38bc82c69aad..cae29226d13c 100644
--- a/sys/cam/cam_xpt.c
+++ b/sys/cam/cam_xpt.c
@@ -2471,15 +2471,12 @@ xptsetasyncfunc(struct cam_ed *device, void *arg)
if ((device->flags & CAM_DEV_UNCONFIGURED) != 0)
return (1);
- memset(&cgd, 0, sizeof(cgd));
xpt_compile_path(&path,
NULL,
device->target->bus->path_id,
device->target->target_id,
device->lun_id);
- xpt_setup_ccb(&cgd.ccb_h, &path, CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, &path);
csa->callback(csa->callback_arg,
AC_FOUND_DEVICE,
&path, &cgd);
@@ -2518,6 +2515,15 @@ xpt_action(union ccb *start_ccb)
("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code,
xpt_action_name(start_ccb->ccb_h.func_code)));
+ /*
+ * Either it isn't queued, or it has a real priority. There still too
+ * many places that reuse CCBs with a real priority to do immediate
+ * queries to do the other side of this assert.
+ */
+ KASSERT((start_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0 ||
+ start_ccb->ccb_h.pinfo.priority != CAM_PRIORITY_NONE,
+ ("%s: queued ccb and CAM_PRIORITY_NONE illegal.", __func__));
+
start_ccb->ccb_h.status = CAM_REQ_INPROG;
(*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb);
}
diff --git a/sys/cam/cam_xpt.h b/sys/cam/cam_xpt.h
index 06ef52580120..efa6c823245a 100644
--- a/sys/cam/cam_xpt.h
+++ b/sys/cam/cam_xpt.h
@@ -145,19 +145,31 @@ uint32_t xpt_poll_setup(union ccb *start_ccb);
void xpt_sim_poll(struct cam_sim *sim);
/*
- * Perform a path inquiry at the request priority. The bzero may be
- * unnecessary.
+ * Perform a path inquiry. bzero may be redundant for allocated CCBs, but for
+ * the on-stack CCBs it's required.
*/
static inline void
xpt_path_inq(struct ccb_pathinq *cpi, struct cam_path *path)
{
-
bzero(cpi, sizeof(*cpi));
- xpt_setup_ccb(&cpi->ccb_h, path, CAM_PRIORITY_NORMAL);
+ xpt_setup_ccb(&cpi->ccb_h, path, CAM_PRIORITY_NONE);
cpi->ccb_h.func_code = XPT_PATH_INQ;
xpt_action((union ccb *)cpi);
}
+/*
+ * Perform get device type. bzero may be redundant for allocated CCBs, but for
+ * the on-stack CCBs it's required.
+ */
+static inline void
+xpt_gdev_type(struct ccb_getdev *cgd, struct cam_path *path)
+{
+ bzero(cgd, sizeof(*cgd));
+ xpt_setup_ccb(&cgd->ccb_h, path, CAM_PRIORITY_NONE);
+ cgd->ccb_h.func_code = XPT_GDEV_TYPE;
+ xpt_action((union ccb *)cgd);
+}
+
#endif /* _KERNEL */
#endif /* _CAM_CAM_XPT_H */
diff --git a/sys/cam/mmc/mmc_da.c b/sys/cam/mmc/mmc_da.c
index 1c455e1951d7..322141a72707 100644
--- a/sys/cam/mmc/mmc_da.c
+++ b/sys/cam/mmc/mmc_da.c
@@ -692,10 +692,7 @@ sddaasync(void *callback_arg, uint32_t code,
case AC_GETDEV_CHANGED:
{
CAM_DEBUG(path, CAM_DEBUG_TRACE, ("=> AC_GETDEV_CHANGED\n"));
- memset(&cgd, 0, sizeof(cgd));
- xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, periph->path);
cam_periph_async(periph, code, path, arg);
break;
}
@@ -789,7 +786,8 @@ sddaregister(struct cam_periph *periph, void *arg)
static int
mmc_exec_app_cmd(struct cam_periph *periph, union ccb *ccb,
- struct mmc_command *cmd) {
+ struct mmc_command *cmd)
+{
int err;
/* Send APP_CMD first */
@@ -843,7 +841,8 @@ mmc_exec_app_cmd(struct cam_periph *periph, union ccb *ccb,
}
static int
-mmc_app_get_scr(struct cam_periph *periph, union ccb *ccb, uint32_t *rawscr) {
+mmc_app_get_scr(struct cam_periph *periph, union ccb *ccb, uint32_t *rawscr)
+{
int err;
struct mmc_command cmd;
struct mmc_data d;
@@ -869,7 +868,8 @@ mmc_app_get_scr(struct cam_periph *periph, union ccb *ccb, uint32_t *rawscr) {
static int
mmc_send_ext_csd(struct cam_periph *periph, union ccb *ccb,
- uint8_t *rawextcsd, size_t buf_len) {
+ uint8_t *rawextcsd, size_t buf_len)
+{
int err;
struct mmc_data d;
@@ -966,14 +966,16 @@ mmc_switch(struct cam_periph *periph, union ccb *ccb,
}
static uint32_t
-mmc_get_spec_vers(struct cam_periph *periph) {
+mmc_get_spec_vers(struct cam_periph *periph)
+{
struct sdda_softc *softc = (struct sdda_softc *)periph->softc;
return (softc->csd.spec_vers);
}
static uint64_t
-mmc_get_media_size(struct cam_periph *periph) {
+mmc_get_media_size(struct cam_periph *periph)
+{
struct sdda_softc *softc = (struct sdda_softc *)periph->softc;
return (softc->mediasize);
@@ -992,7 +994,8 @@ mmc_get_cmd6_timeout(struct cam_periph *periph)
static int
mmc_sd_switch(struct cam_periph *periph, union ccb *ccb,
uint8_t mode, uint8_t grp, uint8_t value,
- uint8_t *res) {
+ uint8_t *res)
+{
struct mmc_data mmc_d;
uint32_t arg;
int err;
@@ -1069,7 +1072,8 @@ mmc_set_timing(struct cam_periph *periph,
}
static void
-sdda_start_init_task(void *context, int pending) {
+sdda_start_init_task(void *context, int pending)
+{
union ccb *new_ccb;
struct cam_periph *periph;
@@ -1077,7 +1081,7 @@ sdda_start_init_task(void *context, int pending) {
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_start_init_task\n"));
new_ccb = xpt_alloc_ccb();
xpt_setup_ccb(&new_ccb->ccb_h, periph->path,
- CAM_PRIORITY_NONE);
+ CAM_PRIORITY_NORMAL);
cam_periph_lock(periph);
cam_periph_hold(periph, PRIBIO|PCATCH);
@@ -1088,7 +1092,8 @@ sdda_start_init_task(void *context, int pending) {
}
static void
-sdda_set_bus_width(struct cam_periph *periph, union ccb *ccb, int width) {
+sdda_set_bus_width(struct cam_periph *periph, union ccb *ccb, int width)
+{
struct sdda_softc *softc = (struct sdda_softc *)periph->softc;
struct mmc_params *mmcp = &periph->path->device->mmc_ident_data;
int err;
@@ -1198,27 +1203,6 @@ sdda_get_host_caps(struct cam_periph *periph, union ccb *ccb)
return (cts->host_caps);
}
-static uint32_t
-sdda_get_max_data(struct cam_periph *periph, union ccb *ccb)
-{
- struct ccb_trans_settings_mmc *cts;
-
- cts = &ccb->cts.proto_specific.mmc;
- memset(cts, 0, sizeof(struct ccb_trans_settings_mmc));
-
- ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
- ccb->ccb_h.flags = CAM_DIR_NONE;
- ccb->ccb_h.retry_count = 0;
- ccb->ccb_h.timeout = 100;
- ccb->ccb_h.cbfcnp = NULL;
- xpt_action(ccb);
-
- if (ccb->ccb_h.status != CAM_REQ_CMP)
- panic("Cannot get host max data");
- KASSERT(cts->host_max_data != 0, ("host_max_data == 0?!"));
- return (cts->host_max_data);
-}
-
static void
sdda_start_init(void *context, union ccb *start_ccb)
{
@@ -1544,10 +1528,7 @@ sdda_add_part(struct cam_periph *periph, u_int type, const char *name,
bioq_init(&part->bio_queue);
- bzero(&cpi, sizeof(cpi));
- xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NONE);
- cpi.ccb_h.func_code = XPT_PATH_INQ;
- xpt_action((union ccb *)&cpi);
+ xpt_path_inq(&cpi, periph->path);
/*
* Register this media as a disk
diff --git a/sys/cam/mmc/mmc_xpt.c b/sys/cam/mmc/mmc_xpt.c
index 4fce03004994..f5f66f5214a8 100644
--- a/sys/cam/mmc/mmc_xpt.c
+++ b/sys/cam/mmc/mmc_xpt.c
@@ -610,7 +610,6 @@ mmcprobe_start(struct cam_periph *periph, union ccb *start_ccb)
CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_RESET\n"));
/* FALLTHROUGH */
case PROBE_IDENTIFY:
- xpt_path_inq(&start_ccb->cpi, periph->path);
CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_IDENTIFY\n"));
init_standard_ccb(start_ccb, XPT_MMC_GET_TRAN_SETTINGS);
break;
diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c
index 13a376ebb6e3..b518f84454ad 100644
--- a/sys/cam/scsi/scsi_all.c
+++ b/sys/cam/scsi/scsi_all.c
@@ -3708,11 +3708,7 @@ scsi_command_string(struct cam_device *device, struct ccb_scsiio *csio,
/*
* Get the device information.
*/
- xpt_setup_ccb(&cgd->ccb_h,
- csio->ccb_h.path,
- CAM_PRIORITY_NORMAL);
- cgd->ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)cgd);
+ xpt_gdev_type(cgd, csio->ccb_h.path);
/*
* If the device is unconfigured, just pretend that it is a hard
@@ -5144,11 +5140,7 @@ scsi_sense_sbuf(struct cam_device *device, struct ccb_scsiio *csio,
/*
* Get the device information.
*/
- xpt_setup_ccb(&cgd->ccb_h,
- csio->ccb_h.path,
- CAM_PRIORITY_NORMAL);
- cgd->ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)cgd);
+ xpt_gdev_type(cgd, csio->ccb_h.path);
/*
* If the device is unconfigured, just pretend that it is a hard
diff --git a/sys/cam/scsi/scsi_cd.c b/sys/cam/scsi/scsi_cd.c
index 00a417f65052..e622a96ec77e 100644
--- a/sys/cam/scsi/scsi_cd.c
+++ b/sys/cam/scsi/scsi_cd.c
@@ -1240,13 +1240,7 @@ cddone(struct cam_periph *periph, union ccb *done_ccb)
/*getcount_only*/0);
status = done_ccb->ccb_h.status;
-
- bzero(&cgd, sizeof(cgd));
- xpt_setup_ccb(&cgd.ccb_h,
- done_ccb->ccb_h.path,
- CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, done_ccb->ccb_h.path);
if (scsi_extract_sense_ccb(done_ccb,
&error_code, &sense_key, &asc, &ascq))
diff --git a/sys/cam/scsi/scsi_ch.c b/sys/cam/scsi/scsi_ch.c
index 89a817c1b488..3da22ba61392 100644
--- a/sys/cam/scsi/scsi_ch.c
+++ b/sys/cam/scsi/scsi_ch.c
@@ -1705,11 +1705,7 @@ chscsiversion(struct cam_periph *periph)
/*
* Get the device information.
*/
- xpt_setup_ccb(&cgd->ccb_h,
- periph->path,
- CAM_PRIORITY_NORMAL);
- cgd->ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)cgd);
+ xpt_gdev_type(cgd, periph->path);
if (cgd->ccb_h.status != CAM_REQ_CMP) {
xpt_free_ccb((union ccb *)cgd);
diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c
index 0a2389cd9b5d..d02750aaacaf 100644
--- a/sys/cam/scsi/scsi_da.c
+++ b/sys/cam/scsi/scsi_da.c
@@ -5035,11 +5035,7 @@ dadone_proberc(struct cam_periph *periph, union ccb *done_ccb)
/*timeout*/0,
/*getcount_only*/0);
- memset(&cgd, 0, sizeof(cgd));
- xpt_setup_ccb(&cgd.ccb_h, done_ccb->ccb_h.path,
- CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, done_ccb->ccb_h.path);
if (scsi_extract_sense_ccb(done_ccb,
&error_code, &sense_key, &asc, &ascq))
@@ -5077,6 +5073,18 @@ dadone_proberc(struct cam_periph *periph, union ccb *done_ccb)
* behind a SATL translation that's fallen into a
* terminally fatal state.
*
+ * 4/2 happens on some HGST drives that are quite
+ * ill. We've already sent the start unit command (for
+ * which we ignore a 44/0 asc/ascq, which I'm hesitant
+ * to change since it's so basic and there's other error
+ * conditions to the START UNIT we should ignore). So to
+ * require initialization at this point when it should
+ * be fine implies to me, at least, that we should
+ * invalidate. Since we do read capacity in geom tasting
+ * a lot, and since this timeout is long, this leads to
+ * up to a 10 minute delay in booting.
+ *
+ * 4/2: LOGICAL UNIT NOT READY, INITIALIZING COMMAND REQUIRED
* 25/0: LOGICAL UNIT NOT SUPPORTED
* 44/0: INTERNAL TARGET FAILURE
* 44/1: PERSISTENT RESERVATION INFORMATION LOST
@@ -5084,6 +5092,7 @@ dadone_proberc(struct cam_periph *periph, union ccb *done_ccb)
*/
if ((have_sense)
&& (asc != 0x25) && (asc != 0x44)
+ && (asc != 0x04 && ascq != 0x02)
&& (error_code == SSD_CURRENT_ERROR
|| error_code == SSD_DESC_CURRENT_ERROR)) {
const char *sense_key_desc;
diff --git a/sys/cam/scsi/scsi_enc_ses.c b/sys/cam/scsi/scsi_enc_ses.c
index c429e820a1fd..435874a9874a 100644
--- a/sys/cam/scsi/scsi_enc_ses.c
+++ b/sys/cam/scsi/scsi_enc_ses.c
@@ -979,10 +979,7 @@ ses_paths_iter(enc_softc_t *enc, enc_element_t *elm,
!= CAM_REQ_CMP)
return;
- memset(&cgd, 0, sizeof(cgd));
- xpt_setup_ccb(&cgd.ccb_h, path, CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, path);
if (cam_ccb_success((union ccb *)&cgd))
callback(enc, elm, path, callback_arg);
diff --git a/sys/cam/scsi/scsi_sa.c b/sys/cam/scsi/scsi_sa.c
index cfd48c98f30e..88147393192f 100644
--- a/sys/cam/scsi/scsi_sa.c
+++ b/sys/cam/scsi/scsi_sa.c
@@ -4731,12 +4731,7 @@ saextget(struct cdev *dev, struct cam_periph *periph, struct sbuf *sb,
SASBADDVARSTR(sb, indent, periph->periph_name, %s, periph_name,
strlen(periph->periph_name) + 1);
SASBADDUINT(sb, indent, periph->unit_number, %u, unit_number);
- memset(&cgd, 0, sizeof(cgd));
- xpt_setup_ccb(&cgd.ccb_h,
- periph->path,
- CAM_PRIORITY_NORMAL);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ xpt_gdev_type(&cgd, periph->path);
if ((cgd.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
g->status = MT_EXT_GET_ERROR;
snprintf(g->error_str, sizeof(g->error_str),
diff --git a/sys/cam/scsi/scsi_xpt.c b/sys/cam/scsi/scsi_xpt.c
index 439dd2050a95..bef35243af98 100644
--- a/sys/cam/scsi/scsi_xpt.c
+++ b/sys/cam/scsi/scsi_xpt.c
@@ -1915,6 +1915,15 @@ typedef struct {
int lunindex[0];
} scsi_scan_bus_info;
+static void
+free_scan_info(scsi_scan_bus_info *scan_info)
+{
+ KASSERT(scan_info->cpi != NULL,
+ ("scan_info (%p) missing its ccb_pathinq CCB\n", scan_info));
+ xpt_free_ccb((union ccb *)scan_info->cpi);
+ free(scan_info, M_CAMXPT);
+}
+
/*
* To start a scan, request_ccb is an XPT_SCAN_BUS ccb.
* As the scan progresses, scsi_scan_bus is used as the
@@ -1945,10 +1954,7 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb)
xpt_done(request_ccb);
return;
}
- xpt_setup_ccb(&work_ccb->ccb_h, request_ccb->ccb_h.path,
- request_ccb->ccb_h.pinfo.priority);
- work_ccb->ccb_h.func_code = XPT_PATH_INQ;
- xpt_action(work_ccb);
+ xpt_path_inq(&work_ccb->cpi, request_ccb->ccb_h.path);
if (work_ccb->ccb_h.status != CAM_REQ_CMP) {
request_ccb->ccb_h.status = work_ccb->ccb_h.status;
xpt_free_ccb(work_ccb);
@@ -2037,16 +2043,14 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb)
printf(
"scsi_scan_bus: xpt_create_path failed with status %#x, bus scan halted\n",
status);
- free(scan_info, M_CAMXPT);
+ free_scan_info(scan_info);
request_ccb->ccb_h.status = status;
- xpt_free_ccb(work_ccb);
xpt_done(request_ccb);
break;
}
work_ccb = xpt_alloc_ccb_nowait();
if (work_ccb == NULL) {
- xpt_free_ccb((union ccb *)scan_info->cpi);
- free(scan_info, M_CAMXPT);
+ free_scan_info(scan_info);
xpt_free_path(path);
request_ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
xpt_done(request_ccb);
@@ -2179,16 +2183,16 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb)
* Check to see if we scan any further luns.
*/
if (next_target) {
- int done;
+ bool done;
/*
* Free the current request path- we're done with it.
*/
xpt_free_path(oldpath);
hop_again:
- done = 0;
+ done = false;
if (scan_info->request_ccb->ccb_h.func_code == XPT_SCAN_TGT) {
- done = 1;
+ done = true;
} else if (scan_info->cpi->hba_misc & PIM_SEQSCAN) {
scan_info->counter++;
if (scan_info->counter ==
@@ -2197,23 +2201,22 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb)
}
if (scan_info->counter >=
scan_info->cpi->max_target+1) {
- done = 1;
+ done = true;
}
} else {
scan_info->counter--;
if (scan_info->counter == 0) {
- done = 1;
+ done = true;
}
}
if (done) {
mtx_unlock(mtx);
xpt_free_ccb(request_ccb);
- xpt_free_ccb((union ccb *)scan_info->cpi);
request_ccb = scan_info->request_ccb;
CAM_DEBUG(request_ccb->ccb_h.path,
CAM_DEBUG_TRACE,
("SCAN done for %p\n", scan_info));
- free(scan_info, M_CAMXPT);
+ free_scan_info(scan_info);
request_ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done(request_ccb);
break;
@@ -2233,9 +2236,8 @@ scsi_scan_bus(struct cam_periph *periph, union ccb *request_ccb)
"scsi_scan_bus: xpt_create_path failed with status %#x, bus scan halted\n",
status);
xpt_free_ccb(request_ccb);
- xpt_free_ccb((union ccb *)scan_info->cpi);
request_ccb = scan_info->request_ccb;
- free(scan_info, M_CAMXPT);
+ free_scan_info(scan_info);
request_ccb->ccb_h.status = status;
xpt_done(request_ccb);
break;
@@ -2294,10 +2296,7 @@ scsi_scan_lun(struct cam_periph *periph, struct cam_path *path,
CAM_DEBUG(path, CAM_DEBUG_TRACE, ("scsi_scan_lun\n"));
- memset(&cpi, 0, sizeof(cpi));
- xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE);
- cpi.ccb_h.func_code = XPT_PATH_INQ;
- xpt_action((union ccb *)&cpi);
+ xpt_path_inq(&cpi, path);
if (cpi.ccb_h.status != CAM_REQ_CMP) {
if (request_ccb != NULL) {
@@ -2421,10 +2420,7 @@ scsi_devise_transport(struct cam_path *path)
struct scsi_inquiry_data *inq_buf;
/* Get transport information from the SIM */
- memset(&cpi, 0, sizeof(cpi));
- xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE);
- cpi.ccb_h.func_code = XPT_PATH_INQ;
- xpt_action((union ccb *)&cpi);
+ xpt_path_inq(&cpi, path);
inq_buf = NULL;
if ((path->device->flags & CAM_DEV_INQUIRY_DATA_VALID) != 0)
@@ -2732,10 +2728,7 @@ scsi_set_transfer_settings(struct ccb_trans_settings *cts, struct cam_path *path
inq_data = &device->inq_data;
scsi = &cts->proto_specific.scsi;
- memset(&cpi, 0, sizeof(cpi));
- xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE);
- cpi.ccb_h.func_code = XPT_PATH_INQ;
- xpt_action((union ccb *)&cpi);
+ xpt_path_inq(&cpi, path);
/* SCSI specific sanity checking */
if ((cpi.hba_inquiry & PI_TAG_ABLE) == 0
@@ -3046,10 +3039,7 @@ _scsi_announce_periph(struct cam_periph *periph, u_int *speed, u_int *freq, stru
return;
/* Ask the SIM for its base transfer speed */
- memset(&cpi, 0, sizeof(cpi));
- xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NORMAL);
- cpi.ccb_h.func_code = XPT_PATH_INQ;
- xpt_action((union ccb *)&cpi);
+ xpt_path_inq(&cpi, path);
/* Report connection speed */
*speed = cpi.base_transfer_speed;
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index 0ce38384abbf..83d964360343 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -2019,6 +2019,7 @@ typedef struct vdev {
vdev_list_t v_children; /* children of this vdev */
const char *v_name; /* vdev name */
uint64_t v_guid; /* vdev guid */
+ uint64_t v_txg; /* most recent transaction */
uint64_t v_id; /* index in parent */
uint64_t v_psize; /* physical device capacity */
int v_ashift; /* offset to block shift */
@@ -2048,7 +2049,6 @@ typedef struct spa {
STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
char *spa_name; /* pool name */
uint64_t spa_guid; /* pool guid */
- uint64_t spa_txg; /* most recent transaction */
struct uberblock *spa_uberblock; /* best uberblock so far */
vdev_t *spa_root_vdev; /* toplevel vdev container */
objset_phys_t *spa_mos; /* MOS for this pool */
diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h
index eaa086188b5f..8d2748098c00 100644
--- a/sys/compat/freebsd32/freebsd32_syscall.h
+++ b/sys/compat/freebsd32/freebsd32_syscall.h
@@ -511,4 +511,6 @@
#define FREEBSD32_SYS_fchroot 590
#define FREEBSD32_SYS_freebsd32_setcred 591
#define FREEBSD32_SYS_exterrctl 592
-#define FREEBSD32_SYS_MAXSYSCALL 593
+#define FREEBSD32_SYS_inotify_add_watch_at 593
+#define FREEBSD32_SYS_inotify_rm_watch 594
+#define FREEBSD32_SYS_MAXSYSCALL 595
diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c
index 989f32a5c6f0..bda373268cc5 100644
--- a/sys/compat/freebsd32/freebsd32_syscalls.c
+++ b/sys/compat/freebsd32/freebsd32_syscalls.c
@@ -598,4 +598,6 @@ const char *freebsd32_syscallnames[] = {
"fchroot", /* 590 = fchroot */
"freebsd32_setcred", /* 591 = freebsd32_setcred */
"exterrctl", /* 592 = exterrctl */
+ "inotify_add_watch_at", /* 593 = inotify_add_watch_at */
+ "inotify_rm_watch", /* 594 = inotify_rm_watch */
};
diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c
index 476fe2ac3f80..ef0aff8bf852 100644
--- a/sys/compat/freebsd32/freebsd32_sysent.c
+++ b/sys/compat/freebsd32/freebsd32_sysent.c
@@ -659,5 +659,7 @@ struct sysent freebsd32_sysent[] = {
{ .sy_narg = AS(getrlimitusage_args), .sy_call = (sy_call_t *)sys_getrlimitusage, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 589 = getrlimitusage */
{ .sy_narg = AS(fchroot_args), .sy_call = (sy_call_t *)sys_fchroot, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 590 = fchroot */
{ .sy_narg = AS(freebsd32_setcred_args), .sy_call = (sy_call_t *)freebsd32_setcred, .sy_auevent = AUE_SETCRED, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 591 = freebsd32_setcred */
- { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */
+ { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */
+ { .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 593 = inotify_add_watch_at */
+ { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */
};
diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c
index cf08938cd5de..37564a737a62 100644
--- a/sys/compat/freebsd32/freebsd32_systrace_args.c
+++ b/sys/compat/freebsd32/freebsd32_systrace_args.c
@@ -3395,6 +3395,24 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 3;
break;
}
+ /* inotify_add_watch_at */
+ case 593: {
+ struct inotify_add_watch_at_args *p = params;
+ iarg[a++] = p->fd; /* int */
+ iarg[a++] = p->dfd; /* int */
+ uarg[a++] = (intptr_t)p->path; /* const char * */
+ uarg[a++] = p->mask; /* uint32_t */
+ *n_args = 4;
+ break;
+ }
+ /* inotify_rm_watch */
+ case 594: {
+ struct inotify_rm_watch_args *p = params;
+ iarg[a++] = p->fd; /* int */
+ iarg[a++] = p->wd; /* int */
+ *n_args = 2;
+ break;
+ }
default:
*n_args = 0;
break;
@@ -9172,6 +9190,38 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
};
break;
+ /* inotify_add_watch_at */
+ case 593:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "userland const char *";
+ break;
+ case 3:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* inotify_rm_watch */
+ case 594:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
default:
break;
};
@@ -11070,6 +11120,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
if (ndx == 0 || ndx == 1)
p = "int";
break;
+ /* inotify_add_watch_at */
+ case 593:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* inotify_rm_watch */
+ case 594:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
default:
break;
};
diff --git a/sys/compat/linux/linux_dummy.c b/sys/compat/linux/linux_dummy.c
index 35d6debe0da9..19cd55849f65 100644
--- a/sys/compat/linux/linux_dummy.c
+++ b/sys/compat/linux/linux_dummy.c
@@ -74,9 +74,6 @@ DUMMY(kexec_load);
DUMMY(add_key);
DUMMY(request_key);
DUMMY(keyctl);
-/* Linux 2.6.13: */
-DUMMY(inotify_add_watch);
-DUMMY(inotify_rm_watch);
/* Linux 2.6.16: */
DUMMY(migrate_pages);
DUMMY(unshare);
@@ -87,7 +84,6 @@ DUMMY(vmsplice);
DUMMY(move_pages);
/* Linux 2.6.27: */
DUMMY(signalfd4);
-DUMMY(inotify_init1);
/* Linux 2.6.31: */
DUMMY(perf_event_open);
/* Linux 2.6.36: */
diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c
index 246bc26d85d4..86834a7ecea8 100644
--- a/sys/compat/linux/linux_file.c
+++ b/sys/compat/linux/linux_file.c
@@ -32,11 +32,13 @@
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
+#include <sys/inotify.h>
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/selinfo.h>
#include <sys/pipe.h>
#include <sys/proc.h>
+#include <sys/specialfd.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
@@ -1877,3 +1879,122 @@ linux_writev(struct thread *td, struct linux_writev_args *args)
freeuio(auio);
return (linux_enobufs2eagain(td, args->fd, error));
}
+
+static int
+linux_inotify_init_flags(int l_flags)
+{
+ int bsd_flags;
+
+ if ((l_flags & ~(LINUX_IN_CLOEXEC | LINUX_IN_NONBLOCK)) != 0)
+ linux_msg(NULL, "inotify_init1 unsupported flags 0x%x",
+ l_flags);
+
+ bsd_flags = 0;
+ if ((l_flags & LINUX_IN_CLOEXEC) != 0)
+ bsd_flags |= O_CLOEXEC;
+ if ((l_flags & LINUX_IN_NONBLOCK) != 0)
+ bsd_flags |= O_NONBLOCK;
+ return (bsd_flags);
+}
+
+static int
+inotify_init_common(struct thread *td, int flags)
+{
+ struct specialfd_inotify si;
+
+ si.flags = linux_inotify_init_flags(flags);
+ return (kern_specialfd(td, SPECIALFD_INOTIFY, &si));
+}
+
+#if defined(__i386__) || defined(__amd64__)
+int
+linux_inotify_init(struct thread *td, struct linux_inotify_init_args *args)
+{
+ return (inotify_init_common(td, 0));
+}
+#endif
+
+int
+linux_inotify_init1(struct thread *td, struct linux_inotify_init1_args *args)
+{
+ return (inotify_init_common(td, args->flags));
+}
+
+/*
+ * The native implementation uses the same values for inotify events as
+ * libinotify, which gives us binary compatibility with Linux. This simplifies
+ * the shim implementation a lot, as otherwise we would have to handle read(2)
+ * calls on inotify descriptors and translate events to Linux's ABI.
+ */
+_Static_assert(LINUX_IN_ACCESS == IN_ACCESS,
+ "IN_ACCESS mismatch");
+_Static_assert(LINUX_IN_MODIFY == IN_MODIFY,
+ "IN_MODIFY mismatch");
+_Static_assert(LINUX_IN_ATTRIB == IN_ATTRIB,
+ "IN_ATTRIB mismatch");
+_Static_assert(LINUX_IN_CLOSE_WRITE == IN_CLOSE_WRITE,
+ "IN_CLOSE_WRITE mismatch");
+_Static_assert(LINUX_IN_CLOSE_NOWRITE == IN_CLOSE_NOWRITE,
+ "IN_CLOSE_NOWRITE mismatch");
+_Static_assert(LINUX_IN_OPEN == IN_OPEN,
+ "IN_OPEN mismatch");
+_Static_assert(LINUX_IN_MOVED_FROM == IN_MOVED_FROM,
+ "IN_MOVED_FROM mismatch");
+_Static_assert(LINUX_IN_MOVED_TO == IN_MOVED_TO,
+ "IN_MOVED_TO mismatch");
+_Static_assert(LINUX_IN_CREATE == IN_CREATE,
+ "IN_CREATE mismatch");
+_Static_assert(LINUX_IN_DELETE == IN_DELETE,
+ "IN_DELETE mismatch");
+_Static_assert(LINUX_IN_DELETE_SELF == IN_DELETE_SELF,
+ "IN_DELETE_SELF mismatch");
+_Static_assert(LINUX_IN_MOVE_SELF == IN_MOVE_SELF,
+ "IN_MOVE_SELF mismatch");
+
+_Static_assert(LINUX_IN_UNMOUNT == IN_UNMOUNT,
+ "IN_UNMOUNT mismatch");
+_Static_assert(LINUX_IN_Q_OVERFLOW == IN_Q_OVERFLOW,
+ "IN_Q_OVERFLOW mismatch");
+_Static_assert(LINUX_IN_IGNORED == IN_IGNORED,
+ "IN_IGNORED mismatch");
+
+_Static_assert(LINUX_IN_ISDIR == IN_ISDIR,
+ "IN_ISDIR mismatch");
+_Static_assert(LINUX_IN_ONLYDIR == IN_ONLYDIR,
+ "IN_ONLYDIR mismatch");
+_Static_assert(LINUX_IN_DONT_FOLLOW == IN_DONT_FOLLOW,
+ "IN_DONT_FOLLOW mismatch");
+_Static_assert(LINUX_IN_MASK_CREATE == IN_MASK_CREATE,
+ "IN_MASK_CREATE mismatch");
+_Static_assert(LINUX_IN_MASK_ADD == IN_MASK_ADD,
+ "IN_MASK_ADD mismatch");
+_Static_assert(LINUX_IN_ONESHOT == IN_ONESHOT,
+ "IN_ONESHOT mismatch");
+_Static_assert(LINUX_IN_EXCL_UNLINK == IN_EXCL_UNLINK,
+ "IN_EXCL_UNLINK mismatch");
+
+static int
+linux_inotify_watch_flags(int l_flags)
+{
+ if ((l_flags & ~(LINUX_IN_ALL_EVENTS | LINUX_IN_ALL_FLAGS)) != 0) {
+ linux_msg(NULL, "inotify_add_watch unsupported flags 0x%x",
+ l_flags);
+ }
+
+ return (l_flags);
+}
+
+int
+linux_inotify_add_watch(struct thread *td,
+ struct linux_inotify_add_watch_args *args)
+{
+ return (kern_inotify_add_watch(args->fd, AT_FDCWD, args->pathname,
+ linux_inotify_watch_flags(args->mask), td));
+}
+
+int
+linux_inotify_rm_watch(struct thread *td,
+ struct linux_inotify_rm_watch_args *args)
+{
+ return (kern_inotify_rm_watch(args->fd, args->wd, td));
+}
diff --git a/sys/compat/linux/linux_file.h b/sys/compat/linux/linux_file.h
index 2e56942b0f40..7448dc597230 100644
--- a/sys/compat/linux/linux_file.h
+++ b/sys/compat/linux/linux_file.h
@@ -189,6 +189,38 @@
#define LINUX_HUGETLB_FLAG_ENCODE_2GB (31 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
#define LINUX_HUGETLB_FLAG_ENCODE_16GB (34U << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+/* inotify flags */
+#define LINUX_IN_ACCESS 0x00000001
+#define LINUX_IN_MODIFY 0x00000002
+#define LINUX_IN_ATTRIB 0x00000004
+#define LINUX_IN_CLOSE_WRITE 0x00000008
+#define LINUX_IN_CLOSE_NOWRITE 0x00000010
+#define LINUX_IN_OPEN 0x00000020
+#define LINUX_IN_MOVED_FROM 0x00000040
+#define LINUX_IN_MOVED_TO 0x00000080
+#define LINUX_IN_CREATE 0x00000100
+#define LINUX_IN_DELETE 0x00000200
+#define LINUX_IN_DELETE_SELF 0x00000400
+#define LINUX_IN_MOVE_SELF 0x00000800
+
+#define LINUX_IN_UNMOUNT 0x00002000
+#define LINUX_IN_Q_OVERFLOW 0x00004000
+#define LINUX_IN_IGNORED 0x00008000
+
+#define LINUX_IN_ONLYDIR 0x01000000
+#define LINUX_IN_DONT_FOLLOW 0x02000000
+#define LINUX_IN_EXCL_UNLINK 0x04000000
+#define LINUX_IN_MASK_CREATE 0x10000000
+#define LINUX_IN_MASK_ADD 0x20000000
+#define LINUX_IN_ISDIR 0x40000000
+#define LINUX_IN_ONESHOT 0x80000000
+
+#define LINUX_IN_ALL_EVENTS 0x00000fff
+#define LINUX_IN_ALL_FLAGS 0xf700e000
+
+#define LINUX_IN_NONBLOCK 0x00000800
+#define LINUX_IN_CLOEXEC 0x00080000
+
#if defined(_KERNEL)
struct l_file_handle {
l_uint handle_bytes;
diff --git a/sys/compat/linuxkpi/common/include/acpi/acpi.h b/sys/compat/linuxkpi/common/include/acpi/acpi.h
index e0218bdde12e..1e398d05ba20 100644
--- a/sys/compat/linuxkpi/common/include/acpi/acpi.h
+++ b/sys/compat/linuxkpi/common/include/acpi/acpi.h
@@ -3,6 +3,10 @@
*
* Copyright (c) 2017 Mark Johnston <markj@FreeBSD.org>
* Copyright (c) 2020 Vladimir Kondratyev <wulf@FreeBSD.org>
+ * Copyright (c) 2025 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Björn Zeeb
+ * under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
@@ -31,6 +35,13 @@
#define _LINUXKPI_ACPI_ACPI_H_
/*
+ * LINUXKPI_WANT_LINUX_ACPI is a temporary workaround to allow drm-kmod
+ * to update all needed branches without breaking builds.
+ * Once that happened and checks are implemented based on __FreeBSD_verison
+ * we will remove these conditions again.
+ */
+
+/*
* FreeBSD import of ACPICA has a typedef for BOOLEAN which conflicts with
* amdgpu driver. Workaround it on preprocessor level.
*/
@@ -46,8 +57,8 @@ typedef int64_t INT64;
#include <contrib/dev/acpica/include/acpi.h>
#undef BOOLEAN
+typedef ACPI_IO_ADDRESS acpi_io_address;
typedef ACPI_HANDLE acpi_handle;
-typedef ACPI_OBJECT acpi_object;
typedef ACPI_OBJECT_HANDLER acpi_object_handler;
typedef ACPI_OBJECT_TYPE acpi_object_type;
typedef ACPI_STATUS acpi_status;
@@ -55,12 +66,62 @@ typedef ACPI_STRING acpi_string;
typedef ACPI_SIZE acpi_size;
typedef ACPI_WALK_CALLBACK acpi_walk_callback;
+union linuxkpi_acpi_object {
+ acpi_object_type type;
+ struct {
+ acpi_object_type type;
+ UINT64 value;
+ } integer;
+ struct {
+ acpi_object_type type;
+ UINT32 length;
+ char *pointer;
+ } string;
+ struct {
+ acpi_object_type type;
+ UINT32 length;
+ UINT8 *pointer;
+ } buffer;
+ struct {
+ acpi_object_type type;
+ UINT32 count;
+ union linuxkpi_acpi_object *elements;
+ } package;
+ struct {
+ acpi_object_type type;
+ acpi_object_type actual_type;
+ acpi_handle handle;
+ } reference;
+ struct {
+ acpi_object_type type;
+ UINT32 proc_id;
+ acpi_io_address pblk_address;
+ UINT32 pblk_length;
+ } processor;
+ struct {
+ acpi_object_type type;
+ UINT32 system_level;
+ UINT32 resource_order;
+ } power_resource;
+};
+
+#ifdef LINUXKPI_WANT_LINUX_ACPI
+struct linuxkpi_acpi_buffer {
+ acpi_size length; /* Length in bytes of the buffer */
+ void *pointer; /* pointer to buffer */
+};
+
+typedef struct linuxkpi_acpi_buffer lkpi_acpi_buffer_t;
+#else
+typedef ACPI_BUFFER lkpi_acpi_buffer_t;
+#endif
+
static inline ACPI_STATUS
acpi_evaluate_object(ACPI_HANDLE Object, ACPI_STRING Pathname,
- ACPI_OBJECT_LIST *ParameterObjects, ACPI_BUFFER *ReturnObjectBuffer)
+ ACPI_OBJECT_LIST *ParameterObjects, lkpi_acpi_buffer_t *ReturnObjectBuffer)
{
return (AcpiEvaluateObject(
- Object, Pathname, ParameterObjects, ReturnObjectBuffer));
+ Object, Pathname, ParameterObjects, (ACPI_BUFFER *)ReturnObjectBuffer));
}
static inline const char *
@@ -83,9 +144,9 @@ acpi_get_data(ACPI_HANDLE ObjHandle, ACPI_OBJECT_HANDLER Handler, void **Data)
}
static inline ACPI_STATUS
-acpi_get_name(ACPI_HANDLE Object, UINT32 NameType, ACPI_BUFFER *RetPathPtr)
+acpi_get_name(ACPI_HANDLE Object, UINT32 NameType, lkpi_acpi_buffer_t *RetPathPtr)
{
- return (AcpiGetName(Object, NameType, RetPathPtr));
+ return (AcpiGetName(Object, NameType, (ACPI_BUFFER *)RetPathPtr));
}
static inline ACPI_STATUS
@@ -101,4 +162,9 @@ acpi_put_table(ACPI_TABLE_HEADER *Table)
AcpiPutTable(Table);
}
+#ifdef LINUXKPI_WANT_LINUX_ACPI
+#define acpi_object linuxkpi_acpi_object
+#define acpi_buffer linuxkpi_acpi_buffer
+#endif
+
#endif /* _LINUXKPI_ACPI_ACPI_H_ */
diff --git a/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h b/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h
index 65bcbe7f1bdd..47195e7d66a6 100644
--- a/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h
+++ b/sys/compat/linuxkpi/common/include/acpi/acpi_bus.h
@@ -58,4 +58,10 @@ bool lkpi_acpi_dev_present(const char *hid, const char *uid,
struct acpi_device *lkpi_acpi_dev_get_first_match_dev(const char *hid,
const char *uid, int64_t hrv);
+union linuxkpi_acpi_object;
+
+union linuxkpi_acpi_object *
+acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid,
+ UINT64 rev, UINT64 func, union linuxkpi_acpi_object *arg);
+
#endif /* _LINUXKPI_ACPI_ACPI_BUS_H_ */
diff --git a/sys/compat/linuxkpi/common/include/linux/device.h b/sys/compat/linuxkpi/common/include/linux/device.h
index a5f6874a07f6..2556b0c45e49 100644
--- a/sys/compat/linuxkpi/common/include/linux/device.h
+++ b/sys/compat/linuxkpi/common/include/linux/device.h
@@ -90,6 +90,8 @@ struct dev_pm_ops {
struct device_driver {
const char *name;
const struct dev_pm_ops *pm;
+
+ void (*shutdown) (struct device *);
};
struct device_type {
diff --git a/sys/compat/linuxkpi/common/include/linux/pci.h b/sys/compat/linuxkpi/common/include/linux/pci.h
index 174015ba7a58..af19829f1cbb 100644
--- a/sys/compat/linuxkpi/common/include/linux/pci.h
+++ b/sys/compat/linuxkpi/common/include/linux/pci.h
@@ -72,6 +72,10 @@ struct pci_device_id {
uintptr_t driver_data;
};
+#define MODULE_DEVICE_TABLE_BUS_pci(_bus, _table) \
+MODULE_PNP_INFO("U32:vendor;U32:device;V32:subvendor;V32:subdevice", \
+ _bus, lkpi_ ## _table, _table, nitems(_table) - 1)
+
/* Linux has an empty element at the end of the ID table -> nitems() - 1. */
#define MODULE_DEVICE_TABLE(_bus, _table) \
\
@@ -85,11 +89,10 @@ static driver_t _ ## _bus ## _ ## _table ## _driver = { \
0 \
}; \
\
-DRIVER_MODULE(lkpi_ ## _table, pci, _ ## _bus ## _ ## _table ## _driver,\
+DRIVER_MODULE(lkpi_ ## _table, _bus, _ ## _bus ## _ ## _table ## _driver,\
0, 0); \
\
-MODULE_PNP_INFO("U32:vendor;U32:device;V32:subvendor;V32:subdevice", \
- _bus, lkpi_ ## _table, _table, nitems(_table) - 1)
+MODULE_DEVICE_TABLE_BUS_ ## _bus(_bus, _table)
#define PCI_ANY_ID -1U
diff --git a/sys/compat/linuxkpi/common/include/linux/slab.h b/sys/compat/linuxkpi/common/include/linux/slab.h
index f3a840d9bf4b..efa5c8cb67b3 100644
--- a/sys/compat/linuxkpi/common/include/linux/slab.h
+++ b/sys/compat/linuxkpi/common/include/linux/slab.h
@@ -45,7 +45,7 @@
MALLOC_DECLARE(M_KMALLOC);
-#define kvzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO)
+#define kvzalloc(size, flags) kvmalloc(size, (flags) | __GFP_ZERO)
#define kvcalloc(n, size, flags) kvmalloc_array(n, size, (flags) | __GFP_ZERO)
#define kzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO)
#define kzalloc_node(size, flags, node) kmalloc_node(size, (flags) | __GFP_ZERO, node)
diff --git a/sys/compat/linuxkpi/common/src/linux_acpi.c b/sys/compat/linuxkpi/common/src/linux_acpi.c
index 6a9afb3ddff0..d18c69d9210d 100644
--- a/sys/compat/linuxkpi/common/src/linux_acpi.c
+++ b/sys/compat/linuxkpi/common/src/linux_acpi.c
@@ -39,6 +39,7 @@
#include <linux/notifier.h>
#include <linux/suspend.h>
+#include <linux/uuid.h>
#include <acpi/acpi_bus.h>
#include <acpi/video.h>
@@ -99,6 +100,17 @@ acpi_evaluate_dsm_typed(ACPI_HANDLE handle, const char *uuid, int rev,
argv4, &buf, type)) ? (ACPI_OBJECT *)buf.Pointer : NULL);
}
+union linuxkpi_acpi_object *
+acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid,
+ UINT64 rev, UINT64 func, union linuxkpi_acpi_object *pkg)
+{
+ ACPI_BUFFER buf;
+
+ return (ACPI_SUCCESS(acpi_EvaluateDSM(ObjHandle, (const uint8_t *)guid,
+ rev, func, (ACPI_OBJECT *)pkg, &buf)) ?
+ (union linuxkpi_acpi_object *)buf.Pointer : NULL);
+}
+
static void
linux_handle_power_suspend_event(void *arg __unused)
{
@@ -323,6 +335,13 @@ acpi_evaluate_dsm_typed(ACPI_HANDLE handle, const char *uuid, int rev,
return (NULL);
}
+union linuxkpi_acpi_object *
+acpi_evaluate_dsm(ACPI_HANDLE ObjHandle, const guid_t *guid,
+ UINT64 rev, UINT64 func, union linuxkpi_acpi_object *pkg)
+{
+ return (NULL);
+}
+
int
register_acpi_notifier(struct notifier_block *nb)
{
diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c
index ebb92eacbf9a..628af17df853 100644
--- a/sys/compat/linuxkpi/common/src/linux_page.c
+++ b/sys/compat/linuxkpi/common/src/linux_page.c
@@ -106,6 +106,7 @@ linux_alloc_pages(gfp_t flags, unsigned int order)
if ((flags & M_ZERO) != 0)
req |= VM_ALLOC_ZERO;
+
if (order == 0 && (flags & GFP_DMA32) == 0) {
page = vm_page_alloc_noobj(req);
if (page == NULL)
@@ -113,6 +114,10 @@ linux_alloc_pages(gfp_t flags, unsigned int order)
} else {
vm_paddr_t pmax = (flags & GFP_DMA32) ?
BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR;
+
+ if ((flags & __GFP_NORETRY) != 0)
+ req |= VM_ALLOC_NORECLAIM;
+
retry:
page = vm_page_alloc_noobj_contig(req, npages, 0, pmax,
PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
diff --git a/sys/conf/files b/sys/conf/files
index 75ee10be5896..dd0d390962f2 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -598,42 +598,24 @@ contrib/dev/acpica/components/utilities/utxface.c optional acpi
contrib/dev/acpica/components/utilities/utxferror.c optional acpi
contrib/dev/acpica/components/utilities/utxfinit.c optional acpi
contrib/dev/acpica/os_specific/service_layers/osgendbg.c optional acpi acpi_debug
-netpfil/ipfilter/netinet/fil.c optional ipfilter inet \
- compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_auth.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_fil_freebsd.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_frag.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_log.c optional ipfilter inet \
- compile-with "${NORMAL_C} -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_nat.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_proxy.c optional ipfilter inet \
- compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_state.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_lookup.c optional ipfilter inet \
- compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -Wno-error -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_pool.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_htable.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter ${NO_WTAUTOLOGICAL_POINTER_COMPARE}"
-netpfil/ipfilter/netinet/ip_sync.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/mlfk_ipl.c optional ipfilter inet \
- compile-with "${NORMAL_C} -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_nat6.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_rules.c optional ipfilter inet \
- compile-with "${NORMAL_C} -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_scan.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/ip_dstlist.c optional ipfilter inet \
- compile-with "${NORMAL_C} -Wno-unused -I$S/netpfil/ipfilter"
-netpfil/ipfilter/netinet/radix_ipf.c optional ipfilter inet \
- compile-with "${NORMAL_C} -I$S/netpfil/ipfilter"
+netpfil/ipfilter/netinet/fil.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_auth.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_fil_freebsd.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_frag.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_log.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_nat.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_proxy.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_state.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_lookup.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_pool.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_htable.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_sync.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/mlfk_ipl.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_nat6.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_rules.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_scan.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/ip_dstlist.c optional ipfilter inet compile-with "${IPFILTER_C}"
+netpfil/ipfilter/netinet/radix_ipf.c optional ipfilter inet compile-with "${IPFILTER_C}"
contrib/libfdt/fdt.c optional fdt
contrib/libfdt/fdt_ro.c optional fdt
contrib/libfdt/fdt_rw.c optional fdt
@@ -1776,6 +1758,19 @@ dev/hwpmc/hwpmc_soft.c optional hwpmc
dev/hwreset/hwreset.c optional hwreset
dev/hwreset/hwreset_array.c optional hwreset
dev/hwreset/hwreset_if.m optional hwreset
+dev/hwt/hwt.c optional hwt
+dev/hwt/hwt_backend.c optional hwt
+dev/hwt/hwt_config.c optional hwt
+dev/hwt/hwt_context.c optional hwt
+dev/hwt/hwt_contexthash.c optional hwt
+dev/hwt/hwt_cpu.c optional hwt
+dev/hwt/hwt_hook.c optional hwt
+dev/hwt/hwt_ioctl.c optional hwt
+dev/hwt/hwt_owner.c optional hwt
+dev/hwt/hwt_ownerhash.c optional hwt
+dev/hwt/hwt_record.c optional hwt
+dev/hwt/hwt_thread.c optional hwt
+dev/hwt/hwt_vm.c optional hwt
dev/ichiic/ig4_acpi.c optional ig4 acpi iicbus
dev/ichiic/ig4_iic.c optional ig4 iicbus
dev/ichiic/ig4_pci.c optional ig4 pci iicbus
@@ -3160,8 +3155,6 @@ dev/sound/midi/midi.c optional sound
dev/sound/midi/mpu401.c optional sound
dev/sound/midi/mpu_if.m optional sound
dev/sound/midi/mpufoi_if.m optional sound
-dev/sound/midi/sequencer.c optional sound
-dev/sound/midi/synth_if.m optional sound
dev/spibus/acpi_spibus.c optional acpi spibus
dev/spibus/ofw_spibus.c optional fdt spibus
dev/spibus/spibus.c optional spibus \
@@ -3234,6 +3227,19 @@ dev/uart/uart_if.m optional uart
dev/uart/uart_subr.c optional uart
dev/uart/uart_tty.c optional uart
#
+# Universal Flash Storage Host Controller Interface drivers
+#
+dev/ufshci/ufshci.c optional ufshci
+dev/ufshci/ufshci_ctrlr.c optional ufshci
+dev/ufshci/ufshci_ctrlr_cmd.c optional ufshci
+dev/ufshci/ufshci_dev.c optional ufshci
+dev/ufshci/ufshci_pci.c optional ufshci
+dev/ufshci/ufshci_req_queue.c optional ufshci
+dev/ufshci/ufshci_req_sdb.c optional ufshci
+dev/ufshci/ufshci_sim.c optional ufshci
+dev/ufshci/ufshci_sysctl.c optional ufshci
+dev/ufshci/ufshci_uic_cmd.c optional ufshci
+#
# USB controller drivers
#
dev/usb/controller/musb_otg.c optional musb
@@ -3979,6 +3985,7 @@ kern/vfs_export.c standard
kern/vfs_extattr.c standard
kern/vfs_hash.c standard
kern/vfs_init.c standard
+kern/vfs_inotify.c standard
kern/vfs_lookup.c standard
kern/vfs_mount.c standard
kern/vfs_mountroot.c standard
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 0584fc29d039..80548320c3fc 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -84,8 +84,8 @@ amd64/amd64/xen-locore.S optional xenhvm \
amd64/amd64/machdep.c standard
amd64/amd64/mem.c optional mem
amd64/amd64/minidump_machdep.c standard
-amd64/amd64/mp_machdep.c optional smp
-amd64/amd64/mpboot.S optional smp
+amd64/amd64/mp_machdep.c standard
+amd64/amd64/mpboot.S standard
amd64/amd64/pmap.c standard
amd64/amd64/ptrace_machdep.c standard
amd64/amd64/support.S standard
@@ -191,6 +191,10 @@ dev/ice/irdma_di_if.m optional ice pci \
compile-with "${NORMAL_M} -I$S/dev/ice"
dev/ice/ice_ddp_common.c optional ice pci \
compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_iov.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_vf_mbx.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
ice_ddp.c optional ice_ddp \
compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}" \
no-ctfconvert no-implicit-rule before-depend local \
diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk
index e6e42b33a9b7..78178065e15b 100644
--- a/sys/conf/kern.pre.mk
+++ b/sys/conf/kern.pre.mk
@@ -290,6 +290,10 @@ BNXT_CFLAGS= -I$S/dev/bnxt/bnxt_en ${OFEDCFLAGS}
BNXT_C_NOIMP= ${CC} -c -o ${.TARGET} ${BNXT_CFLAGS} ${WERROR}
BNXT_C= ${BNXT_C_NOIMP} ${.IMPSRC}
+# IP Filter
+IPFILTER_CFLAGS= -I$S/netpfil/ipfilter
+IPFILTER_C= ${NORMAL_C} ${IPFILTER_CFLAGS}
+
GEN_CFILES= $S/$M/$M/genassym.c ${MFILES:T:S/.m$/.c/}
SYSTEM_CFILES= config.c env.c hints.c vnode_if.c
SYSTEM_DEP= Makefile ${SYSTEM_OBJS}
diff --git a/sys/conf/options b/sys/conf/options
index 03e8964e965d..a637b0b74a77 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -885,6 +885,9 @@ DCONS_FORCE_GDB opt_dcons.h
HWPMC_DEBUG opt_global.h
HWPMC_HOOKS
+# Hardware Trace (HWT) framework options
+HWT_HOOKS
+
# 802.11 support layer
IEEE80211_DEBUG opt_wlan.h
IEEE80211_DEBUG_REFCNT opt_wlan.h
diff --git a/sys/contrib/dev/iwlwifi/iwl-debug.h b/sys/contrib/dev/iwlwifi/iwl-debug.h
index 43288a5a8d74..7b3b402766b4 100644
--- a/sys/contrib/dev/iwlwifi/iwl-debug.h
+++ b/sys/contrib/dev/iwlwifi/iwl-debug.h
@@ -47,7 +47,7 @@ enum iwl_dl {
IWL_DL_DROP = 0x00000010,
IWL_DL_EEPROM = 0x00000020,
IWL_DL_FW = 0x00000040,
- /* = 0x00000080, */
+ IWL_DL_DEV_RADIO = 0x00000080,
IWL_DL_HC = 0x00000100,
IWL_DL_HT = 0x00000200,
IWL_DL_INFO = 0x00000400,
@@ -195,6 +195,8 @@ void __iwl_dbg(struct device *, u32, bool, const char *, const char *fmt, ...);
IWL_DPRINTF(_subsys, IWL_DL_WEP, _fmt, ##__VA_ARGS__)
#define IWL_DEBUG_WOWLAN(_subsys, _fmt, ...) \
IWL_DPRINTF(_subsys, IWL_DL_WOWLAN, _fmt, ##__VA_ARGS__)
+#define IWL_DEBUG_DEV_RADIO(_dev, _fmt, ...) \
+ IWL_DPRINTF_DEV((_dev), IWL_DL_DEV_RADIO, _fmt, ##__VA_ARGS__)
#define IWL_DEBUG_PCI_RW(_subsys, _fmt, ...) \
IWL_DPRINTF(_subsys, IWL_DL_PCI_RW, _fmt, ##__VA_ARGS__)
diff --git a/sys/contrib/dev/rtw89/acpi.c b/sys/contrib/dev/rtw89/acpi.c
index 02d4526c1538..f5dedb12c129 100644
--- a/sys/contrib/dev/rtw89/acpi.c
+++ b/sys/contrib/dev/rtw89/acpi.c
@@ -8,7 +8,6 @@
#include "acpi.h"
#include "debug.h"
-#if defined(__linux__)
static const guid_t rtw89_guid = GUID_INIT(0xD2A8C3E8, 0x4B69, 0x4F00,
0x82, 0xBD, 0xFE, 0x86,
0x07, 0x80, 0x3A, 0xA7);
@@ -149,14 +148,6 @@ int rtw89_acpi_evaluate_dsm(struct rtw89_dev *rtwdev,
ACPI_FREE(obj);
return ret;
}
-#elif defined(__FreeBSD__)
-int rtw89_acpi_evaluate_dsm(struct rtw89_dev *rtwdev,
- enum rtw89_acpi_dsm_func func,
- struct rtw89_acpi_dsm_result *res)
-{
- return -ENOENT;
-}
-#endif
int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev,
struct rtw89_acpi_rtag_result *res)
@@ -180,28 +171,15 @@ int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev,
if (ACPI_FAILURE(status))
return -EIO;
-#if defined(__linux__)
obj = buf.pointer;
if (obj->type != ACPI_TYPE_BUFFER) {
-#elif defined(__FreeBSD__)
- obj = buf.Pointer;
- if (obj->Type != ACPI_TYPE_BUFFER) {
-#endif
rtw89_debug(rtwdev, RTW89_DBG_ACPI,
-#if defined(__linux__)
"acpi: expect buffer but type: %d\n", obj->type);
-#elif defined(__FreeBSD__)
- "acpi: expect buffer but type: %d\n", obj->Type);
-#endif
ret = -EINVAL;
goto out;
}
-#if defined(__linux__)
buf_len = obj->buffer.length;
-#elif defined(__FreeBSD__)
- buf_len = obj->Buffer.Length;
-#endif
if (buf_len != sizeof(*res)) {
rtw89_debug(rtwdev, RTW89_DBG_ACPI, "%s: invalid buffer length: %u\n",
__func__, buf_len);
@@ -209,11 +187,7 @@ int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev,
goto out;
}
-#if defined(__linux__)
*res = *(struct rtw89_acpi_rtag_result *)obj->buffer.pointer;
-#elif defined(__FreeBSD__)
- *res = *(struct rtw89_acpi_rtag_result *)obj->Buffer.Pointer;
-#endif
rtw89_hex_dump(rtwdev, RTW89_DBG_ACPI, "antenna_gain: ", res, sizeof(*res));
diff --git a/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h b/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h
index 1d844a67c928..c4c9e789cf3e 100644
--- a/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h
+++ b/sys/dev/bnxt/bnxt_en/bnxt_auxbus_compat.h
@@ -39,6 +39,7 @@ struct auxiliary_device_id {
char name[AUXILIARY_NAME_SIZE];
uint64_t driver_data;
};
+#define MODULE_DEVICE_TABLE_BUS_auxiliary(_bus, _table)
struct auxiliary_device {
struct device dev;
diff --git a/sys/dev/drm2/drm_fb_helper.c b/sys/dev/drm2/drm_fb_helper.c
index f67cc9f60d02..1f4abd255690 100644
--- a/sys/dev/drm2/drm_fb_helper.c
+++ b/sys/dev/drm2/drm_fb_helper.c
@@ -51,7 +51,7 @@ struct vt_kms_softc {
struct task fb_mode_task;
};
-/* Call restore out of vt(9) locks. */
+/* Call restore out of vt(4) locks. */
static void
vt_restore_fbdev_mode(void *arg, int pending)
{
diff --git a/sys/dev/efidev/efirt.c b/sys/dev/efidev/efirt.c
index b0fa33daeca7..b55c1c191077 100644
--- a/sys/dev/efidev/efirt.c
+++ b/sys/dev/efidev/efirt.c
@@ -107,7 +107,8 @@ static int efi_status2err[25] = {
enum efi_table_type {
TYPE_ESRT = 0,
- TYPE_PROP
+ TYPE_PROP,
+ TYPE_MEMORY_ATTR
};
static int efi_enter(void);
@@ -445,6 +446,42 @@ get_table_length(enum efi_table_type type, size_t *table_len, void **taddr)
free(buf, M_TEMP);
return (0);
}
+ case TYPE_MEMORY_ATTR:
+ {
+ efi_guid_t guid = EFI_MEMORY_ATTRIBUTES_TABLE;
+ struct efi_memory_attribute_table *tbl_addr, *mem_addr;
+ int error;
+ void *buf;
+ size_t len = sizeof(struct efi_memory_attribute_table);
+
+ error = efi_get_table(&guid, (void **)&tbl_addr);
+ if (error)
+ return (error);
+
+ buf = malloc(len, M_TEMP, M_WAITOK);
+ error = physcopyout((vm_paddr_t)tbl_addr, buf, len);
+ if (error) {
+ free(buf, M_TEMP);
+ return (error);
+ }
+
+ mem_addr = (struct efi_memory_attribute_table *)buf;
+ if (mem_addr->version != 2) {
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+ len += mem_addr->descriptor_size * mem_addr->num_ents;
+ if (len > EFI_TABLE_ALLOC_MAX) {
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+
+ *table_len = len;
+ if (taddr != NULL)
+ *taddr = tbl_addr;
+ free(buf, M_TEMP);
+ return (0);
+ }
}
return (ENOENT);
}
@@ -457,7 +494,8 @@ copy_table(efi_guid_t *guid, void **buf, size_t buf_len, size_t *table_len)
enum efi_table_type type;
} tables[] = {
{ EFI_TABLE_ESRT, TYPE_ESRT },
- { EFI_PROPERTIES_TABLE, TYPE_PROP }
+ { EFI_PROPERTIES_TABLE, TYPE_PROP },
+ { EFI_MEMORY_ATTRIBUTES_TABLE, TYPE_MEMORY_ATTR }
};
size_t table_idx;
void *taddr;
diff --git a/sys/dev/gpio/acpi_gpiobus.c b/sys/dev/gpio/acpi_gpiobus.c
index 2987af634866..94f4e5771266 100644
--- a/sys/dev/gpio/acpi_gpiobus.c
+++ b/sys/dev/gpio/acpi_gpiobus.c
@@ -36,6 +36,7 @@
#include <dev/gpio/gpiobusvar.h>
#include <dev/gpio/acpi_gpiobusvar.h>
+#include <dev/gpio/gpiobus_internal.h>
#include "gpiobus_if.h"
@@ -356,7 +357,7 @@ acpi_gpiobus_attach(device_t dev)
status = AcpiWalkResources(handle, "_AEI", acpi_gpiobus_enumerate_aei,
&ctx);
- if (ACPI_FAILURE(status))
+ if (ACPI_FAILURE(status) && status != AE_NOT_FOUND)
device_printf(dev, "Failed to enumerate AEI resources\n");
return (0);
diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c
index 2e2618805e7b..764bcb7e6ee8 100644
--- a/sys/dev/gpio/gpiobus.c
+++ b/sys/dev/gpio/gpiobus.c
@@ -39,6 +39,7 @@
#include <sys/sbuf.h>
#include <dev/gpio/gpiobusvar.h>
+#include <dev/gpio/gpiobus_internal.h>
#include "gpiobus_if.h"
@@ -109,10 +110,9 @@ gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags,
res = bus_alloc_resource(consumer_dev, SYS_RES_IRQ, rid, irq, irq, 1,
alloc_flags);
if (res == NULL) {
- intr_free_intr_map_data((struct intr_map_data *)gpio_data);
+ intr_unmap_irq(irq);
return (NULL);
}
- rman_set_virtual(res, gpio_data);
return (res);
}
#else
@@ -213,20 +213,40 @@ gpio_pin_is_active(gpio_pin_t pin, bool *active)
return (0);
}
+/*
+ * Note that this function should only
+ * be used in cases where a pre-existing
+ * gpiobus_pin structure exists. In most
+ * cases, the gpio_pin_get_by_* functions
+ * suffice.
+ */
+int
+gpio_pin_acquire(gpio_pin_t gpio)
+{
+ device_t busdev;
+
+ KASSERT(gpio != NULL, ("GPIO pin is NULL."));
+ KASSERT(gpio->dev != NULL, ("GPIO pin device is NULL."));
+
+ busdev = GPIO_GET_BUS(gpio->dev);
+ if (busdev == NULL)
+ return (ENXIO);
+
+ return (gpiobus_acquire_pin(busdev, gpio->pin));
+}
+
void
gpio_pin_release(gpio_pin_t gpio)
{
device_t busdev;
- if (gpio == NULL)
- return;
-
+ KASSERT(gpio != NULL, ("GPIO pin is NULL."));
KASSERT(gpio->dev != NULL, ("GPIO pin device is NULL."));
busdev = GPIO_GET_BUS(gpio->dev);
- if (busdev != NULL)
- gpiobus_release_pin(busdev, gpio->pin);
+ KASSERT(busdev != NULL, ("gpiobus dev is NULL."));
+ gpiobus_release_pin(busdev, gpio->pin);
free(gpio, M_DEVBUF);
}
@@ -293,7 +313,7 @@ gpiobus_print_pins(struct gpiobus_ivar *devi, struct sbuf *sb)
}
device_t
-gpiobus_attach_bus(device_t dev)
+gpiobus_add_bus(device_t dev)
{
device_t busdev;
@@ -307,8 +327,24 @@ gpiobus_attach_bus(device_t dev)
#ifdef FDT
ofw_gpiobus_register_provider(dev);
#endif
- bus_attach_children(dev);
+ return (busdev);
+}
+
+/*
+ * Attach a gpiobus child.
+ * Note that the controller is expected
+ * to be fully initialized at this point.
+ */
+device_t
+gpiobus_attach_bus(device_t dev)
+{
+ device_t busdev;
+
+ busdev = gpiobus_add_bus(dev);
+ if (busdev == NULL)
+ return (NULL);
+ bus_attach_children(dev);
return (busdev);
}
@@ -385,14 +421,13 @@ gpiobus_acquire_pin(device_t bus, uint32_t pin)
sc = device_get_softc(bus);
/* Consistency check. */
if (pin >= sc->sc_npins) {
- device_printf(bus,
- "invalid pin %d, max: %d\n", pin, sc->sc_npins - 1);
- return (-1);
+ panic("%s: invalid pin %d, max: %d",
+ device_get_nameunit(bus), pin, sc->sc_npins - 1);
}
/* Mark pin as mapped and give warning if it's already mapped. */
if (sc->sc_pins[pin].mapped) {
device_printf(bus, "warning: pin %d is already mapped\n", pin);
- return (-1);
+ return (EBUSY);
}
sc->sc_pins[pin].mapped = 1;
@@ -400,7 +435,7 @@ gpiobus_acquire_pin(device_t bus, uint32_t pin)
}
/* Release mapped pin */
-int
+void
gpiobus_release_pin(device_t bus, uint32_t pin)
{
struct gpiobus_softc *sc;
@@ -408,19 +443,15 @@ gpiobus_release_pin(device_t bus, uint32_t pin)
sc = device_get_softc(bus);
/* Consistency check. */
if (pin >= sc->sc_npins) {
- device_printf(bus,
- "invalid pin %d, max=%d\n",
- pin, sc->sc_npins - 1);
- return (-1);
+ panic("%s: invalid pin %d, max: %d",
+ device_get_nameunit(bus), pin, sc->sc_npins - 1);
}
- if (!sc->sc_pins[pin].mapped) {
- device_printf(bus, "pin %d is not mapped\n", pin);
- return (-1);
- }
- sc->sc_pins[pin].mapped = 0;
+ if (!sc->sc_pins[pin].mapped)
+ panic("%s: pin %d is not mapped", device_get_nameunit(bus),
+ pin);
- return (0);
+ sc->sc_pins[pin].mapped = 0;
}
static int
@@ -435,8 +466,7 @@ gpiobus_acquire_child_pins(device_t dev, device_t child)
device_printf(child, "cannot acquire pin %d\n",
devi->pins[i]);
while (--i >= 0) {
- (void)gpiobus_release_pin(dev,
- devi->pins[i]);
+ gpiobus_release_pin(dev, devi->pins[i]);
}
gpiobus_free_ivars(devi);
return (EBUSY);
@@ -835,6 +865,25 @@ gpiobus_alloc_resource(device_t bus, device_t child, int type, int *rid,
end, count, flags));
}
+static int
+gpiobus_release_resource(device_t dev, device_t child, struct resource *r)
+{
+ int err;
+#ifdef INTRNG
+ u_int irq;
+
+ irq = rman_get_start(r);
+ MPASS(irq == rman_get_end(r));
+#endif
+ err = bus_generic_rman_release_resource(dev, child, r);
+ if (err != 0)
+ return (err);
+#ifdef INTRNG
+ intr_unmap_irq(irq);
+#endif
+ return (0);
+}
+
static struct resource_list *
gpiobus_get_resource_list(device_t bus __unused, device_t child)
{
@@ -1029,7 +1078,7 @@ static device_method_t gpiobus_methods[] = {
DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource),
DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource),
DEVMETHOD(bus_alloc_resource, gpiobus_alloc_resource),
- DEVMETHOD(bus_release_resource, bus_generic_rman_release_resource),
+ DEVMETHOD(bus_release_resource, gpiobus_release_resource),
DEVMETHOD(bus_activate_resource, bus_generic_rman_activate_resource),
DEVMETHOD(bus_deactivate_resource, bus_generic_rman_deactivate_resource),
DEVMETHOD(bus_get_resource_list, gpiobus_get_resource_list),
diff --git a/sys/dev/sound/midi/sequencer.h b/sys/dev/gpio/gpiobus_internal.h
index 22ea0ae6c1b6..de3f57663132 100644
--- a/sys/dev/sound/midi/sequencer.h
+++ b/sys/dev/gpio/gpiobus_internal.h
@@ -1,8 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
- * Copyright (c) 2003 Mathew Kanner
- * Copyright (c) 1999 Seigo Tanimura
+ * Copyright (c) 2009 Oleksandr Tymoshenko <gonzo@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -25,65 +24,24 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
*/
-/*
- * Include file for the midi sequence driver.
- */
-
-#ifndef _SEQUENCER_H_
-#define _SEQUENCER_H_
-
-#define NSEQ_MAX 16
+#ifndef __GPIOBUS_INTERNAL_H__
+#define __GPIOBUS_INTERNAL_H__
/*
- * many variables should be reduced to a range. Here define a macro
+ * Functions shared between gpiobus and other bus classes that derive from it;
+ * these should not be called directly by other drivers.
*/
-
-#define RANGE(var, low, high) (var) = \
-((var)<(low)?(low) : (var)>(high)?(high) : (var))
-
-#ifdef _KERNEL
-
-void seq_timer(void *arg);
-
-SYSCTL_DECL(_hw_midi_seq);
-
-extern int seq_debug;
-
-#define SEQ_DEBUG(y, x) \
- do { \
- if (seq_debug >= y) { \
- (x); \
- } \
- } while (0)
-
-SYSCTL_DECL(_hw_midi);
-
-#endif /* _KERNEL */
-
-#define SYNTHPROP_MIDI 1
-#define SYNTHPROP_SYNTH 2
-#define SYNTHPROP_RX 4
-#define SYNTHPROP_TX 8
-
-struct _midi_cmdtab {
- int cmd;
- char *name;
-};
-typedef struct _midi_cmdtab midi_cmdtab;
-extern midi_cmdtab cmdtab_seqevent[];
-extern midi_cmdtab cmdtab_seqioctl[];
-extern midi_cmdtab cmdtab_timer[];
-extern midi_cmdtab cmdtab_seqcv[];
-extern midi_cmdtab cmdtab_seqccmn[];
-
-char *midi_cmdname(int cmd, midi_cmdtab * tab);
-
-enum {
- MORE,
- TIMERARMED,
- QUEUEFULL
-};
-
+int gpiobus_attach(device_t);
+int gpiobus_detach(device_t);
+int gpiobus_init_softc(device_t);
+int gpiobus_alloc_ivars(struct gpiobus_ivar *);
+void gpiobus_free_ivars(struct gpiobus_ivar *);
+int gpiobus_read_ivar(device_t, device_t, int, uintptr_t *);
+int gpiobus_acquire_pin(device_t, uint32_t);
+void gpiobus_release_pin(device_t, uint32_t);
+
+extern driver_t gpiobus_driver;
#endif
diff --git a/sys/dev/gpio/gpiobusvar.h b/sys/dev/gpio/gpiobusvar.h
index 74783e112f89..7f504236a774 100644
--- a/sys/dev/gpio/gpiobusvar.h
+++ b/sys/dev/gpio/gpiobusvar.h
@@ -156,6 +156,8 @@ int gpio_pin_get_by_bus_pinnum(device_t _bus, uint32_t _pinnum, gpio_pin_t *_gp)
/* Acquire a pin by child and index (used by direct children of gpiobus). */
int gpio_pin_get_by_child_index(device_t _child, uint32_t _idx, gpio_pin_t *_gp);
+/* Acquire a pin from an existing gpio_pin_t. */
+int gpio_pin_acquire(gpio_pin_t gpio);
/* Release a pin acquired via any gpio_pin_get_xxx() function. */
void gpio_pin_release(gpio_pin_t gpio);
@@ -167,22 +169,9 @@ int gpio_pin_setflags(gpio_pin_t pin, uint32_t flags);
struct resource *gpio_alloc_intr_resource(device_t consumer_dev, int *rid,
u_int alloc_flags, gpio_pin_t pin, uint32_t intr_mode);
-/*
- * Functions shared between gpiobus and other bus classes that derive from it;
- * these should not be called directly by other drivers.
- */
int gpio_check_flags(uint32_t, uint32_t);
+device_t gpiobus_add_bus(device_t);
device_t gpiobus_attach_bus(device_t);
int gpiobus_detach_bus(device_t);
-int gpiobus_attach(device_t);
-int gpiobus_detach(device_t);
-int gpiobus_init_softc(device_t);
-int gpiobus_alloc_ivars(struct gpiobus_ivar *);
-void gpiobus_free_ivars(struct gpiobus_ivar *);
-int gpiobus_read_ivar(device_t, device_t, int, uintptr_t *);
-int gpiobus_acquire_pin(device_t, uint32_t);
-int gpiobus_release_pin(device_t, uint32_t);
-
-extern driver_t gpiobus_driver;
#endif /* __GPIOBUS_H__ */
diff --git a/sys/dev/gpio/gpiopps.c b/sys/dev/gpio/gpiopps.c
index bb8afa5e062c..82620a50a798 100644
--- a/sys/dev/gpio/gpiopps.c
+++ b/sys/dev/gpio/gpiopps.c
@@ -160,7 +160,7 @@ gpiopps_detach(device_t dev)
if (sc->ires != NULL)
bus_release_resource(dev, SYS_RES_IRQ, sc->irid, sc->ires);
if (sc->gpin != NULL)
- gpiobus_release_pin(GPIO_GET_BUS(sc->gpin->dev), sc->gpin->pin);
+ gpio_pin_release(sc->gpin);
return (0);
}
diff --git a/sys/dev/gpio/ofw_gpiobus.c b/sys/dev/gpio/ofw_gpiobus.c
index 32dc5b55e698..fc5fb03d6824 100644
--- a/sys/dev/gpio/ofw_gpiobus.c
+++ b/sys/dev/gpio/ofw_gpiobus.c
@@ -36,6 +36,7 @@
#include <sys/module.h>
#include <dev/gpio/gpiobusvar.h>
+#include <dev/gpio/gpiobus_internal.h>
#include <dev/ofw/ofw_bus.h>
#include "gpiobus_if.h"
diff --git a/sys/dev/gpio/pl061.c b/sys/dev/gpio/pl061.c
index cc39790322b6..87d4310a6396 100644
--- a/sys/dev/gpio/pl061.c
+++ b/sys/dev/gpio/pl061.c
@@ -487,14 +487,21 @@ pl061_attach(device_t dev)
}
}
+ mtx_init(&sc->sc_mtx, device_get_nameunit(dev), "pl061", MTX_SPIN);
+
+ if (sc->sc_xref != 0 && !intr_pic_register(dev, sc->sc_xref)) {
+ device_printf(dev, "couldn't register PIC\n");
+ PL061_LOCK_DESTROY(sc);
+ goto free_isrc;
+ }
+
sc->sc_busdev = gpiobus_attach_bus(dev);
if (sc->sc_busdev == NULL) {
device_printf(dev, "couldn't attach gpio bus\n");
+ PL061_LOCK_DESTROY(sc);
goto free_isrc;
}
- mtx_init(&sc->sc_mtx, device_get_nameunit(dev), "pl061", MTX_SPIN);
-
return (0);
free_isrc:
@@ -503,6 +510,7 @@ free_isrc:
* for (irq = 0; irq < PL061_NUM_GPIO; irq++)
* intr_isrc_deregister(PIC_INTR_ISRC(sc, irq));
*/
+ bus_teardown_intr(dev, sc->sc_irq_res, sc->sc_irq_hdlr);
bus_release_resource(dev, SYS_RES_IRQ, sc->sc_irq_rid,
sc->sc_irq_res);
free_pic:
diff --git a/sys/dev/gpio/pl061.h b/sys/dev/gpio/pl061.h
index 809a1168493d..d9fe23e502b1 100644
--- a/sys/dev/gpio/pl061.h
+++ b/sys/dev/gpio/pl061.h
@@ -46,6 +46,7 @@ struct pl061_softc {
struct resource *sc_mem_res;
struct resource *sc_irq_res;
void *sc_irq_hdlr;
+ intptr_t sc_xref;
int sc_mem_rid;
int sc_irq_rid;
struct pl061_pin_irqsrc sc_isrcs[PL061_NUM_GPIO];
diff --git a/sys/dev/gpio/pl061_acpi.c b/sys/dev/gpio/pl061_acpi.c
index f5885025083e..8e9921261e4e 100644
--- a/sys/dev/gpio/pl061_acpi.c
+++ b/sys/dev/gpio/pl061_acpi.c
@@ -67,19 +67,12 @@ pl061_acpi_probe(device_t dev)
static int
pl061_acpi_attach(device_t dev)
{
- int error;
+ struct pl061_softc *sc;
- error = pl061_attach(dev);
- if (error != 0)
- return (error);
+ sc = device_get_softc(dev);
+ sc->sc_xref = ACPI_GPIO_XREF;
- if (!intr_pic_register(dev, ACPI_GPIO_XREF)) {
- device_printf(dev, "couldn't register PIC\n");
- pl061_detach(dev);
- error = ENXIO;
- }
-
- return (error);
+ return (pl061_attach(dev));
}
static device_method_t pl061_acpi_methods[] = {
diff --git a/sys/dev/gpio/pl061_fdt.c b/sys/dev/gpio/pl061_fdt.c
index aa22298b43c6..681b3ccdfdeb 100644
--- a/sys/dev/gpio/pl061_fdt.c
+++ b/sys/dev/gpio/pl061_fdt.c
@@ -61,19 +61,12 @@ pl061_fdt_probe(device_t dev)
static int
pl061_fdt_attach(device_t dev)
{
- int error;
+ struct pl061_softc *sc;
- error = pl061_attach(dev);
- if (error != 0)
- return (error);
+ sc = device_get_softc(dev);
+ sc->sc_xref = OF_xref_from_node(ofw_bus_get_node(dev));
- if (!intr_pic_register(dev, OF_xref_from_node(ofw_bus_get_node(dev)))) {
- device_printf(dev, "couldn't register PIC\n");
- pl061_detach(dev);
- error = ENXIO;
- }
-
- return (error);
+ return (pl061_attach(dev));
}
static device_method_t pl061_fdt_methods[] = {
diff --git a/sys/dev/gpio/qoriq_gpio.c b/sys/dev/gpio/qoriq_gpio.c
index 25dfccede29f..8b44cd256c79 100644
--- a/sys/dev/gpio/qoriq_gpio.c
+++ b/sys/dev/gpio/qoriq_gpio.c
@@ -369,11 +369,6 @@ qoriq_gpio_attach(device_t dev)
for (i = 0; i <= MAXPIN; i++)
sc->sc_pins[i].gp_caps = DEFAULT_CAPS;
- sc->busdev = gpiobus_attach_bus(dev);
- if (sc->busdev == NULL) {
- qoriq_gpio_detach(dev);
- return (ENOMEM);
- }
/*
* Enable the GPIO Input Buffer for all GPIOs.
* This is safe on devices without a GPIBE register, because those
@@ -384,6 +379,12 @@ qoriq_gpio_attach(device_t dev)
OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev);
+ sc->busdev = gpiobus_attach_bus(dev);
+ if (sc->busdev == NULL) {
+ qoriq_gpio_detach(dev);
+ return (ENOMEM);
+ }
+
return (0);
}
diff --git a/sys/dev/hwt/hwt.c b/sys/dev/hwt/hwt.c
new file mode 100644
index 000000000000..c476e6031ba8
--- /dev/null
+++ b/sys/dev/hwt/hwt.c
@@ -0,0 +1,242 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Hardware Tracing framework.
+ *
+ * The framework manages hardware tracing units that collect information
+ * about software execution and store it as events in highly compressed format
+ * into DRAM. The events cover information about control flow changes of a
+ * program, whether branches taken or not, exceptions taken, timing information,
+ * cycles elapsed and more. That allows us to restore entire program flow of a
+ * given application without performance impact.
+ *
+ * Design overview.
+ *
+ * The framework provides character devices for mmap(2) and ioctl(2) system
+ * calls to allow user to manage CPU (hardware) tracing units.
+ *
+ * /dev/hwt:
+ * .ioctl:
+ * hwt_ioctl():
+ * a) HWT_IOC_ALLOC
+ * Allocates kernel tracing context CTX based on requested mode
+ * of operation. Verifies the information that comes with the
+ * request (pid, cpus), allocates unique ID for the context.
+ * Creates a new character device for CTX management.
+ *
+ * /dev/hwt_%d[_%d], ident[, thread_id]
+ * .mmap
+ * Maps tracing buffers of the corresponding thread to userspace.
+ * .ioctl
+ * hwt_thread_ioctl():
+ * a) HWT_IOC_START
+ * Enables tracing unit for a given context.
+ * b) HWT_IOC_RECORD_GET
+ * Transfers (small) record entries collected during program
+ * execution for a given context to userspace, such as mmaping
+ * tables of executable and dynamic libraries, interpreter,
+ * kernel mappings, tid of threads created, etc.
+ * c) HWT_IOC_SET_CONFIG
+ * Allows to specify backend-specific configuration of the
+ * trace unit.
+ * d) HWT_IOC_WAKEUP
+ * Wakes up a thread that is currently sleeping.
+ * e) HWT_IOC_BUFPTR_GET
+ * Transfers current hardware pointer in the filling buffer
+ * to the userspace.
+ * f) HWT_IOC_SVC_BUF
+ * To avoid data loss, userspace may notify kernel it has
+ * copied out the given buffer, so kernel is ok to overwrite
+ *
+ * HWT context lifecycle in THREAD mode of operation:
+ * 1. User invokes HWT_IOC_ALLOC ioctl with information about pid to trace and
+ * size of the buffers for the trace data to allocate.
+ * Some architectures may have different tracing units supported, so user
+ * also provides backend name to use for this context, e.g. "coresight".
+ * 2. Kernel allocates context, lookups the proc for the given pid. Then it
+ * creates first hwt_thread in the context and allocates trace buffers for
+ * it. Immediately, kernel initializes tracing backend.
+ * Kernel creates character device and returns unique identificator of
+ * trace context to the user.
+ * 3. To manage the new context, user opens the character device created.
+ * User invokes HWT_IOC_START ioctl, kernel marks context as RUNNING.
+ * At this point any HWT hook invocation by scheduler enables/disables
+ * tracing for threads associated with the context (threads of the proc).
+ * Any new threads creation (of the target proc) procedures will be invoking
+ * corresponding hooks in HWT framework, so that new hwt_thread and buffers
+ * allocated, character device for mmap(2) created on the fly.
+ * 4. User issues HWT_IOC_RECORD_GET ioctl to fetch information about mmaping
+ * tables and threads created during application startup.
+ * 5. User mmaps tracing buffers of each thread to userspace (using
+ * /dev/hwt_%d_%d % (ident, thread_id) character devices).
+ * 6. User can repeat 4 if expected thread is not yet created during target
+ * application execution.
+ * 7. User issues HWT_IOC_BUFPTR_GET ioctl to get current filling level of the
+ * hardware buffer of a given thread.
+ * 8. User invokes trace decoder library to process available data and see the
+ * results in human readable form.
+ * 9. User repeats 7 if needed.
+ *
+ * HWT context lifecycle in CPU mode of operation:
+ * 1. User invokes HWT_IOC_ALLOC ioctl providing a set of CPU to trace within
+ * single CTX.
+ * 2. Kernel verifies the set of CPU and allocates tracing context, creates
+ * a buffer for each CPU.
+ * Kernel creates a character device for every CPU provided in the request.
+ * Kernel initialized tracing backend.
+ * 3. User opens character devices of interest to map the buffers to userspace.
+ * User can start tracing by invoking HWT_IOC_START on any of character
+ * device within the context, entire context will be marked as RUNNING.
+ * 4. The rest is similar to the THREAD mode.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_ownerhash.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_ioctl.h>
+#include <dev/hwt/hwt_hook.h>
+
+#define HWT_DEBUG
+#undef HWT_DEBUG
+
+#ifdef HWT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static eventhandler_tag hwt_exit_tag;
+static struct cdev *hwt_cdev;
+static struct cdevsw hwt_cdevsw = {
+ .d_version = D_VERSION,
+ .d_name = "hwt",
+ .d_mmap_single = NULL,
+ .d_ioctl = hwt_ioctl
+};
+
+static void
+hwt_process_exit(void *arg __unused, struct proc *p)
+{
+ struct hwt_owner *ho;
+
+ /* Stop HWTs associated with exiting owner, if any. */
+ ho = hwt_ownerhash_lookup(p);
+ if (ho)
+ hwt_owner_shutdown(ho);
+}
+
+static int
+hwt_load(void)
+{
+ struct make_dev_args args;
+ int error;
+
+ make_dev_args_init(&args);
+ args.mda_devsw = &hwt_cdevsw;
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_WHEEL;
+ args.mda_mode = 0660;
+ args.mda_si_drv1 = NULL;
+
+ hwt_backend_load();
+ hwt_ctx_load();
+ hwt_contexthash_load();
+ hwt_ownerhash_load();
+ hwt_record_load();
+
+ error = make_dev_s(&args, &hwt_cdev, "hwt");
+ if (error != 0)
+ return (error);
+
+ hwt_exit_tag = EVENTHANDLER_REGISTER(process_exit, hwt_process_exit,
+ NULL, EVENTHANDLER_PRI_ANY);
+
+ hwt_hook_load();
+
+ return (0);
+}
+
+static int
+hwt_unload(void)
+{
+
+ hwt_hook_unload();
+ EVENTHANDLER_DEREGISTER(process_exit, hwt_exit_tag);
+ destroy_dev(hwt_cdev);
+ hwt_record_unload();
+ hwt_ownerhash_unload();
+ hwt_contexthash_unload();
+ hwt_ctx_unload();
+ hwt_backend_unload();
+
+ return (0);
+}
+
+static int
+hwt_modevent(module_t mod, int type, void *data)
+{
+ int error;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = hwt_load();
+ break;
+ case MOD_UNLOAD:
+ error = hwt_unload();
+ break;
+ default:
+ error = 0;
+ break;
+ }
+
+ return (error);
+}
+
+static moduledata_t hwt_mod = {
+ "hwt",
+ hwt_modevent,
+ NULL
+};
+
+DECLARE_MODULE(hwt, hwt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_VERSION(hwt, 1);
diff --git a/sys/dev/hwt/hwt_backend.c b/sys/dev/hwt/hwt_backend.c
new file mode 100644
index 000000000000..1ba5db0d3d09
--- /dev/null
+++ b/sys/dev/hwt/hwt_backend.c
@@ -0,0 +1,289 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Hardware Trace (HWT) framework. */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/hwt.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_backend.h>
+
+#define HWT_BACKEND_DEBUG
+#undef HWT_BACKEND_DEBUG
+
+#ifdef HWT_BACKEND_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static struct mtx hwt_backend_mtx;
+
+struct hwt_backend_entry {
+ struct hwt_backend *backend;
+ LIST_ENTRY(hwt_backend_entry) next;
+};
+
+static LIST_HEAD(, hwt_backend_entry) hwt_backends;
+
+static MALLOC_DEFINE(M_HWT_BACKEND, "hwt_backend", "HWT backend");
+
+int
+hwt_backend_init(struct hwt_context *ctx)
+{
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ error = ctx->hwt_backend->ops->hwt_backend_init(ctx);
+
+ return (error);
+}
+
+void
+hwt_backend_deinit(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+
+ ctx->hwt_backend->ops->hwt_backend_deinit(ctx);
+}
+
+int
+hwt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ error = ctx->hwt_backend->ops->hwt_backend_configure(ctx, cpu_id,
+ thread_id);
+
+ return (error);
+}
+
+void
+hwt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+
+ dprintf("%s\n", __func__);
+
+ ctx->hwt_backend->ops->hwt_backend_enable(ctx, cpu_id);
+}
+
+void
+hwt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+
+ dprintf("%s\n", __func__);
+
+ ctx->hwt_backend->ops->hwt_backend_disable(ctx, cpu_id);
+}
+
+void
+hwt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+
+ ctx->hwt_backend->ops->hwt_backend_enable_smp(ctx);
+}
+
+void
+hwt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+
+ ctx->hwt_backend->ops->hwt_backend_disable_smp(ctx);
+}
+
+void __unused
+hwt_backend_dump(struct hwt_context *ctx, int cpu_id)
+{
+
+ dprintf("%s\n", __func__);
+
+ ctx->hwt_backend->ops->hwt_backend_dump(cpu_id);
+}
+
+int
+hwt_backend_read(struct hwt_context *ctx, struct hwt_vm *vm, int *ident,
+ vm_offset_t *offset, uint64_t *data)
+{
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ error = ctx->hwt_backend->ops->hwt_backend_read(vm, ident,
+ offset, data);
+
+ return (error);
+}
+
+struct hwt_backend *
+hwt_backend_lookup(const char *name)
+{
+ struct hwt_backend_entry *entry;
+ struct hwt_backend *backend;
+
+ HWT_BACKEND_LOCK();
+ LIST_FOREACH(entry, &hwt_backends, next) {
+ backend = entry->backend;
+ if (strcmp(backend->name, name) == 0) {
+ HWT_BACKEND_UNLOCK();
+ return (backend);
+ }
+ }
+ HWT_BACKEND_UNLOCK();
+
+ return (NULL);
+}
+
+int
+hwt_backend_register(struct hwt_backend *backend)
+{
+ struct hwt_backend_entry *entry;
+
+ if (backend == NULL ||
+ backend->name == NULL ||
+ backend->ops == NULL)
+ return (EINVAL);
+
+ entry = malloc(sizeof(struct hwt_backend_entry), M_HWT_BACKEND,
+ M_WAITOK | M_ZERO);
+ entry->backend = backend;
+
+ HWT_BACKEND_LOCK();
+ LIST_INSERT_HEAD(&hwt_backends, entry, next);
+ HWT_BACKEND_UNLOCK();
+
+ return (0);
+}
+
+int
+hwt_backend_unregister(struct hwt_backend *backend)
+{
+ struct hwt_backend_entry *entry, *tmp;
+
+ if (backend == NULL)
+ return (EINVAL);
+
+ /* TODO: check if not in use */
+
+ HWT_BACKEND_LOCK();
+ LIST_FOREACH_SAFE(entry, &hwt_backends, next, tmp) {
+ if (entry->backend == backend) {
+ LIST_REMOVE(entry, next);
+ HWT_BACKEND_UNLOCK();
+ free(entry, M_HWT_BACKEND);
+ return (0);
+ }
+ }
+ HWT_BACKEND_UNLOCK();
+
+ return (ENOENT);
+}
+
+void
+hwt_backend_load(void)
+{
+
+ mtx_init(&hwt_backend_mtx, "hwt backend", NULL, MTX_DEF);
+ LIST_INIT(&hwt_backends);
+}
+
+void
+hwt_backend_unload(void)
+{
+
+ /* TODO: ensure all unregistered */
+
+ mtx_destroy(&hwt_backend_mtx);
+}
+
+void
+hwt_backend_stop(struct hwt_context *ctx)
+{
+ dprintf("%s\n", __func__);
+
+ ctx->hwt_backend->ops->hwt_backend_stop(ctx);
+}
+
+int
+hwt_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size,
+ int data_version)
+{
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ error = ctx->hwt_backend->ops->hwt_backend_svc_buf(ctx, data, data_size,
+ data_version);
+
+ return (error);
+}
+
+int
+hwt_backend_thread_alloc(struct hwt_context *ctx, struct hwt_thread *thr)
+{
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ if (ctx->hwt_backend->ops->hwt_backend_thread_alloc == NULL)
+ return (0);
+ KASSERT(thr->private == NULL,
+ ("%s: thread private data is not NULL\n", __func__));
+ error = ctx->hwt_backend->ops->hwt_backend_thread_alloc(thr);
+
+ return (error);
+}
+
+void
+hwt_backend_thread_free(struct hwt_thread *thr)
+{
+ dprintf("%s\n", __func__);
+
+ if (thr->backend->ops->hwt_backend_thread_free == NULL)
+ return;
+ KASSERT(thr->private != NULL,
+ ("%s: thread private data is NULL\n", __func__));
+ thr->backend->ops->hwt_backend_thread_free(thr);
+
+ return;
+}
diff --git a/sys/dev/hwt/hwt_backend.h b/sys/dev/hwt/hwt_backend.h
new file mode 100644
index 000000000000..3b6c9442a7a6
--- /dev/null
+++ b/sys/dev/hwt/hwt_backend.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_BACKEND_H_
+#define _DEV_HWT_HWT_BACKEND_H_
+
+struct hwt_backend_ops {
+ int (*hwt_backend_init)(struct hwt_context *);
+ int (*hwt_backend_deinit)(struct hwt_context *);
+ int (*hwt_backend_configure)(struct hwt_context *, int cpu_id,
+ int thread_id);
+ int (*hwt_backend_svc_buf)(struct hwt_context *, void *data,
+ size_t data_size, int data_version);
+ void (*hwt_backend_enable)(struct hwt_context *, int cpu_id);
+ void (*hwt_backend_disable)(struct hwt_context *, int cpu_id);
+ int (*hwt_backend_read)(struct hwt_vm *, int *ident,
+ vm_offset_t *offset, uint64_t *data);
+ void (*hwt_backend_stop)(struct hwt_context *);
+ /* For backends that are tied to local CPU registers */
+ int (*hwt_backend_enable_smp)(struct hwt_context *);
+ int (*hwt_backend_disable_smp)(struct hwt_context *);
+ /* Allocation and initialization of backend-specific thread data. */
+ int (*hwt_backend_thread_alloc)(struct hwt_thread *);
+ void (*hwt_backend_thread_free)(struct hwt_thread *);
+ /* Debugging only. */
+ void (*hwt_backend_dump)(int cpu_id);
+};
+
+struct hwt_backend {
+ const char *name;
+ struct hwt_backend_ops *ops;
+ /* buffers require kernel virtual addresses */
+ bool kva_req;
+};
+
+int hwt_backend_init(struct hwt_context *ctx);
+void hwt_backend_deinit(struct hwt_context *ctx);
+int hwt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id);
+void hwt_backend_enable(struct hwt_context *ctx, int cpu_id);
+void hwt_backend_disable(struct hwt_context *ctx, int cpu_id);
+void hwt_backend_enable_smp(struct hwt_context *ctx);
+void hwt_backend_disable_smp(struct hwt_context *ctx);
+void hwt_backend_dump(struct hwt_context *ctx, int cpu_id);
+int hwt_backend_read(struct hwt_context *ctx, struct hwt_vm *vm, int *ident,
+ vm_offset_t *offset, uint64_t *data);
+int hwt_backend_register(struct hwt_backend *);
+int hwt_backend_unregister(struct hwt_backend *);
+void hwt_backend_stop(struct hwt_context *);
+int hwt_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size,
+ int data_version);
+struct hwt_backend * hwt_backend_lookup(const char *name);
+int hwt_backend_thread_alloc(struct hwt_context *ctx, struct hwt_thread *);
+void hwt_backend_thread_free(struct hwt_thread *);
+
+void hwt_backend_load(void);
+void hwt_backend_unload(void);
+
+#define HWT_BACKEND_LOCK() mtx_lock(&hwt_backend_mtx)
+#define HWT_BACKEND_UNLOCK() mtx_unlock(&hwt_backend_mtx)
+
+#endif /* !_DEV_HWT_HWT_BACKEND_H_ */
+
diff --git a/sys/dev/hwt/hwt_config.c b/sys/dev/hwt/hwt_config.c
new file mode 100644
index 000000000000..30688e7fc76b
--- /dev/null
+++ b/sys/dev/hwt/hwt_config.c
@@ -0,0 +1,108 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/lock.h>
+#include <sys/hwt.h>
+
+#include <vm/vm.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_record.h>
+
+#define HWT_MAXCONFIGSIZE PAGE_SIZE
+
+#define HWT_CONFIG_DEBUG
+#undef HWT_CONFIG_DEBUG
+
+#ifdef HWT_CONFIG_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static MALLOC_DEFINE(M_HWT_CONFIG, "hwt_config", "HWT config");
+
+int
+hwt_config_set(struct thread *td, struct hwt_context *ctx,
+ struct hwt_set_config *sconf)
+{
+ size_t config_size;
+ void *old_config;
+ void *config;
+ int error;
+
+ config_size = sconf->config_size;
+ if (config_size == 0)
+ return (0);
+
+ if (config_size > HWT_MAXCONFIGSIZE)
+ return (EFBIG);
+
+ config = malloc(config_size, M_HWT_CONFIG, M_WAITOK | M_ZERO);
+
+ error = copyin(sconf->config, config, config_size);
+ if (error) {
+ free(config, M_HWT_CONFIG);
+ return (error);
+ }
+
+ HWT_CTX_LOCK(ctx);
+ old_config = ctx->config;
+ ctx->config = config;
+ ctx->config_size = sconf->config_size;
+ ctx->config_version = sconf->config_version;
+ HWT_CTX_UNLOCK(ctx);
+
+ if (old_config != NULL)
+ free(old_config, M_HWT_CONFIG);
+
+ return (error);
+}
+
+void
+hwt_config_free(struct hwt_context *ctx)
+{
+
+ if (ctx->config == NULL)
+ return;
+
+ free(ctx->config, M_HWT_CONFIG);
+
+ ctx->config = NULL;
+}
diff --git a/sys/dev/hwt/hwt_config.h b/sys/dev/hwt/hwt_config.h
new file mode 100644
index 000000000000..47485583063c
--- /dev/null
+++ b/sys/dev/hwt/hwt_config.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_CONFIG_H_
+#define _DEV_HWT_HWT_CONFIG_H_
+
+int hwt_config_set(struct thread *td, struct hwt_context *ctx,
+ struct hwt_set_config *sconf);
+void hwt_config_free(struct hwt_context *ctx);
+
+#endif /* !_DEV_HWT_HWT_CONFIG_H_ */
diff --git a/sys/dev/hwt/hwt_context.c b/sys/dev/hwt/hwt_context.c
new file mode 100644
index 000000000000..9af76cffc928
--- /dev/null
+++ b/sys/dev/hwt/hwt_context.c
@@ -0,0 +1,201 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/bitstring.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/hwt.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_cpu.h>
+
+#define HWT_DEBUG
+#undef HWT_DEBUG
+
+#ifdef HWT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static MALLOC_DEFINE(M_HWT_CTX, "hwt_ctx", "Hardware Trace");
+
+static bitstr_t *ident_set;
+static int ident_set_size;
+static struct mtx ident_set_mutex;
+
+static int
+hwt_ctx_ident_alloc(int *new_ident)
+{
+
+ mtx_lock(&ident_set_mutex);
+ bit_ffc(ident_set, ident_set_size, new_ident);
+ if (*new_ident == -1) {
+ mtx_unlock(&ident_set_mutex);
+ return (ENOMEM);
+ }
+ bit_set(ident_set, *new_ident);
+ mtx_unlock(&ident_set_mutex);
+
+ return (0);
+}
+
+static void
+hwt_ctx_ident_free(int ident)
+{
+
+ mtx_lock(&ident_set_mutex);
+ bit_clear(ident_set, ident);
+ mtx_unlock(&ident_set_mutex);
+}
+
+int
+hwt_ctx_alloc(struct hwt_context **ctx0)
+{
+ struct hwt_context *ctx;
+ int error;
+
+ ctx = malloc(sizeof(struct hwt_context), M_HWT_CTX, M_WAITOK | M_ZERO);
+
+ TAILQ_INIT(&ctx->records);
+ TAILQ_INIT(&ctx->threads);
+ TAILQ_INIT(&ctx->cpus);
+ mtx_init(&ctx->mtx, "ctx", NULL, MTX_SPIN);
+ mtx_init(&ctx->rec_mtx, "ctx_rec", NULL, MTX_DEF);
+ refcount_init(&ctx->refcnt, 0);
+
+ error = hwt_ctx_ident_alloc(&ctx->ident);
+ if (error) {
+ printf("could not allocate ident bit str\n");
+ return (error);
+ }
+
+ *ctx0 = ctx;
+
+ return (0);
+}
+
+static void
+hwt_ctx_free_cpus(struct hwt_context *ctx)
+{
+ struct hwt_cpu *cpu;
+
+ do {
+ HWT_CTX_LOCK(ctx);
+ cpu = TAILQ_FIRST(&ctx->cpus);
+ if (cpu)
+ TAILQ_REMOVE(&ctx->cpus, cpu, next);
+ HWT_CTX_UNLOCK(ctx);
+
+ if (cpu == NULL)
+ break;
+
+ /* TODO: move vm_free() to cpu_free()? */
+ hwt_vm_free(cpu->vm);
+ hwt_cpu_free(cpu);
+ } while (1);
+}
+
+static void
+hwt_ctx_free_threads(struct hwt_context *ctx)
+{
+ struct hwt_thread *thr;
+
+ dprintf("%s: remove threads\n", __func__);
+
+ do {
+ HWT_CTX_LOCK(ctx);
+ thr = TAILQ_FIRST(&ctx->threads);
+ if (thr)
+ TAILQ_REMOVE(&ctx->threads, thr, next);
+ HWT_CTX_UNLOCK(ctx);
+
+ if (thr == NULL)
+ break;
+
+ HWT_THR_LOCK(thr);
+ /* TODO: check if thr is sleeping before waking it up. */
+ wakeup(thr);
+ HWT_THR_UNLOCK(thr);
+
+ if (refcount_release(&thr->refcnt))
+ hwt_thread_free(thr);
+ } while (1);
+}
+
+void
+hwt_ctx_free(struct hwt_context *ctx)
+{
+
+ if (ctx->mode == HWT_MODE_CPU)
+ hwt_ctx_free_cpus(ctx);
+ else
+ hwt_ctx_free_threads(ctx);
+
+ hwt_config_free(ctx);
+ hwt_ctx_ident_free(ctx->ident);
+ free(ctx, M_HWT_CTX);
+}
+
+void
+hwt_ctx_put(struct hwt_context *ctx)
+{
+
+ refcount_release(&ctx->refcnt);
+}
+
+void
+hwt_ctx_load(void)
+{
+
+ ident_set_size = (1 << 8);
+ ident_set = bit_alloc(ident_set_size, M_HWT_CTX, M_WAITOK);
+ mtx_init(&ident_set_mutex, "ident set", NULL, MTX_DEF);
+}
+
+void
+hwt_ctx_unload(void)
+{
+
+ mtx_destroy(&ident_set_mutex);
+ free(ident_set, M_HWT_CTX);
+}
diff --git a/sys/dev/hwt/hwt_context.h b/sys/dev/hwt/hwt_context.h
new file mode 100644
index 000000000000..cafb197ae348
--- /dev/null
+++ b/sys/dev/hwt/hwt_context.h
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_CONTEXT_H_
+#define _DEV_HWT_HWT_CONTEXT_H_
+
+enum hwt_ctx_state {
+ CTX_STATE_STOPPED,
+ CTX_STATE_RUNNING,
+};
+
+struct hwt_context {
+ TAILQ_HEAD(, hwt_record_entry) records;
+
+ LIST_ENTRY(hwt_context) next_hch; /* Entry in contexthash. */
+ LIST_ENTRY(hwt_context) next_hwts; /* Entry in ho->hwts. */
+
+ int mode;
+ int ident;
+
+ int kqueue_fd;
+ struct thread *hwt_td;
+
+ /* CPU mode. */
+ cpuset_t cpu_map;
+ TAILQ_HEAD(, hwt_cpu) cpus;
+
+ /* Thread mode. */
+ struct proc *proc; /* Target proc. */
+ pid_t pid; /* Target pid. */
+ TAILQ_HEAD(, hwt_thread) threads;
+ int thread_counter;
+ int pause_on_mmap;
+
+ size_t bufsize; /* Trace bufsize for each vm.*/
+
+ void *config;
+ size_t config_size;
+ int config_version;
+
+ struct hwt_owner *hwt_owner;
+ struct hwt_backend *hwt_backend;
+
+ struct mtx mtx;
+ struct mtx rec_mtx;
+ enum hwt_ctx_state state;
+ int refcnt;
+};
+
+#define HWT_CTX_LOCK(ctx) mtx_lock_spin(&(ctx)->mtx)
+#define HWT_CTX_UNLOCK(ctx) mtx_unlock_spin(&(ctx)->mtx)
+#define HWT_CTX_ASSERT_LOCKED(ctx) mtx_assert(&(ctx)->mtx, MA_OWNED)
+
+int hwt_ctx_alloc(struct hwt_context **ctx0);
+void hwt_ctx_free(struct hwt_context *ctx);
+void hwt_ctx_put(struct hwt_context *ctx);
+
+void hwt_ctx_load(void);
+void hwt_ctx_unload(void);
+
+#endif /* !_DEV_HWT_HWT_CONTEXT_H_ */
diff --git a/sys/dev/hwt/hwt_contexthash.c b/sys/dev/hwt/hwt_contexthash.c
new file mode 100644
index 000000000000..5682b7d38e5e
--- /dev/null
+++ b/sys/dev/hwt/hwt_contexthash.c
@@ -0,0 +1,134 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/hwt.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+
+#define HWT_DEBUG
+#undef HWT_DEBUG
+
+#ifdef HWT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+#define HWT_CONTEXTHASH_SIZE 1024
+
+static MALLOC_DEFINE(M_HWT_CONTEXTHASH, "hwt_chash", "Hardware Trace");
+
+/*
+ * Hash function. Discard the lower 2 bits of the pointer since
+ * these are always zero for our uses. The hash multiplier is
+ * round((2^LONG_BIT) * ((sqrt(5)-1)/2)).
+ */
+
+#define _HWT_HM 11400714819323198486u /* hash multiplier */
+#define HWT_HASH_PTR(P, M) ((((unsigned long) (P) >> 2) * _HWT_HM) & (M))
+
+static struct mtx hwt_contexthash_mtx;
+static u_long hwt_contexthashmask;
+static LIST_HEAD(hwt_contexthash, hwt_context) *hwt_contexthash;
+
+/*
+ * To use by hwt_switch_in/out() and hwt_record() only.
+ * This function returns with refcnt acquired.
+ */
+struct hwt_context *
+hwt_contexthash_lookup(struct proc *p)
+{
+ struct hwt_contexthash *hch;
+ struct hwt_context *ctx;
+ int hindex;
+
+ hindex = HWT_HASH_PTR(p, hwt_contexthashmask);
+ hch = &hwt_contexthash[hindex];
+
+ HWT_CTXHASH_LOCK();
+ LIST_FOREACH(ctx, hch, next_hch) {
+ if (ctx->proc == p) {
+ refcount_acquire(&ctx->refcnt);
+ HWT_CTXHASH_UNLOCK();
+ return (ctx);
+ }
+ }
+ HWT_CTXHASH_UNLOCK();
+
+ return (NULL);
+}
+
+void
+hwt_contexthash_insert(struct hwt_context *ctx)
+{
+ struct hwt_contexthash *hch;
+ int hindex;
+
+ hindex = HWT_HASH_PTR(ctx->proc, hwt_contexthashmask);
+ hch = &hwt_contexthash[hindex];
+
+ HWT_CTXHASH_LOCK();
+ LIST_INSERT_HEAD(hch, ctx, next_hch);
+ HWT_CTXHASH_UNLOCK();
+}
+
+void
+hwt_contexthash_remove(struct hwt_context *ctx)
+{
+
+ HWT_CTXHASH_LOCK();
+ LIST_REMOVE(ctx, next_hch);
+ HWT_CTXHASH_UNLOCK();
+}
+
+void
+hwt_contexthash_load(void)
+{
+
+ hwt_contexthash = hashinit(HWT_CONTEXTHASH_SIZE, M_HWT_CONTEXTHASH,
+ &hwt_contexthashmask);
+ mtx_init(&hwt_contexthash_mtx, "hwt ctx hash", "hwt ctx", MTX_SPIN);
+}
+
+void
+hwt_contexthash_unload(void)
+{
+
+ mtx_destroy(&hwt_contexthash_mtx);
+ hashdestroy(hwt_contexthash, M_HWT_CONTEXTHASH, hwt_contexthashmask);
+}
diff --git a/sys/dev/hwt/hwt_contexthash.h b/sys/dev/hwt/hwt_contexthash.h
new file mode 100644
index 000000000000..c3ab7acd2a74
--- /dev/null
+++ b/sys/dev/hwt/hwt_contexthash.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_CONTEXTHASH_H_
+#define _DEV_HWT_HWT_CONTEXTHASH_H_
+
+struct hwt_context * hwt_contexthash_lookup(struct proc *p);
+void hwt_contexthash_insert(struct hwt_context *ctx);
+void hwt_contexthash_remove(struct hwt_context *ctx);
+
+void hwt_contexthash_load(void);
+void hwt_contexthash_unload(void);
+
+#define HWT_CTXHASH_LOCK() mtx_lock_spin(&hwt_contexthash_mtx)
+#define HWT_CTXHASH_UNLOCK() mtx_unlock_spin(&hwt_contexthash_mtx)
+
+#endif /* !_DEV_HWT_HWT_CONTEXTHASH_H_ */
diff --git a/sys/dev/hwt/hwt_cpu.c b/sys/dev/hwt/hwt_cpu.c
new file mode 100644
index 000000000000..7d38eb082e65
--- /dev/null
+++ b/sys/dev/hwt/hwt_cpu.c
@@ -0,0 +1,115 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/hwt.h>
+
+#include <vm/vm.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_cpu.h>
+
+#define HWT_CPU_DEBUG
+#undef HWT_CPU_DEBUG
+
+#ifdef HWT_CPU_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static MALLOC_DEFINE(M_HWT_CPU, "hwt_cpu", "HWT cpu");
+
+struct hwt_cpu *
+hwt_cpu_alloc(void)
+{
+ struct hwt_cpu *cpu;
+
+ cpu = malloc(sizeof(struct hwt_cpu), M_HWT_CPU, M_WAITOK | M_ZERO);
+
+ return (cpu);
+}
+
+void
+hwt_cpu_free(struct hwt_cpu *cpu)
+{
+
+ free(cpu, M_HWT_CPU);
+}
+
+struct hwt_cpu *
+hwt_cpu_first(struct hwt_context *ctx)
+{
+ struct hwt_cpu *cpu;
+
+ HWT_CTX_ASSERT_LOCKED(ctx);
+
+ cpu = TAILQ_FIRST(&ctx->cpus);
+
+ KASSERT(cpu != NULL, ("cpu is NULL"));
+
+ return (cpu);
+}
+
+struct hwt_cpu *
+hwt_cpu_get(struct hwt_context *ctx, int cpu_id)
+{
+ struct hwt_cpu *cpu, *tcpu;
+
+ HWT_CTX_ASSERT_LOCKED(ctx);
+
+ TAILQ_FOREACH_SAFE(cpu, &ctx->cpus, next, tcpu) {
+ KASSERT(cpu != NULL, ("cpu is NULL"));
+ if (cpu->cpu_id == cpu_id) {
+ return cpu;
+ }
+ }
+
+ return (NULL);
+}
+
+void
+hwt_cpu_insert(struct hwt_context *ctx, struct hwt_cpu *cpu)
+{
+
+ HWT_CTX_ASSERT_LOCKED(ctx);
+
+ TAILQ_INSERT_TAIL(&ctx->cpus, cpu, next);
+}
diff --git a/sys/dev/hwt/hwt_cpu.h b/sys/dev/hwt/hwt_cpu.h
new file mode 100644
index 000000000000..92b89229b6e4
--- /dev/null
+++ b/sys/dev/hwt/hwt_cpu.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_CPU_H_
+#define _DEV_HWT_HWT_CPU_H_
+
+struct hwt_cpu {
+ int cpu_id;
+ struct hwt_vm *vm;
+ TAILQ_ENTRY(hwt_cpu) next;
+};
+
+struct hwt_cpu * hwt_cpu_alloc(void);
+void hwt_cpu_free(struct hwt_cpu *cpu);
+
+struct hwt_cpu * hwt_cpu_first(struct hwt_context *ctx);
+struct hwt_cpu * hwt_cpu_get(struct hwt_context *ctx, int cpu_id);
+void hwt_cpu_insert(struct hwt_context *ctx, struct hwt_cpu *cpu);
+
+#endif /* !_DEV_HWT_HWT_CPU_H_ */
diff --git a/sys/dev/hwt/hwt_hook.c b/sys/dev/hwt/hwt_hook.c
new file mode 100644
index 000000000000..258279b14f20
--- /dev/null
+++ b/sys/dev/hwt/hwt_hook.c
@@ -0,0 +1,323 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Hardware Trace (HWT) framework. */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/refcount.h>
+#include <sys/hwt.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_vm.h>
+
+#define HWT_DEBUG
+#undef HWT_DEBUG
+
+#ifdef HWT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static void
+hwt_switch_in(struct thread *td)
+{
+ struct hwt_context *ctx;
+ struct hwt_thread *thr;
+ struct proc *p;
+ int cpu_id;
+
+ p = td->td_proc;
+
+ cpu_id = PCPU_GET(cpuid);
+
+ ctx = hwt_contexthash_lookup(p);
+ if (ctx == NULL)
+ return;
+
+ if (ctx->state != CTX_STATE_RUNNING) {
+ hwt_ctx_put(ctx);
+ return;
+ }
+
+ thr = hwt_thread_lookup(ctx, td);
+ if (thr == NULL) {
+ hwt_ctx_put(ctx);
+ return;
+ }
+
+ dprintf("%s: thr %p index %d tid %d on cpu_id %d\n", __func__, thr,
+ thr->thread_id, td->td_tid, cpu_id);
+
+ hwt_backend_configure(ctx, cpu_id, thr->thread_id);
+ hwt_backend_enable(ctx, cpu_id);
+
+ hwt_ctx_put(ctx);
+}
+
+static void
+hwt_switch_out(struct thread *td)
+{
+ struct hwt_context *ctx;
+ struct hwt_thread *thr;
+ struct proc *p;
+ int cpu_id;
+
+ p = td->td_proc;
+
+ cpu_id = PCPU_GET(cpuid);
+
+ ctx = hwt_contexthash_lookup(p);
+ if (ctx == NULL)
+ return;
+
+ if (ctx->state != CTX_STATE_RUNNING) {
+ hwt_ctx_put(ctx);
+ return;
+ }
+ thr = hwt_thread_lookup(ctx, td);
+ if (thr == NULL) {
+ hwt_ctx_put(ctx);
+ return;
+ }
+
+ dprintf("%s: thr %p index %d tid %d on cpu_id %d\n", __func__, thr,
+ thr->thread_id, td->td_tid, cpu_id);
+
+ hwt_backend_disable(ctx, cpu_id);
+
+ hwt_ctx_put(ctx);
+}
+
+static void
+hwt_hook_thread_exit(struct thread *td)
+{
+ struct hwt_context *ctx;
+ struct hwt_thread *thr;
+ struct proc *p;
+ int cpu_id;
+
+ p = td->td_proc;
+
+ cpu_id = PCPU_GET(cpuid);
+
+ ctx = hwt_contexthash_lookup(p);
+ if (ctx == NULL)
+ return;
+
+ thr = hwt_thread_lookup(ctx, td);
+ if (thr == NULL) {
+ hwt_ctx_put(ctx);
+ return;
+ }
+
+ thr->state = HWT_THREAD_STATE_EXITED;
+
+ dprintf("%s: thr %p index %d tid %d on cpu_id %d\n", __func__, thr,
+ thr->thread_id, td->td_tid, cpu_id);
+
+ if (ctx->state == CTX_STATE_RUNNING)
+ hwt_backend_disable(ctx, cpu_id);
+
+ hwt_ctx_put(ctx);
+}
+
+static void
+hwt_hook_mmap(struct thread *td)
+{
+ struct hwt_context *ctx;
+ struct hwt_thread *thr;
+ struct proc *p;
+ int pause;
+
+ p = td->td_proc;
+
+ ctx = hwt_contexthash_lookup(p);
+ if (ctx == NULL)
+ return;
+
+ /* The ctx state could be any here. */
+
+ pause = ctx->pause_on_mmap ? 1 : 0;
+
+ thr = hwt_thread_lookup(ctx, td);
+ if (thr == NULL) {
+ hwt_ctx_put(ctx);
+ return;
+ }
+
+ /*
+ * msleep(9) atomically releases the mtx lock, so take refcount
+ * to ensure that thr is not destroyed.
+ * It could not be destroyed prior to this call as we are holding ctx
+ * refcnt.
+ */
+ refcount_acquire(&thr->refcnt);
+ hwt_ctx_put(ctx);
+
+ if (pause) {
+ HWT_THR_LOCK(thr);
+ msleep(thr, &thr->mtx, PCATCH, "hwt-mmap", 0);
+ HWT_THR_UNLOCK(thr);
+ }
+
+ if (refcount_release(&thr->refcnt))
+ hwt_thread_free(thr);
+}
+
+static int
+hwt_hook_thread_create(struct thread *td)
+{
+ struct hwt_record_entry *entry;
+ struct hwt_context *ctx;
+ struct hwt_thread *thr;
+ char path[MAXPATHLEN];
+ size_t bufsize;
+ struct proc *p;
+ int thread_id, kva_req;
+ int error;
+
+ p = td->td_proc;
+
+ /* Step 1. Get CTX and collect information needed. */
+ ctx = hwt_contexthash_lookup(p);
+ if (ctx == NULL)
+ return (ENXIO);
+ thread_id = atomic_fetchadd_int(&ctx->thread_counter, 1);
+ bufsize = ctx->bufsize;
+ kva_req = ctx->hwt_backend->kva_req;
+ sprintf(path, "hwt_%d_%d", ctx->ident, thread_id);
+ hwt_ctx_put(ctx);
+
+ /* Step 2. Allocate some memory without holding ctx ref. */
+ error = hwt_thread_alloc(&thr, path, bufsize, kva_req);
+ if (error) {
+ printf("%s: could not allocate thread, error %d\n",
+ __func__, error);
+ return (error);
+ }
+
+ entry = hwt_record_entry_alloc();
+ entry->record_type = HWT_RECORD_THREAD_CREATE;
+ entry->thread_id = thread_id;
+
+ /* Step 3. Get CTX once again. */
+ ctx = hwt_contexthash_lookup(p);
+ if (ctx == NULL) {
+ hwt_record_entry_free(entry);
+ hwt_thread_free(thr);
+ /* ctx->thread_counter does not matter. */
+ return (ENXIO);
+ }
+ /* Allocate backend-specific thread data. */
+ error = hwt_backend_thread_alloc(ctx, thr);
+ if (error != 0) {
+ dprintf("%s: failed to allocate backend thread data\n",
+ __func__);
+ return (error);
+ }
+
+ thr->vm->ctx = ctx;
+ thr->ctx = ctx;
+ thr->backend = ctx->hwt_backend;
+ thr->thread_id = thread_id;
+ thr->td = td;
+
+ HWT_CTX_LOCK(ctx);
+ hwt_thread_insert(ctx, thr, entry);
+ HWT_CTX_UNLOCK(ctx);
+
+ /* Notify userspace. */
+ hwt_record_wakeup(ctx);
+
+ hwt_ctx_put(ctx);
+
+ return (0);
+}
+
+static void
+hwt_hook_handler(struct thread *td, int func, void *arg)
+{
+ struct proc *p;
+
+ p = td->td_proc;
+ if ((p->p_flag2 & P2_HWT) == 0)
+ return;
+
+ switch (func) {
+ case HWT_SWITCH_IN:
+ hwt_switch_in(td);
+ break;
+ case HWT_SWITCH_OUT:
+ hwt_switch_out(td);
+ break;
+ case HWT_THREAD_CREATE:
+ hwt_hook_thread_create(td);
+ break;
+ case HWT_THREAD_SET_NAME:
+ /* TODO. */
+ break;
+ case HWT_THREAD_EXIT:
+ hwt_hook_thread_exit(td);
+ break;
+ case HWT_EXEC:
+ case HWT_MMAP:
+ hwt_record_td(td, arg, M_WAITOK | M_ZERO);
+ hwt_hook_mmap(td);
+ break;
+ case HWT_RECORD:
+ hwt_record_td(td, arg, M_WAITOK | M_ZERO);
+ break;
+ };
+}
+
+void
+hwt_hook_load(void)
+{
+
+ hwt_hook = hwt_hook_handler;
+}
+
+void
+hwt_hook_unload(void)
+{
+
+ hwt_hook = NULL;
+}
diff --git a/sys/dev/hwt/hwt_hook.h b/sys/dev/hwt/hwt_hook.h
new file mode 100644
index 000000000000..a8eccba3ec43
--- /dev/null
+++ b/sys/dev/hwt/hwt_hook.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/hwt_record.h>
+
+#ifndef _DEV_HWT_HWT_HOOK_H_
+#define _DEV_HWT_HWT_HOOK_H_
+
+#define HWT_SWITCH_IN 0
+#define HWT_SWITCH_OUT 1
+#define HWT_THREAD_EXIT 2
+#define HWT_THREAD_CREATE 3
+#define HWT_THREAD_SET_NAME 4
+#define HWT_RECORD 5
+#define HWT_MMAP 6
+#define HWT_EXEC 7
+
+#define HWT_CALL_HOOK(td, func, arg) \
+do { \
+ if (hwt_hook != NULL) \
+ (hwt_hook)((td), (func), (arg)); \
+} while (0)
+
+#define HWT_HOOK_INSTALLED (hwt_hook != NULL)
+
+extern void (*hwt_hook)(struct thread *td, int func, void *arg);
+
+void hwt_hook_load(void);
+void hwt_hook_unload(void);
+
+#endif /* !_DEV_HWT_HWT_HOOK_H_ */
diff --git a/sys/dev/hwt/hwt_intr.h b/sys/dev/hwt/hwt_intr.h
new file mode 100644
index 000000000000..e601969f001c
--- /dev/null
+++ b/sys/dev/hwt/hwt_intr.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2023-2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_INTR_H_
+#define _DEV_HWT_HWT_INTR_H_
+
+#include <machine/frame.h>
+
+extern int (*hwt_intr)(struct trapframe *tf);
+
+#endif /* !_DEV_HWT_HWT_INTR_H_ */
diff --git a/sys/dev/hwt/hwt_ioctl.c b/sys/dev/hwt/hwt_ioctl.c
new file mode 100644
index 000000000000..592db4931bb4
--- /dev/null
+++ b/sys/dev/hwt/hwt_ioctl.c
@@ -0,0 +1,445 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Hardware Trace (HWT) framework. */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/ioccom.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/smp.h>
+#include <sys/hwt.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_ownerhash.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_ioctl.h>
+#include <dev/hwt/hwt_vm.h>
+
+#define HWT_IOCTL_DEBUG
+#undef HWT_IOCTL_DEBUG
+
+#ifdef HWT_IOCTL_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+/* No real reason for these limitations just sanity checks. */
+#define HWT_MAXBUFSIZE (32UL * 1024 * 1024 * 1024) /* 32 GB */
+
+static MALLOC_DEFINE(M_HWT_IOCTL, "hwt_ioctl", "Hardware Trace");
+
+/*
+ * Check if owner process *o can trace target process *t.
+ */
+
+static int
+hwt_priv_check(struct proc *o, struct proc *t)
+{
+ struct ucred *oc, *tc;
+ int error;
+ int i;
+
+ PROC_LOCK(o);
+ oc = o->p_ucred;
+ crhold(oc);
+ PROC_UNLOCK(o);
+
+ PROC_LOCK_ASSERT(t, MA_OWNED);
+ tc = t->p_ucred;
+ crhold(tc);
+
+ error = 0;
+
+ /*
+ * The effective uid of the HWT owner should match at least one
+ * of the effective / real / saved uids of the target process.
+ */
+
+ if (oc->cr_uid != tc->cr_uid &&
+ oc->cr_uid != tc->cr_svuid &&
+ oc->cr_uid != tc->cr_ruid) {
+ error = EPERM;
+ goto done;
+ }
+
+ /*
+ * Everyone of the target's group ids must be in the owner's
+ * group list.
+ */
+ for (i = 0; i < tc->cr_ngroups; i++)
+ if (!groupmember(tc->cr_groups[i], oc)) {
+ error = EPERM;
+ goto done;
+ }
+
+ /* Check the read and saved GIDs too. */
+ if (!groupmember(tc->cr_rgid, oc) ||
+ !groupmember(tc->cr_svgid, oc)) {
+ error = EPERM;
+ goto done;
+ }
+
+done:
+ crfree(tc);
+ crfree(oc);
+
+ return (error);
+}
+
+static int
+hwt_ioctl_alloc_mode_thread(struct thread *td, struct hwt_owner *ho,
+ struct hwt_backend *backend, struct hwt_alloc *halloc)
+{
+ struct thread **threads, *td1;
+ struct hwt_record_entry *entry;
+ struct hwt_context *ctx, *ctx1;
+ struct hwt_thread *thr;
+ char path[MAXPATHLEN];
+ struct proc *p;
+ int thread_id;
+ int error;
+ int cnt;
+ int i;
+
+ /* Check if the owner have this pid configured already. */
+ ctx = hwt_owner_lookup_ctx(ho, halloc->pid);
+ if (ctx)
+ return (EEXIST);
+
+ /* Allocate a new HWT context. */
+ error = hwt_ctx_alloc(&ctx);
+ if (error)
+ return (error);
+ ctx->bufsize = halloc->bufsize;
+ ctx->pid = halloc->pid;
+ ctx->hwt_backend = backend;
+ ctx->hwt_owner = ho;
+ ctx->mode = HWT_MODE_THREAD;
+ ctx->hwt_td = td;
+ ctx->kqueue_fd = halloc->kqueue_fd;
+
+ error = copyout(&ctx->ident, halloc->ident, sizeof(int));
+ if (error) {
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+
+ /* Now get the victim proc. */
+ p = pfind(halloc->pid);
+ if (p == NULL) {
+ hwt_ctx_free(ctx);
+ return (ENXIO);
+ }
+
+ /* Ensure we can trace it. */
+ error = hwt_priv_check(td->td_proc, p);
+ if (error) {
+ PROC_UNLOCK(p);
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+
+ /* Ensure it is not being traced already. */
+ ctx1 = hwt_contexthash_lookup(p);
+ if (ctx1) {
+ refcount_release(&ctx1->refcnt);
+ PROC_UNLOCK(p);
+ hwt_ctx_free(ctx);
+ return (EEXIST);
+ }
+
+ /* Allocate hwt threads and buffers. */
+
+ cnt = 0;
+
+ FOREACH_THREAD_IN_PROC(p, td1) {
+ cnt += 1;
+ }
+
+ KASSERT(cnt > 0, ("no threads"));
+
+ threads = malloc(sizeof(struct thread *) * cnt, M_HWT_IOCTL,
+ M_NOWAIT | M_ZERO);
+ if (threads == NULL) {
+ PROC_UNLOCK(p);
+ hwt_ctx_free(ctx);
+ return (ENOMEM);
+ }
+
+ i = 0;
+
+ FOREACH_THREAD_IN_PROC(p, td1) {
+ threads[i++] = td1;
+ }
+
+ ctx->proc = p;
+ PROC_UNLOCK(p);
+
+ for (i = 0; i < cnt; i++) {
+ thread_id = atomic_fetchadd_int(&ctx->thread_counter, 1);
+ sprintf(path, "hwt_%d_%d", ctx->ident, thread_id);
+
+ error = hwt_thread_alloc(&thr, path, ctx->bufsize,
+ ctx->hwt_backend->kva_req);
+ if (error) {
+ free(threads, M_HWT_IOCTL);
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+ /* Allocate backend-specific thread data. */
+ error = hwt_backend_thread_alloc(ctx, thr);
+ if (error != 0) {
+ dprintf("%s: failed to allocate thread backend data\n",
+ __func__);
+ free(threads, M_HWT_IOCTL);
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+
+ /*
+ * Insert a THREAD_CREATE record so userspace picks up
+ * the thread's tracing buffers.
+ */
+ entry = hwt_record_entry_alloc();
+ entry->record_type = HWT_RECORD_THREAD_CREATE;
+ entry->thread_id = thread_id;
+
+ thr->vm->ctx = ctx;
+ thr->td = threads[i];
+ thr->ctx = ctx;
+ thr->backend = ctx->hwt_backend;
+ thr->thread_id = thread_id;
+
+ HWT_CTX_LOCK(ctx);
+ hwt_thread_insert(ctx, thr, entry);
+ HWT_CTX_UNLOCK(ctx);
+ }
+
+ free(threads, M_HWT_IOCTL);
+
+ error = hwt_backend_init(ctx);
+ if (error) {
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+
+ /* hwt_owner_insert_ctx? */
+ mtx_lock(&ho->mtx);
+ LIST_INSERT_HEAD(&ho->hwts, ctx, next_hwts);
+ mtx_unlock(&ho->mtx);
+
+ /*
+ * Hooks are now in action after this, but the ctx is not in RUNNING
+ * state.
+ */
+ hwt_contexthash_insert(ctx);
+
+ p = pfind(halloc->pid);
+ if (p) {
+ p->p_flag2 |= P2_HWT;
+ PROC_UNLOCK(p);
+ }
+
+ return (0);
+}
+
+static int
+hwt_ioctl_alloc_mode_cpu(struct thread *td, struct hwt_owner *ho,
+ struct hwt_backend *backend, struct hwt_alloc *halloc)
+{
+ struct hwt_context *ctx;
+ struct hwt_cpu *cpu;
+ struct hwt_vm *vm;
+ char path[MAXPATHLEN];
+ size_t cpusetsize;
+ cpuset_t cpu_map;
+ int cpu_count = 0;
+ int cpu_id;
+ int error;
+
+ CPU_ZERO(&cpu_map);
+ cpusetsize = min(halloc->cpusetsize, sizeof(cpuset_t));
+ error = copyin(halloc->cpu_map, &cpu_map, cpusetsize);
+ if (error)
+ return (error);
+
+ CPU_FOREACH_ISSET(cpu_id, &cpu_map) {
+#ifdef SMP
+ /* Ensure CPU is not halted. */
+ if (CPU_ISSET(cpu_id, &hlt_cpus_mask))
+ return (ENXIO);
+#endif
+#if 0
+ /* TODO: Check if the owner have this cpu configured already. */
+ ctx = hwt_owner_lookup_ctx_by_cpu(ho, halloc->cpu);
+ if (ctx)
+ return (EEXIST);
+#endif
+
+ cpu_count++;
+ }
+
+ if (cpu_count == 0)
+ return (ENODEV);
+
+ /* Allocate a new HWT context. */
+ error = hwt_ctx_alloc(&ctx);
+ if (error)
+ return (error);
+ ctx->bufsize = halloc->bufsize;
+ ctx->hwt_backend = backend;
+ ctx->hwt_owner = ho;
+ ctx->mode = HWT_MODE_CPU;
+ ctx->cpu_map = cpu_map;
+ ctx->hwt_td = td;
+ ctx->kqueue_fd = halloc->kqueue_fd;
+
+ error = copyout(&ctx->ident, halloc->ident, sizeof(int));
+ if (error) {
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+
+ CPU_FOREACH_ISSET(cpu_id, &cpu_map) {
+ sprintf(path, "hwt_%d_%d", ctx->ident, cpu_id);
+ error = hwt_vm_alloc(ctx->bufsize, ctx->hwt_backend->kva_req,
+ path, &vm);
+ if (error) {
+ /* TODO: remove all allocated cpus. */
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+
+ cpu = hwt_cpu_alloc();
+ cpu->cpu_id = cpu_id;
+ cpu->vm = vm;
+
+ vm->cpu = cpu;
+ vm->ctx = ctx;
+
+ HWT_CTX_LOCK(ctx);
+ hwt_cpu_insert(ctx, cpu);
+ HWT_CTX_UNLOCK(ctx);
+ }
+
+ error = hwt_backend_init(ctx);
+ if (error) {
+ /* TODO: remove all allocated cpus. */
+ hwt_ctx_free(ctx);
+ return (error);
+ }
+
+ /* hwt_owner_insert_ctx? */
+ mtx_lock(&ho->mtx);
+ LIST_INSERT_HEAD(&ho->hwts, ctx, next_hwts);
+ mtx_unlock(&ho->mtx);
+
+ hwt_record_kernel_objects(ctx);
+
+ return (0);
+}
+
+static int
+hwt_ioctl_alloc(struct thread *td, struct hwt_alloc *halloc)
+{
+ char backend_name[HWT_BACKEND_MAXNAMELEN];
+ struct hwt_backend *backend;
+ struct hwt_owner *ho;
+ int error;
+
+ if (halloc->bufsize > HWT_MAXBUFSIZE)
+ return (EINVAL);
+ if (halloc->bufsize % PAGE_SIZE)
+ return (EINVAL);
+ if (halloc->backend_name == NULL)
+ return (EINVAL);
+
+ error = copyinstr(halloc->backend_name, (void *)backend_name,
+ HWT_BACKEND_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+
+ backend = hwt_backend_lookup(backend_name);
+ if (backend == NULL)
+ return (ENODEV);
+
+ /* First get the owner. */
+ ho = hwt_ownerhash_lookup(td->td_proc);
+ if (ho == NULL) {
+ /* Create a new owner. */
+ ho = hwt_owner_alloc(td->td_proc);
+ if (ho == NULL)
+ return (ENOMEM);
+ hwt_ownerhash_insert(ho);
+ }
+
+ switch (halloc->mode) {
+ case HWT_MODE_THREAD:
+ error = hwt_ioctl_alloc_mode_thread(td, ho, backend, halloc);
+ break;
+ case HWT_MODE_CPU:
+ error = hwt_ioctl_alloc_mode_cpu(td, ho, backend, halloc);
+ break;
+ default:
+ error = ENXIO;
+ };
+
+ return (error);
+}
+
+int
+hwt_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
+ struct thread *td)
+{
+ int error;
+
+ switch (cmd) {
+ case HWT_IOC_ALLOC:
+ /* Allocate HWT context. */
+ error = hwt_ioctl_alloc(td, (struct hwt_alloc *)addr);
+ return (error);
+ default:
+ return (ENXIO);
+ };
+}
diff --git a/sys/dev/hwt/hwt_ioctl.h b/sys/dev/hwt/hwt_ioctl.h
new file mode 100644
index 000000000000..ce4270dc0d44
--- /dev/null
+++ b/sys/dev/hwt/hwt_ioctl.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_IOCTL_H
+#define _DEV_HWT_HWT_IOCTL_H
+
+int hwt_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
+ struct thread *td);
+
+#endif /* !_DEV_HWT_HWT_IOCTL_H */
diff --git a/sys/dev/hwt/hwt_owner.c b/sys/dev/hwt/hwt_owner.c
new file mode 100644
index 000000000000..3c82040578de
--- /dev/null
+++ b/sys/dev/hwt/hwt_owner.c
@@ -0,0 +1,157 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/hwt.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_ownerhash.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_record.h>
+
+#define HWT_DEBUG
+#undef HWT_DEBUG
+
+#ifdef HWT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static MALLOC_DEFINE(M_HWT_OWNER, "hwt_owner", "Hardware Trace");
+
+struct hwt_context *
+hwt_owner_lookup_ctx(struct hwt_owner *ho, pid_t pid)
+{
+ struct hwt_context *ctx;
+
+ mtx_lock(&ho->mtx);
+ LIST_FOREACH(ctx, &ho->hwts, next_hwts) {
+ if (ctx->pid == pid) {
+ mtx_unlock(&ho->mtx);
+ return (ctx);
+ }
+ }
+ mtx_unlock(&ho->mtx);
+
+ return (NULL);
+}
+
+#if 0
+struct hwt_context *
+hwt_owner_lookup_ctx_by_cpu(struct hwt_owner *ho, int cpu)
+{
+ struct hwt_context *ctx;
+
+ mtx_lock(&ho->mtx);
+ LIST_FOREACH(ctx, &ho->hwts, next_hwts) {
+ if (ctx->cpu == cpu) {
+ mtx_unlock(&ho->mtx);
+ return (ctx);
+ }
+ }
+ mtx_unlock(&ho->mtx);
+
+ return (NULL);
+}
+#endif
+
+struct hwt_owner *
+hwt_owner_alloc(struct proc *p)
+{
+ struct hwt_owner *ho;
+
+ ho = malloc(sizeof(struct hwt_owner), M_HWT_OWNER,
+ M_WAITOK | M_ZERO);
+ ho->p = p;
+
+ LIST_INIT(&ho->hwts);
+ mtx_init(&ho->mtx, "hwts", NULL, MTX_DEF);
+
+ return (ho);
+}
+
+void
+hwt_owner_shutdown(struct hwt_owner *ho)
+{
+ struct hwt_context *ctx;
+
+ dprintf("%s: stopping hwt owner\n", __func__);
+
+ while (1) {
+ mtx_lock(&ho->mtx);
+ ctx = LIST_FIRST(&ho->hwts);
+ if (ctx)
+ LIST_REMOVE(ctx, next_hwts);
+ mtx_unlock(&ho->mtx);
+
+ if (ctx == NULL)
+ break;
+
+ if (ctx->mode == HWT_MODE_THREAD)
+ hwt_contexthash_remove(ctx);
+
+ /*
+ * A hook could be still dealing with this ctx right here.
+ */
+
+ HWT_CTX_LOCK(ctx);
+ ctx->state = 0;
+ HWT_CTX_UNLOCK(ctx);
+
+ /* Ensure hooks invocation is now completed. */
+ while (refcount_load(&ctx->refcnt) > 0)
+ continue;
+
+ /*
+ * Note that a thread could be still sleeping on msleep(9).
+ */
+
+ hwt_backend_deinit(ctx);
+ hwt_record_free_all(ctx);
+ hwt_ctx_free(ctx);
+ }
+
+ hwt_ownerhash_remove(ho);
+ free(ho, M_HWT_OWNER);
+}
diff --git a/sys/dev/hwt/hwt_owner.h b/sys/dev/hwt/hwt_owner.h
new file mode 100644
index 000000000000..2ac569a55050
--- /dev/null
+++ b/sys/dev/hwt/hwt_owner.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_OWNER_H_
+#define _DEV_HWT_HWT_OWNER_H_
+
+struct hwt_owner {
+ struct proc *p;
+ struct mtx mtx; /* Protects hwts. */
+ LIST_HEAD(, hwt_context) hwts; /* Owned HWTs. */
+ LIST_ENTRY(hwt_owner) next; /* Entry in hwt owner hash. */
+};
+
+
+struct hwt_context * hwt_owner_lookup_ctx(struct hwt_owner *ho, pid_t pid);
+struct hwt_owner * hwt_owner_alloc(struct proc *p);
+void hwt_owner_shutdown(struct hwt_owner *ho);
+struct hwt_context * hwt_owner_lookup_ctx_by_cpu(struct hwt_owner *ho, int cpu);
+
+#endif /* !_DEV_HWT_HWT_OWNER_H_ */
diff --git a/sys/dev/hwt/hwt_ownerhash.c b/sys/dev/hwt/hwt_ownerhash.c
new file mode 100644
index 000000000000..7c9e2232bac4
--- /dev/null
+++ b/sys/dev/hwt/hwt_ownerhash.c
@@ -0,0 +1,141 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/hwt.h>
+
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_ownerhash.h>
+
+#define HWT_DEBUG
+#undef HWT_DEBUG
+
+#ifdef HWT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+#define HWT_OWNERHASH_SIZE 1024
+
+static MALLOC_DEFINE(M_HWT_OWNERHASH, "hwt_ohash", "Hardware Trace");
+
+/*
+ * Hash function. Discard the lower 2 bits of the pointer since
+ * these are always zero for our uses. The hash multiplier is
+ * round((2^LONG_BIT) * ((sqrt(5)-1)/2)).
+ */
+
+#define _HWT_HM 11400714819323198486u /* hash multiplier */
+#define HWT_HASH_PTR(P, M) ((((unsigned long) (P) >> 2) * _HWT_HM) & (M))
+
+static struct mtx hwt_ownerhash_mtx;
+static u_long hwt_ownerhashmask;
+static LIST_HEAD(hwt_ownerhash, hwt_owner) *hwt_ownerhash;
+
+struct hwt_owner *
+hwt_ownerhash_lookup(struct proc *p)
+{
+ struct hwt_ownerhash *hoh;
+ struct hwt_owner *ho;
+ int hindex;
+
+ hindex = HWT_HASH_PTR(p, hwt_ownerhashmask);
+ hoh = &hwt_ownerhash[hindex];
+
+ HWT_OWNERHASH_LOCK();
+ LIST_FOREACH(ho, hoh, next) {
+ if (ho->p == p) {
+ HWT_OWNERHASH_UNLOCK();
+ return (ho);
+ }
+ }
+ HWT_OWNERHASH_UNLOCK();
+
+ return (NULL);
+}
+
+void
+hwt_ownerhash_insert(struct hwt_owner *ho)
+{
+ struct hwt_ownerhash *hoh;
+ int hindex;
+
+ hindex = HWT_HASH_PTR(ho->p, hwt_ownerhashmask);
+ hoh = &hwt_ownerhash[hindex];
+
+ HWT_OWNERHASH_LOCK();
+ LIST_INSERT_HEAD(hoh, ho, next);
+ HWT_OWNERHASH_UNLOCK();
+}
+
+void
+hwt_ownerhash_remove(struct hwt_owner *ho)
+{
+
+ /* Destroy hwt owner. */
+ HWT_OWNERHASH_LOCK();
+ LIST_REMOVE(ho, next);
+ HWT_OWNERHASH_UNLOCK();
+}
+
+void
+hwt_ownerhash_load(void)
+{
+
+ hwt_ownerhash = hashinit(HWT_OWNERHASH_SIZE, M_HWT_OWNERHASH,
+ &hwt_ownerhashmask);
+ mtx_init(&hwt_ownerhash_mtx, "hwt-owner-hash", "hwt-owner", MTX_DEF);
+}
+
+void
+hwt_ownerhash_unload(void)
+{
+ struct hwt_ownerhash *hoh;
+ struct hwt_owner *ho, *tmp;
+
+ HWT_OWNERHASH_LOCK();
+ for (hoh = hwt_ownerhash;
+ hoh <= &hwt_ownerhash[hwt_ownerhashmask];
+ hoh++) {
+ LIST_FOREACH_SAFE(ho, hoh, next, tmp) {
+ /* TODO: module is in use ? */
+ }
+ }
+ HWT_OWNERHASH_UNLOCK();
+
+ mtx_destroy(&hwt_ownerhash_mtx);
+ hashdestroy(hwt_ownerhash, M_HWT_OWNERHASH, hwt_ownerhashmask);
+}
diff --git a/sys/dev/hwt/hwt_ownerhash.h b/sys/dev/hwt/hwt_ownerhash.h
new file mode 100644
index 000000000000..4a7bc958d0f7
--- /dev/null
+++ b/sys/dev/hwt/hwt_ownerhash.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_OWNERHASH_H_
+#define _DEV_HWT_HWT_OWNERHASH_H_
+
+struct hwt_owner * hwt_ownerhash_lookup(struct proc *p);
+void hwt_ownerhash_insert(struct hwt_owner *ho);
+void hwt_ownerhash_remove(struct hwt_owner *ho);
+
+void hwt_ownerhash_load(void);
+void hwt_ownerhash_unload(void);
+
+#define HWT_OWNERHASH_LOCK() mtx_lock(&hwt_ownerhash_mtx)
+#define HWT_OWNERHASH_UNLOCK() mtx_unlock(&hwt_ownerhash_mtx)
+
+#endif /* !_DEV_HWT_HWT_OWNERHASH_H_ */
diff --git a/sys/dev/hwt/hwt_record.c b/sys/dev/hwt/hwt_record.c
new file mode 100644
index 000000000000..850ea6f8c5be
--- /dev/null
+++ b/sys/dev/hwt/hwt_record.c
@@ -0,0 +1,302 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/hwt.h>
+#include <sys/linker.h>
+#include <sys/pmckern.h> /* linker_hwpmc_list_objects */
+
+#include <vm/vm.h>
+#include <vm/uma.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_record.h>
+
+#define HWT_RECORD_DEBUG
+#undef HWT_RECORD_DEBUG
+
+#ifdef HWT_RECORD_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static MALLOC_DEFINE(M_HWT_RECORD, "hwt_record", "Hardware Trace");
+static uma_zone_t record_zone = NULL;
+
+static struct hwt_record_entry *
+hwt_record_clone(struct hwt_record_entry *ent, int flags)
+{
+ struct hwt_record_entry *entry;
+
+ entry = uma_zalloc(record_zone, flags);
+ if (entry == NULL)
+ return (NULL);
+ memcpy(entry, ent, sizeof(struct hwt_record_entry));
+ switch (ent->record_type) {
+ case HWT_RECORD_MMAP:
+ case HWT_RECORD_EXECUTABLE:
+ case HWT_RECORD_KERNEL:
+ entry->fullpath = strdup(ent->fullpath, M_HWT_RECORD);
+ break;
+ default:
+ break;
+ }
+
+ return (entry);
+}
+
+static void
+hwt_record_to_user(struct hwt_record_entry *ent,
+ struct hwt_record_user_entry *usr)
+{
+ usr->record_type = ent->record_type;
+ switch (ent->record_type) {
+ case HWT_RECORD_MMAP:
+ case HWT_RECORD_EXECUTABLE:
+ case HWT_RECORD_KERNEL:
+ usr->addr = ent->addr;
+ usr->baseaddr = ent->baseaddr;
+ strncpy(usr->fullpath, ent->fullpath, MAXPATHLEN);
+ break;
+ case HWT_RECORD_BUFFER:
+ usr->buf_id = ent->buf_id;
+ usr->curpage = ent->curpage;
+ usr->offset = ent->offset;
+ break;
+ case HWT_RECORD_THREAD_CREATE:
+ case HWT_RECORD_THREAD_SET_NAME:
+ usr->thread_id = ent->thread_id;
+ break;
+ default:
+ break;
+ }
+}
+
+void
+hwt_record_load(void)
+{
+ record_zone = uma_zcreate("HWT records",
+ sizeof(struct hwt_record_entry), NULL, NULL, NULL, NULL, 0, 0);
+}
+
+void
+hwt_record_unload(void)
+{
+ uma_zdestroy(record_zone);
+}
+
+void
+hwt_record_ctx(struct hwt_context *ctx, struct hwt_record_entry *ent, int flags)
+{
+ struct hwt_record_entry *entry;
+
+ KASSERT(ent != NULL, ("ent is NULL"));
+ entry = hwt_record_clone(ent, flags);
+ if (entry == NULL) {
+ /* XXX: Not sure what to do here other than logging an error. */
+ return;
+ }
+
+ HWT_CTX_LOCK(ctx);
+ TAILQ_INSERT_TAIL(&ctx->records, entry, next);
+ HWT_CTX_UNLOCK(ctx);
+ hwt_record_wakeup(ctx);
+}
+
+void
+hwt_record_td(struct thread *td, struct hwt_record_entry *ent, int flags)
+{
+ struct hwt_record_entry *entry;
+ struct hwt_context *ctx;
+ struct proc *p;
+
+ p = td->td_proc;
+
+ KASSERT(ent != NULL, ("ent is NULL"));
+ entry = hwt_record_clone(ent, flags);
+ if (entry == NULL) {
+ /* XXX: Not sure what to do here other than logging an error. */
+ return;
+ }
+ ctx = hwt_contexthash_lookup(p);
+ if (ctx == NULL) {
+ hwt_record_entry_free(entry);
+ return;
+ }
+ HWT_CTX_LOCK(ctx);
+ TAILQ_INSERT_TAIL(&ctx->records, entry, next);
+ HWT_CTX_UNLOCK(ctx);
+ hwt_record_wakeup(ctx);
+
+ hwt_ctx_put(ctx);
+}
+
+struct hwt_record_entry *
+hwt_record_entry_alloc(void)
+{
+ return (uma_zalloc(record_zone, M_WAITOK | M_ZERO));
+}
+
+void
+hwt_record_entry_free(struct hwt_record_entry *entry)
+{
+
+ switch (entry->record_type) {
+ case HWT_RECORD_MMAP:
+ case HWT_RECORD_EXECUTABLE:
+ case HWT_RECORD_KERNEL:
+ free(entry->fullpath, M_HWT_RECORD);
+ break;
+ default:
+ break;
+ }
+
+ uma_zfree(record_zone, entry);
+}
+
+static int
+hwt_record_grab(struct hwt_context *ctx,
+ struct hwt_record_user_entry *user_entry, int nitems_req, int wait)
+{
+ struct hwt_record_entry *entry;
+ int i;
+
+ if (wait) {
+ mtx_lock(&ctx->rec_mtx);
+ if (TAILQ_FIRST(&ctx->records) == NULL) {
+ /* Wait until we have new records. */
+ msleep(ctx, &ctx->rec_mtx, PCATCH, "recsnd", 0);
+ }
+ mtx_unlock(&ctx->rec_mtx);
+ }
+
+ for (i = 0; i < nitems_req; i++) {
+ HWT_CTX_LOCK(ctx);
+ entry = TAILQ_FIRST(&ctx->records);
+ if (entry)
+ TAILQ_REMOVE_HEAD(&ctx->records, next);
+ HWT_CTX_UNLOCK(ctx);
+
+ if (entry == NULL)
+ break;
+ hwt_record_to_user(entry, &user_entry[i]);
+ hwt_record_entry_free(entry);
+ }
+
+ return (i);
+}
+
+void
+hwt_record_free_all(struct hwt_context *ctx)
+{
+ struct hwt_record_entry *entry;
+
+ while (1) {
+ HWT_CTX_LOCK(ctx);
+ entry = TAILQ_FIRST(&ctx->records);
+ if (entry)
+ TAILQ_REMOVE_HEAD(&ctx->records, next);
+ HWT_CTX_UNLOCK(ctx);
+
+ if (entry == NULL)
+ break;
+
+ hwt_record_entry_free(entry);
+ }
+}
+
+int
+hwt_record_send(struct hwt_context *ctx, struct hwt_record_get *record_get)
+{
+ struct hwt_record_user_entry *user_entry;
+ int nitems_req;
+ int error;
+ int i;
+
+ nitems_req = 0;
+
+ error = copyin(record_get->nentries, &nitems_req, sizeof(int));
+ if (error)
+ return (error);
+
+ if (nitems_req < 1 || nitems_req > 1024)
+ return (ENXIO);
+
+ user_entry = malloc(sizeof(struct hwt_record_user_entry) * nitems_req,
+ M_HWT_RECORD, M_WAITOK | M_ZERO);
+
+ i = hwt_record_grab(ctx, user_entry, nitems_req, record_get->wait);
+ if (i > 0)
+ error = copyout(user_entry, record_get->records,
+ sizeof(struct hwt_record_user_entry) * i);
+
+ if (error == 0)
+ error = copyout(&i, record_get->nentries, sizeof(int));
+
+ free(user_entry, M_HWT_RECORD);
+
+ return (error);
+}
+
+void
+hwt_record_kernel_objects(struct hwt_context *ctx)
+{
+ struct hwt_record_entry *entry;
+ struct pmckern_map_in *kobase;
+ int i;
+
+ kobase = linker_hwpmc_list_objects();
+ for (i = 0; kobase[i].pm_file != NULL; i++) {
+ entry = hwt_record_entry_alloc();
+ entry->record_type = HWT_RECORD_KERNEL;
+ entry->fullpath = strdup(kobase[i].pm_file, M_HWT_RECORD);
+ entry->addr = kobase[i].pm_address;
+
+ HWT_CTX_LOCK(ctx);
+ TAILQ_INSERT_HEAD(&ctx->records, entry, next);
+ HWT_CTX_UNLOCK(ctx);
+ }
+ free(kobase, M_LINKER);
+}
+
+void
+hwt_record_wakeup(struct hwt_context *ctx)
+{
+ wakeup(ctx);
+}
diff --git a/sys/dev/hwt/hwt_record.h b/sys/dev/hwt/hwt_record.h
new file mode 100644
index 000000000000..3f347ca67d54
--- /dev/null
+++ b/sys/dev/hwt/hwt_record.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_RECORD_H_
+#define _DEV_HWT_HWT_RECORD_H_
+
+struct hwt_record_get;
+
+void hwt_record_load(void);
+void hwt_record_unload(void);
+
+int hwt_record_send(struct hwt_context *ctx, struct hwt_record_get *record_get);
+void hwt_record_td(struct thread *td, struct hwt_record_entry *ent, int flags);
+void hwt_record_ctx(struct hwt_context *ctx, struct hwt_record_entry *ent,
+ int flags);
+struct hwt_record_entry * hwt_record_entry_alloc(void);
+void hwt_record_entry_free(struct hwt_record_entry *entry);
+void hwt_record_kernel_objects(struct hwt_context *ctx);
+void hwt_record_free_all(struct hwt_context *ctx);
+void hwt_record_wakeup(struct hwt_context *ctx);
+
+#endif /* !_DEV_HWT_HWT_RECORD_H_ */
diff --git a/sys/dev/hwt/hwt_thread.c b/sys/dev/hwt/hwt_thread.c
new file mode 100644
index 000000000000..827c068a681f
--- /dev/null
+++ b/sys/dev/hwt/hwt_thread.c
@@ -0,0 +1,162 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/hwt.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_phys.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_ownerhash.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_record.h>
+
+#define HWT_THREAD_DEBUG
+#undef HWT_THREAD_DEBUG
+
+#ifdef HWT_THREAD_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static MALLOC_DEFINE(M_HWT_THREAD, "hwt_thread", "Hardware Trace");
+
+struct hwt_thread *
+hwt_thread_first(struct hwt_context *ctx)
+{
+ struct hwt_thread *thr;
+
+ HWT_CTX_ASSERT_LOCKED(ctx);
+
+ thr = TAILQ_FIRST(&ctx->threads);
+
+ KASSERT(thr != NULL, ("thr is NULL"));
+
+ return (thr);
+}
+
+/*
+ * To use by hwt_switch_in/out() only.
+ */
+struct hwt_thread *
+hwt_thread_lookup(struct hwt_context *ctx, struct thread *td)
+{
+ struct hwt_thread *thr;
+
+ /* Caller of this func holds ctx refcnt right here. */
+
+ HWT_CTX_LOCK(ctx);
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ if (thr->td == td) {
+ HWT_CTX_UNLOCK(ctx);
+ return (thr);
+ }
+ }
+ HWT_CTX_UNLOCK(ctx);
+
+ /*
+ * We are here because the hook on thread creation failed to allocate
+ * a thread.
+ */
+
+ return (NULL);
+}
+
+int
+hwt_thread_alloc(struct hwt_thread **thr0, char *path, size_t bufsize,
+ int kva_req)
+{
+ struct hwt_thread *thr;
+ struct hwt_vm *vm;
+ int error;
+
+ error = hwt_vm_alloc(bufsize, kva_req, path, &vm);
+ if (error)
+ return (error);
+
+ thr = malloc(sizeof(struct hwt_thread), M_HWT_THREAD,
+ M_WAITOK | M_ZERO);
+ thr->vm = vm;
+
+ mtx_init(&thr->mtx, "thr", NULL, MTX_DEF);
+
+ refcount_init(&thr->refcnt, 1);
+
+ vm->thr = thr;
+
+ *thr0 = thr;
+
+ return (0);
+}
+
+void
+hwt_thread_free(struct hwt_thread *thr)
+{
+
+ hwt_vm_free(thr->vm);
+ /* Free private backend data, if any. */
+ if (thr->private != NULL)
+ hwt_backend_thread_free(thr);
+ free(thr, M_HWT_THREAD);
+}
+
+/*
+ * Inserts a new thread and a thread creation record into the
+ * context notifies userspace about the newly created thread.
+ */
+void
+hwt_thread_insert(struct hwt_context *ctx, struct hwt_thread *thr,
+ struct hwt_record_entry *entry)
+{
+
+ HWT_CTX_ASSERT_LOCKED(ctx);
+ TAILQ_INSERT_TAIL(&ctx->threads, thr, next);
+ TAILQ_INSERT_TAIL(&ctx->records, entry, next);
+}
diff --git a/sys/dev/hwt/hwt_thread.h b/sys/dev/hwt/hwt_thread.h
new file mode 100644
index 000000000000..ccc29aeb3494
--- /dev/null
+++ b/sys/dev/hwt/hwt_thread.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_THREAD_H_
+#define _DEV_HWT_HWT_THREAD_H_
+
+struct hwt_record_entry;
+
+struct hwt_thread {
+ struct hwt_vm *vm;
+ struct hwt_context *ctx;
+ struct hwt_backend *backend;
+ struct thread *td;
+ TAILQ_ENTRY(hwt_thread) next;
+ int thread_id;
+ int state;
+#define HWT_THREAD_STATE_EXITED (1 << 0)
+ struct mtx mtx;
+ u_int refcnt;
+ int cpu_id; /* last cpu_id */
+ void *private; /* backend-specific private data */
+};
+
+/* Thread allocation. */
+int hwt_thread_alloc(struct hwt_thread **thr0, char *path, size_t bufsize,
+ int kva_req);
+void hwt_thread_free(struct hwt_thread *thr);
+
+/* Thread list mgt. */
+void hwt_thread_insert(struct hwt_context *ctx, struct hwt_thread *thr, struct hwt_record_entry *entry);
+struct hwt_thread * hwt_thread_first(struct hwt_context *ctx);
+struct hwt_thread * hwt_thread_lookup(struct hwt_context *ctx,
+ struct thread *td);
+
+#define HWT_THR_LOCK(thr) mtx_lock(&(thr)->mtx)
+#define HWT_THR_UNLOCK(thr) mtx_unlock(&(thr)->mtx)
+#define HWT_THR_ASSERT_LOCKED(thr) mtx_assert(&(thr)->mtx, MA_OWNED)
+
+#endif /* !_DEV_HWT_HWT_THREAD_H_ */
diff --git a/sys/dev/hwt/hwt_vm.c b/sys/dev/hwt/hwt_vm.c
new file mode 100644
index 000000000000..6c55e218dcec
--- /dev/null
+++ b/sys/dev/hwt/hwt_vm.c
@@ -0,0 +1,503 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/ioccom.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/hwt.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_phys.h>
+
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_contexthash.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_owner.h>
+#include <dev/hwt/hwt_ownerhash.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_record.h>
+
+#define HWT_THREAD_DEBUG
+#undef HWT_THREAD_DEBUG
+
+#ifdef HWT_THREAD_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+static MALLOC_DEFINE(M_HWT_VM, "hwt_vm", "Hardware Trace");
+
+static int
+hwt_vm_fault(vm_object_t vm_obj, vm_ooffset_t offset,
+ int prot, vm_page_t *mres)
+{
+
+ return (0);
+}
+
+static int
+hwt_vm_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred, u_short *color)
+{
+
+ *color = 0;
+
+ return (0);
+}
+
+static void
+hwt_vm_dtor(void *handle)
+{
+
+}
+
+static struct cdev_pager_ops hwt_vm_pager_ops = {
+ .cdev_pg_fault = hwt_vm_fault,
+ .cdev_pg_ctor = hwt_vm_ctor,
+ .cdev_pg_dtor = hwt_vm_dtor
+};
+
+static int
+hwt_vm_alloc_pages(struct hwt_vm *vm, int kva_req)
+{
+ vm_paddr_t low, high, boundary;
+ vm_memattr_t memattr;
+#ifdef __aarch64__
+ uintptr_t va;
+#endif
+ int alignment;
+ vm_page_t m;
+ int pflags;
+ int tries;
+ int i;
+
+ alignment = PAGE_SIZE;
+ low = 0;
+ high = -1UL;
+ boundary = 0;
+ pflags = VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO;
+ memattr = VM_MEMATTR_DEVICE;
+
+ if (kva_req) {
+ vm->kvaddr = kva_alloc(vm->npages * PAGE_SIZE);
+ if (!vm->kvaddr)
+ return (ENOMEM);
+ }
+
+ vm->obj = cdev_pager_allocate(vm, OBJT_MGTDEVICE,
+ &hwt_vm_pager_ops, vm->npages * PAGE_SIZE, PROT_READ, 0,
+ curthread->td_ucred);
+
+ for (i = 0; i < vm->npages; i++) {
+ tries = 0;
+retry:
+ m = vm_page_alloc_noobj_contig(pflags, 1, low, high,
+ alignment, boundary, memattr);
+ if (m == NULL) {
+ if (tries < 3) {
+ if (!vm_page_reclaim_contig(pflags, 1, low,
+ high, alignment, boundary))
+ vm_wait(NULL);
+ tries++;
+ goto retry;
+ }
+
+ return (ENOMEM);
+ }
+
+#if 0
+ /* TODO: could not clean device memory on arm64. */
+ if ((m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+#endif
+
+#ifdef __aarch64__
+ va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+ cpu_dcache_wb_range((void *)va, PAGE_SIZE);
+#endif
+
+ m->valid = VM_PAGE_BITS_ALL;
+ m->oflags &= ~VPO_UNMANAGED;
+ m->flags |= PG_FICTITIOUS;
+ vm->pages[i] = m;
+
+ VM_OBJECT_WLOCK(vm->obj);
+ vm_page_insert(m, vm->obj, i);
+ if (kva_req)
+ pmap_qenter(vm->kvaddr + i * PAGE_SIZE, &m, 1);
+ VM_OBJECT_WUNLOCK(vm->obj);
+ }
+
+ return (0);
+}
+
+static int
+hwt_vm_open(struct cdev *cdev, int oflags, int devtype, struct thread *td)
+{
+
+ dprintf("%s\n", __func__);
+
+ return (0);
+}
+
+static int
+hwt_vm_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
+ vm_size_t mapsize, struct vm_object **objp, int nprot)
+{
+ struct hwt_vm *vm;
+
+ vm = cdev->si_drv1;
+
+ if (nprot != PROT_READ || *offset != 0)
+ return (ENXIO);
+
+ vm_object_reference(vm->obj);
+ *objp = vm->obj;
+
+ return (0);
+}
+
+static void
+hwt_vm_start_cpu_mode(struct hwt_context *ctx)
+{
+ cpuset_t enable_cpus;
+ int cpu_id;
+
+ CPU_ZERO(&enable_cpus);
+
+ CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
+#ifdef SMP
+ /* Ensure CPU is not halted. */
+ if (CPU_ISSET(cpu_id, &hlt_cpus_mask))
+ continue;
+#endif
+
+ hwt_backend_configure(ctx, cpu_id, cpu_id);
+
+ CPU_SET(cpu_id, &enable_cpus);
+ }
+
+ if (ctx->hwt_backend->ops->hwt_backend_enable_smp == NULL) {
+ CPU_FOREACH_ISSET(cpu_id, &enable_cpus)
+ hwt_backend_enable(ctx, cpu_id);
+ } else {
+ /* Some backends require enabling all CPUs at once. */
+ hwt_backend_enable_smp(ctx);
+ }
+}
+
+static int
+hwt_vm_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
+ struct thread *td)
+{
+ struct hwt_record_get *rget;
+ struct hwt_set_config *sconf;
+ struct hwt_bufptr_get *ptr_get;
+ struct hwt_svc_buf *sbuf;
+
+ struct hwt_context *ctx;
+ struct hwt_vm *vm;
+ struct hwt_owner *ho;
+
+ vm_offset_t offset;
+ int ident;
+ int error;
+ uint64_t data = 0;
+ void *data2;
+ size_t data_size;
+ int data_version;
+
+ vm = dev->si_drv1;
+ KASSERT(vm != NULL, ("si_drv1 is NULL"));
+
+ ctx = vm->ctx;
+
+ /* Ensure process is registered owner of this HWT. */
+ ho = hwt_ownerhash_lookup(td->td_proc);
+ if (ho == NULL)
+ return (ENXIO);
+
+ if (ctx->hwt_owner != ho)
+ return (EPERM);
+
+ switch (cmd) {
+ case HWT_IOC_START:
+ dprintf("%s: start tracing\n", __func__);
+
+ HWT_CTX_LOCK(ctx);
+ if (ctx->state == CTX_STATE_RUNNING) {
+ /* Already running ? */
+ HWT_CTX_UNLOCK(ctx);
+ return (ENXIO);
+ }
+ ctx->state = CTX_STATE_RUNNING;
+ HWT_CTX_UNLOCK(ctx);
+
+ if (ctx->mode == HWT_MODE_CPU)
+ hwt_vm_start_cpu_mode(ctx);
+ else {
+ /*
+ * Tracing backend will be configured and enabled
+ * during hook invocation. See hwt_hook.c.
+ */
+ }
+
+ break;
+
+ case HWT_IOC_STOP:
+ if (ctx->state == CTX_STATE_STOPPED)
+ return (ENXIO);
+ hwt_backend_stop(ctx);
+ ctx->state = CTX_STATE_STOPPED;
+ break;
+
+ case HWT_IOC_RECORD_GET:
+ rget = (struct hwt_record_get *)addr;
+ error = hwt_record_send(ctx, rget);
+ if (error)
+ return (error);
+ break;
+
+ case HWT_IOC_SET_CONFIG:
+ if (ctx->state == CTX_STATE_RUNNING) {
+ return (ENXIO);
+ }
+ sconf = (struct hwt_set_config *)addr;
+ error = hwt_config_set(td, ctx, sconf);
+ if (error)
+ return (error);
+ ctx->pause_on_mmap = sconf->pause_on_mmap ? 1 : 0;
+ break;
+
+ case HWT_IOC_WAKEUP:
+
+ if (ctx->mode == HWT_MODE_CPU)
+ return (ENXIO);
+
+ KASSERT(vm->thr != NULL, ("thr is NULL"));
+
+ wakeup(vm->thr);
+
+ break;
+
+ case HWT_IOC_BUFPTR_GET:
+ ptr_get = (struct hwt_bufptr_get *)addr;
+
+ error = hwt_backend_read(ctx, vm, &ident, &offset, &data);
+ if (error)
+ return (error);
+
+ if (ptr_get->ident)
+ error = copyout(&ident, ptr_get->ident, sizeof(int));
+ if (error)
+ return (error);
+
+ if (ptr_get->offset)
+ error = copyout(&offset, ptr_get->offset,
+ sizeof(vm_offset_t));
+ if (error)
+ return (error);
+
+ if (ptr_get->data)
+ error = copyout(&data, ptr_get->data, sizeof(uint64_t));
+ if (error)
+ return (error);
+
+ break;
+
+ case HWT_IOC_SVC_BUF:
+ if (ctx->state == CTX_STATE_STOPPED) {
+ return (ENXIO);
+ }
+
+ sbuf = (struct hwt_svc_buf *)addr;
+ data_size = sbuf->data_size;
+ data_version = sbuf->data_version;
+
+ if (data_size == 0 || data_size > PAGE_SIZE)
+ return (EINVAL);
+
+ data2 = malloc(data_size, M_HWT_VM, M_WAITOK | M_ZERO);
+ error = copyin(sbuf->data, data2, data_size);
+ if (error) {
+ free(data2, M_HWT_VM);
+ return (error);
+ }
+
+ error = hwt_backend_svc_buf(ctx, data2, data_size, data_version);
+ if (error) {
+ free(data2, M_HWT_VM);
+ return (error);
+ }
+
+ free(data2, M_HWT_VM);
+ break;
+
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+static struct cdevsw hwt_vm_cdevsw = {
+ .d_version = D_VERSION,
+ .d_name = "hwt",
+ .d_open = hwt_vm_open,
+ .d_mmap_single = hwt_vm_mmap_single,
+ .d_ioctl = hwt_vm_ioctl,
+};
+
+static int
+hwt_vm_create_cdev(struct hwt_vm *vm, char *path)
+{
+ struct make_dev_args args;
+ int error;
+
+ dprintf("%s: path %s\n", __func__, path);
+
+ make_dev_args_init(&args);
+ args.mda_devsw = &hwt_vm_cdevsw;
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_WHEEL;
+ args.mda_mode = 0660;
+ args.mda_si_drv1 = vm;
+
+ error = make_dev_s(&args, &vm->cdev, "%s", path);
+ if (error != 0)
+ return (error);
+
+ return (0);
+}
+
+static int
+hwt_vm_alloc_buffers(struct hwt_vm *vm, int kva_req)
+{
+ int error;
+
+ vm->pages = malloc(sizeof(struct vm_page *) * vm->npages,
+ M_HWT_VM, M_WAITOK | M_ZERO);
+
+ error = hwt_vm_alloc_pages(vm, kva_req);
+ if (error) {
+ printf("%s: could not alloc pages\n", __func__);
+ return (error);
+ }
+
+ return (0);
+}
+
+static void
+hwt_vm_destroy_buffers(struct hwt_vm *vm)
+{
+ vm_page_t m;
+ int i;
+
+ if (vm->ctx->hwt_backend->kva_req && vm->kvaddr != 0) {
+ pmap_qremove(vm->kvaddr, vm->npages);
+ kva_free(vm->kvaddr, vm->npages * PAGE_SIZE);
+ }
+ VM_OBJECT_WLOCK(vm->obj);
+ for (i = 0; i < vm->npages; i++) {
+ m = vm->pages[i];
+ if (m == NULL)
+ break;
+
+ vm_page_busy_acquire(m, 0);
+ cdev_pager_free_page(vm->obj, m);
+ m->flags &= ~PG_FICTITIOUS;
+ vm_page_unwire_noq(m);
+ vm_page_free(m);
+
+ }
+ vm_pager_deallocate(vm->obj);
+ VM_OBJECT_WUNLOCK(vm->obj);
+
+ free(vm->pages, M_HWT_VM);
+}
+
+void
+hwt_vm_free(struct hwt_vm *vm)
+{
+
+ dprintf("%s\n", __func__);
+
+ if (vm->cdev)
+ destroy_dev_sched(vm->cdev);
+ hwt_vm_destroy_buffers(vm);
+ free(vm, M_HWT_VM);
+}
+
+int
+hwt_vm_alloc(size_t bufsize, int kva_req, char *path, struct hwt_vm **vm0)
+{
+ struct hwt_vm *vm;
+ int error;
+
+ vm = malloc(sizeof(struct hwt_vm), M_HWT_VM, M_WAITOK | M_ZERO);
+ vm->npages = bufsize / PAGE_SIZE;
+
+ error = hwt_vm_alloc_buffers(vm, kva_req);
+ if (error) {
+ free(vm, M_HWT_VM);
+ return (error);
+ }
+
+ error = hwt_vm_create_cdev(vm, path);
+ if (error) {
+ hwt_vm_free(vm);
+ return (error);
+ }
+
+ *vm0 = vm;
+
+ return (0);
+}
diff --git a/sys/dev/hwt/hwt_vm.h b/sys/dev/hwt/hwt_vm.h
new file mode 100644
index 000000000000..5002bd43e093
--- /dev/null
+++ b/sys/dev/hwt/hwt_vm.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEV_HWT_HWT_VM_H_
+#define _DEV_HWT_HWT_VM_H_
+
+struct hwt_vm {
+ vm_page_t *pages;
+ int npages;
+ vm_object_t obj;
+ vm_offset_t kvaddr;
+ struct cdev *cdev;
+
+ struct hwt_context *ctx;
+ struct hwt_cpu *cpu; /* cpu mode only. */
+ struct hwt_thread *thr; /* thr mode only. */
+};
+
+int hwt_vm_alloc(size_t bufsize, int kva_req, char *path, struct hwt_vm **vm0);
+void hwt_vm_free(struct hwt_vm *vm);
+
+#endif /* !_DEV_HWT_HWT_VM_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c
index 189a3e66a039..7ea60a499c72 100644
--- a/sys/dev/hyperv/vmbus/vmbus_chan.c
+++ b/sys/dev/hyperv/vmbus/vmbus_chan.c
@@ -1555,7 +1555,7 @@ vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags,
continue;
flags = atomic_swap_long(&event_flags[f], 0);
- chid_base = f << VMBUS_EVTFLAG_SHIFT;
+ chid_base = f * VMBUS_EVTFLAG_LEN;
while ((chid_ofs = ffsl(flags)) != 0) {
struct vmbus_channel *chan;
@@ -1599,7 +1599,7 @@ vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu)
eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE;
if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) {
vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags,
- VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT);
+ VMBUS_CHAN_MAX_COMPAT / VMBUS_EVTFLAG_LEN);
}
}
@@ -1903,7 +1903,7 @@ vmbus_chan_msgproc_choffer(struct vmbus_softc *sc,
* Setup event flag.
*/
chan->ch_evtflag =
- &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT];
+ &sc->vmbus_tx_evtflags[chan->ch_id / VMBUS_EVTFLAG_LEN];
chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK);
/*
diff --git a/sys/dev/hyperv/vmbus/vmbus_reg.h b/sys/dev/hyperv/vmbus/vmbus_reg.h
index 4aa729475b5d..76cdca0ebeb2 100644
--- a/sys/dev/hyperv/vmbus/vmbus_reg.h
+++ b/sys/dev/hyperv/vmbus/vmbus_reg.h
@@ -60,16 +60,10 @@ CTASSERT(sizeof(struct vmbus_message) == VMBUS_MSG_SIZE);
* Hyper-V SynIC event flags
*/
-#ifdef __LP64__
-#define VMBUS_EVTFLAGS_MAX 32
-#define VMBUS_EVTFLAG_SHIFT 6
-#else
-#define VMBUS_EVTFLAGS_MAX 64
-#define VMBUS_EVTFLAG_SHIFT 5
-#endif
-#define VMBUS_EVTFLAG_LEN (1 << VMBUS_EVTFLAG_SHIFT)
+#define VMBUS_EVTFLAG_LEN (sizeof(u_long) * 8)
#define VMBUS_EVTFLAG_MASK (VMBUS_EVTFLAG_LEN - 1)
#define VMBUS_EVTFLAGS_SIZE 256
+#define VMBUS_EVTFLAGS_MAX (VMBUS_EVTFLAGS_SIZE / sizeof(u_long))
struct vmbus_evtflags {
u_long evt_flags[VMBUS_EVTFLAGS_MAX];
diff --git a/sys/dev/ice/ice_features.h b/sys/dev/ice/ice_features.h
index 821abe4806ca..5b23757b1c98 100644
--- a/sys/dev/ice/ice_features.h
+++ b/sys/dev/ice/ice_features.h
@@ -91,7 +91,9 @@ enum feat_list {
static inline void
ice_disable_unsupported_features(ice_bitmap_t __unused *bitmap)
{
+#ifndef PCI_IOV
ice_clear_bit(ICE_FEATURE_SRIOV, bitmap);
+#endif
#ifndef DEV_NETMAP
ice_clear_bit(ICE_FEATURE_NETMAP, bitmap);
#endif
diff --git a/sys/dev/ice/ice_iflib.h b/sys/dev/ice/ice_iflib.h
index 3a5dc201189a..e1d5307a9516 100644
--- a/sys/dev/ice/ice_iflib.h
+++ b/sys/dev/ice/ice_iflib.h
@@ -139,6 +139,9 @@ struct ice_irq_vector {
* @tc: traffic class queue belongs to
* @q_handle: qidx in tc; used in TXQ enable functions
*
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
* Other parameters may be iflib driver specific
*/
struct ice_tx_queue {
@@ -153,6 +156,9 @@ struct ice_tx_queue {
u32 me;
u16 q_handle;
u8 tc;
+#ifdef PCI_IOV
+ u8 itr_idx;
+#endif
/* descriptor writeback status */
qidx_t *tx_rsq;
@@ -175,6 +181,9 @@ struct ice_tx_queue {
* @stats: queue statistics
* @tc: traffic class queue belongs to
*
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
* Other parameters may be iflib driver specific
*/
struct ice_rx_queue {
@@ -187,6 +196,9 @@ struct ice_rx_queue {
struct ice_irq_vector *irqv;
u32 me;
u8 tc;
+#ifdef PCI_IOV
+ u8 itr_idx;
+#endif
struct if_irq que_irq;
};
@@ -332,6 +344,10 @@ struct ice_softc {
ice_declare_bitmap(feat_cap, ICE_FEATURE_COUNT);
ice_declare_bitmap(feat_en, ICE_FEATURE_COUNT);
+#ifdef PCI_IOV
+ struct ice_vf *vfs;
+ u16 num_vfs;
+#endif
struct ice_resmgr os_imgr;
/* For mirror interface */
struct ice_mirr_if *mirr_if;
diff --git a/sys/dev/ice/ice_iov.c b/sys/dev/ice/ice_iov.c
new file mode 100644
index 000000000000..c5a3e1060e44
--- /dev/null
+++ b/sys/dev/ice/ice_iov.c
@@ -0,0 +1,1856 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.c
+ * @brief Virtualization support functions
+ *
+ * Contains functions for enabling and managing PCIe virtual function devices,
+ * including enabling new VFs, and managing VFs over the virtchnl interface.
+ */
+
+#include "ice_iov.h"
+
+static struct ice_vf *ice_iov_get_vf(struct ice_softc *sc, int vf_num);
+static void ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf,
+ bool trigger_vflr);
+static void ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf);
+
+static void ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static bool ice_vc_isvalid_ring_len(u16 ring_len);
+static void ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+ struct virtchnl_eth_stats *vstats);
+static void ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static enum virtchnl_status_code ice_iov_err_to_virt_err(int ice_err);
+static int ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr);
+
+/**
+ * ice_iov_attach - Initialize SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Initialize SR-IOV PF host support at the end of the driver attach process.
+ *
+ * @pre Must be called from sleepable context (calls malloc() w/ M_WAITOK)
+ *
+ * @returns 0 if successful, or
+ * - ENOMEM if there is no memory for the PF/VF schemas or iov device
+ * - ENXIO if the device isn't PCI-E or doesn't support the same SR-IOV
+ * version as the kernel
+ * - ENOENT if the device doesn't have the SR-IOV capability
+ */
+int
+ice_iov_attach(struct ice_softc *sc)
+{
+ device_t dev = sc->dev;
+ nvlist_t *pf_schema, *vf_schema;
+ int error;
+
+ pf_schema = pci_iov_schema_alloc_node();
+ vf_schema = pci_iov_schema_alloc_node();
+
+ pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL);
+ pci_iov_schema_add_bool(vf_schema, "mac-anti-spoof",
+ IOV_SCHEMA_HASDEFAULT, TRUE);
+ pci_iov_schema_add_bool(vf_schema, "allow-set-mac",
+ IOV_SCHEMA_HASDEFAULT, FALSE);
+ pci_iov_schema_add_bool(vf_schema, "allow-promisc",
+ IOV_SCHEMA_HASDEFAULT, FALSE);
+ pci_iov_schema_add_uint16(vf_schema, "num-queues",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_QUEUES);
+ pci_iov_schema_add_uint16(vf_schema, "mirror-src-vsi",
+ IOV_SCHEMA_HASDEFAULT, ICE_INVALID_MIRROR_VSI);
+ pci_iov_schema_add_uint16(vf_schema, "max-vlan-allowed",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_VLAN_LIMIT);
+ pci_iov_schema_add_uint16(vf_schema, "max-mac-filters",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_FILTER_LIMIT);
+
+ error = pci_iov_attach(dev, pf_schema, vf_schema);
+ if (error != 0) {
+ device_printf(dev,
+ "pci_iov_attach failed (error=%s)\n",
+ ice_err_str(error));
+ ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+ } else
+ ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+
+ return (error);
+}
+
+/**
+ * ice_iov_detach - Teardown SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Teardown SR-IOV PF host support at the start of the driver detach process.
+ *
+ * @returns 0 if successful or IOV support hasn't been setup, or
+ * - EBUSY if VFs still exist
+ */
+int
+ice_iov_detach(struct ice_softc *sc)
+{
+ device_t dev = sc->dev;
+ int error;
+
+ error = pci_iov_detach(dev);
+ if (error != 0) {
+ device_printf(dev,
+ "pci_iov_detach failed (error=%s)\n",
+ ice_err_str(error));
+ }
+
+ return (error);
+}
+
+/**
+ * ice_iov_init - Called by the OS before the first VF is created.
+ * @sc: device softc structure
+ * @num_vfs: number of VFs to setup resources for
+ * @params: configuration parameters for the PF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params __unused)
+{
+ /* Allocate array of VFs, for tracking */
+ sc->vfs = (struct ice_vf *)malloc(sizeof(struct ice_vf) * num_vfs, M_ICE, M_NOWAIT |
+ M_ZERO);
+ if (sc->vfs == NULL)
+ return (ENOMEM);
+
+ /* Initialize each VF with basic information */
+ for (int i = 0; i < num_vfs; i++)
+ sc->vfs[i].vf_num = i;
+
+ /* Save off number of configured VFs */
+ sc->num_vfs = num_vfs;
+
+ return (0);
+}
+
+/**
+ * ice_iov_get_vf - Get pointer to VF at given index
+ * @sc: device softc structure
+ * @vf_num: Index of VF to retrieve
+ *
+ * @remark will throw an assertion if vf_num is not in the
+ * range of allocated VFs
+ *
+ * @returns a pointer to the VF structure at the given index
+ */
+static struct ice_vf *
+ice_iov_get_vf(struct ice_softc *sc, int vf_num)
+{
+ MPASS(vf_num < sc->num_vfs);
+
+ return &sc->vfs[vf_num];
+}
+
+/**
+ * ice_iov_add_vf - Called by the OS for each VF to create
+ * @sc: device softc structure
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params)
+{
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ device_t dev = sc->dev;
+ struct ice_vsi *vsi;
+ struct ice_vf *vf;
+ int vf_num_queues;
+ const void *mac;
+ size_t size;
+ int error;
+ int i;
+
+ vf = ice_iov_get_vf(sc, vfnum);
+ vf->vf_flags = VF_FLAG_ENABLED;
+
+ /* This VF needs at least one VSI */
+ vsi = ice_alloc_vsi(sc, ICE_VSI_VF);
+ if (vsi == NULL)
+ return (ENOMEM);
+ vf->vsi = vsi;
+ vsi->vf_num = vfnum;
+
+ vf_num_queues = nvlist_get_number(params, "num-queues");
+ /* Validate and clamp value if invalid */
+ if (vf_num_queues < 1 || vf_num_queues > ICE_MAX_SCATTERED_QUEUES)
+ device_printf(dev, "Invalid num-queues (%d) for VF %d\n",
+ vf_num_queues, vf->vf_num);
+ if (vf_num_queues < 1) {
+ device_printf(dev, "Setting VF %d num-queues to 1\n", vf->vf_num);
+ vf_num_queues = 1;
+ } else if (vf_num_queues > ICE_MAX_SCATTERED_QUEUES) {
+ device_printf(dev, "Setting VF %d num-queues to %d\n",
+ vf->vf_num, ICE_MAX_SCATTERED_QUEUES);
+ vf_num_queues = ICE_MAX_SCATTERED_QUEUES;
+ }
+ vsi->qmap_type = ICE_RESMGR_ALLOC_SCATTERED;
+
+ /* Reserve VF queue allocation from PF queues */
+ ice_alloc_vsi_qmap(vsi, vf_num_queues, vf_num_queues);
+ vsi->num_tx_queues = vsi->num_rx_queues = vf_num_queues;
+
+ /* Assign Tx queues from PF space */
+ error = ice_resmgr_assign_scattered(&sc->tx_qmgr, vsi->tx_qmap,
+ vsi->num_tx_queues);
+ if (error) {
+ device_printf(sc->dev, "Unable to assign VF Tx queues: %s\n",
+ ice_err_str(error));
+ goto release_vsi;
+ }
+
+ /* Assign Rx queues from PF space */
+ error = ice_resmgr_assign_scattered(&sc->rx_qmgr, vsi->rx_qmap,
+ vsi->num_rx_queues);
+ if (error) {
+ device_printf(sc->dev, "Unable to assign VF Rx queues: %s\n",
+ ice_err_str(error));
+ goto release_vsi;
+ }
+
+ vsi->max_frame_size = ICE_MAX_FRAME_SIZE;
+
+ /* Allocate queue structure memory */
+ vsi->tx_queues = (struct ice_tx_queue *)
+ malloc(sizeof(struct ice_tx_queue) * vsi->num_tx_queues, M_ICE,
+ M_NOWAIT | M_ZERO);
+ if (!vsi->tx_queues) {
+ device_printf(sc->dev, "VF-%d: Unable to allocate Tx queue memory\n",
+ vfnum);
+ error = ENOMEM;
+ goto release_vsi;
+ }
+ for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) {
+ txq->me = i;
+ txq->vsi = vsi;
+ }
+
+ /* Allocate queue structure memory */
+ vsi->rx_queues = (struct ice_rx_queue *)
+ malloc(sizeof(struct ice_rx_queue) * vsi->num_rx_queues, M_ICE,
+ M_NOWAIT | M_ZERO);
+ if (!vsi->rx_queues) {
+ device_printf(sc->dev, "VF-%d: Unable to allocate Rx queue memory\n",
+ vfnum);
+ error = ENOMEM;
+ goto free_txqs;
+ }
+ for (i = 0, rxq = vsi->rx_queues; i < vsi->num_rx_queues; i++, rxq++) {
+ rxq->me = i;
+ rxq->vsi = vsi;
+ }
+
+ /* Allocate space to store the IRQ vector data */
+ vf->num_irq_vectors = vf_num_queues + 1;
+ vf->tx_irqvs = (struct ice_irq_vector *)
+ malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+ M_ICE, M_NOWAIT);
+ if (!vf->tx_irqvs) {
+ device_printf(sc->dev,
+ "Unable to allocate TX irqv memory for VF-%d's %d vectors\n",
+ vfnum, vf->num_irq_vectors);
+ error = ENOMEM;
+ goto free_rxqs;
+ }
+ vf->rx_irqvs = (struct ice_irq_vector *)
+ malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+ M_ICE, M_NOWAIT);
+ if (!vf->rx_irqvs) {
+ device_printf(sc->dev,
+ "Unable to allocate RX irqv memory for VF-%d's %d vectors\n",
+ vfnum, vf->num_irq_vectors);
+ error = ENOMEM;
+ goto free_txirqvs;
+ }
+
+ /* Assign VF interrupts from PF space */
+ if (!(vf->vf_imap =
+ (u16 *)malloc(sizeof(u16) * vf->num_irq_vectors,
+ M_ICE, M_NOWAIT))) {
+ device_printf(dev, "Unable to allocate VF-%d imap memory\n", vfnum);
+ error = ENOMEM;
+ goto free_rxirqvs;
+ }
+ error = ice_resmgr_assign_contiguous(&sc->dev_imgr, vf->vf_imap, vf->num_irq_vectors);
+ if (error) {
+ device_printf(dev, "Unable to assign VF-%d interrupt mapping: %s\n",
+ vfnum, ice_err_str(error));
+ goto free_imap;
+ }
+
+ if (nvlist_exists_binary(params, "mac-addr")) {
+ mac = nvlist_get_binary(params, "mac-addr", &size);
+ memcpy(vf->mac, mac, ETHER_ADDR_LEN);
+
+ if (nvlist_get_bool(params, "allow-set-mac"))
+ vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+ } else
+ /*
+ * If the administrator has not specified a MAC address then
+ * we must allow the VF to choose one.
+ */
+ vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+
+ if (nvlist_get_bool(params, "mac-anti-spoof"))
+ vf->vf_flags |= VF_FLAG_MAC_ANTI_SPOOF;
+
+ if (nvlist_get_bool(params, "allow-promisc"))
+ vf->vf_flags |= VF_FLAG_PROMISC_CAP;
+
+ vsi->mirror_src_vsi = nvlist_get_number(params, "mirror-src-vsi");
+
+ vf->vlan_limit = nvlist_get_number(params, "max-vlan-allowed");
+ vf->mac_filter_limit = nvlist_get_number(params, "max-mac-filters");
+
+ vf->vf_flags |= VF_FLAG_VLAN_CAP;
+
+ /* Create and setup VSI in HW */
+ error = ice_initialize_vsi(vsi);
+ if (error) {
+ device_printf(sc->dev, "Unable to initialize VF %d VSI: %s\n",
+ vfnum, ice_err_str(error));
+ goto release_imap;
+ }
+
+ /* Add the broadcast address */
+ error = ice_add_vsi_mac_filter(vsi, broadcastaddr);
+ if (error) {
+ device_printf(sc->dev, "Unable to add broadcast filter VF %d VSI: %s\n",
+ vfnum, ice_err_str(error));
+ goto release_imap;
+ }
+
+ ice_iov_ready_vf(sc, vf);
+
+ return (0);
+
+release_imap:
+ ice_resmgr_release_map(&sc->dev_imgr, vf->vf_imap,
+ vf->num_irq_vectors);
+free_imap:
+ free(vf->vf_imap, M_ICE);
+ vf->vf_imap = NULL;
+free_rxirqvs:
+ free(vf->rx_irqvs, M_ICE);
+ vf->rx_irqvs = NULL;
+free_txirqvs:
+ free(vf->tx_irqvs, M_ICE);
+ vf->tx_irqvs = NULL;
+free_rxqs:
+ free(vsi->rx_queues, M_ICE);
+ vsi->rx_queues = NULL;
+free_txqs:
+ free(vsi->tx_queues, M_ICE);
+ vsi->tx_queues = NULL;
+release_vsi:
+ ice_release_vsi(vsi);
+ vf->vsi = NULL;
+ return (error);
+}
+
+/**
+ * ice_iov_uninit - Called by the OS when VFs are destroyed
+ * @sc: device softc structure
+ */
+void
+ice_iov_uninit(struct ice_softc *sc)
+{
+ struct ice_vf *vf;
+ struct ice_vsi *vsi;
+
+ /* Release per-VF resources */
+ for (int i = 0; i < sc->num_vfs; i++) {
+ vf = &sc->vfs[i];
+ vsi = vf->vsi;
+
+ /* Free VF interrupt reservation */
+ if (vf->vf_imap) {
+ free(vf->vf_imap, M_ICE);
+ vf->vf_imap = NULL;
+ }
+
+ /* Free queue interrupt mapping trackers */
+ if (vf->tx_irqvs) {
+ free(vf->tx_irqvs, M_ICE);
+ vf->tx_irqvs = NULL;
+ }
+ if (vf->rx_irqvs) {
+ free(vf->rx_irqvs, M_ICE);
+ vf->rx_irqvs = NULL;
+ }
+
+ if (!vsi)
+ continue;
+
+ /* Free VSI queues */
+ if (vsi->tx_queues) {
+ free(vsi->tx_queues, M_ICE);
+ vsi->tx_queues = NULL;
+ }
+ if (vsi->rx_queues) {
+ free(vsi->rx_queues, M_ICE);
+ vsi->rx_queues = NULL;
+ }
+
+ ice_release_vsi(vsi);
+ vf->vsi = NULL;
+ }
+
+ /* Release memory used for VF tracking */
+ if (sc->vfs) {
+ free(sc->vfs, M_ICE);
+ sc->vfs = NULL;
+ }
+ sc->num_vfs = 0;
+}
+
+/**
+ * ice_iov_handle_vflr - Process VFLR event
+ * @sc: device softc structure
+ *
+ * Identifys which VFs have been reset and re-configure
+ * them.
+ */
+void
+ice_iov_handle_vflr(struct ice_softc *sc)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct ice_vf *vf;
+ u32 reg, reg_idx, bit_idx;
+
+ for (int i = 0; i < sc->num_vfs; i++) {
+ vf = &sc->vfs[i];
+
+ reg_idx = (hw->func_caps.vf_base_id + vf->vf_num) / 32;
+ bit_idx = (hw->func_caps.vf_base_id + vf->vf_num) % 32;
+ reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx));
+ if (reg & BIT(bit_idx))
+ ice_reset_vf(sc, vf, false);
+ }
+}
+
+/**
+ * ice_iov_ready_vf - Setup VF interrupts and mark it as ready
+ * @sc: device softc structure
+ * @vf: driver's VF structure for the VF to update
+ *
+ * Clears VF reset triggering bit, sets up the PF<->VF interrupt
+ * mapping and marks the VF as active in the HW so that the VF
+ * driver can use it.
+ */
+static void
+ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct ice_hw *hw = &sc->hw;
+ u32 reg;
+
+ /* Clear the triggering bit */
+ reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+ reg &= ~VPGEN_VFRTRIG_VFSWR_M;
+ wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+
+ /* Setup VF interrupt allocation and mapping */
+ ice_iov_setup_intr_mapping(sc, vf);
+
+ /* Indicate to the VF that reset is done */
+ wr32(hw, VFGEN_RSTAT(vf->vf_num), VIRTCHNL_VFR_VFACTIVE);
+
+ ice_flush(hw);
+}
+
+/**
+ * ice_reset_vf - Perform a hardware reset (VFR) on a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be reset
+ * @trigger_vflr: trigger a reset or only handle already executed reset
+ *
+ * Performs a VFR for the given VF. This function busy waits until the
+ * reset completes in the HW, notifies the VF that the reset is done
+ * by setting a bit in a HW register, then returns.
+ *
+ * @remark This also sets up the PF<->VF interrupt mapping and allocations in
+ * the hardware after the hardware reset is finished, via
+ * ice_iov_setup_intr_mapping()
+ */
+static void
+ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, bool trigger_vflr)
+{
+ u16 global_vf_num, reg_idx, bit_idx;
+ struct ice_hw *hw = &sc->hw;
+ int status;
+ u32 reg;
+ int i;
+
+ global_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+ if (trigger_vflr) {
+ reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+ reg |= VPGEN_VFRTRIG_VFSWR_M;
+ wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+ }
+
+ /* clear the VFLR bit for the VF in a GLGEN_VFLRSTAT register */
+ reg_idx = (global_vf_num) / 32;
+ bit_idx = (global_vf_num) % 32;
+ wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
+ ice_flush(hw);
+
+ /* Wait until there are no pending PCI transactions */
+ wr32(hw, PF_PCI_CIAA,
+ ICE_PCIE_DEV_STATUS | (global_vf_num << PF_PCI_CIAA_VF_NUM_S));
+
+ for (i = 0; i < ICE_PCI_CIAD_WAIT_COUNT; i++) {
+ reg = rd32(hw, PF_PCI_CIAD);
+ if (!(reg & PCIEM_STA_TRANSACTION_PND))
+ break;
+
+ DELAY(ICE_PCI_CIAD_WAIT_DELAY_US);
+ }
+ if (i == ICE_PCI_CIAD_WAIT_COUNT)
+ device_printf(sc->dev,
+ "VF-%d PCI transactions stuck\n", vf->vf_num);
+
+ /* Disable TX queues, which is required during VF reset */
+ status = ice_dis_vsi_txq(hw->port_info, vf->vsi->idx, 0, 0, NULL, NULL,
+ NULL, ICE_VF_RESET, vf->vf_num, NULL);
+ if (status)
+ device_printf(sc->dev,
+ "%s: Failed to disable LAN Tx queues: err %s aq_err %s\n",
+ __func__, ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+
+ /* Then check for the VF reset to finish in HW */
+ for (i = 0; i < ICE_VPGEN_VFRSTAT_WAIT_COUNT; i++) {
+ reg = rd32(hw, VPGEN_VFRSTAT(vf->vf_num));
+ if ((reg & VPGEN_VFRSTAT_VFRD_M))
+ break;
+
+ DELAY(ICE_VPGEN_VFRSTAT_WAIT_DELAY_US);
+ }
+ if (i == ICE_VPGEN_VFRSTAT_WAIT_COUNT)
+ device_printf(sc->dev,
+ "VF-%d Reset is stuck\n", vf->vf_num);
+
+ ice_iov_ready_vf(sc, vf);
+}
+
+/**
+ * ice_vc_get_vf_res_msg - Handle VIRTCHNL_OP_GET_VF_RESOURCES msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a message from the VF listing its supported capabilities, and
+ * replies to the VF with information about what resources the PF has
+ * allocated for the VF.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail. It's up to the VF driver to reject or complain about the PF's response.
+ */
+static void
+ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vf_resource *vf_res;
+ struct virtchnl_vsi_resource *vsi_res;
+ u16 vf_res_len;
+ u32 vf_caps;
+
+ /* XXX: Only support one VSI per VF, so this size doesn't need adjusting */
+ vf_res_len = sizeof(struct virtchnl_vf_resource);
+ vf_res = (struct virtchnl_vf_resource *)malloc(vf_res_len, M_ICE,
+ M_WAITOK | M_ZERO);
+
+ vf_res->num_vsis = 1;
+ vf_res->num_queue_pairs = vf->vsi->num_tx_queues;
+ vf_res->max_vectors = vf_res->num_queue_pairs + 1;
+
+ vf_res->rss_key_size = ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE;
+ vf_res->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+ vf_res->max_mtu = 0;
+
+ vf_res->vf_cap_flags = VF_BASE_MODE_OFFLOADS;
+ if (msg_buf != NULL) {
+ vf_caps = *((u32 *)(msg_buf));
+
+ if (vf_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
+ vf_res->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
+
+ if (vf_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
+ vf_res->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
+ }
+
+ vsi_res = &vf_res->vsi_res[0];
+ vsi_res->vsi_id = vf->vsi->idx;
+ vsi_res->num_queue_pairs = vf->vsi->num_tx_queues;
+ vsi_res->vsi_type = VIRTCHNL_VSI_SRIOV;
+ vsi_res->qset_handle = 0;
+ if (!ETHER_IS_ZERO(vf->mac))
+ memcpy(vsi_res->default_mac_addr, vf->mac, ETHER_ADDR_LEN);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_VF_RESOURCES,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)vf_res, vf_res_len, NULL);
+
+ free(vf_res, M_ICE);
+}
+
+/**
+ * ice_vc_version_msg - Handle VIRTCHNL_OP_VERSION msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a version message from the VF, and responds to the VF with
+ * the version number that the PF will use.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail.
+ */
+static void
+ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct virtchnl_version_info *recv_vf_version;
+ struct ice_hw *hw = &sc->hw;
+ device_t dev = sc->dev;
+
+ recv_vf_version = (struct virtchnl_version_info *)msg_buf;
+
+ /* VFs running the 1.0 API expect to get 1.0 back */
+ if (VF_IS_V10(recv_vf_version)) {
+ vf->version.major = 1;
+ vf->version.minor = VIRTCHNL_VERSION_MINOR_NO_VF_CAPS;
+ } else {
+ vf->version.major = VIRTCHNL_VERSION_MAJOR;
+ vf->version.minor = VIRTCHNL_VERSION_MINOR;
+
+ if ((recv_vf_version->major != VIRTCHNL_VERSION_MAJOR) ||
+ (recv_vf_version->minor != VIRTCHNL_VERSION_MINOR))
+ device_printf(dev,
+ "%s: VF-%d requested version (%d.%d) differs from PF version (%d.%d)\n",
+ __func__, vf->vf_num,
+ recv_vf_version->major, recv_vf_version->minor,
+ VIRTCHNL_VERSION_MAJOR, VIRTCHNL_VERSION_MINOR);
+ }
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_VERSION,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&vf->version, sizeof(vf->version),
+ NULL);
+}
+
+/**
+ * ice_vf_validate_mac - Validate MAC address before adding it
+ * @vf: VF tracking structure
+ * @addr: MAC address to validate
+ *
+ * Validate a MAC address before adding it to a VF during the handling
+ * of a VIRTCHNL_OP_ADD_ETH_ADDR operation. Notably, this also checks if
+ * the VF is allowed to set its own arbitrary MAC addresses.
+ *
+ * Returns 0 if MAC address is valid for the given vf
+ */
+static int
+ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr)
+{
+
+ if (ETHER_IS_ZERO(addr) || ETHER_IS_BROADCAST(addr))
+ return (EINVAL);
+
+ /*
+ * If the VF is not allowed to change its MAC address, don't let it
+ * set a MAC filter for an address that is not a multicast address and
+ * is not its assigned MAC.
+ */
+ if (!(vf->vf_flags & VF_FLAG_SET_MAC_CAP) &&
+ !(ETHER_IS_MULTICAST(addr) || !bcmp(addr, vf->mac, ETHER_ADDR_LEN)))
+ return (EPERM);
+
+ return (0);
+}
+
+/**
+ * ice_vc_add_eth_addr_msg - Handle VIRTCHNL_OP_ADD_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and adds those addresses
+ * to the VSI's filter list.
+ */
+static void
+ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct virtchnl_ether_addr_list *addr_list;
+ struct ice_hw *hw = &sc->hw;
+ u16 added_addr_cnt = 0;
+ int error = 0;
+
+ addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+ if (addr_list->num_elements >
+ (vf->mac_filter_limit - vf->mac_filter_cnt)) {
+ v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ goto done;
+ }
+
+ for (int i = 0; i < addr_list->num_elements; i++) {
+ u8 *addr = addr_list->list[i].addr;
+
+ /* The type flag is currently ignored; every MAC address is
+ * treated as the LEGACY type
+ */
+
+ error = ice_vf_validate_mac(vf, addr);
+ if (error == EPERM) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Not permitted to add MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ } else if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Did not add invalid MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+
+ error = ice_add_vsi_mac_filter(vf->vsi, addr);
+ if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Error adding MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+ /* Don't count VF's MAC against its MAC filter limit */
+ if (memcmp(addr, vf->mac, ETHER_ADDR_LEN))
+ added_addr_cnt++;
+ }
+
+ vf->mac_filter_cnt += added_addr_cnt;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_ETH_ADDR,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_eth_addr_msg - Handle VIRTCHNL_OP_DEL_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and removes those addresses
+ * from the VSI's filter list.
+ */
+static void
+ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct virtchnl_ether_addr_list *addr_list;
+ struct ice_hw *hw = &sc->hw;
+ u16 deleted_addr_cnt = 0;
+ int error = 0;
+
+ addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+ for (int i = 0; i < addr_list->num_elements; i++) {
+ error = ice_remove_vsi_mac_filter(vf->vsi, addr_list->list[i].addr);
+ if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Error removing MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+ /* Don't count VF's MAC against its MAC filter limit */
+ if (memcmp(addr_list->list[i].addr, vf->mac, ETHER_ADDR_LEN))
+ deleted_addr_cnt++;
+ }
+
+ if (deleted_addr_cnt >= vf->mac_filter_cnt)
+ vf->mac_filter_cnt = 0;
+ else
+ vf->mac_filter_cnt -= deleted_addr_cnt;
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_ETH_ADDR,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_add_vlan_msg - Handle VIRTCHNL_OP_ADD_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VLANs in msg_buf to the VF's VLAN filter list.
+ */
+static void
+ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vlan_filter_list *vlan_list;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+ if (vlan_list->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vlan_list->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vlan_list->num_elements > (vf->vlan_limit - vf->vlan_cnt)) {
+ v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ goto done;
+ }
+
+ status = ice_add_vlan_hw_filters(vsi, vlan_list->vlan_id,
+ vlan_list->num_elements);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failure adding VLANs to VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx, ice_status_str(status),
+ ice_aq_str(sc->hw.adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ vf->vlan_cnt += vlan_list->num_elements;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_VLAN,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_vlan_msg - Handle VIRTCHNL_OP_DEL_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Removes the VLANs in msg_buf from the VF's VLAN filter list.
+ */
+static void
+ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vlan_filter_list *vlan_list;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+ if (vlan_list->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vlan_list->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ status = ice_remove_vlan_hw_filters(vsi, vlan_list->vlan_id,
+ vlan_list->num_elements);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failure deleting VLANs from VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx, ice_status_str(status),
+ ice_aq_str(sc->hw.adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ if (vlan_list->num_elements >= vf->vlan_cnt)
+ vf->vlan_cnt = 0;
+ else
+ vf->vlan_cnt -= vlan_list->num_elements;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_VLAN,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_validate_ring_len - Check to see if a descriptor ring length is valid
+ * @ring_len: length of ring
+ *
+ * Check whether a ring size value is valid.
+ *
+ * @returns true if given ring size is valid
+ */
+static bool
+ice_vc_isvalid_ring_len(u16 ring_len)
+{
+ return (ring_len >= ICE_MIN_DESC_COUNT &&
+ ring_len <= ICE_MAX_DESC_COUNT &&
+ !(ring_len % ICE_DESC_COUNT_INCR));
+}
+
+/**
+ * ice_vc_cfg_vsi_qs_msg - Handle VIRTCHNL_OP_CONFIG_VSI_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ */
+static void
+ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ device_t dev = sc->dev;
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vsi_queue_config_info *vqci;
+ struct virtchnl_queue_pair_info *vqpi;
+ enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ int i, error = 0;
+
+ vqci = (struct virtchnl_vsi_queue_config_info *)msg_buf;
+
+ if (vqci->num_queue_pairs > vf->vsi->num_tx_queues &&
+ vqci->num_queue_pairs > vf->vsi->num_rx_queues) {
+ status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ ice_vsi_disable_tx(vf->vsi);
+ ice_control_all_rx_queues(vf->vsi, false);
+
+ /*
+ * Clear TX and RX queues config in case VF
+ * requests different number of queues.
+ */
+ for (i = 0; i < vsi->num_tx_queues; i++) {
+ txq = &vsi->tx_queues[i];
+
+ txq->desc_count = 0;
+ txq->tx_paddr = 0;
+ txq->tc = 0;
+ }
+
+ for (i = 0; i < vsi->num_rx_queues; i++) {
+ rxq = &vsi->rx_queues[i];
+
+ rxq->desc_count = 0;
+ rxq->rx_paddr = 0;
+ }
+
+ vqpi = vqci->qpair;
+ for (i = 0; i < vqci->num_queue_pairs; i++, vqpi++) {
+ /* Initial parameter validation */
+ if (vqpi->txq.vsi_id != vf->vsi->idx ||
+ vqpi->rxq.vsi_id != vf->vsi->idx ||
+ vqpi->txq.queue_id != vqpi->rxq.queue_id ||
+ vqpi->txq.headwb_enabled ||
+ vqpi->rxq.splithdr_enabled ||
+ vqpi->rxq.crc_disable ||
+ !(ice_vc_isvalid_ring_len(vqpi->txq.ring_len)) ||
+ !(ice_vc_isvalid_ring_len(vqpi->rxq.ring_len))) {
+ status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Copy parameters into VF's queue/VSI structs */
+ txq = &vsi->tx_queues[vqpi->txq.queue_id];
+
+ txq->desc_count = vqpi->txq.ring_len;
+ txq->tx_paddr = vqpi->txq.dma_ring_addr;
+ txq->q_handle = vqpi->txq.queue_id;
+ txq->tc = 0;
+
+ rxq = &vsi->rx_queues[vqpi->rxq.queue_id];
+
+ rxq->desc_count = vqpi->rxq.ring_len;
+ rxq->rx_paddr = vqpi->rxq.dma_ring_addr;
+ vsi->mbuf_sz = vqpi->rxq.databuffer_size;
+ }
+
+ /* Configure TX queues in HW */
+ error = ice_cfg_vsi_for_tx(vsi);
+ if (error) {
+ device_printf(dev,
+ "VF-%d: Unable to configure VSI for Tx: %s\n",
+ vf->vf_num, ice_err_str(error));
+ status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ goto done;
+ }
+
+ /* Configure RX queues in HW */
+ error = ice_cfg_vsi_for_rx(vsi);
+ if (error) {
+ device_printf(dev,
+ "VF-%d: Unable to configure VSI for Rx: %s\n",
+ vf->vf_num, ice_err_str(error));
+ status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ ice_vsi_disable_tx(vsi);
+ goto done;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_VSI_QUEUES,
+ status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_key_msg - Handle VIRTCHNL_OP_CONFIG_RSS_KEY msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Sets the RSS key for the given VF, using the contents of msg_buf.
+ */
+static void
+ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_aqc_get_set_rss_keys keydata =
+ { .standard_rss_key = {0}, .extended_hash_key = {0} };
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_key *vrk;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vrk = (struct virtchnl_rss_key *)msg_buf;
+
+ if (vrk->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vrk->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if ((vrk->key_len >
+ (ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE +
+ ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)) ||
+ vrk->key_len == 0) {
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ memcpy(&keydata, vrk->key, vrk->key_len);
+
+ status = ice_aq_set_rss_key(hw, vsi->idx, &keydata);
+ if (status) {
+ device_printf(sc->dev,
+ "ice_aq_set_rss_key status %s, error %s\n",
+ ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_KEY,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_lut_msg - Handle VIRTCHNL_OP_CONFIG_RSS_LUT msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the LUT from the VF in msg_buf to the PF via an admin queue call.
+ */
+static void
+ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_lut *vrl;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_aq_get_set_rss_lut_params lut_params = {};
+ struct ice_vsi *vsi = vf->vsi;
+
+ vrl = (struct virtchnl_rss_lut *)msg_buf;
+
+ if (vrl->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vrl->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vrl->lut_entries > ICE_VSIQF_HLUT_ARRAY_SIZE) {
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ lut_params.vsi_handle = vsi->idx;
+ lut_params.lut_size = vsi->rss_table_size;
+ lut_params.lut_type = vsi->rss_lut_type;
+ lut_params.lut = vrl->lut;
+ lut_params.global_lut_id = 0;
+
+ status = ice_aq_set_rss_lut(hw, &lut_params);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Cannot set RSS lut, err %s aq_err %s\n",
+ vf->vf_num, ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_LUT,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_set_rss_hena_msg - Handle VIRTCHNL_OP_SET_RSS_HENA msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VF's hena (hash enable) bits as flow types to the PF's RSS flow
+ * type list.
+ */
+static void
+ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_hena *vrh;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ MPASS(vsi != NULL);
+
+ vrh = (struct virtchnl_rss_hena *)msg_buf;
+
+ /*
+ * Remove existing configuration to make sure only requested
+ * config is applied and allow VFs to disable RSS completly.
+ */
+ status = ice_rem_vsi_rss_cfg(hw, vsi->idx);
+ if (vrh->hena) {
+ /*
+ * Problem with removing config is not fatal, when new one
+ * is requested. Warn about it but try to apply new config
+ * anyway.
+ */
+ if (status)
+ device_printf(sc->dev,
+ "ice_rem_vsi_rss_cfg status %s, error %s\n",
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ status = ice_add_avf_rss_cfg(hw, vsi->idx, vrh->hena);
+ if (status)
+ device_printf(sc->dev,
+ "ice_add_avf_rss_cfg status %s, error %s\n",
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ }
+ v_status = ice_iov_err_to_virt_err(status);
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_SET_RSS_HENA,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_enable_queues_msg - Handle VIRTCHNL_OP_ENABLE_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Enables VF queues selected in msg_buf for Tx/Rx traffic.
+ *
+ * @remark Only actually operates on Rx queues; Tx queues are enabled in
+ * CONFIG_VSI_QUEUES message handler.
+ */
+static void
+ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_queue_select *vqs;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ int bit, error = 0;
+
+ vqs = (struct virtchnl_queue_select *)msg_buf;
+
+ if (vqs->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ __func__, vf->vf_num, vsi->idx, vqs->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (!vqs->rx_queues && !vqs->tx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message queue masks are empty\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Validate rx_queue mask */
+ bit = fls(vqs->rx_queues);
+ if (bit > vsi->num_rx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's rx_queues map (0x%08x) has invalid bit set (%d)\n",
+ __func__, vf->vf_num, vqs->rx_queues, bit);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Tx ring enable is handled in an earlier message. */
+ for_each_set_bit(bit, &vqs->rx_queues, 32) {
+ error = ice_control_rx_queue(vsi, bit, true);
+ if (error) {
+ device_printf(sc->dev,
+ "Unable to enable Rx ring %d for receive: %s\n",
+ bit, ice_err_str(error));
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ENABLE_QUEUES,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_disable_queues_msg - Handle VIRTCHNL_OP_DISABLE_QUEUES msg
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Disables all VF queues for the VF's VSI.
+ *
+ * @remark Unlike the ENABLE_QUEUES handler, this operates on both
+ * Tx and Rx queues
+ */
+static void
+ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf __unused)
+{
+ struct ice_hw *hw = &sc->hw;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ int error = 0;
+
+ error = ice_control_all_rx_queues(vsi, false);
+ if (error) {
+ device_printf(sc->dev,
+ "Unable to disable Rx rings for transmit: %s\n",
+ ice_err_str(error));
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ error = ice_vsi_disable_tx(vsi);
+ if (error) {
+ /* Already prints an error message */
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DISABLE_QUEUES,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_irq_map_msg - Handle VIRTCHNL_OP_CFG_IRQ_MAP msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the interrupt vectors described in the message in msg_buf. The
+ * VF needs to send this message during init, so that queues can be allowed
+ * to generate interrupts.
+ */
+static void
+ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+#define ICE_VIRTCHNL_QUEUE_MAP_SIZE 16
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_irq_map_info *vimi;
+ struct virtchnl_vector_map *vvm;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ u16 vector;
+
+ vimi = (struct virtchnl_irq_map_info *)msg_buf;
+
+ if (vimi->num_vectors > vf->num_irq_vectors) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message has more vectors (%d) than configured for VF (%d)\n",
+ __func__, vf->vf_num, vimi->num_vectors, vf->num_irq_vectors);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ vvm = vimi->vecmap;
+ /* Save off information from message */
+ for (int i = 0; i < vimi->num_vectors; i++, vvm++) {
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ int bit;
+
+ if (vvm->vsi_id != vf->vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's VSI ID (%d) does not match VF's (%d) for vector %d\n",
+ __func__, vf->vf_num, vvm->vsi_id, vf->vsi->idx, i);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* vvm->vector_id is relative to VF space */
+ vector = vvm->vector_id;
+
+ if (vector >= vf->num_irq_vectors) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's vector ID (%d) is greater than VF's max ID (%d)\n",
+ __func__, vf->vf_num, vector, vf->num_irq_vectors - 1);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* The Misc/Admin Queue vector doesn't need mapping */
+ if (vector == 0)
+ continue;
+
+ /* coverity[address_of] */
+ for_each_set_bit(bit, &vvm->txq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+ if (bit >= vsi->num_tx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: txq map has invalid bit set\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ vf->tx_irqvs[vector].me = vector;
+
+ txq = &vsi->tx_queues[bit];
+ txq->irqv = &vf->tx_irqvs[vector];
+ txq->itr_idx = vvm->txitr_idx;
+ }
+ /* coverity[address_of] */
+ for_each_set_bit(bit, &vvm->rxq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+ if (bit >= vsi->num_rx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: rxq map has invalid bit set\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+ vf->rx_irqvs[vector].me = vector;
+
+ rxq = &vsi->rx_queues[bit];
+ rxq->irqv = &vf->rx_irqvs[vector];
+ rxq->itr_idx = vvm->rxitr_idx;
+ }
+ }
+
+ /* Write to T/RQCTL registers to actually map vectors to queues */
+ for (int i = 0; i < vf->vsi->num_rx_queues; i++)
+ if (vsi->rx_queues[i].irqv != NULL)
+ ice_configure_rxq_interrupt(hw, vsi->rx_qmap[i],
+ vsi->rx_queues[i].irqv->me, vsi->rx_queues[i].itr_idx);
+
+ for (int i = 0; i < vf->vsi->num_tx_queues; i++)
+ if (vsi->tx_queues[i].irqv != NULL)
+ ice_configure_txq_interrupt(hw, vsi->tx_qmap[i],
+ vsi->tx_queues[i].irqv->me, vsi->tx_queues[i].itr_idx);
+
+ ice_flush(hw);
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_eth_stats_to_virtchnl_eth_stats - Convert stats for virtchnl
+ * @istats: VSI stats from HW to convert
+ * @vstats: stats struct to copy to
+ *
+ * This function copies all known stats in struct virtchnl_eth_stats from the
+ * input struct ice_eth_stats to an output struct virtchnl_eth_stats.
+ *
+ * @remark These two structure types currently have the same definition up to
+ * the size of struct virtchnl_eth_stats (on FreeBSD), but that could change
+ * in the future.
+ */
+static void
+ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+ struct virtchnl_eth_stats *vstats)
+{
+ vstats->rx_bytes = istats->rx_bytes;
+ vstats->rx_unicast = istats->rx_unicast;
+ vstats->rx_multicast = istats->rx_multicast;
+ vstats->rx_broadcast = istats->rx_broadcast;
+ vstats->rx_discards = istats->rx_discards;
+ vstats->rx_unknown_protocol = istats->rx_unknown_protocol;
+ vstats->tx_bytes = istats->tx_bytes;
+ vstats->tx_unicast = istats->tx_unicast;
+ vstats->tx_multicast = istats->tx_multicast;
+ vstats->tx_broadcast = istats->tx_broadcast;
+ vstats->tx_discards = istats->tx_discards;
+ vstats->tx_errors = istats->tx_errors;
+}
+
+/**
+ * ice_vc_get_stats_msg - Handle VIRTCHNL_OP_GET_STATS msg
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Updates the VF's VSI stats and sends those stats back to the VF.
+ */
+static void
+ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct virtchnl_queue_select *vqs;
+ struct virtchnl_eth_stats stats;
+ struct ice_vsi *vsi = vf->vsi;
+ struct ice_hw *hw = &sc->hw;
+
+ vqs = (struct virtchnl_queue_select *)msg_buf;
+
+ if (vqs->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message has invalid VSI ID %d (VF has VSI ID %d)\n",
+ __func__, vf->vf_num, vqs->vsi_id, vsi->idx);
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+ VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+ }
+
+ ice_update_vsi_hw_stats(vf->vsi);
+ ice_eth_stats_to_virtchnl_eth_stats(&vsi->hw_stats.cur, &stats);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&stats,
+ sizeof(struct virtchnl_eth_stats), NULL);
+}
+
+/**
+ * ice_vc_cfg_promisc_mode_msg - Handle VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the promiscuous modes for the given VSI in msg_buf.
+ */
+static void
+ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_promisc_info *vpi;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ int status = 0;
+ struct ice_vsi *vsi = vf->vsi;
+ ice_declare_bitmap(old_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(clear_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(set_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+ u16 vid;
+
+ vpi = (struct virtchnl_promisc_info *)msg_buf;
+
+ /* Check to see if VF has permission to configure promiscuous mode */
+ if (!(vf->vf_flags & VF_FLAG_PROMISC_CAP)) {
+ device_printf(sc->dev,
+ "VF-%d: attempted to configure promiscuous mode\n",
+ vf->vf_num);
+ /* Don't reply to VF with an error */
+ goto done;
+ }
+
+ if (vpi->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vpi->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vpi->flags & ~ICE_VIRTCHNL_VALID_PROMISC_FLAGS) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid promiscuous flags set (valid 0x%02x, got 0x%02x)\n",
+ vf->vf_num, ICE_VIRTCHNL_VALID_PROMISC_FLAGS,
+ vpi->flags);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+
+ }
+
+ ice_zero_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+ /* Convert virtchnl flags to ice AQ promiscuous mode flags */
+ if (vpi->flags & FLAG_VF_UNICAST_PROMISC) {
+ ice_set_bit(ICE_PROMISC_UCAST_TX, req_promisc_mask);
+ ice_set_bit(ICE_PROMISC_UCAST_RX, req_promisc_mask);
+ }
+ if (vpi->flags & FLAG_VF_MULTICAST_PROMISC) {
+ ice_set_bit(ICE_PROMISC_MCAST_TX, req_promisc_mask);
+ ice_set_bit(ICE_PROMISC_MCAST_RX, req_promisc_mask);
+ }
+
+ status = ice_get_vsi_promisc(hw, vsi->idx, old_promisc_mask, &vid);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to get promiscuous mode mask for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ /* Figure out what got added and what got removed */
+ ice_zero_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+ ice_xor_bitmap(old_req_xor_mask, old_promisc_mask, req_promisc_mask, ICE_PROMISC_MAX);
+ ice_and_bitmap(clear_promisc_mask, old_req_xor_mask, old_promisc_mask, ICE_PROMISC_MAX);
+ ice_and_bitmap(set_promisc_mask, old_req_xor_mask, req_promisc_mask, ICE_PROMISC_MAX);
+
+ if (ice_is_any_bit_set(clear_promisc_mask, ICE_PROMISC_MAX)) {
+ status = ice_clear_vsi_promisc(hw, vsi->idx,
+ clear_promisc_mask, 0);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to clear promiscuous mode for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+ }
+
+ if (ice_is_any_bit_set(set_promisc_mask, ICE_PROMISC_MAX)) {
+ status = ice_set_vsi_promisc(hw, vsi->idx, set_promisc_mask, 0);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to set promiscuous mode for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_notify_all_vfs_link_state - Notify all VFs of PF link state
+ * @sc: device private structure
+ *
+ * Sends a message to all VFs about the status of the PF's link
+ * state. For more details, @see ice_vc_notify_vf_link_state.
+ */
+void
+ice_vc_notify_all_vfs_link_state(struct ice_softc *sc)
+{
+ for (int i = 0; i < sc->num_vfs; i++)
+ ice_vc_notify_vf_link_state(sc, &sc->vfs[i]);
+}
+
+/**
+ * ice_vc_notify_vf_link_state - Notify VF of PF link state
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ *
+ * Sends an event message to the specified VF with information about
+ * the current link state from the PF's port. This includes whether
+ * link is up or down, and the link speed in 100Mbps units.
+ */
+static void
+ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct virtchnl_pf_event event = {};
+ struct ice_hw *hw = &sc->hw;
+
+ event.event = VIRTCHNL_EVENT_LINK_CHANGE;
+ event.severity = PF_EVENT_SEVERITY_INFO;
+ event.event_data.link_event_adv.link_status = sc->link_up;
+ event.event_data.link_event_adv.link_speed =
+ (u32)ice_conv_link_speed_to_virtchnl(true,
+ hw->port_info->phy.link_info.link_speed);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_EVENT,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&event, sizeof(event), NULL);
+}
+
+/**
+ * ice_vc_handle_vf_msg - Handle a message from a VF
+ * @sc: device private structure
+ * @event: event received from the HW MBX queue
+ *
+ * Called whenever an event is received from a VF on the HW mailbox queue.
+ * Responsible for handling these messages as well as responding to the
+ * VF afterwards, depending on the received message type.
+ */
+void
+ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event)
+{
+ struct ice_hw *hw = &sc->hw;
+ device_t dev = sc->dev;
+ struct ice_vf *vf;
+ int err = 0;
+
+ u32 v_opcode = event->desc.cookie_high;
+ u16 v_id = event->desc.retval;
+ u8 *msg = event->msg_buf;
+ u16 msglen = event->msg_len;
+
+ if (v_id >= sc->num_vfs) {
+ device_printf(dev, "%s: Received msg from invalid VF-%d: opcode %d, len %d\n",
+ __func__, v_id, v_opcode, msglen);
+ return;
+ }
+
+ vf = &sc->vfs[v_id];
+
+ /* Perform basic checks on the msg */
+ err = virtchnl_vc_validate_vf_msg(&vf->version, v_opcode, msg, msglen);
+ if (err) {
+ device_printf(dev, "%s: Received invalid msg from VF-%d: opcode %d, len %d, error %d\n",
+ __func__, vf->vf_num, v_opcode, msglen, err);
+ ice_aq_send_msg_to_vf(hw, v_id, v_opcode, VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+ return;
+ }
+
+ switch (v_opcode) {
+ case VIRTCHNL_OP_VERSION:
+ ice_vc_version_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_RESET_VF:
+ ice_reset_vf(sc, vf, true);
+ break;
+ case VIRTCHNL_OP_GET_VF_RESOURCES:
+ ice_vc_get_vf_res_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ADD_ETH_ADDR:
+ ice_vc_add_eth_addr_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_DEL_ETH_ADDR:
+ ice_vc_del_eth_addr_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ADD_VLAN:
+ ice_vc_add_vlan_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_DEL_VLAN:
+ ice_vc_del_vlan_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+ ice_vc_cfg_vsi_qs_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_RSS_KEY:
+ ice_vc_cfg_rss_key_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_RSS_LUT:
+ ice_vc_cfg_rss_lut_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_SET_RSS_HENA:
+ ice_vc_set_rss_hena_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ENABLE_QUEUES:
+ ice_vc_enable_queues_msg(sc, vf, msg);
+ ice_vc_notify_vf_link_state(sc, vf);
+ break;
+ case VIRTCHNL_OP_DISABLE_QUEUES:
+ ice_vc_disable_queues_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+ ice_vc_cfg_irq_map_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_GET_STATS:
+ ice_vc_get_stats_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+ ice_vc_cfg_promisc_mode_msg(sc, vf, msg);
+ break;
+ default:
+ device_printf(dev, "%s: Received unknown msg from VF-%d: opcode %d, len %d\n",
+ __func__, vf->vf_num, v_opcode, msglen);
+ ice_aq_send_msg_to_vf(hw, v_id, v_opcode,
+ VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL, 0, NULL);
+ break;
+ }
+}
+
+/**
+ * ice_iov_setup_intr_mapping - Setup interrupt config for a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be configured
+ *
+ * Before a VF can be used, and after a VF reset, the PF must configure
+ * the VF's interrupt allocation registers. This includes allocating
+ * interrupts from the PF's interrupt pool to the VF using the
+ * VPINT_ALLOC(_PCI) registers, and setting up a mapping from PF vectors
+ * to VF vectors in GLINT_VECT2FUNC.
+ *
+ * As well, this sets up queue allocation registers and maps the mailbox
+ * interrupt for the VF.
+ */
+static void
+ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct ice_vsi *vsi = vf->vsi;
+ u16 v;
+
+ /* Calculate indices for register ops below */
+ u16 vf_first_irq_idx = vf->vf_imap[0];
+ u16 vf_last_irq_idx = (vf_first_irq_idx + vf->num_irq_vectors) - 1;
+ u16 abs_vf_first_irq_idx = hw->func_caps.common_cap.msix_vector_first_id +
+ vf_first_irq_idx;
+ u16 abs_vf_last_irq_idx = (abs_vf_first_irq_idx + vf->num_irq_vectors) - 1;
+ u16 abs_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+ /* Map out VF interrupt allocation in global device space. Both
+ * VPINT_ALLOC and VPINT_ALLOC_PCI use the same values.
+ */
+ wr32(hw, VPINT_ALLOC(vf->vf_num),
+ (((abs_vf_first_irq_idx << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) |
+ ((abs_vf_last_irq_idx << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) |
+ VPINT_ALLOC_VALID_M));
+ wr32(hw, VPINT_ALLOC_PCI(vf->vf_num),
+ (((abs_vf_first_irq_idx << VPINT_ALLOC_PCI_FIRST_S) & VPINT_ALLOC_PCI_FIRST_M) |
+ ((abs_vf_last_irq_idx << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) |
+ VPINT_ALLOC_PCI_VALID_M));
+
+ /* Create inverse mapping of vectors to PF/VF combinations */
+ for (v = vf_first_irq_idx; v <= vf_last_irq_idx; v++)
+ {
+ wr32(hw, GLINT_VECT2FUNC(v),
+ (((abs_vf_num << GLINT_VECT2FUNC_VF_NUM_S) & GLINT_VECT2FUNC_VF_NUM_M) |
+ ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) & GLINT_VECT2FUNC_PF_NUM_M)));
+ }
+
+ /* Map mailbox interrupt to MSI-X index 0. Disable ITR for it, too. */
+ wr32(hw, VPINT_MBX_CTL(abs_vf_num),
+ ((0 << VPINT_MBX_CTL_MSIX_INDX_S) & VPINT_MBX_CTL_MSIX_INDX_M) |
+ ((0x3 << VPINT_MBX_CTL_ITR_INDX_S) & VPINT_MBX_CTL_ITR_INDX_M) |
+ VPINT_MBX_CTL_CAUSE_ENA_M);
+
+ /* Mark the TX queue mapping registers as valid */
+ wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_num), VPLAN_TXQ_MAPENA_TX_ENA_M);
+
+ /* Indicate to HW that VF has scattered queue allocation */
+ wr32(hw, VPLAN_TX_QBASE(vf->vf_num), VPLAN_TX_QBASE_VFQTABLE_ENA_M);
+ for (int i = 0; i < vsi->num_tx_queues; i++) {
+ wr32(hw, VPLAN_TX_QTABLE(i, vf->vf_num),
+ (vsi->tx_qmap[i] << VPLAN_TX_QTABLE_QINDEX_S) & VPLAN_TX_QTABLE_QINDEX_M);
+ }
+
+ /* Mark the RX queue mapping registers as valid */
+ wr32(hw, VPLAN_RXQ_MAPENA(vf->vf_num), VPLAN_RXQ_MAPENA_RX_ENA_M);
+ wr32(hw, VPLAN_RX_QBASE(vf->vf_num), VPLAN_RX_QBASE_VFQTABLE_ENA_M);
+ for (int i = 0; i < vsi->num_rx_queues; i++) {
+ wr32(hw, VPLAN_RX_QTABLE(i, vf->vf_num),
+ (vsi->rx_qmap[i] << VPLAN_RX_QTABLE_QINDEX_S) & VPLAN_RX_QTABLE_QINDEX_M);
+ }
+}
+
+/**
+ * ice_err_to_virt err - translate ice errors into virtchnl errors
+ * @ice_err: status returned from ice function
+ */
+static enum virtchnl_status_code
+ice_iov_err_to_virt_err(int ice_err)
+{
+ switch (ice_err) {
+ case 0:
+ return VIRTCHNL_STATUS_SUCCESS;
+ case ICE_ERR_BAD_PTR:
+ case ICE_ERR_INVAL_SIZE:
+ case ICE_ERR_DEVICE_NOT_SUPPORTED:
+ case ICE_ERR_PARAM:
+ case ICE_ERR_CFG:
+ return VIRTCHNL_STATUS_ERR_PARAM;
+ case ICE_ERR_NO_MEMORY:
+ return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ case ICE_ERR_NOT_READY:
+ case ICE_ERR_RESET_FAILED:
+ case ICE_ERR_FW_API_VER:
+ case ICE_ERR_AQ_ERROR:
+ case ICE_ERR_AQ_TIMEOUT:
+ case ICE_ERR_AQ_FULL:
+ case ICE_ERR_AQ_NO_WORK:
+ case ICE_ERR_AQ_EMPTY:
+ return VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ default:
+ return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+ }
+}
diff --git a/sys/dev/ice/ice_iov.h b/sys/dev/ice/ice_iov.h
new file mode 100644
index 000000000000..c4fb3e932e3f
--- /dev/null
+++ b/sys/dev/ice/ice_iov.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.h
+ * @brief header for IOV functionality
+ *
+ * This header includes definitions used to implement device Virtual Functions
+ * for the ice driver.
+ */
+
+#ifndef _ICE_IOV_H_
+#define _ICE_IOV_H_
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/nv.h>
+#include <sys/iov_schema.h>
+#include <sys/param.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <dev/pci/pci_iov.h>
+
+#include "ice_iflib.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * @enum ice_vf_flags
+ * @brief VF state flags
+ *
+ * Used to indicate the status of a PF's VF, as well as indicating what each VF
+ * is capabile of. Intended to be modified only using atomic operations, so
+ * they can be read and modified in places that aren't locked.
+ *
+ * Used in struct ice_vf's vf_flags field.
+ */
+enum ice_vf_flags {
+ VF_FLAG_ENABLED = BIT(0),
+ VF_FLAG_SET_MAC_CAP = BIT(1),
+ VF_FLAG_VLAN_CAP = BIT(2),
+ VF_FLAG_PROMISC_CAP = BIT(3),
+ VF_FLAG_MAC_ANTI_SPOOF = BIT(4),
+};
+
+/**
+ * @struct ice_vf
+ * @brief PF's VF software context
+ *
+ * Represents the state and options for a VF spawned from a PF.
+ */
+struct ice_vf {
+ struct ice_vsi *vsi;
+ u32 vf_flags;
+
+ u8 mac[ETHER_ADDR_LEN];
+ u16 vf_num;
+ struct virtchnl_version_info version;
+
+ u16 mac_filter_limit;
+ u16 mac_filter_cnt;
+ u16 vlan_limit;
+ u16 vlan_cnt;
+
+ u16 num_irq_vectors;
+ u16 *vf_imap;
+ struct ice_irq_vector *tx_irqvs;
+ struct ice_irq_vector *rx_irqvs;
+};
+
+#define ICE_PCIE_DEV_STATUS 0xAA
+
+#define ICE_PCI_CIAD_WAIT_COUNT 100
+#define ICE_PCI_CIAD_WAIT_DELAY_US 1
+#define ICE_VPGEN_VFRSTAT_WAIT_COUNT 100
+#define ICE_VPGEN_VFRSTAT_WAIT_DELAY_US 20
+
+#define ICE_VIRTCHNL_VALID_PROMISC_FLAGS (FLAG_VF_UNICAST_PROMISC | \
+ FLAG_VF_MULTICAST_PROMISC)
+
+#define ICE_DEFAULT_VF_VLAN_LIMIT 64
+#define ICE_DEFAULT_VF_FILTER_LIMIT 16
+
+int ice_iov_attach(struct ice_softc *sc);
+int ice_iov_detach(struct ice_softc *sc);
+
+int ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params);
+int ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params);
+void ice_iov_uninit(struct ice_softc *sc);
+
+void ice_iov_handle_vflr(struct ice_softc *sc);
+
+void ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event);
+void ice_vc_notify_all_vfs_link_state(struct ice_softc *sc);
+
+#endif /* _ICE_IOV_H_ */
+
diff --git a/sys/dev/ice/ice_lib.c b/sys/dev/ice/ice_lib.c
index d44ae5f37750..442111e5ffaf 100644
--- a/sys/dev/ice/ice_lib.c
+++ b/sys/dev/ice/ice_lib.c
@@ -42,6 +42,9 @@
#include "ice_lib.h"
#include "ice_iflib.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/resource.h>
@@ -741,6 +744,12 @@ ice_initialize_vsi(struct ice_vsi *vsi)
case ICE_VSI_VMDQ2:
ctx.flags = ICE_AQ_VSI_TYPE_VMDQ2;
break;
+#ifdef PCI_IOV
+ case ICE_VSI_VF:
+ ctx.flags = ICE_AQ_VSI_TYPE_VF;
+ ctx.vf_num = vsi->vf_num;
+ break;
+#endif
default:
return (ENODEV);
}
@@ -1607,6 +1616,12 @@ ice_setup_tx_ctx(struct ice_tx_queue *txq, struct ice_tlan_ctx *tlan_ctx, u16 pf
case ICE_VSI_VMDQ2:
tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
break;
+#ifdef PCI_IOV
+ case ICE_VSI_VF:
+ tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
+ tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_num;
+ break;
+#endif
default:
return (ENODEV);
}
@@ -1660,6 +1675,10 @@ ice_cfg_vsi_for_tx(struct ice_vsi *vsi)
struct ice_tlan_ctx tlan_ctx = { 0 };
struct ice_tx_queue *txq = &vsi->tx_queues[i];
+ /* Last configured queue */
+ if (txq->desc_count == 0)
+ break;
+
pf_q = vsi->tx_qmap[txq->me];
qg->txqs[0].txq_id = htole16(pf_q);
@@ -1788,6 +1807,10 @@ ice_cfg_vsi_for_rx(struct ice_vsi *vsi)
for (i = 0; i < vsi->num_rx_queues; i++) {
MPASS(vsi->mbuf_sz > 0);
+ /* Last configured queue */
+ if (vsi->rx_queues[i].desc_count == 0)
+ break;
+
err = ice_setup_rx_ctx(&vsi->rx_queues[i]);
if (err)
return err;
@@ -2257,6 +2280,11 @@ ice_process_ctrlq_event(struct ice_softc *sc, const char *qname,
case ice_aqc_opc_get_link_status:
ice_process_link_event(sc, event);
break;
+#ifdef PCI_IOV
+ case ice_mbx_opc_send_msg_to_pf:
+ ice_vc_handle_vf_msg(sc, event);
+ break;
+#endif
case ice_aqc_opc_fw_logs_event:
ice_handle_fw_log_event(sc, &event->desc, event->msg_buf);
break;
diff --git a/sys/dev/ice/ice_lib.h b/sys/dev/ice/ice_lib.h
index b6b23ec82161..308b2bda2790 100644
--- a/sys/dev/ice/ice_lib.h
+++ b/sys/dev/ice/ice_lib.h
@@ -611,6 +611,10 @@ struct ice_vsi {
u16 mirror_src_vsi;
u16 rule_mir_ingress;
u16 rule_mir_egress;
+
+#ifdef PCI_IOV
+ u8 vf_num; /* Index of owning VF, if applicable */
+#endif
};
/**
diff --git a/sys/dev/ice/ice_vf_mbx.c b/sys/dev/ice/ice_vf_mbx.c
new file mode 100644
index 000000000000..387a1c6739a6
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ice_common.h"
+#include "ice_hw_autogen.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * ice_aq_send_msg_to_vf
+ * @hw: pointer to the hardware structure
+ * @vfid: VF ID to send msg
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to VF driver (0x0802) using mailbox
+ * queue and asynchronously sending message via
+ * ice_sq_send_cmd() function
+ */
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+ u8 *msg, u16 msglen, struct ice_sq_cd *cd)
+{
+ struct ice_aqc_pf_vf_msg *cmd;
+ struct ice_aq_desc desc;
+
+ ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_vf);
+
+ cmd = &desc.params.virt;
+ cmd->id = CPU_TO_LE32(vfid);
+
+ desc.cookie_high = CPU_TO_LE32(v_opcode);
+ desc.cookie_low = CPU_TO_LE32(v_retval);
+
+ if (msglen)
+ desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+ return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+/**
+ * ice_aq_send_msg_to_pf
+ * @hw: pointer to the hardware structure
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to PF driver using mailbox queue. By default, this
+ * message is sent asynchronously, i.e. ice_sq_send_cmd()
+ * does not wait for completion before returning.
+ */
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+ int v_retval, u8 *msg, u16 msglen,
+ struct ice_sq_cd *cd)
+{
+ struct ice_aq_desc desc;
+
+ ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_pf);
+ desc.cookie_high = CPU_TO_LE32(v_opcode);
+ desc.cookie_low = CPU_TO_LE32(v_retval);
+
+ if (msglen)
+ desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+ return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+static const u32 ice_legacy_aq_to_vc_speed[] = {
+ VIRTCHNL_LINK_SPEED_100MB, /* BIT(0) */
+ VIRTCHNL_LINK_SPEED_100MB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_10GB,
+ VIRTCHNL_LINK_SPEED_20GB,
+ VIRTCHNL_LINK_SPEED_25GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+};
+
+/**
+ * ice_conv_link_speed_to_virtchnl
+ * @adv_link_support: determines the format of the returned link speed
+ * @link_speed: variable containing the link_speed to be converted
+ *
+ * Convert link speed supported by HW to link speed supported by virtchnl.
+ * If adv_link_support is true, then return link speed in Mbps. Else return
+ * link speed as a VIRTCHNL_LINK_SPEED_* casted to a u32. Note that the caller
+ * needs to cast back to an enum virtchnl_link_speed in the case where
+ * adv_link_support is false, but when adv_link_support is true the caller can
+ * expect the speed in Mbps.
+ */
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
+{
+ /* convert a BIT() value into an array index */
+ u16 index = (u16)(ice_fls(link_speed) - 1);
+
+ if (adv_link_support)
+ return ice_get_link_speed(index);
+ else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed))
+ /* Virtchnl speeds are not defined for every speed supported in
+ * the hardware. To maintain compatibility with older AVF
+ * drivers, while reporting the speed the new speed values are
+ * resolved to the closest known virtchnl speeds
+ */
+ return ice_legacy_aq_to_vc_speed[index];
+
+ return VIRTCHNL_LINK_SPEED_UNKNOWN;
+}
+
+/* The mailbox overflow detection algorithm helps to check if there
+ * is a possibility of a malicious VF transmitting too many MBX messages to the
+ * PF.
+ * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during
+ * driver initialization in ice_init_hw() using ice_mbx_init_snapshot().
+ * The struct ice_mbx_snapshot helps to track and traverse a static window of
+ * messages within the mailbox queue while looking for a malicious VF.
+ *
+ * 2. When the caller starts processing its mailbox queue in response to an
+ * interrupt, the structure ice_mbx_snapshot is expected to be cleared before
+ * the algorithm can be run for the first time for that interrupt. This
+ * requires calling ice_mbx_reset_snapshot() as well as calling
+ * ice_mbx_reset_vf_info() for each VF tracking structure.
+ *
+ * 3. For every message read by the caller from the MBX Queue, the caller must
+ * call the detection algorithm's entry function ice_mbx_vf_state_handler().
+ * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is
+ * filled as it is required to be passed to the algorithm.
+ *
+ * 4. Every time a message is read from the MBX queue, a tracking structure
+ * for the VF must be passed to the state handler. The boolean output
+ * report_malvf from ice_mbx_vf_state_handler() serves as an indicator to the
+ * caller whether it must report this VF as malicious or not.
+ *
+ * 5. When a VF is identified to be malicious, the caller can send a message
+ * to the system administrator.
+ *
+ * 6. The PF is responsible for maintaining the struct ice_mbx_vf_info
+ * structure for each VF. The PF should clear the VF tracking structure if the
+ * VF is reset. When a VF is shut down and brought back up, we will then
+ * assume that the new VF is not malicious and may report it again if we
+ * detect it again.
+ *
+ * 7. The function ice_mbx_reset_snapshot() is called to reset the information
+ * in ice_mbx_snapshot for every new mailbox interrupt handled.
+ */
+#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M)
+/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that
+ * the max messages check must be ignored in the algorithm
+ */
+#define ICE_IGNORE_MAX_MSG_CNT 0xFFFF
+
+/**
+ * ice_mbx_reset_snapshot - Initialize mailbox snapshot structure
+ * @snap: pointer to the mailbox snapshot
+ */
+static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap)
+{
+ struct ice_mbx_vf_info *vf_info;
+
+ /* Clear mbx_buf in the mailbox snaphot structure and setting the
+ * mailbox snapshot state to a new capture.
+ */
+ ice_memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf), ICE_NONDMA_MEM);
+ snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+
+ /* Reset message counts for all VFs to zero */
+ LIST_FOR_EACH_ENTRY(vf_info, &snap->mbx_vf, ice_mbx_vf_info, list_entry)
+ vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_traverse - Pass through mailbox snapshot
+ * @hw: pointer to the HW struct
+ * @new_state: new algorithm state
+ *
+ * Traversing the mailbox static snapshot without checking
+ * for malicious VFs.
+ */
+static void
+ice_mbx_traverse(struct ice_hw *hw,
+ enum ice_mbx_snapshot_state *new_state)
+{
+ struct ice_mbx_snap_buffer_data *snap_buf;
+ u32 num_iterations;
+
+ snap_buf = &hw->mbx_snapshot.mbx_buf;
+
+ /* As mailbox buffer is circular, applying a mask
+ * on the incremented iteration count.
+ */
+ num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations);
+
+ /* Checking either of the below conditions to exit snapshot traversal:
+ * Condition-1: If the number of iterations in the mailbox is equal to
+ * the mailbox head which would indicate that we have reached the end
+ * of the static snapshot.
+ * Condition-2: If the maximum messages serviced in the mailbox for a
+ * given interrupt is the highest possible value then there is no need
+ * to check if the number of messages processed is equal to it. If not
+ * check if the number of messages processed is greater than or equal
+ * to the maximum number of mailbox entries serviced in current work item.
+ */
+ if (num_iterations == snap_buf->head ||
+ (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT &&
+ ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx))
+ *new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_detect_malvf - Detect malicious VF in snapshot
+ * @hw: pointer to the HW struct
+ * @vf_info: mailbox tracking structure for a VF
+ * @new_state: new algorithm state
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * This function tracks the number of asynchronous messages
+ * sent per VF and marks the VF as malicious if it exceeds
+ * the permissible number of messages to send.
+ */
+static int
+ice_mbx_detect_malvf(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info,
+ enum ice_mbx_snapshot_state *new_state,
+ bool *is_malvf)
+{
+ /* increment the message count for this VF */
+ vf_info->msg_count++;
+
+ if (vf_info->msg_count >= ICE_ASYNC_VF_MSG_THRESHOLD)
+ *is_malvf = true;
+
+ /* continue to iterate through the mailbox snapshot */
+ ice_mbx_traverse(hw, new_state);
+
+ return 0;
+}
+
+/**
+ * ice_e830_mbx_vf_dec_trig - Decrements the VF mailbox queue counter
+ * @hw: pointer to the HW struct
+ * @event: pointer to the control queue receive event
+ *
+ * This function triggers to decrement the counter
+ * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT when the driver replenishes
+ * the buffers at the PF mailbox queue.
+ */
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+ struct ice_rq_event_info *event)
+{
+ u16 vfid = LE16_TO_CPU(event->desc.retval);
+
+ wr32(hw, E830_MBX_VF_DEC_TRIG(vfid), 1);
+}
+
+/**
+ * ice_mbx_vf_clear_cnt_e830 - Clear the VF mailbox queue count
+ * @hw: pointer to the HW struct
+ * @vf_id: VF ID in the PF space
+ *
+ * This function clears the counter MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT, and should
+ * be called when a VF is created and on VF reset.
+ */
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id)
+{
+ u32 reg = rd32(hw, E830_MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT(vf_id));
+
+ wr32(hw, E830_MBX_VF_DEC_TRIG(vf_id), reg);
+}
+
+/**
+ * ice_mbx_vf_state_handler - Handle states of the overflow algorithm
+ * @hw: pointer to the HW struct
+ * @mbx_data: pointer to structure containing mailbox data
+ * @vf_info: mailbox tracking structure for the VF in question
+ * @report_malvf: boolean output to indicate whether VF should be reported
+ *
+ * The function serves as an entry point for the malicious VF
+ * detection algorithm by handling the different states and state
+ * transitions of the algorithm:
+ * New snapshot: This state is entered when creating a new static
+ * snapshot. The data from any previous mailbox snapshot is
+ * cleared and a new capture of the mailbox head and tail is
+ * logged. This will be the new static snapshot to detect
+ * asynchronous messages sent by VFs. On capturing the snapshot
+ * and depending on whether the number of pending messages in that
+ * snapshot exceed the watermark value, the state machine enters
+ * traverse or detect states.
+ * Traverse: If pending message count is below watermark then iterate
+ * through the snapshot without any action on VF.
+ * Detect: If pending message count exceeds watermark traverse
+ * the static snapshot and look for a malicious VF.
+ */
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+ struct ice_mbx_vf_info *vf_info, bool *report_malvf)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+ struct ice_mbx_snap_buffer_data *snap_buf;
+ struct ice_ctl_q_info *cq = &hw->mailboxq;
+ enum ice_mbx_snapshot_state new_state;
+ int status = 0;
+ bool is_malvf = false;
+
+ if (!report_malvf || !mbx_data || !vf_info)
+ return ICE_ERR_BAD_PTR;
+
+ *report_malvf = false;
+
+ /* When entering the mailbox state machine assume that the VF
+ * is not malicious until detected.
+ */
+ /* Checking if max messages allowed to be processed while servicing current
+ * interrupt is not less than the defined AVF message threshold.
+ */
+ if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD)
+ return ICE_ERR_INVAL_SIZE;
+
+ /* The watermark value should not be lesser than the threshold limit
+ * set for the number of asynchronous messages a VF can send to mailbox
+ * nor should it be greater than the maximum number of messages in the
+ * mailbox serviced in current interrupt.
+ */
+ if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD ||
+ mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx)
+ return ICE_ERR_PARAM;
+
+ new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+ snap_buf = &snap->mbx_buf;
+
+ switch (snap_buf->state) {
+ case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT:
+ /* Clear any previously held data in mailbox snapshot structure. */
+ ice_mbx_reset_snapshot(snap);
+
+ /* Collect the pending ARQ count, number of messages processed and
+ * the maximum number of messages allowed to be processed from the
+ * Mailbox for current interrupt.
+ */
+ snap_buf->num_pending_arq = mbx_data->num_pending_arq;
+ snap_buf->num_msg_proc = mbx_data->num_msg_proc;
+ snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx;
+
+ /* Capture a new static snapshot of the mailbox by logging the
+ * head and tail of snapshot and set num_iterations to the tail
+ * value to mark the start of the iteration through the snapshot.
+ */
+ snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean +
+ mbx_data->num_pending_arq);
+ snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1);
+ snap_buf->num_iterations = snap_buf->tail;
+
+ /* Pending ARQ messages returned by ice_clean_rq_elem
+ * is the difference between the head and tail of the
+ * mailbox queue. Comparing this value against the watermark
+ * helps to check if we potentially have malicious VFs.
+ */
+ if (snap_buf->num_pending_arq >=
+ mbx_data->async_watermark_val) {
+ new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+ status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+ } else {
+ new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+ ice_mbx_traverse(hw, &new_state);
+ }
+ break;
+
+ case ICE_MAL_VF_DETECT_STATE_TRAVERSE:
+ new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+ ice_mbx_traverse(hw, &new_state);
+ break;
+
+ case ICE_MAL_VF_DETECT_STATE_DETECT:
+ new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+ status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+ break;
+
+ default:
+ new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+ status = ICE_ERR_CFG;
+ }
+
+ snap_buf->state = new_state;
+
+ /* Only report VFs as malicious the first time we detect it */
+ if (is_malvf && !vf_info->malicious) {
+ vf_info->malicious = 1;
+ *report_malvf = true;
+ }
+
+ return status;
+}
+
+/**
+ * ice_mbx_clear_malvf - Clear VF mailbox info
+ * @vf_info: the mailbox tracking structure for a VF
+ *
+ * In case of a VF reset, this function shall be called to clear the VF's
+ * current mailbox tracking state.
+ */
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info)
+{
+ vf_info->malicious = 0;
+ vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_init_vf_info - Initialize a new VF mailbox tracking info
+ * @hw: pointer to the hardware structure
+ * @vf_info: the mailbox tracking info structure for a VF
+ *
+ * Initialize a VF mailbox tracking info structure and insert it into the
+ * snapshot list.
+ *
+ * If you remove the VF, you must also delete the associated VF info structure
+ * from the linked list.
+ */
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+ ice_mbx_clear_malvf(vf_info);
+ LIST_ADD(&vf_info->list_entry, &snap->mbx_vf);
+}
+
+/**
+ * ice_mbx_init_snapshot - Initialize mailbox snapshot data
+ * @hw: pointer to the hardware structure
+ *
+ * Clear the mailbox snapshot structure and initialize the VF mailbox list.
+ */
+void ice_mbx_init_snapshot(struct ice_hw *hw)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+ INIT_LIST_HEAD(&snap->mbx_vf);
+ ice_mbx_reset_snapshot(snap);
+}
diff --git a/sys/dev/ice/ice_vf_mbx.h b/sys/dev/ice/ice_vf_mbx.h
new file mode 100644
index 000000000000..3b185ac89c11
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ICE_VF_MBX_H_
+#define _ICE_VF_MBX_H_
+
+#include "ice_type.h"
+#include "ice_controlq.h"
+
+/* Defining the mailbox message threshold as 63 asynchronous
+ * pending messages. Normal VF functionality does not require
+ * sending more than 63 asynchronous pending message.
+ */
+
+ /* Threshold value should be used to initialize
+ * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT register.
+ */
+#define ICE_ASYNC_VF_MSG_THRESHOLD 63
+
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+ int v_retval, u8 *msg, u16 msglen,
+ struct ice_sq_cd *cd);
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+ u8 *msg, u16 msglen, struct ice_sq_cd *cd);
+
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed);
+
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+ struct ice_rq_event_info *event);
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id);
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+ struct ice_mbx_vf_info *vf_info, bool *report_malvf);
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_snapshot(struct ice_hw *hw);
+#endif /* _ICE_VF_MBX_H_ */
diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c
index e60ee0f1c5c3..1469d2916465 100644
--- a/sys/dev/ice/if_ice_iflib.c
+++ b/sys/dev/ice/if_ice_iflib.c
@@ -42,6 +42,9 @@
#include "ice_drv_info.h"
#include "ice_switch.h"
#include "ice_sched.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
#include <sys/module.h>
#include <sys/sockio.h>
@@ -85,6 +88,12 @@ static int ice_if_suspend(if_ctx_t ctx);
static int ice_if_resume(if_ctx_t ctx);
static bool ice_if_needs_restart(if_ctx_t ctx, enum iflib_restart_event event);
static void ice_init_link(struct ice_softc *sc);
+#ifdef PCI_IOV
+static int ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params);
+static void ice_if_iov_uninit(if_ctx_t ctx);
+static int ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params);
+static void ice_if_vflr_handle(if_ctx_t ctx);
+#endif
static int ice_setup_mirror_vsi(struct ice_mirr_if *mif);
static int ice_wire_mirror_intrs(struct ice_mirr_if *mif);
static void ice_free_irqvs_subif(struct ice_mirr_if *mif);
@@ -158,6 +167,11 @@ static device_method_t ice_methods[] = {
DEVMETHOD(device_shutdown, iflib_device_shutdown),
DEVMETHOD(device_suspend, iflib_device_suspend),
DEVMETHOD(device_resume, iflib_device_resume),
+#ifdef PCI_IOV
+ DEVMETHOD(pci_iov_init, iflib_device_iov_init),
+ DEVMETHOD(pci_iov_uninit, iflib_device_iov_uninit),
+ DEVMETHOD(pci_iov_add_vf, iflib_device_iov_add_vf),
+#endif
DEVMETHOD_END
};
@@ -198,6 +212,12 @@ static device_method_t ice_iflib_methods[] = {
DEVMETHOD(ifdi_suspend, ice_if_suspend),
DEVMETHOD(ifdi_resume, ice_if_resume),
DEVMETHOD(ifdi_needs_restart, ice_if_needs_restart),
+#ifdef PCI_IOV
+ DEVMETHOD(ifdi_iov_vf_add, ice_if_iov_vf_add),
+ DEVMETHOD(ifdi_iov_init, ice_if_iov_init),
+ DEVMETHOD(ifdi_iov_uninit, ice_if_iov_uninit),
+ DEVMETHOD(ifdi_vflr_handle, ice_if_vflr_handle),
+#endif
DEVMETHOD_END
};
@@ -733,6 +753,9 @@ ice_update_link_status(struct ice_softc *sc, bool update_media)
iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0);
ice_rdma_link_change(sc, LINK_STATE_DOWN, 0);
}
+#ifdef PCI_IOV
+ ice_vc_notify_all_vfs_link_state(sc);
+#endif
update_media = true;
}
@@ -831,6 +854,14 @@ ice_if_attach_post(if_ctx_t ctx)
ice_add_device_sysctls(sc);
+#ifdef PCI_IOV
+ if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) {
+ err = ice_iov_attach(sc);
+ if (err == ENOMEM)
+ return (err);
+ }
+#endif /* PCI_IOV */
+
/* Get DCBX/LLDP state and start DCBX agent */
ice_init_dcb_setup(sc);
@@ -953,6 +984,11 @@ ice_if_detach(if_ctx_t ctx)
ice_destroy_mirror_interface(sc);
ice_rdma_pf_detach(sc);
+#ifdef PCI_IOV
+ if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV))
+ ice_iov_detach(sc);
+#endif /* PCI_IOV */
+
/* Free allocated media types */
ifmedia_removeall(sc->media);
@@ -1676,6 +1712,11 @@ ice_if_msix_intr_assign(if_ctx_t ctx, int msix)
/* For future interrupt assignments */
sc->last_rid = rid + sc->irdma_vectors;
+#ifdef PCI_IOV
+ /* Create soft IRQ for handling VF resets */
+ iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_IOV, sc, 0, "iov");
+#endif
+
return (0);
fail:
for (; i >= 0; i--, vector--)
@@ -2277,7 +2318,12 @@ ice_transition_recovery_mode(struct ice_softc *sc)
ice_rdma_pf_detach(sc);
ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
+#ifdef PCI_IOV
+ if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+ ice_iov_detach(sc);
+#else
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
ice_vsi_del_txqs_ctx(vsi);
@@ -2325,7 +2371,12 @@ ice_transition_safe_mode(struct ice_softc *sc)
ice_rdma_pf_detach(sc);
ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
+#ifdef PCI_IOV
+ if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+ ice_iov_detach(sc);
+#else
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap);
@@ -2410,6 +2461,15 @@ ice_if_update_admin_status(if_ctx_t ctx)
/* Check and update link status */
ice_update_link_status(sc, false);
+#ifdef PCI_IOV
+ /*
+ * Schedule VFs' reset handler after global resets
+ * and other events were processed.
+ */
+ if (ice_testandclear_state(&sc->state, ICE_STATE_VFLR_PENDING))
+ iflib_iov_intr_deferred(ctx);
+#endif
+
/*
* If there are still messages to process, we need to reschedule
* ourselves. Otherwise, we can just re-enable the interrupt. We'll be
@@ -3349,6 +3409,78 @@ ice_init_link(struct ice_softc *sc)
}
+#ifdef PCI_IOV
+/**
+ * ice_if_iov_init - iov init handler for iflib
+ * @ctx: iflib context pointer
+ * @num_vfs: number of VFs to create
+ * @params: configuration parameters for the PF
+ *
+ * Configure the driver for SR-IOV mode. Used to setup things like memory
+ * before any VFs are created.
+ *
+ * @remark This is a wrapper for ice_iov_init
+ */
+static int
+ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ return ice_iov_init(sc, num_vfs, params);
+}
+
+/**
+ * ice_if_iov_uninit - iov uninit handler for iflib
+ * @ctx: iflib context pointer
+ *
+ * Destroys VFs and frees their memory and resources.
+ *
+ * @remark This is a wrapper for ice_iov_uninit
+ */
+static void
+ice_if_iov_uninit(if_ctx_t ctx)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ ice_iov_uninit(sc);
+}
+
+/**
+ * ice_if_iov_vf_add - iov add vf handler for iflib
+ * @ctx: iflib context pointer
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * Sets up the VF given by the vfnum index. This is called by the OS
+ * for each VF created by the PF driver after it is spawned.
+ *
+ * @remark This is a wrapper for ice_iov_vf_add
+ */
+static int
+ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ return ice_iov_add_vf(sc, vfnum, params);
+}
+
+/**
+ * ice_if_vflr_handle - iov VFLR handler
+ * @ctx: iflib context pointer
+ *
+ * Performs the necessar teardown or setup required for a VF after
+ * a VFLR is initiated.
+ *
+ * @remark This is a wrapper for ice_iov_handle_vflr
+ */
+static void
+ice_if_vflr_handle(if_ctx_t ctx)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+ ice_iov_handle_vflr(sc);
+}
+#endif /* PCI_IOV */
+
extern struct if_txrx ice_subif_txrx;
/**
diff --git a/sys/dev/ichiic/ig4_pci.c b/sys/dev/ichiic/ig4_pci.c
index 0195466150eb..3a49e220e335 100644
--- a/sys/dev/ichiic/ig4_pci.c
+++ b/sys/dev/ichiic/ig4_pci.c
@@ -186,6 +186,12 @@ static int ig4iic_pci_detach(device_t dev);
#define PCI_CHIP_METEORLAKE_M_I2C_3 0x7e518086
#define PCI_CHIP_METEORLAKE_M_I2C_4 0x7e7a8086
#define PCI_CHIP_METEORLAKE_M_I2C_5 0x7e7b8086
+#define PCI_CHIP_ARROWLAKE_U_I2C_0 0x77788086
+#define PCI_CHIP_ARROWLAKE_U_I2C_1 0x77798086
+#define PCI_CHIP_ARROWLAKE_U_I2C_2 0x777a8086
+#define PCI_CHIP_ARROWLAKE_U_I2C_3 0x777b8086
+#define PCI_CHIP_ARROWLAKE_U_I2C_4 0x77508086
+#define PCI_CHIP_ARROWLAKE_U_I2C_5 0x77518086
struct ig4iic_pci_device {
uint32_t devid;
@@ -316,6 +322,12 @@ static struct ig4iic_pci_device ig4iic_pci_devices[] = {
{ PCI_CHIP_METEORLAKE_M_I2C_3, "Intel Meteor Lake-M I2C Controller-3", IG4_TIGERLAKE},
{ PCI_CHIP_METEORLAKE_M_I2C_4, "Intel Meteor Lake-M I2C Controller-4", IG4_TIGERLAKE},
{ PCI_CHIP_METEORLAKE_M_I2C_5, "Intel Meteor Lake-M I2C Controller-5", IG4_TIGERLAKE},
+ { PCI_CHIP_ARROWLAKE_U_I2C_0, "Intel Arrow Lake-H/U I2C Controller-0", IG4_TIGERLAKE},
+ { PCI_CHIP_ARROWLAKE_U_I2C_1, "Intel Arrow Lake-H/U I2C Controller-1", IG4_TIGERLAKE},
+ { PCI_CHIP_ARROWLAKE_U_I2C_2, "Intel Arrow Lake-H/U I2C Controller-2", IG4_TIGERLAKE},
+ { PCI_CHIP_ARROWLAKE_U_I2C_3, "Intel Arrow Lake-H/U I2C Controller-3", IG4_TIGERLAKE},
+ { PCI_CHIP_ARROWLAKE_U_I2C_4, "Intel Arrow Lake-H/U I2C Controller-4", IG4_TIGERLAKE},
+ { PCI_CHIP_ARROWLAKE_U_I2C_5, "Intel Arrow Lake-H/U I2C Controller-5", IG4_TIGERLAKE},
};
static int
diff --git a/sys/dev/iicbus/gpio/tca64xx.c b/sys/dev/iicbus/gpio/tca64xx.c
index 3b3bca9936f1..cd011ae9be75 100644
--- a/sys/dev/iicbus/gpio/tca64xx.c
+++ b/sys/dev/iicbus/gpio/tca64xx.c
@@ -261,14 +261,13 @@ tca64xx_attach(device_t dev)
sc->addr = iicbus_get_addr(dev);
mtx_init(&sc->mtx, "tca64xx gpio", "gpio", MTX_DEF);
+ OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev);
sc->busdev = gpiobus_attach_bus(dev);
if (sc->busdev == NULL) {
device_printf(dev, "Could not create busdev child\n");
return (ENXIO);
}
- OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev);
-
#ifdef DEBUG
switch (sc->chip) {
case TCA6416_TYPE:
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
index b842d4f2fd8e..29dc0c880e3a 100644
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@@ -11,9 +11,9 @@
*/
/*-
- * The following functions are based on the vn(4) driver: mdstart_swap(),
- * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
- * and as such under the following copyright:
+ * The following functions are based on the historical vn(4) driver:
+ * mdstart_swap(), mdstart_vnode(), mdcreate_swap(), mdcreate_vnode()
+ * and mddestroy(), and as such under the following copyright:
*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
@@ -1559,19 +1559,26 @@ mddestroy(struct md_s *sc, struct thread *td)
mtx_destroy(&sc->queue_mtx);
switch (sc->type) {
case MD_VNODE:
- vn_lock(sc->s_vnode.vnode, LK_EXCLUSIVE | LK_RETRY);
- sc->s_vnode.vnode->v_vflag &= ~VV_MD;
- VOP_UNLOCK(sc->s_vnode.vnode);
- (void)vn_close(sc->s_vnode.vnode, sc->flags & MD_READONLY ?
- FREAD : (FREAD|FWRITE), sc->cred, td);
- kva_free(sc->s_vnode.kva, maxphys + PAGE_SIZE);
+ if (sc->s_vnode.vnode != NULL) {
+ vn_lock(sc->s_vnode.vnode, LK_EXCLUSIVE | LK_RETRY);
+ sc->s_vnode.vnode->v_vflag &= ~VV_MD;
+ VOP_UNLOCK(sc->s_vnode.vnode);
+ (void)vn_close(sc->s_vnode.vnode,
+ sc->flags & MD_READONLY ? FREAD : (FREAD|FWRITE),
+ sc->cred, td);
+ }
+ if (sc->s_vnode.kva != 0)
+ kva_free(sc->s_vnode.kva, maxphys + PAGE_SIZE);
break;
case MD_SWAP:
- vm_object_deallocate(sc->s_swap.object);
+ if (sc->s_swap.object != NULL)
+ vm_object_deallocate(sc->s_swap.object);
break;
case MD_MALLOC:
- destroy_indir(sc, sc->s_malloc.indir);
- uma_zdestroy(sc->s_malloc.uma);
+ if (sc->s_malloc.indir != NULL)
+ destroy_indir(sc, sc->s_malloc.indir);
+ if (sc->s_malloc.uma != NULL)
+ uma_zdestroy(sc->s_malloc.uma);
break;
case MD_PRELOAD:
case MD_NULL:
diff --git a/sys/dev/mem/memutil.c b/sys/dev/mem/memutil.c
index cf9714d6ec8f..20ce337df0ab 100644
--- a/sys/dev/mem/memutil.c
+++ b/sys/dev/mem/memutil.c
@@ -26,15 +26,14 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <sys/param.h>
+#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memrange.h>
-#include <sys/rwlock.h>
-#include <sys/systm.h>
+#include <sys/sx.h>
-static struct rwlock mr_lock;
+static struct sx mr_lock;
/*
* Implementation-neutral, kernel-callable functions for manipulating
@@ -46,7 +45,7 @@ mem_range_init(void)
if (mem_range_softc.mr_op == NULL)
return;
- rw_init(&mr_lock, "memrange");
+ sx_init(&mr_lock, "memrange");
mem_range_softc.mr_op->init(&mem_range_softc);
}
@@ -56,7 +55,7 @@ mem_range_destroy(void)
if (mem_range_softc.mr_op == NULL)
return;
- rw_destroy(&mr_lock);
+ sx_destroy(&mr_lock);
}
int
@@ -67,12 +66,12 @@ mem_range_attr_get(struct mem_range_desc *mrd, int *arg)
if (mem_range_softc.mr_op == NULL)
return (EOPNOTSUPP);
nd = *arg;
- rw_rlock(&mr_lock);
+ sx_slock(&mr_lock);
if (nd == 0)
*arg = mem_range_softc.mr_ndesc;
else
bcopy(mem_range_softc.mr_desc, mrd, nd * sizeof(*mrd));
- rw_runlock(&mr_lock);
+ sx_sunlock(&mr_lock);
return (0);
}
@@ -83,8 +82,8 @@ mem_range_attr_set(struct mem_range_desc *mrd, int *arg)
if (mem_range_softc.mr_op == NULL)
return (EOPNOTSUPP);
- rw_wlock(&mr_lock);
+ sx_xlock(&mr_lock);
ret = mem_range_softc.mr_op->set(&mem_range_softc, mrd, arg);
- rw_wunlock(&mr_lock);
+ sx_xunlock(&mr_lock);
return (ret);
}
diff --git a/sys/dev/mgb/if_mgb.c b/sys/dev/mgb/if_mgb.c
index 1240d0f84415..409f34167df0 100644
--- a/sys/dev/mgb/if_mgb.c
+++ b/sys/dev/mgb/if_mgb.c
@@ -1435,7 +1435,7 @@ mgb_hw_teardown(struct mgb_softc *sc)
/* Stop MAC */
CSR_CLEAR_REG(sc, MGB_MAC_RX, MGB_MAC_ENBL);
- CSR_WRITE_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
+ CSR_CLEAR_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
if ((err = mgb_wait_for_bits(sc, MGB_MAC_RX, MGB_MAC_DSBL, 0)))
return (err);
if ((err = mgb_wait_for_bits(sc, MGB_MAC_TX, MGB_MAC_DSBL, 0)))
diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h
index 361b9f72d873..c3f3a2372482 100644
--- a/sys/dev/mlx5/mlx5_accel/ipsec.h
+++ b/sys/dev/mlx5/mlx5_accel/ipsec.h
@@ -260,8 +260,8 @@ int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv);
void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv);
int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv);
int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr);
-void mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
- struct mlx5e_rq_mbuf *mr);
+void mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+ struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr);
static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
{
@@ -269,12 +269,12 @@ static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
}
static inline void
-mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe,
+mlx5e_accel_ipsec_handle_rx(if_t ifp, struct mbuf *mb, struct mlx5_cqe64 *cqe,
struct mlx5e_rq_mbuf *mr)
{
u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata);
if (MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data))
- mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe, mr);
+ mlx5e_accel_ipsec_handle_rx_cqe(ifp, mb, cqe, mr);
}
#endif /* __MLX5_ACCEL_IPSEC_H__ */
diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
index 0883cfb2d510..5dccb8bc2b87 100644
--- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
+++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
@@ -24,11 +24,14 @@
*
*/
+#include "opt_ipsec.h"
+
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netipsec/keydb.h>
#include <netipsec/ipsec_offload.h>
+#include <netipsec/xform.h>
#include <dev/mlx5/qp.h>
#include <dev/mlx5/mlx5_en/en.h>
#include <dev/mlx5/mlx5_accel/ipsec.h>
@@ -48,7 +51,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
return (0);
mtag = (struct ipsec_accel_in_tag *)m_tag_get(
- PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*mtag), M_NOWAIT);
+ PACKET_TAG_IPSEC_ACCEL_IN, sizeof(struct ipsec_accel_in_tag) -
+ __offsetof(struct ipsec_accel_in_tag, xh), M_NOWAIT);
if (mtag == NULL)
return (-ENOMEM);
mr->ipsec_mtag = mtag;
@@ -56,8 +60,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
}
void
-mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
- struct mlx5e_rq_mbuf *mr)
+mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+ struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr)
{
struct ipsec_accel_in_tag *mtag;
u32 drv_spi;
@@ -65,10 +69,12 @@ mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
drv_spi = MLX5_IPSEC_METADATA_HANDLE(be32_to_cpu(cqe->ft_metadata));
mtag = mr->ipsec_mtag;
WARN_ON(mtag == NULL);
- mr->ipsec_mtag = NULL;
if (mtag != NULL) {
mtag->drv_spi = drv_spi;
- m_tag_prepend(mb, &mtag->tag);
+ if (ipsec_accel_fill_xh(ifp, drv_spi, &mtag->xh)) {
+ m_tag_prepend(mb, &mtag->tag);
+ mr->ipsec_mtag = NULL;
+ }
}
}
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
index 8b8f2e570245..89d2010656c5 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
@@ -42,13 +42,30 @@
static if_snd_tag_free_t mlx5e_tls_rx_snd_tag_free;
static if_snd_tag_modify_t mlx5e_tls_rx_snd_tag_modify;
+static if_snd_tag_status_str_t mlx5e_tls_rx_snd_tag_status_str;
static const struct if_snd_tag_sw mlx5e_tls_rx_snd_tag_sw = {
.snd_tag_modify = mlx5e_tls_rx_snd_tag_modify,
.snd_tag_free = mlx5e_tls_rx_snd_tag_free,
+ .snd_tag_status_str = mlx5e_tls_rx_snd_tag_status_str,
.type = IF_SND_TAG_TYPE_TLS_RX
};
+static const char *mlx5e_tls_rx_progress_params_auth_state_str[] = {
+ [MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD] = "no_offload",
+ [MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_OFFLOAD] = "offload",
+ [MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_AUTHENTICATION] =
+ "authentication",
+};
+
+static const char *mlx5e_tls_rx_progress_params_record_tracker_state_str[] = {
+ [MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START] = "start",
+ [MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_TRACKING] =
+ "tracking",
+ [MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_SEARCHING] =
+ "searching",
+};
+
MALLOC_DEFINE(M_MLX5E_TLS_RX, "MLX5E_TLS_RX", "MLX5 ethernet HW TLS RX");
/* software TLS RX context */
@@ -250,7 +267,8 @@ mlx5e_tls_rx_send_progress_parameters_sync(struct mlx5e_iq *iq,
mtx_unlock(&iq->lock);
while (1) {
- if (wait_for_completion_timeout(&ptag->progress_complete, hz) != 0)
+ if (wait_for_completion_timeout(&ptag->progress_complete,
+ msecs_to_jiffies(1000)) != 0)
break;
priv = container_of(iq, struct mlx5e_channel, iq)->priv;
if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR ||
@@ -331,7 +349,8 @@ done:
* Zero is returned upon success, else some error happened.
*/
static int
-mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_rx_tag *ptag)
+mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq,
+ struct mlx5e_tls_rx_tag *ptag, mlx5e_iq_callback_t *cb)
{
struct mlx5e_get_tls_progress_params_wqe *wqe;
const u32 ds_cnt = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS);
@@ -367,7 +386,7 @@ mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_r
memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32));
iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
- iq->data[pi].callback = &mlx5e_tls_rx_receive_progress_parameters_cb;
+ iq->data[pi].callback = cb;
iq->data[pi].arg = ptag;
m_snd_tag_ref(&ptag->tag);
@@ -640,7 +659,8 @@ mlx5e_tls_rx_set_params(void *ctx, struct inpcb *inp, const struct tls_session_p
return (EINVAL);
MLX5_SET64(sw_tls_rx_cntx, ctx, param.initial_record_number, tls_sn_he);
- MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, tcp_sn_he);
+ MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, 0);
+ MLX5_SET(sw_tls_rx_cntx, ctx, progress.next_record_tcp_sn, tcp_sn_he);
return (0);
}
@@ -819,6 +839,7 @@ mlx5e_tls_rx_snd_tag_alloc(if_t ifp,
}
ptag->flow_rule = flow_rule;
+ init_completion(&ptag->progress_complete);
return (0);
@@ -968,7 +989,8 @@ mlx5e_tls_rx_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_param
params->tls_rx.tls_rec_length,
params->tls_rx.tls_seq_number) &&
ptag->tcp_resync_pending == 0) {
- err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag);
+ err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag,
+ &mlx5e_tls_rx_receive_progress_parameters_cb);
if (err != 0) {
MLX5E_TLS_RX_STAT_INC(ptag, rx_resync_err, 1);
} else {
@@ -1001,6 +1023,74 @@ mlx5e_tls_rx_snd_tag_free(struct m_snd_tag *pmt)
queue_work(priv->tls_rx.wq, &ptag->work);
}
+static void
+mlx5e_tls_rx_str_status_cb(void *arg)
+{
+ struct mlx5e_tls_rx_tag *ptag;
+
+ ptag = (struct mlx5e_tls_rx_tag *)arg;
+ complete_all(&ptag->progress_complete);
+ m_snd_tag_rele(&ptag->tag);
+}
+
+static int
+mlx5e_tls_rx_snd_tag_status_str(struct m_snd_tag *pmt, char *buf, size_t *sz)
+{
+ int err, out_size;
+ struct mlx5e_iq *iq;
+ void *buffer;
+ uint32_t tracker_state_val;
+ uint32_t auth_state_val;
+ struct mlx5e_priv *priv;
+ struct mlx5e_tls_rx_tag *ptag =
+ container_of(pmt, struct mlx5e_tls_rx_tag, tag);
+
+ if (buf == NULL)
+ return (0);
+
+ MLX5E_TLS_RX_TAG_LOCK(ptag);
+ priv = container_of(ptag->tls_rx, struct mlx5e_priv, tls_rx);
+ iq = mlx5e_tls_rx_get_iq(priv, ptag->flowid, ptag->flowtype);
+ reinit_completion(&ptag->progress_complete);
+ err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag,
+ &mlx5e_tls_rx_str_status_cb);
+ MLX5E_TLS_RX_TAG_UNLOCK(ptag);
+ if (err != 0)
+ return (err);
+
+ for (;;) {
+ if (wait_for_completion_timeout(&ptag->progress_complete,
+ msecs_to_jiffies(1000)) != 0)
+ break;
+ if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR ||
+ pci_channel_offline(priv->mdev->pdev) != 0)
+ return (ENXIO);
+ }
+ buffer = mlx5e_tls_rx_get_progress_buffer(ptag);
+ tracker_state_val = MLX5_GET(tls_progress_params, buffer,
+ record_tracker_state);
+ auth_state_val = MLX5_GET(tls_progress_params, buffer, auth_state);
+
+ /* Validate tracker state value is in range */
+ if (tracker_state_val >
+ MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_SEARCHING)
+ return (EINVAL);
+
+ /* Validate auth state value is in range */
+ if (auth_state_val >
+ MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_AUTHENTICATION)
+ return (EINVAL);
+
+ out_size = snprintf(buf, *sz, "tracker_state: %s, auth_state: %s",
+ mlx5e_tls_rx_progress_params_record_tracker_state_str[
+ tracker_state_val],
+ mlx5e_tls_rx_progress_params_auth_state_str[auth_state_val]);
+
+ if (out_size <= *sz)
+ *sz = out_size;
+ return (0);
+}
+
#else
int
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
index 6b53db6fea23..eb569488631a 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
@@ -467,7 +467,7 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq,
break;
}
- mlx5e_accel_ipsec_handle_rx(mb, cqe, mr);
+ mlx5e_accel_ipsec_handle_rx(ifp, mb, cqe, mr);
}
static inline void
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 73a7cee4aad0..fd7f00ced14b 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -48,7 +48,7 @@
#define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */
static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
- struct nvme_async_event_request *aer);
+ struct nvme_async_event_request *aer);
static void
nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags)
@@ -680,96 +680,6 @@ nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
}
static void
-nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
-{
- struct nvme_async_event_request *aer = arg;
- struct nvme_health_information_page *health_info;
- struct nvme_ns_list *nsl;
- struct nvme_error_information_entry *err;
- int i;
-
- /*
- * If the log page fetch for some reason completed with an error,
- * don't pass log page data to the consumers. In practice, this case
- * should never happen.
- */
- if (nvme_completion_is_error(cpl))
- nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
- aer->log_page_id, NULL, 0);
- else {
- /* Convert data to host endian */
- switch (aer->log_page_id) {
- case NVME_LOG_ERROR:
- err = (struct nvme_error_information_entry *)aer->log_page_buffer;
- for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
- nvme_error_information_entry_swapbytes(err++);
- break;
- case NVME_LOG_HEALTH_INFORMATION:
- nvme_health_information_page_swapbytes(
- (struct nvme_health_information_page *)aer->log_page_buffer);
- break;
- case NVME_LOG_CHANGED_NAMESPACE:
- nvme_ns_list_swapbytes(
- (struct nvme_ns_list *)aer->log_page_buffer);
- break;
- case NVME_LOG_COMMAND_EFFECT:
- nvme_command_effects_page_swapbytes(
- (struct nvme_command_effects_page *)aer->log_page_buffer);
- break;
- case NVME_LOG_RES_NOTIFICATION:
- nvme_res_notification_page_swapbytes(
- (struct nvme_res_notification_page *)aer->log_page_buffer);
- break;
- case NVME_LOG_SANITIZE_STATUS:
- nvme_sanitize_status_page_swapbytes(
- (struct nvme_sanitize_status_page *)aer->log_page_buffer);
- break;
- default:
- break;
- }
-
- if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
- health_info = (struct nvme_health_information_page *)
- aer->log_page_buffer;
- nvme_ctrlr_log_critical_warnings(aer->ctrlr,
- health_info->critical_warning);
- /*
- * Critical warnings reported through the
- * SMART/health log page are persistent, so
- * clear the associated bits in the async event
- * config so that we do not receive repeated
- * notifications for the same event.
- */
- aer->ctrlr->async_event_config &=
- ~health_info->critical_warning;
- nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
- aer->ctrlr->async_event_config, NULL, NULL);
- } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
- !nvme_use_nvd) {
- nsl = (struct nvme_ns_list *)aer->log_page_buffer;
- for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
- if (nsl->ns[i] > NVME_MAX_NAMESPACES)
- break;
- nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
- }
- }
-
- /*
- * Pass the cpl data from the original async event completion,
- * not the log page fetch.
- */
- nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
- aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
- }
-
- /*
- * Repost another asynchronous event request to replace the one
- * that just completed.
- */
- nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
-}
-
-static void
nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
{
struct nvme_async_event_request *aer = arg;
@@ -784,33 +694,18 @@ nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
return;
}
- /* Associated log page is in bits 23:16 of completion entry dw0. */
+ /*
+ * Save the completion status and associated log page is in bits 23:16
+ * of completion entry dw0. Print a message and queue it for further
+ * processing.
+ */
+ memcpy(&aer->cpl, cpl, sizeof(*cpl));
aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0);
-
nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
" page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0),
NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0),
aer->log_page_id);
-
- if (is_log_page_id_valid(aer->log_page_id)) {
- aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
- aer->log_page_id);
- memcpy(&aer->cpl, cpl, sizeof(*cpl));
- nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
- NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
- aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
- aer);
- /* Wait to notify consumers until after log page is fetched. */
- } else {
- nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
- NULL, 0);
-
- /*
- * Repost another asynchronous event request to replace the one
- * that just completed.
- */
- nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
- }
+ taskqueue_enqueue(aer->ctrlr->taskqueue, &aer->task);
}
static void
@@ -819,15 +714,21 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
{
struct nvme_request *req;
- aer->ctrlr = ctrlr;
/*
- * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable
- * callback context. AER completions should be handled on a dedicated
- * thread.
+ * We're racing the reset thread, so let that process submit this again.
+ * XXX does this really solve that race? And is that race even possible
+ * since we only reset when we've no theard from the card in a long
+ * time. Why would we get an AER in the middle of that just before we
+ * kick off the reset?
*/
- req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb,
+ if (ctrlr->is_resetting)
+ return;
+
+ aer->ctrlr = ctrlr;
+ req = nvme_allocate_request_null(M_WAITOK, nvme_ctrlr_async_event_cb,
aer);
aer->req = req;
+ aer->log_page_id = 0; /* Not a valid page */
/*
* Disable timeout here, since asynchronous event requests should by
@@ -1203,6 +1104,140 @@ nvme_ctrlr_reset_task(void *arg, int pending)
atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
}
+static void
+nvme_ctrlr_aer_done(void *arg, const struct nvme_completion *cpl)
+{
+ struct nvme_async_event_request *aer = arg;
+
+ mtx_lock(&aer->mtx);
+ if (nvme_completion_is_error(cpl))
+ aer->log_page_size = (uint32_t)-1;
+ else
+ aer->log_page_size = nvme_ctrlr_get_log_page_size(
+ aer->ctrlr, aer->log_page_id);
+ wakeup(aer);
+ mtx_unlock(&aer->mtx);
+}
+
+static void
+nvme_ctrlr_aer_task(void *arg, int pending)
+{
+ struct nvme_async_event_request *aer = arg;
+ struct nvme_controller *ctrlr = aer->ctrlr;
+ uint32_t len;
+
+ /*
+ * We're resetting, so just punt.
+ */
+ if (ctrlr->is_resetting)
+ return;
+
+ if (!is_log_page_id_valid(aer->log_page_id)) {
+ /*
+ * Repost another asynchronous event request to replace the one
+ * that just completed.
+ */
+ nvme_notify_async_consumers(ctrlr, &aer->cpl, aer->log_page_id,
+ NULL, 0);
+ nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+ goto out;
+ }
+
+ aer->log_page_size = 0;
+ len = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id);
+ nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
+ NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, len,
+ nvme_ctrlr_aer_done, aer);
+ mtx_lock(&aer->mtx);
+ while (aer->log_page_size == 0)
+ mtx_sleep(aer, &aer->mtx, PRIBIO, "nvme_pt", 0);
+ mtx_unlock(&aer->mtx);
+
+ if (aer->log_page_size != (uint32_t)-1) {
+ /*
+ * If the log page fetch for some reason completed with an
+ * error, don't pass log page data to the consumers. In
+ * practice, this case should never happen.
+ */
+ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+ aer->log_page_id, NULL, 0);
+ goto out;
+ }
+
+ /* Convert data to host endian */
+ switch (aer->log_page_id) {
+ case NVME_LOG_ERROR: {
+ struct nvme_error_information_entry *err =
+ (struct nvme_error_information_entry *)aer->log_page_buffer;
+ for (int i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
+ nvme_error_information_entry_swapbytes(err++);
+ break;
+ }
+ case NVME_LOG_HEALTH_INFORMATION:
+ nvme_health_information_page_swapbytes(
+ (struct nvme_health_information_page *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_CHANGED_NAMESPACE:
+ nvme_ns_list_swapbytes(
+ (struct nvme_ns_list *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_COMMAND_EFFECT:
+ nvme_command_effects_page_swapbytes(
+ (struct nvme_command_effects_page *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_RES_NOTIFICATION:
+ nvme_res_notification_page_swapbytes(
+ (struct nvme_res_notification_page *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_SANITIZE_STATUS:
+ nvme_sanitize_status_page_swapbytes(
+ (struct nvme_sanitize_status_page *)aer->log_page_buffer);
+ break;
+ default:
+ break;
+ }
+
+ if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
+ struct nvme_health_information_page *health_info =
+ (struct nvme_health_information_page *)aer->log_page_buffer;
+
+ /*
+ * Critical warnings reported through the SMART/health log page
+ * are persistent, so clear the associated bits in the async
+ * event config so that we do not receive repeated notifications
+ * for the same event.
+ */
+ nvme_ctrlr_log_critical_warnings(aer->ctrlr,
+ health_info->critical_warning);
+ aer->ctrlr->async_event_config &=
+ ~health_info->critical_warning;
+ nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
+ aer->ctrlr->async_event_config, NULL, NULL);
+ } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE) {
+ struct nvme_ns_list *nsl =
+ (struct nvme_ns_list *)aer->log_page_buffer;
+ for (int i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
+ if (nsl->ns[i] > NVME_MAX_NAMESPACES)
+ break;
+ nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
+ }
+ }
+
+ /*
+ * Pass the cpl data from the original async event completion, not the
+ * log page fetch.
+ */
+ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+ aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
+
+ /*
+ * Repost another asynchronous event request to replace the one
+ * that just completed.
+ */
+out:
+ nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+}
+
/*
* Poll all the queues enabled on the device for completion.
*/
@@ -1574,13 +1609,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
/*
* Create 2 threads for the taskqueue. The reset thread will block when
* it detects that the controller has failed until all I/O has been
- * failed up the stack. The fail_req task needs to be able to run in
- * this case to finish the request failure for some cases.
- *
- * We could partially solve this race by draining the failed requeust
- * queue before proceding to free the sim, though nothing would stop
- * new I/O from coming in after we do that drain, but before we reach
- * cam_sim_free, so this big hammer is used instead.
+ * failed up the stack. The second thread is used for AER events, which
+ * can block, but only briefly for memory and log page fetching.
*/
ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
taskqueue_thread_enqueue, &ctrlr->taskqueue);
@@ -1590,7 +1620,12 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
ctrlr->is_initialized = false;
ctrlr->notification_sent = 0;
TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
- STAILQ_INIT(&ctrlr->fail_req);
+ for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+ struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+ TASK_INIT(&aer->task, 0, nvme_ctrlr_aer_task, aer);
+ mtx_init(&aer->mtx, "AER mutex", NULL, MTX_DEF);
+ }
ctrlr->is_failed = false;
make_dev_args_init(&md_args);
@@ -1678,8 +1713,14 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
}
noadminq:
- if (ctrlr->taskqueue)
+ if (ctrlr->taskqueue) {
taskqueue_free(ctrlr->taskqueue);
+ for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+ struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+ mtx_destroy(&aer->mtx);
+ }
+ }
if (ctrlr->tag)
bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 949e69ec9290..36f00fedc48e 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -123,6 +123,8 @@ struct nvme_request {
struct nvme_async_event_request {
struct nvme_controller *ctrlr;
struct nvme_request *req;
+ struct task task;
+ struct mtx mtx;
struct nvme_completion cpl;
uint32_t log_page_id;
uint32_t log_page_size;
@@ -307,8 +309,6 @@ struct nvme_controller {
bool isr_warned;
bool is_initialized;
- STAILQ_HEAD(, nvme_request) fail_req;
-
/* Host Memory Buffer */
int hmb_nchunks;
size_t hmb_chunk;
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
index dbdd4568bdf1..1ac0d142443b 100644
--- a/sys/dev/nvmf/host/nvmf.c
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -27,6 +27,7 @@
#include <dev/nvmf/host/nvmf_var.h>
static struct cdevsw nvmf_cdevsw;
+static struct taskqueue *nvmf_tq;
bool nvmf_fail_disconnect = false;
SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
@@ -34,7 +35,10 @@ SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
+static void nvmf_controller_loss_task(void *arg, int pending);
static void nvmf_disconnect_task(void *arg, int pending);
+static void nvmf_request_reconnect(struct nvmf_softc *sc);
+static void nvmf_request_reconnect_task(void *arg, int pending);
static void nvmf_shutdown_pre_sync(void *arg, int howto);
static void nvmf_shutdown_post_sync(void *arg, int howto);
@@ -294,6 +298,9 @@ nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl)
admin = nvlist_get_nvlist(nvl, "admin");
io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
kato = dnvlist_get_number(nvl, "kato", 0);
+ sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0);
+ sc->controller_loss_timeout = dnvlist_get_number(nvl,
+ "controller_loss_timeout", 0);
/* Setup the admin queue. */
sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
@@ -504,6 +511,10 @@ nvmf_attach(device_t dev)
callout_init(&sc->ka_tx_timer, 1);
sx_init(&sc->connection_lock, "nvmf connection");
TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+ TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0,
+ nvmf_controller_loss_task, sc);
+ TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0,
+ nvmf_request_reconnect_task, sc);
oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
@@ -603,7 +614,9 @@ out:
nvmf_destroy_aer(sc);
- taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+ taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
+ taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task);
+ taskqueue_drain(nvmf_tq, &sc->disconnect_task);
sx_destroy(&sc->connection_lock);
nvlist_destroy(sc->rparams);
free(sc->cdata, M_NVMF);
@@ -613,7 +626,7 @@ out:
void
nvmf_disconnect(struct nvmf_softc *sc)
{
- taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+ taskqueue_enqueue(nvmf_tq, &sc->disconnect_task);
}
static void
@@ -676,6 +689,74 @@ nvmf_disconnect_task(void *arg, int pending __unused)
nvmf_destroy_qp(sc->admin);
sc->admin = NULL;
+ if (sc->reconnect_delay != 0)
+ nvmf_request_reconnect(sc);
+ if (sc->controller_loss_timeout != 0)
+ taskqueue_enqueue_timeout(nvmf_tq,
+ &sc->controller_loss_task, sc->controller_loss_timeout *
+ hz);
+
+ sx_xunlock(&sc->connection_lock);
+}
+
+static void
+nvmf_controller_loss_task(void *arg, int pending)
+{
+ struct nvmf_softc *sc = arg;
+ device_t dev;
+ int error;
+
+ bus_topo_lock();
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin != NULL || sc->detaching) {
+ /* Reconnected or already detaching. */
+ sx_xunlock(&sc->connection_lock);
+ bus_topo_unlock();
+ return;
+ }
+
+ sc->controller_timedout = true;
+ sx_xunlock(&sc->connection_lock);
+
+ /*
+ * XXX: Doing this from here is a bit ugly. We don't have an
+ * extra reference on `dev` but bus_topo_lock should block any
+ * concurrent device_delete_child invocations.
+ */
+ dev = sc->dev;
+ error = device_delete_child(root_bus, dev);
+ if (error != 0)
+ device_printf(dev,
+ "failed to detach after controller loss: %d\n", error);
+ bus_topo_unlock();
+}
+
+static void
+nvmf_request_reconnect(struct nvmf_softc *sc)
+{
+ char buf[64];
+
+ sx_assert(&sc->connection_lock, SX_LOCKED);
+
+ snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev));
+ devctl_notify("nvme", "controller", "RECONNECT", buf);
+ taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task,
+ sc->reconnect_delay * hz);
+}
+
+static void
+nvmf_request_reconnect_task(void *arg, int pending)
+{
+ struct nvmf_softc *sc = arg;
+
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
+ /* Reconnected or already detaching. */
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ nvmf_request_reconnect(sc);
sx_xunlock(&sc->connection_lock);
}
@@ -699,7 +780,7 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
}
sx_xlock(&sc->connection_lock);
- if (sc->admin != NULL || sc->detaching) {
+ if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
error = EBUSY;
goto out;
}
@@ -745,6 +826,9 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
nvmf_reconnect_sim(sc);
nvmf_rescan_all_ns(sc);
+
+ taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL);
+ taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL);
out:
sx_xunlock(&sc->connection_lock);
nvlist_destroy(nvl);
@@ -852,7 +936,21 @@ nvmf_detach(device_t dev)
}
free(sc->io, M_NVMF);
- taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+ taskqueue_drain(nvmf_tq, &sc->disconnect_task);
+ if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task,
+ NULL) != 0)
+ taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
+
+ /*
+ * Don't cancel/drain the controller loss task if that task
+ * has fired and is triggering the detach.
+ */
+ if (!sc->controller_timedout) {
+ if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task,
+ NULL) != 0)
+ taskqueue_drain_timeout(nvmf_tq,
+ &sc->controller_loss_task);
+ }
if (sc->admin != NULL)
nvmf_destroy_qp(sc->admin);
@@ -1154,14 +1252,25 @@ static struct cdevsw nvmf_cdevsw = {
static int
nvmf_modevent(module_t mod, int what, void *arg)
{
+ int error;
+
switch (what) {
case MOD_LOAD:
- return (nvmf_ctl_load());
+ error = nvmf_ctl_load();
+ if (error != 0)
+ return (error);
+
+ nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO,
+ taskqueue_thread_enqueue, &nvmf_tq);
+ taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq");
+ return (0);
case MOD_QUIESCE:
return (0);
case MOD_UNLOAD:
nvmf_ctl_unload();
destroy_dev_drain(&nvmf_cdevsw);
+ if (nvmf_tq != NULL)
+ taskqueue_free(nvmf_tq);
return (0);
default:
return (EOPNOTSUPP);
diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h
index e45a31f413a4..606245b3969c 100644
--- a/sys/dev/nvmf/host/nvmf_var.h
+++ b/sys/dev/nvmf/host/nvmf_var.h
@@ -75,9 +75,15 @@ struct nvmf_softc {
struct callout ka_rx_timer;
sbintime_t ka_rx_sbt;
+ struct timeout_task request_reconnect_task;
+ struct timeout_task controller_loss_task;
+ uint32_t reconnect_delay;
+ uint32_t controller_loss_timeout;
+
struct sx connection_lock;
struct task disconnect_task;
bool detaching;
+ bool controller_timedout;
u_int num_aer;
struct nvmf_aer *aer;
diff --git a/sys/dev/nvmf/nvmf.h b/sys/dev/nvmf/nvmf.h
index d4e7b1511e9d..9b2b4c1dea40 100644
--- a/sys/dev/nvmf/nvmf.h
+++ b/sys/dev/nvmf/nvmf.h
@@ -27,6 +27,13 @@
#define NVMF_NN (1024)
/*
+ * Default timeouts for Fabrics hosts. These match values used by
+ * Linux.
+ */
+#define NVMF_DEFAULT_RECONNECT_DELAY 10
+#define NVMF_DEFAULT_CONTROLLER_LOSS 600
+
+/*
* (data, size) is the userspace buffer for a packed nvlist.
*
* For requests that copyout an nvlist, len is the amount of data
@@ -68,6 +75,8 @@ struct nvmf_ioc_nv {
*
* number trtype
* number kato (optional)
+ * number reconnect_delay (optional)
+ * number controller_loss_timeout (optional)
* qpair handoff nvlist admin
* qpair handoff nvlist array io
* binary cdata struct nvme_controller_data
@@ -81,6 +90,8 @@ struct nvmf_ioc_nv {
* string hostnqn
* number num_io_queues
* number kato (optional)
+ * number reconnect_delay (optional)
+ * number controller_loss_timeout (optional)
* number io_qsize
* bool sq_flow_control
*
diff --git a/sys/dev/ofw/ofw_bus_subr.c b/sys/dev/ofw/ofw_bus_subr.c
index 4d0479dfb957..b99d784929bc 100644
--- a/sys/dev/ofw/ofw_bus_subr.c
+++ b/sys/dev/ofw/ofw_bus_subr.c
@@ -634,11 +634,89 @@ ofw_bus_find_iparent(phandle_t node)
return (iparent);
}
+static phandle_t
+ofw_bus_search_iparent(phandle_t node)
+{
+ phandle_t iparent;
+
+ do {
+ if (OF_getencprop(node, "interrupt-parent", &iparent,
+ sizeof(iparent)) > 0) {
+ node = OF_node_from_xref(iparent);
+ } else {
+ node = OF_parent(node);
+ }
+ if (node == 0)
+ return (0);
+ } while (!OF_hasprop(node, "#interrupt-cells"));
+
+ return (OF_xref_from_node(node));
+}
+
+static int
+ofw_bus_traverse_imap(phandle_t inode, phandle_t node, uint32_t *intr,
+ int intrsz, pcell_t *res, int ressz, phandle_t *iparentp)
+{
+ struct ofw_bus_iinfo ii;
+ void *reg;
+ uint32_t *intrp;
+ phandle_t iparent;
+ int rv = 0;
+
+ /* We already have an interrupt controller */
+ if (OF_hasprop(node, "interrupt-controller"))
+ return (0);
+
+ intrp = malloc(intrsz, M_OFWPROP, M_WAITOK);
+ memcpy(intrp, intr, intrsz);
+
+ while (true) {
+ /* There is no interrupt-map to follow */
+ if (!OF_hasprop(inode, "interrupt-map")) {
+ free(intrp, M_OFWPROP);
+ return (0);
+ }
+
+ memset(&ii, 0, sizeof(ii));
+ ofw_bus_setup_iinfo(inode, &ii, sizeof(cell_t));
+
+ reg = NULL;
+ if (ii.opi_addrc > 0)
+ reg = malloc(ii.opi_addrc, M_OFWPROP, M_WAITOK);
+
+ rv = ofw_bus_lookup_imap(node, &ii, reg, ii.opi_addrc, intrp,
+ intrsz, res, ressz, &iparent);
+
+ free(reg, M_OFWPROP);
+ free(ii.opi_imap, M_OFWPROP);
+ free(ii.opi_imapmsk, M_OFWPROP);
+ free(intrp, M_OFWPROP);
+
+ if (rv == 0)
+ return (0);
+
+ node = inode;
+ inode = OF_node_from_xref(iparent);
+
+ /* Stop when we have an interrupt controller */
+ if (OF_hasprop(inode, "interrupt-controller")) {
+ *iparentp = iparent;
+ return (rv);
+ }
+
+ intrsz = rv * sizeof(pcell_t);
+ intrp = malloc(intrsz, M_OFWPROP, M_WAITOK);
+ memcpy(intrp, res, intrsz);
+ }
+}
+
int
ofw_bus_intr_to_rl(device_t dev, phandle_t node,
struct resource_list *rl, int *rlen)
{
- phandle_t iparent;
+ phandle_t iparent, iparent_node;
+ uint32_t result[16];
+ uint32_t intrpcells, *intrp;
uint32_t icells, *intr;
int err, i, irqnum, nintr, rid;
bool extended;
@@ -646,15 +724,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
nintr = OF_getencprop_alloc_multi(node, "interrupts", sizeof(*intr),
(void **)&intr);
if (nintr > 0) {
- iparent = ofw_bus_find_iparent(node);
+ iparent = ofw_bus_search_iparent(node);
if (iparent == 0) {
device_printf(dev, "No interrupt-parent found, "
"assuming direct parent\n");
iparent = OF_parent(node);
iparent = OF_xref_from_node(iparent);
}
- if (OF_searchencprop(OF_node_from_xref(iparent),
- "#interrupt-cells", &icells, sizeof(icells)) == -1) {
+ iparent_node = OF_node_from_xref(iparent);
+ if (OF_searchencprop(iparent_node, "#interrupt-cells", &icells,
+ sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property, assuming <1>\n");
icells = 1;
@@ -677,7 +756,8 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
for (i = 0; i < nintr; i += icells) {
if (extended) {
iparent = intr[i++];
- if (OF_searchencprop(OF_node_from_xref(iparent),
+ iparent_node = OF_node_from_xref(iparent);
+ if (OF_searchencprop(iparent_node,
"#interrupt-cells", &icells, sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property\n");
@@ -691,7 +771,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
break;
}
}
- irqnum = ofw_bus_map_intr(dev, iparent, icells, &intr[i]);
+
+ intrp = &intr[i];
+ intrpcells = ofw_bus_traverse_imap(iparent_node, node, intrp,
+ icells * sizeof(intr[0]), result, sizeof(result), &iparent);
+ if (intrpcells > 0)
+ intrp = result;
+ else
+ intrpcells = icells;
+
+ irqnum = ofw_bus_map_intr(dev, iparent, intrpcells, intrp);
resource_list_add(rl, SYS_RES_IRQ, rid++, irqnum, irqnum, 1);
}
if (rlen != NULL)
diff --git a/sys/dev/qlnx/qlnxe/qlnx_os.c b/sys/dev/qlnx/qlnxe/qlnx_os.c
index 05ec69a70dfe..4ad190374f87 100644
--- a/sys/dev/qlnx/qlnxe/qlnx_os.c
+++ b/sys/dev/qlnx/qlnxe/qlnx_os.c
@@ -30,6 +30,8 @@
* Author : David C Somayajulu, Cavium, Inc., San Jose, CA 95131.
*/
+#include "opt_inet.h"
+
#include <sys/cdefs.h>
#include "qlnx_os.h"
#include "bcm_osal.h"
@@ -2306,8 +2308,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
else if (device_id == QLOGIC_PCI_DEVICE_ID_1644)
if_setbaudrate(ifp, IF_Gbps(100));
- if_setcapabilities(ifp, IFCAP_LINKSTATE);
-
if_setinitfn(ifp, qlnx_init);
if_setsoftc(ifp, ha);
if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
@@ -2341,7 +2341,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
if_setcapabilities(ifp, IFCAP_HWCSUM);
if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
-
if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0);
if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING, 0);
if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0);
@@ -2350,6 +2349,8 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
+ if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0);
+ if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0);
if_sethwtsomax(ifp, QLNX_MAX_TSO_FRAME_SIZE -
(ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
@@ -2778,7 +2779,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
if (!p_ptt) {
QL_DPRINT1(ha, "ecore_ptt_acquire failed\n");
- ret = -1;
+ ret = ERESTART;
break;
}
@@ -2789,7 +2790,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
ecore_ptt_release(p_hwfn, p_ptt);
if (ret) {
- ret = -1;
+ ret = ENODEV;
break;
}
diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c
index c4282c723a44..8363de99a60a 100644
--- a/sys/dev/random/fortuna.c
+++ b/sys/dev/random/fortuna.c
@@ -341,6 +341,13 @@ random_fortuna_process_event(struct harvest_event *event)
u_int pl;
RANDOM_RESEED_LOCK();
+ /*
+ * Run SP 800-90B health tests on the source if so configured.
+ */
+ if (!random_harvest_healthtest(event)) {
+ RANDOM_RESEED_UNLOCK();
+ return;
+ }
/*-
* FS&K - P_i = P_i|<harvested stuff>
* Accumulate the event into the appropriate pool
diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c
index 379b64ac15f1..c7762967c4fb 100644
--- a/sys/dev/random/random_harvestq.c
+++ b/sys/dev/random/random_harvestq.c
@@ -88,6 +88,8 @@ static void random_sources_feed(void);
static __read_mostly bool epoch_inited;
static __read_mostly epoch_t rs_epoch;
+static const char *random_source_descr[ENTROPYSOURCE];
+
/*
* How many events to queue up. We create this many items in
* an 'empty' queue, then transfer them to the 'harvest' queue with
@@ -131,36 +133,25 @@ static struct harvest_context {
/* The context of the kernel thread processing harvested entropy */
struct proc *hc_kthread_proc;
/*
- * Lockless ring buffer holding entropy events
- * If ring.in == ring.out,
- * the buffer is empty.
- * If ring.in != ring.out,
- * the buffer contains harvested entropy.
- * If (ring.in + 1) == ring.out (mod RANDOM_RING_MAX),
- * the buffer is full.
- *
- * NOTE: ring.in points to the last added element,
- * and ring.out points to the last consumed element.
- *
- * The ring.in variable needs locking as there are multiple
- * sources to the ring. Only the sources may change ring.in,
- * but the consumer may examine it.
- *
- * The ring.out variable does not need locking as there is
- * only one consumer. Only the consumer may change ring.out,
- * but the sources may examine it.
+ * A pair of buffers for queued events. New events are added to the
+ * active queue while the kthread processes the other one in parallel.
*/
- struct entropy_ring {
+ struct entropy_buffer {
struct harvest_event ring[RANDOM_RING_MAX];
- volatile u_int in;
- volatile u_int out;
- } hc_entropy_ring;
+ u_int pos;
+ } hc_entropy_buf[2];
+ u_int hc_active_buf;
struct fast_entropy_accumulator {
volatile u_int pos;
uint32_t buf[RANDOM_ACCUM_MAX];
} hc_entropy_fast_accumulator;
} harvest_context;
+#define RANDOM_HARVEST_INIT_LOCK() mtx_init(&harvest_context.hc_mtx, \
+ "entropy harvest mutex", NULL, MTX_SPIN)
+#define RANDOM_HARVEST_LOCK() mtx_lock_spin(&harvest_context.hc_mtx)
+#define RANDOM_HARVEST_UNLOCK() mtx_unlock_spin(&harvest_context.hc_mtx)
+
static struct kproc_desc random_proc_kp = {
"rand_harvestq",
random_kthread,
@@ -178,43 +169,48 @@ random_harvestq_fast_process_event(struct harvest_event *event)
static void
random_kthread(void)
{
- u_int maxloop, ring_out, i;
+ struct harvest_context *hc;
- /*
- * Locking is not needed as this is the only place we modify ring.out, and
- * we only examine ring.in without changing it. Both of these are volatile,
- * and this is a unique thread.
- */
+ hc = &harvest_context;
for (random_kthread_control = 1; random_kthread_control;) {
- /* Deal with events, if any. Restrict the number we do in one go. */
- maxloop = RANDOM_RING_MAX;
- while (harvest_context.hc_entropy_ring.out != harvest_context.hc_entropy_ring.in) {
- ring_out = (harvest_context.hc_entropy_ring.out + 1)%RANDOM_RING_MAX;
- random_harvestq_fast_process_event(harvest_context.hc_entropy_ring.ring + ring_out);
- harvest_context.hc_entropy_ring.out = ring_out;
- if (!--maxloop)
- break;
- }
+ struct entropy_buffer *buf;
+ u_int entries;
+
+ /* Deal with queued events. */
+ RANDOM_HARVEST_LOCK();
+ buf = &hc->hc_entropy_buf[hc->hc_active_buf];
+ entries = buf->pos;
+ buf->pos = 0;
+ hc->hc_active_buf = (hc->hc_active_buf + 1) %
+ nitems(hc->hc_entropy_buf);
+ RANDOM_HARVEST_UNLOCK();
+ for (u_int i = 0; i < entries; i++)
+ random_harvestq_fast_process_event(&buf->ring[i]);
+
+ /* Poll sources of noise. */
random_sources_feed();
+
/* XXX: FIX!! Increase the high-performance data rate? Need some measurements first. */
- for (i = 0; i < RANDOM_ACCUM_MAX; i++) {
- if (harvest_context.hc_entropy_fast_accumulator.buf[i]) {
- random_harvest_direct(harvest_context.hc_entropy_fast_accumulator.buf + i, sizeof(harvest_context.hc_entropy_fast_accumulator.buf[0]), RANDOM_UMA);
- harvest_context.hc_entropy_fast_accumulator.buf[i] = 0;
+ for (u_int i = 0; i < RANDOM_ACCUM_MAX; i++) {
+ if (hc->hc_entropy_fast_accumulator.buf[i]) {
+ random_harvest_direct(&hc->hc_entropy_fast_accumulator.buf[i],
+ sizeof(hc->hc_entropy_fast_accumulator.buf[0]), RANDOM_UMA);
+ hc->hc_entropy_fast_accumulator.buf[i] = 0;
}
}
/* XXX: FIX!! This is a *great* place to pass hardware/live entropy to random(9) */
- tsleep_sbt(&harvest_context.hc_kthread_proc, 0, "-",
+ tsleep_sbt(&hc->hc_kthread_proc, 0, "-",
SBT_1S/RANDOM_KTHREAD_HZ, 0, C_PREL(1));
}
random_kthread_control = -1;
- wakeup(&harvest_context.hc_kthread_proc);
+ wakeup(&hc->hc_kthread_proc);
kproc_exit(0);
/* NOTREACHED */
}
-/* This happens well after SI_SUB_RANDOM */
SYSINIT(random_device_h_proc, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, kproc_start,
&random_proc_kp);
+_Static_assert(SI_SUB_KICK_SCHEDULER > SI_SUB_RANDOM,
+ "random kthread starting before subsystem initialization");
static void
rs_epoch_init(void *dummy __unused)
@@ -305,7 +301,230 @@ random_sources_feed(void)
explicit_bzero(entropy, sizeof(entropy));
}
-/* ARGSUSED */
+/*
+ * State used for conducting NIST SP 800-90B health tests on entropy sources.
+ */
+static struct health_test_softc {
+ uint32_t ht_rct_value[HARVESTSIZE + 1];
+ u_int ht_rct_count; /* number of samples with the same value */
+ u_int ht_rct_limit; /* constant after init */
+
+ uint32_t ht_apt_value[HARVESTSIZE + 1];
+ u_int ht_apt_count; /* number of samples with the same value */
+ u_int ht_apt_seq; /* sequence number of the last sample */
+ u_int ht_apt_cutoff; /* constant after init */
+
+ uint64_t ht_total_samples;
+ bool ondemand; /* Set to true to restart the state machine */
+ enum {
+ INIT = 0, /* initial state */
+ DISABLED, /* health checking is disabled */
+ STARTUP, /* doing startup tests, samples are discarded */
+ STEADY, /* steady-state operation */
+ FAILED, /* health check failed, discard samples */
+ } ht_state;
+} healthtest[ENTROPYSOURCE];
+
+#define RANDOM_SELFTEST_STARTUP_SAMPLES 1024 /* 4.3, requirement 4 */
+#define RANDOM_SELFTEST_APT_WINDOW 512 /* 4.4.2 */
+
+static void
+copy_event(uint32_t dst[static HARVESTSIZE + 1],
+ const struct harvest_event *event)
+{
+ memset(dst, 0, sizeof(uint32_t) * (HARVESTSIZE + 1));
+ memcpy(dst, event->he_entropy, event->he_size);
+ dst[HARVESTSIZE] = event->he_somecounter;
+}
+
+static void
+random_healthtest_rct_init(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ ht->ht_rct_count = 1;
+ copy_event(ht->ht_rct_value, event);
+}
+
+/*
+ * Apply the repitition count test to a sample.
+ *
+ * Return false if the test failed, i.e., we observed >= C consecutive samples
+ * with the same value, and true otherwise.
+ */
+static bool
+random_healthtest_rct_next(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ uint32_t val[HARVESTSIZE + 1];
+
+ copy_event(val, event);
+ if (memcmp(val, ht->ht_rct_value, sizeof(ht->ht_rct_value)) != 0) {
+ ht->ht_rct_count = 1;
+ memcpy(ht->ht_rct_value, val, sizeof(ht->ht_rct_value));
+ return (true);
+ } else {
+ ht->ht_rct_count++;
+ return (ht->ht_rct_count < ht->ht_rct_limit);
+ }
+}
+
+static void
+random_healthtest_apt_init(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ ht->ht_apt_count = 1;
+ ht->ht_apt_seq = 1;
+ copy_event(ht->ht_apt_value, event);
+}
+
+static bool
+random_healthtest_apt_next(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ uint32_t val[HARVESTSIZE + 1];
+
+ if (ht->ht_apt_seq == 0) {
+ random_healthtest_apt_init(ht, event);
+ return (true);
+ }
+
+ copy_event(val, event);
+ if (memcmp(val, ht->ht_apt_value, sizeof(ht->ht_apt_value)) == 0) {
+ ht->ht_apt_count++;
+ if (ht->ht_apt_count >= ht->ht_apt_cutoff)
+ return (false);
+ }
+
+ ht->ht_apt_seq++;
+ if (ht->ht_apt_seq == RANDOM_SELFTEST_APT_WINDOW)
+ ht->ht_apt_seq = 0;
+
+ return (true);
+}
+
+/*
+ * Run the health tests for the given event. This is assumed to be called from
+ * a serialized context.
+ */
+bool
+random_harvest_healthtest(const struct harvest_event *event)
+{
+ struct health_test_softc *ht;
+
+ ht = &healthtest[event->he_source];
+
+ /*
+ * Was on-demand testing requested? Restart the state machine if so,
+ * restarting the startup tests.
+ */
+ if (atomic_load_bool(&ht->ondemand)) {
+ atomic_store_bool(&ht->ondemand, false);
+ ht->ht_state = INIT;
+ }
+
+ switch (ht->ht_state) {
+ case __predict_false(INIT):
+ /* Store the first sample and initialize test state. */
+ random_healthtest_rct_init(ht, event);
+ random_healthtest_apt_init(ht, event);
+ ht->ht_total_samples = 0;
+ ht->ht_state = STARTUP;
+ return (false);
+ case DISABLED:
+ /* No health testing for this source. */
+ return (true);
+ case STEADY:
+ case STARTUP:
+ ht->ht_total_samples++;
+ if (random_healthtest_rct_next(ht, event) &&
+ random_healthtest_apt_next(ht, event)) {
+ if (ht->ht_state == STARTUP &&
+ ht->ht_total_samples >=
+ RANDOM_SELFTEST_STARTUP_SAMPLES) {
+ printf(
+ "random: health test passed for source %s\n",
+ random_source_descr[event->he_source]);
+ ht->ht_state = STEADY;
+ }
+ return (ht->ht_state == STEADY);
+ }
+ ht->ht_state = FAILED;
+ printf(
+ "random: health test failed for source %s, discarding samples\n",
+ random_source_descr[event->he_source]);
+ /* FALLTHROUGH */
+ case FAILED:
+ return (false);
+ }
+}
+
+static bool nist_healthtest_enabled = false;
+SYSCTL_BOOL(_kern_random, OID_AUTO, nist_healthtest_enabled,
+ CTLFLAG_RDTUN, &nist_healthtest_enabled, 0,
+ "Enable NIST SP 800-90B health tests for noise sources");
+
+static void
+random_healthtest_init(enum random_entropy_source source)
+{
+ struct health_test_softc *ht;
+
+ ht = &healthtest[source];
+ KASSERT(ht->ht_state == INIT,
+ ("%s: health test state is %d for source %d",
+ __func__, ht->ht_state, source));
+
+ /*
+ * If health-testing is enabled, validate all sources except CACHED and
+ * VMGENID: they are deterministic sources used only a small, fixed
+ * number of times, so statistical testing is not applicable.
+ */
+ if (!nist_healthtest_enabled ||
+ source == RANDOM_CACHED || source == RANDOM_PURE_VMGENID) {
+ ht->ht_state = DISABLED;
+ return;
+ }
+
+ /*
+ * Set cutoff values for the two tests, assuming that each sample has
+ * min-entropy of 1 bit and allowing for an error rate of 1 in 2^{34}.
+ * With a sample rate of RANDOM_KTHREAD_HZ, we expect to see an false
+ * positive once in ~54.5 years.
+ *
+ * The RCT limit comes from the formula in section 4.4.1.
+ *
+ * The APT cutoff is calculated using the formula in section 4.4.2
+ * footnote 10 with the window size changed from 512 to 511, since the
+ * test as written counts the number of samples equal to the first
+ * sample in the window, and thus tests W-1 samples.
+ */
+ ht->ht_rct_limit = 35;
+ ht->ht_apt_cutoff = 330;
+}
+
+static int
+random_healthtest_ondemand(SYSCTL_HANDLER_ARGS)
+{
+ u_int mask, source;
+ int error;
+
+ mask = 0;
+ error = sysctl_handle_int(oidp, &mask, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ while (mask != 0) {
+ source = ffs(mask) - 1;
+ if (source < nitems(healthtest))
+ atomic_store_bool(&healthtest[source].ondemand, true);
+ mask &= ~(1u << source);
+ }
+ return (0);
+}
+SYSCTL_PROC(_kern_random, OID_AUTO, nist_healthtest_ondemand,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
+ random_healthtest_ondemand, "I",
+ "Re-run NIST SP 800-90B startup health tests for a noise source");
+
static int
random_check_uint_harvestmask(SYSCTL_HANDLER_ARGS)
{
@@ -336,7 +555,6 @@ SYSCTL_PROC(_kern_random_harvest, OID_AUTO, mask,
random_check_uint_harvestmask, "IU",
"Entropy harvesting mask");
-/* ARGSUSED */
static int
random_print_harvestmask(SYSCTL_HANDLER_ARGS)
{
@@ -370,7 +588,8 @@ static const char *random_source_descr[ENTROPYSOURCE] = {
[RANDOM_SWI] = "SWI",
[RANDOM_FS_ATIME] = "FS_ATIME",
[RANDOM_UMA] = "UMA",
- [RANDOM_CALLOUT] = "CALLOUT", /* ENVIRONMENTAL_END */
+ [RANDOM_CALLOUT] = "CALLOUT",
+ [RANDOM_RANDOMDEV] = "RANDOMDEV", /* ENVIRONMENTAL_END */
[RANDOM_PURE_OCTEON] = "PURE_OCTEON", /* PURE_START */
[RANDOM_PURE_SAFE] = "PURE_SAFE",
[RANDOM_PURE_GLXSB] = "PURE_GLXSB",
@@ -390,7 +609,6 @@ static const char *random_source_descr[ENTROPYSOURCE] = {
/* "ENTROPYSOURCE" */
};
-/* ARGSUSED */
static int
random_print_harvestmask_symbolic(SYSCTL_HANDLER_ARGS)
{
@@ -423,7 +641,6 @@ SYSCTL_PROC(_kern_random_harvest, OID_AUTO, mask_symbolic,
random_print_harvestmask_symbolic, "A",
"Entropy harvesting mask (symbolic)");
-/* ARGSUSED */
static void
random_harvestq_init(void *unused __unused)
{
@@ -433,7 +650,10 @@ random_harvestq_init(void *unused __unused)
hc_source_mask = almost_everything_mask;
RANDOM_HARVEST_INIT_LOCK();
- harvest_context.hc_entropy_ring.in = harvest_context.hc_entropy_ring.out = 0;
+ harvest_context.hc_active_buf = 0;
+
+ for (int i = 0; i < ENTROPYSOURCE; i++)
+ random_healthtest_init(i);
}
SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_init, NULL);
@@ -453,7 +673,7 @@ random_early_prime(char *entropy, size_t len)
return (0);
for (i = 0; i < len; i += sizeof(event.he_entropy)) {
- event.he_somecounter = (uint32_t)get_cyclecount();
+ event.he_somecounter = random_get_cyclecount();
event.he_size = sizeof(event.he_entropy);
event.he_source = RANDOM_CACHED;
event.he_destination =
@@ -493,7 +713,6 @@ random_prime_loader_file(const char *type)
* known to the kernel, and inserting it directly into the hashing
* module, currently Fortuna.
*/
-/* ARGSUSED */
static void
random_harvestq_prime(void *unused __unused)
{
@@ -522,7 +741,6 @@ random_harvestq_prime(void *unused __unused)
}
SYSINIT(random_device_prime, SI_SUB_RANDOM, SI_ORDER_MIDDLE, random_harvestq_prime, NULL);
-/* ARGSUSED */
static void
random_harvestq_deinit(void *unused __unused)
{
@@ -540,9 +758,9 @@ SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_d
* This is supposed to be fast; do not do anything slow in here!
* It is also illegal (and morally reprehensible) to insert any
* high-rate data here. "High-rate" is defined as a data source
- * that will usually cause lots of failures of the "Lockless read"
- * check a few lines below. This includes the "always-on" sources
- * like the Intel "rdrand" or the VIA Nehamiah "xstore" sources.
+ * that is likely to fill up the buffer in much less than 100ms.
+ * This includes the "always-on" sources like the Intel "rdrand"
+ * or the VIA Nehamiah "xstore" sources.
*/
/* XXXRW: get_cyclecount() is cheap on most modern hardware, where cycle
* counters are built in, but on older hardware it will do a real time clock
@@ -551,28 +769,29 @@ SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_d
void
random_harvest_queue_(const void *entropy, u_int size, enum random_entropy_source origin)
{
+ struct harvest_context *hc;
+ struct entropy_buffer *buf;
struct harvest_event *event;
- u_int ring_in;
- KASSERT(origin >= RANDOM_START && origin < ENTROPYSOURCE, ("%s: origin %d invalid\n", __func__, origin));
+ KASSERT(origin >= RANDOM_START && origin < ENTROPYSOURCE,
+ ("%s: origin %d invalid", __func__, origin));
+
+ hc = &harvest_context;
RANDOM_HARVEST_LOCK();
- ring_in = (harvest_context.hc_entropy_ring.in + 1)%RANDOM_RING_MAX;
- if (ring_in != harvest_context.hc_entropy_ring.out) {
- /* The ring is not full */
- event = harvest_context.hc_entropy_ring.ring + ring_in;
- event->he_somecounter = (uint32_t)get_cyclecount();
+ buf = &hc->hc_entropy_buf[hc->hc_active_buf];
+ if (buf->pos < RANDOM_RING_MAX) {
+ event = &buf->ring[buf->pos++];
+ event->he_somecounter = random_get_cyclecount();
event->he_source = origin;
- event->he_destination = harvest_context.hc_destination[origin]++;
+ event->he_destination = hc->hc_destination[origin]++;
if (size <= sizeof(event->he_entropy)) {
event->he_size = size;
memcpy(event->he_entropy, entropy, size);
- }
- else {
+ } else {
/* Big event, so squash it */
event->he_size = sizeof(event->he_entropy[0]);
event->he_entropy[0] = jenkins_hash(entropy, size, (uint32_t)(uintptr_t)event);
}
- harvest_context.hc_entropy_ring.in = ring_in;
}
RANDOM_HARVEST_UNLOCK();
}
@@ -589,7 +808,8 @@ random_harvest_fast_(const void *entropy, u_int size)
u_int pos;
pos = harvest_context.hc_entropy_fast_accumulator.pos;
- harvest_context.hc_entropy_fast_accumulator.buf[pos] ^= jenkins_hash(entropy, size, (uint32_t)get_cyclecount());
+ harvest_context.hc_entropy_fast_accumulator.buf[pos] ^=
+ jenkins_hash(entropy, size, random_get_cyclecount());
harvest_context.hc_entropy_fast_accumulator.pos = (pos + 1)%RANDOM_ACCUM_MAX;
}
@@ -606,7 +826,7 @@ random_harvest_direct_(const void *entropy, u_int size, enum random_entropy_sour
KASSERT(origin >= RANDOM_START && origin < ENTROPYSOURCE, ("%s: origin %d invalid\n", __func__, origin));
size = MIN(size, sizeof(event.he_entropy));
- event.he_somecounter = (uint32_t)get_cyclecount();
+ event.he_somecounter = random_get_cyclecount();
event.he_size = size;
event.he_source = origin;
event.he_destination = harvest_context.hc_destination[origin]++;
diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h
index 69a9dfabd44a..1d462500df85 100644
--- a/sys/dev/random/random_harvestq.h
+++ b/sys/dev/random/random_harvestq.h
@@ -27,6 +27,9 @@
#ifndef SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED
#define SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED
+#include <sys/types.h>
+#include <machine/cpu.h>
+
#define HARVESTSIZE 2 /* Max length in words of each harvested entropy unit */
/* These are used to queue harvested packets of entropy. The entropy
@@ -40,8 +43,12 @@ struct harvest_event {
uint8_t he_source; /* origin of the entropy */
};
-#define RANDOM_HARVEST_INIT_LOCK(x) mtx_init(&harvest_context.hc_mtx, "entropy harvest mutex", NULL, MTX_SPIN)
-#define RANDOM_HARVEST_LOCK(x) mtx_lock_spin(&harvest_context.hc_mtx)
-#define RANDOM_HARVEST_UNLOCK(x) mtx_unlock_spin(&harvest_context.hc_mtx)
+static inline uint32_t
+random_get_cyclecount(void)
+{
+ return ((uint32_t)get_cyclecount());
+}
+
+bool random_harvest_healthtest(const struct harvest_event *event);
#endif /* SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED */
diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c
index 6d637ab5a53e..ced4dd8067d9 100644
--- a/sys/dev/random/randomdev.c
+++ b/sys/dev/random/randomdev.c
@@ -303,16 +303,16 @@ randomdev_accumulate(uint8_t *buf, u_int count)
/* Extra timing here is helpful to scrape scheduler jitter entropy */
randomdev_hash_init(&hash);
- timestamp = (uint32_t)get_cyclecount();
+ timestamp = random_get_cyclecount();
randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
randomdev_hash_iterate(&hash, buf, count);
- timestamp = (uint32_t)get_cyclecount();
+ timestamp = random_get_cyclecount();
randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
randomdev_hash_finish(&hash, entropy_data);
for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
- event.he_somecounter = (uint32_t)get_cyclecount();
+ event.he_somecounter = random_get_cyclecount();
event.he_size = sizeof(event.he_entropy);
- event.he_source = RANDOM_CACHED;
+ event.he_source = RANDOM_RANDOMDEV;
event.he_destination = destination++; /* Harmless cheating */
memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
p_random_alg_context->ra_event_processor(&event);
diff --git a/sys/dev/regulator/regulator_fixed.c b/sys/dev/regulator/regulator_fixed.c
index 0a76da7140a0..55cdb5e4aeae 100644
--- a/sys/dev/regulator/regulator_fixed.c
+++ b/sys/dev/regulator/regulator_fixed.c
@@ -100,12 +100,8 @@ static struct gpio_entry *
regnode_get_gpio_entry(struct gpiobus_pin *gpio_pin)
{
struct gpio_entry *entry, *tmp;
- device_t busdev;
int rv;
- busdev = GPIO_GET_BUS(gpio_pin->dev);
- if (busdev == NULL)
- return (NULL);
entry = malloc(sizeof(struct gpio_entry), M_FIXEDREGULATOR,
M_WAITOK | M_ZERO);
@@ -122,8 +118,8 @@ regnode_get_gpio_entry(struct gpiobus_pin *gpio_pin)
}
/* Reserve pin. */
- /* XXX Can we call gpiobus_acquire_pin() with gpio_list_mtx held? */
- rv = gpiobus_acquire_pin(busdev, gpio_pin->pin);
+ /* XXX Can we call gpio_pin_acquire() with gpio_list_mtx held? */
+ rv = gpio_pin_acquire(gpio_pin);
if (rv != 0) {
mtx_unlock(&gpio_list_mtx);
free(entry, M_FIXEDREGULATOR);
diff --git a/sys/dev/sound/midi/midi.c b/sys/dev/sound/midi/midi.c
index fbfb69de2913..6753f864ba9c 100644
--- a/sys/dev/sound/midi/midi.c
+++ b/sys/dev/sound/midi/midi.c
@@ -30,12 +30,6 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
- /*
- * Parts of this file started out as NetBSD: midi.c 1.31
- * They are mostly gone. Still the most obvious will be the state
- * machine midi_in
- */
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/queue.h>
@@ -66,7 +60,6 @@
#include "mpu_if.h"
#include <dev/sound/midi/midiq.h>
-#include "synth_if.h"
MALLOC_DEFINE(M_MIDI, "midi buffers", "Midi data allocation area");
#ifndef KOBJMETHOD_END
@@ -79,17 +72,6 @@ enum midi_states {
MIDI_IN_START, MIDI_IN_SYSEX, MIDI_IN_DATA
};
-/*
- * The MPU interface current has init() uninit() inqsize() outqsize()
- * callback() : fiddle with the tx|rx status.
- */
-
-#include "mpu_if.h"
-
-/*
- * /dev/rmidi Structure definitions
- */
-
#define MIDI_NAMELEN 16
struct snd_midi {
KOBJ_FIELDS;
@@ -115,95 +97,13 @@ struct snd_midi {
* complete command packets. */
struct proc *async;
struct cdev *dev;
- struct synth_midi *synth;
- int synth_flags;
TAILQ_ENTRY(snd_midi) link;
};
-struct synth_midi {
- KOBJ_FIELDS;
- struct snd_midi *m;
-};
-
-static synth_open_t midisynth_open;
-static synth_close_t midisynth_close;
-static synth_writeraw_t midisynth_writeraw;
-static synth_killnote_t midisynth_killnote;
-static synth_startnote_t midisynth_startnote;
-static synth_setinstr_t midisynth_setinstr;
-static synth_alloc_t midisynth_alloc;
-static synth_controller_t midisynth_controller;
-static synth_bender_t midisynth_bender;
-
-static kobj_method_t midisynth_methods[] = {
- KOBJMETHOD(synth_open, midisynth_open),
- KOBJMETHOD(synth_close, midisynth_close),
- KOBJMETHOD(synth_writeraw, midisynth_writeraw),
- KOBJMETHOD(synth_setinstr, midisynth_setinstr),
- KOBJMETHOD(synth_startnote, midisynth_startnote),
- KOBJMETHOD(synth_killnote, midisynth_killnote),
- KOBJMETHOD(synth_alloc, midisynth_alloc),
- KOBJMETHOD(synth_controller, midisynth_controller),
- KOBJMETHOD(synth_bender, midisynth_bender),
- KOBJMETHOD_END
-};
-
-DEFINE_CLASS(midisynth, midisynth_methods, 0);
-
-/*
- * Module Exports & Interface
- *
- * struct midi_chan *midi_init(MPU_CLASS cls, int unit, int chan,
- * void *cookie)
- * int midi_uninit(struct snd_midi *)
- *
- * 0 == no error
- * EBUSY or other error
- *
- * int midi_in(struct snd_midi *, char *buf, int count)
- * int midi_out(struct snd_midi *, char *buf, int count)
- *
- * midi_{in,out} return actual size transfered
- *
- */
-
-/*
- * midi_devs tailq, holder of all rmidi instances protected by midistat_lock
- */
-
TAILQ_HEAD(, snd_midi) midi_devs;
-/*
- * /dev/midistat variables and declarations, protected by midistat_lock
- */
-
struct sx mstat_lock;
-static int midistat_isopen = 0;
-static struct sbuf midistat_sbuf;
-static struct cdev *midistat_dev;
-
-/*
- * /dev/midistat dev_t declarations
- */
-
-static d_open_t midistat_open;
-static d_close_t midistat_close;
-static d_read_t midistat_read;
-
-static struct cdevsw midistat_cdevsw = {
- .d_version = D_VERSION,
- .d_open = midistat_open,
- .d_close = midistat_close,
- .d_read = midistat_read,
- .d_name = "midistat",
-};
-
-/*
- * /dev/rmidi dev_t declarations, struct variable access is protected by
- * locks contained within the structure.
- */
-
static d_open_t midi_open;
static d_close_t midi_close;
static d_ioctl_t midi_ioctl;
@@ -222,41 +122,18 @@ static struct cdevsw midi_cdevsw = {
.d_name = "rmidi",
};
-/*
- * Prototypes of library functions
- */
-
static int midi_destroy(struct snd_midi *, int);
-static int midistat_prepare(struct sbuf * s);
static int midi_load(void);
static int midi_unload(void);
-/*
- * Misc declr.
- */
SYSCTL_NODE(_hw, OID_AUTO, midi, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"Midi driver");
-static SYSCTL_NODE(_hw_midi, OID_AUTO, stat, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "Status device");
int midi_debug;
/* XXX: should this be moved into debug.midi? */
SYSCTL_INT(_hw_midi, OID_AUTO, debug, CTLFLAG_RW, &midi_debug, 0, "");
-int midi_dumpraw;
-SYSCTL_INT(_hw_midi, OID_AUTO, dumpraw, CTLFLAG_RW, &midi_dumpraw, 0, "");
-
-int midi_instroff;
-SYSCTL_INT(_hw_midi, OID_AUTO, instroff, CTLFLAG_RW, &midi_instroff, 0, "");
-
-int midistat_verbose;
-SYSCTL_INT(_hw_midi_stat, OID_AUTO, verbose, CTLFLAG_RW,
- &midistat_verbose, 0, "");
-
#define MIDI_DEBUG(l,a) if(midi_debug>=l) a
-/*
- * CODE START
- */
void
midistat_lock(void)
@@ -285,9 +162,6 @@ midistat_lockassert(void)
* what unit number is used.
*
* It is an error to call midi_init with an already used unit/channel combo.
- *
- * Returns NULL on error
- *
*/
struct snd_midi *
midi_init(kobj_class_t cls, int unit, int channel, void *cookie)
@@ -326,9 +200,6 @@ midi_init(kobj_class_t cls, int unit, int channel, void *cookie)
MIDI_DEBUG(1, printf("midiinit #2: unit %d/%d.\n", unit, channel));
m = malloc(sizeof(*m), M_MIDI, M_WAITOK | M_ZERO);
- m->synth = malloc(sizeof(*m->synth), M_MIDI, M_WAITOK | M_ZERO);
- kobj_init((kobj_t)m->synth, &midisynth_class);
- m->synth->m = m;
kobj_init((kobj_t)m, cls);
inqsize = MPU_INQSIZE(m, cookie);
outqsize = MPU_OUTQSIZE(m, cookie);
@@ -393,7 +264,6 @@ err2:
if (MIDIQ_BUF(m->outq))
free(MIDIQ_BUF(m->outq), M_MIDI);
err1:
- free(m->synth, M_MIDI);
free(m, M_MIDI);
err0:
midistat_unlock();
@@ -405,9 +275,7 @@ err0:
* midi_uninit does not call MIDI_UNINIT, as since this is the implementors
* entry point. midi_uninit if fact, does not send any methods. A call to
* midi_uninit is a defacto promise that you won't manipulate ch anymore
- *
*/
-
int
midi_uninit(struct snd_midi *m)
{
@@ -440,13 +308,6 @@ exit:
return err;
}
-/*
- * midi_in: process all data until the queue is full, then discards the rest.
- * Since midi_in is a state machine, data discards can cause it to get out of
- * whack. Process as much as possible. It calls, wakeup, selnotify and
- * psignal at most once.
- */
-
#ifdef notdef
static int midi_lengths[] = {2, 2, 2, 2, 1, 1, 2, 0};
@@ -460,6 +321,12 @@ static int midi_lengths[] = {2, 2, 2, 2, 1, 1, 2, 0};
#define MIDI_SYSEX_START 0xF0
#define MIDI_SYSEX_END 0xF7
+/*
+ * midi_in: process all data until the queue is full, then discards the rest.
+ * Since midi_in is a state machine, data discards can cause it to get out of
+ * whack. Process as much as possible. It calls, wakeup, selnotify and
+ * psignal at most once.
+ */
int
midi_in(struct snd_midi *m, uint8_t *buf, int size)
{
@@ -627,9 +494,6 @@ midi_out(struct snd_midi *m, uint8_t *buf, int size)
return used;
}
-/*
- * /dev/rmidi#.# device access functions
- */
int
midi_open(struct cdev *i_dev, int flags, int mode, struct thread *td)
{
@@ -934,434 +798,6 @@ midi_poll(struct cdev *i_dev, int events, struct thread *td)
}
/*
- * /dev/midistat device functions
- *
- */
-static int
-midistat_open(struct cdev *i_dev, int flags, int mode, struct thread *td)
-{
- int error;
-
- MIDI_DEBUG(1, printf("midistat_open\n"));
-
- midistat_lock();
- if (midistat_isopen) {
- midistat_unlock();
- return EBUSY;
- }
- midistat_isopen = 1;
- sbuf_new(&midistat_sbuf, NULL, 4096, SBUF_AUTOEXTEND);
- error = (midistat_prepare(&midistat_sbuf) > 0) ? 0 : ENOMEM;
- if (error)
- midistat_isopen = 0;
- midistat_unlock();
- return error;
-}
-
-static int
-midistat_close(struct cdev *i_dev, int flags, int mode, struct thread *td)
-{
- MIDI_DEBUG(1, printf("midistat_close\n"));
- midistat_lock();
- if (!midistat_isopen) {
- midistat_unlock();
- return EBADF;
- }
- sbuf_delete(&midistat_sbuf);
- midistat_isopen = 0;
- midistat_unlock();
- return 0;
-}
-
-static int
-midistat_read(struct cdev *i_dev, struct uio *uio, int flag)
-{
- long l;
- int err;
-
- MIDI_DEBUG(4, printf("midistat_read\n"));
- midistat_lock();
- if (!midistat_isopen) {
- midistat_unlock();
- return EBADF;
- }
- if (uio->uio_offset < 0 || uio->uio_offset > sbuf_len(&midistat_sbuf)) {
- midistat_unlock();
- return EINVAL;
- }
- err = 0;
- l = lmin(uio->uio_resid, sbuf_len(&midistat_sbuf) - uio->uio_offset);
- if (l > 0) {
- err = uiomove(sbuf_data(&midistat_sbuf) + uio->uio_offset, l,
- uio);
- }
- midistat_unlock();
- return err;
-}
-
-/*
- * Module library functions
- */
-
-static int
-midistat_prepare(struct sbuf *s)
-{
- struct snd_midi *m;
-
- midistat_lockassert();
-
- sbuf_printf(s, "FreeBSD Midi Driver (midi2)\n");
- if (TAILQ_EMPTY(&midi_devs)) {
- sbuf_printf(s, "No devices installed.\n");
- sbuf_finish(s);
- return sbuf_len(s);
- }
- sbuf_printf(s, "Installed devices:\n");
-
- TAILQ_FOREACH(m, &midi_devs, link) {
- mtx_lock(&m->lock);
- sbuf_printf(s, "%s [%d/%d:%s]", m->name, m->unit, m->channel,
- MPU_PROVIDER(m, m->cookie));
- sbuf_printf(s, "%s", MPU_DESCR(m, m->cookie, midistat_verbose));
- sbuf_printf(s, "\n");
- mtx_unlock(&m->lock);
- }
-
- sbuf_finish(s);
- return sbuf_len(s);
-}
-
-#ifdef notdef
-/*
- * Convert IOCTL command to string for debugging
- */
-
-static char *
-midi_cmdname(int cmd)
-{
- static struct {
- int cmd;
- char *name;
- } *tab, cmdtab_midiioctl[] = {
-#define A(x) {x, ## x}
- /*
- * Once we have some real IOCTLs define, the following will
- * be relavant.
- *
- * A(SNDCTL_MIDI_PRETIME), A(SNDCTL_MIDI_MPUMODE),
- * A(SNDCTL_MIDI_MPUCMD), A(SNDCTL_SYNTH_INFO),
- * A(SNDCTL_MIDI_INFO), A(SNDCTL_SYNTH_MEMAVL),
- * A(SNDCTL_FM_LOAD_INSTR), A(SNDCTL_FM_4OP_ENABLE),
- * A(MIOSPASSTHRU), A(MIOGPASSTHRU), A(AIONWRITE),
- * A(AIOGSIZE), A(AIOSSIZE), A(AIOGFMT), A(AIOSFMT),
- * A(AIOGMIX), A(AIOSMIX), A(AIOSTOP), A(AIOSYNC),
- * A(AIOGCAP),
- */
-#undef A
- {
- -1, "unknown"
- },
- };
-
- for (tab = cmdtab_midiioctl; tab->cmd != cmd && tab->cmd != -1; tab++);
- return tab->name;
-}
-
-#endif /* notdef */
-
-/*
- * midisynth
- */
-
-int
-midisynth_open(void *n, void *arg, int flags)
-{
- struct snd_midi *m = ((struct synth_midi *)n)->m;
- int retval;
-
- MIDI_DEBUG(1, printf("midisynth_open %s %s\n",
- flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : ""));
-
- if (m == NULL)
- return ENXIO;
-
- mtx_lock(&m->lock);
- mtx_lock(&m->qlock);
-
- retval = 0;
-
- if (flags & FREAD) {
- if (MIDIQ_SIZE(m->inq) == 0)
- retval = ENXIO;
- else if (m->flags & M_RX)
- retval = EBUSY;
- if (retval)
- goto err;
- }
- if (flags & FWRITE) {
- if (MIDIQ_SIZE(m->outq) == 0)
- retval = ENXIO;
- else if (m->flags & M_TX)
- retval = EBUSY;
- if (retval)
- goto err;
- }
- m->busy++;
-
- /*
- * TODO: Consider m->async = 0;
- */
-
- if (flags & FREAD) {
- m->flags |= M_RX | M_RXEN;
- /*
- * Only clear the inq, the outq might still have data to drain
- * from a previous session
- */
- MIDIQ_CLEAR(m->inq);
- m->rchan = 0;
- }
-
- if (flags & FWRITE) {
- m->flags |= M_TX;
- m->wchan = 0;
- }
- m->synth_flags = flags & (FREAD | FWRITE);
-
- MPU_CALLBACK(m, m->cookie, m->flags);
-
-err: mtx_unlock(&m->qlock);
- mtx_unlock(&m->lock);
- MIDI_DEBUG(2, printf("midisynth_open: return %d.\n", retval));
- return retval;
-}
-
-int
-midisynth_close(void *n)
-{
- struct snd_midi *m = ((struct synth_midi *)n)->m;
- int retval;
- int oldflags;
-
- MIDI_DEBUG(1, printf("midisynth_close %s %s\n",
- m->synth_flags & FREAD ? "M_RX" : "",
- m->synth_flags & FWRITE ? "M_TX" : ""));
-
- if (m == NULL)
- return ENXIO;
-
- mtx_lock(&m->lock);
- mtx_lock(&m->qlock);
-
- if ((m->synth_flags & FREAD && !(m->flags & M_RX)) ||
- (m->synth_flags & FWRITE && !(m->flags & M_TX))) {
- retval = ENXIO;
- goto err;
- }
- m->busy--;
-
- oldflags = m->flags;
-
- if (m->synth_flags & FREAD)
- m->flags &= ~(M_RX | M_RXEN);
- if (m->synth_flags & FWRITE)
- m->flags &= ~M_TX;
-
- if ((m->flags & (M_TXEN | M_RXEN)) != (oldflags & (M_RXEN | M_TXEN)))
- MPU_CALLBACK(m, m->cookie, m->flags);
-
- MIDI_DEBUG(1, printf("midi_close: closed, busy = %d.\n", m->busy));
-
- mtx_unlock(&m->qlock);
- mtx_unlock(&m->lock);
- retval = 0;
-err: return retval;
-}
-
-/*
- * Always blocking.
- */
-
-int
-midisynth_writeraw(void *n, uint8_t *buf, size_t len)
-{
- struct snd_midi *m = ((struct synth_midi *)n)->m;
- int retval;
- int used;
- int i;
-
- MIDI_DEBUG(4, printf("midisynth_writeraw\n"));
-
- retval = 0;
-
- if (m == NULL)
- return ENXIO;
-
- mtx_lock(&m->lock);
- mtx_lock(&m->qlock);
-
- if (!(m->flags & M_TX))
- goto err1;
-
- if (midi_dumpraw)
- printf("midi dump: ");
-
- while (len > 0) {
- while (MIDIQ_AVAIL(m->outq) == 0) {
- if (!(m->flags & M_TXEN)) {
- m->flags |= M_TXEN;
- MPU_CALLBACK(m, m->cookie, m->flags);
- }
- mtx_unlock(&m->lock);
- m->wchan = 1;
- MIDI_DEBUG(3, printf("midisynth_writeraw msleep\n"));
- retval = msleep(&m->wchan, &m->qlock,
- PCATCH | PDROP, "midi TX", 0);
- /*
- * We slept, maybe things have changed since last
- * dying check
- */
- if (retval == EINTR)
- goto err0;
-
- if (retval)
- goto err0;
- mtx_lock(&m->lock);
- mtx_lock(&m->qlock);
- m->wchan = 0;
- if (!m->busy)
- goto err1;
- }
-
- /*
- * We are certain than data can be placed on the queue
- */
-
- used = MIN(MIDIQ_AVAIL(m->outq), len);
- used = MIN(used, MIDI_WSIZE);
- MIDI_DEBUG(5,
- printf("midi_synth: resid %zu len %jd avail %jd\n",
- len, (intmax_t)MIDIQ_LEN(m->outq),
- (intmax_t)MIDIQ_AVAIL(m->outq)));
-
- if (midi_dumpraw)
- for (i = 0; i < used; i++)
- printf("%x ", buf[i]);
-
- MIDIQ_ENQ(m->outq, buf, used);
- len -= used;
-
- /*
- * Inform the bottom half that data can be written
- */
- if (!(m->flags & M_TXEN)) {
- m->flags |= M_TXEN;
- MPU_CALLBACK(m, m->cookie, m->flags);
- }
- }
- /*
- * If we Made it here then transfer is good
- */
- if (midi_dumpraw)
- printf("\n");
-
- retval = 0;
-err1: mtx_unlock(&m->qlock);
- mtx_unlock(&m->lock);
-err0: return retval;
-}
-
-static int
-midisynth_killnote(void *n, uint8_t chn, uint8_t note, uint8_t vel)
-{
- u_char c[3];
-
- if (note > 127 || chn > 15)
- return (EINVAL);
-
- if (vel > 127)
- vel = 127;
-
- if (vel == 64) {
- c[0] = 0x90 | (chn & 0x0f); /* Note on. */
- c[1] = (u_char)note;
- c[2] = 0;
- } else {
- c[0] = 0x80 | (chn & 0x0f); /* Note off. */
- c[1] = (u_char)note;
- c[2] = (u_char)vel;
- }
-
- return midisynth_writeraw(n, c, 3);
-}
-
-static int
-midisynth_setinstr(void *n, uint8_t chn, uint16_t instr)
-{
- u_char c[2];
-
- if (instr > 127 || chn > 15)
- return EINVAL;
-
- c[0] = 0xc0 | (chn & 0x0f); /* Progamme change. */
- c[1] = instr + midi_instroff;
-
- return midisynth_writeraw(n, c, 2);
-}
-
-static int
-midisynth_startnote(void *n, uint8_t chn, uint8_t note, uint8_t vel)
-{
- u_char c[3];
-
- if (note > 127 || chn > 15)
- return EINVAL;
-
- if (vel > 127)
- vel = 127;
-
- c[0] = 0x90 | (chn & 0x0f); /* Note on. */
- c[1] = (u_char)note;
- c[2] = (u_char)vel;
-
- return midisynth_writeraw(n, c, 3);
-}
-static int
-midisynth_alloc(void *n, uint8_t chan, uint8_t note)
-{
- return chan;
-}
-
-static int
-midisynth_controller(void *n, uint8_t chn, uint8_t ctrlnum, uint16_t val)
-{
- u_char c[3];
-
- if (ctrlnum > 127 || chn > 15)
- return EINVAL;
-
- c[0] = 0xb0 | (chn & 0x0f); /* Control Message. */
- c[1] = ctrlnum;
- c[2] = val;
- return midisynth_writeraw(n, c, 3);
-}
-
-static int
-midisynth_bender(void *n, uint8_t chn, uint16_t val)
-{
- u_char c[3];
-
- if (val > 16383 || chn > 15)
- return EINVAL;
-
- c[0] = 0xe0 | (chn & 0x0f); /* Pitch bend. */
- c[1] = (u_char)val & 0x7f;
- c[2] = (u_char)(val >> 7) & 0x7f;
-
- return midisynth_writeraw(n, c, 3);
-}
-
-/*
* Single point of midi destructions.
*/
static int
@@ -1381,24 +817,16 @@ midi_destroy(struct snd_midi *m, int midiuninit)
free(MIDIQ_BUF(m->outq), M_MIDI);
mtx_destroy(&m->qlock);
mtx_destroy(&m->lock);
- free(m->synth, M_MIDI);
free(m, M_MIDI);
return 0;
}
-/*
- * Load and unload functions, creates the /dev/midistat device
- */
-
static int
midi_load(void)
{
sx_init(&mstat_lock, "midistat lock");
TAILQ_INIT(&midi_devs);
- midistat_dev = make_dev(&midistat_cdevsw, MIDI_DEV_MIDICTL, UID_ROOT,
- GID_WHEEL, 0666, "midistat");
-
return 0;
}
@@ -1411,9 +839,6 @@ midi_unload(void)
MIDI_DEBUG(1, printf("midi_unload()\n"));
retval = EBUSY;
midistat_lock();
- if (midistat_isopen)
- goto exit0;
-
TAILQ_FOREACH_SAFE(m, &midi_devs, link, tmp) {
mtx_lock(&m->lock);
if (m->busy)
@@ -1421,28 +846,21 @@ midi_unload(void)
else
retval = midi_destroy(m, 1);
if (retval)
- goto exit1;
+ goto exit;
}
midistat_unlock();
- destroy_dev(midistat_dev);
- /*
- * Made it here then unload is complete
- */
sx_destroy(&mstat_lock);
return 0;
-exit1:
+exit:
mtx_unlock(&m->lock);
-exit0:
midistat_unlock();
if (retval)
MIDI_DEBUG(2, printf("midi_unload: failed\n"));
return retval;
}
-extern int seq_modevent(module_t mod, int type, void *data);
-
static int
midi_modevent(module_t mod, int type, void *data)
{
@@ -1453,14 +871,10 @@ midi_modevent(module_t mod, int type, void *data)
switch (type) {
case MOD_LOAD:
retval = midi_load();
- if (retval == 0)
- retval = seq_modevent(mod, type, data);
break;
case MOD_UNLOAD:
retval = midi_unload();
- if (retval == 0)
- retval = seq_modevent(mod, type, data);
break;
default:
@@ -1470,73 +884,5 @@ midi_modevent(module_t mod, int type, void *data)
return retval;
}
-kobj_t
-midimapper_addseq(void *arg1, int *unit, void **cookie)
-{
- unit = NULL;
-
- return (kobj_t)arg1;
-}
-
-int
-midimapper_open_locked(void *arg1, void **cookie)
-{
- int retval = 0;
- struct snd_midi *m;
-
- midistat_lockassert();
- TAILQ_FOREACH(m, &midi_devs, link) {
- retval++;
- }
-
- return retval;
-}
-
-int
-midimapper_open(void *arg1, void **cookie)
-{
- int retval;
-
- midistat_lock();
- retval = midimapper_open_locked(arg1, cookie);
- midistat_unlock();
-
- return retval;
-}
-
-int
-midimapper_close(void *arg1, void *cookie)
-{
- return 0;
-}
-
-kobj_t
-midimapper_fetch_synth_locked(void *arg, void *cookie, int unit)
-{
- struct snd_midi *m;
- int retval = 0;
-
- midistat_lockassert();
- TAILQ_FOREACH(m, &midi_devs, link) {
- if (unit == retval)
- return (kobj_t)m->synth;
- retval++;
- }
-
- return NULL;
-}
-
-kobj_t
-midimapper_fetch_synth(void *arg, void *cookie, int unit)
-{
- kobj_t synth;
-
- midistat_lock();
- synth = midimapper_fetch_synth_locked(arg, cookie, unit);
- midistat_unlock();
-
- return synth;
-}
-
DEV_MODULE(midi, midi_modevent, NULL);
MODULE_VERSION(midi, 1);
diff --git a/sys/dev/sound/midi/midi.h b/sys/dev/sound/midi/midi.h
index 2254fab690e9..286e84264ef3 100644
--- a/sys/dev/sound/midi/midi.h
+++ b/sys/dev/sound/midi/midi.h
@@ -51,11 +51,4 @@ int midi_uninit(struct snd_midi *_m);
int midi_out(struct snd_midi *_m, uint8_t *_buf, int _size);
int midi_in(struct snd_midi *_m, uint8_t *_buf, int _size);
-kobj_t midimapper_addseq(void *arg1, int *unit, void **cookie);
-int midimapper_open_locked(void *arg1, void **cookie);
-int midimapper_open(void *arg1, void **cookie);
-int midimapper_close(void *arg1, void *cookie);
-kobj_t midimapper_fetch_synth_locked(void *arg, void *cookie, int unit);
-kobj_t midimapper_fetch_synth(void *arg, void *cookie, int unit);
-
#endif
diff --git a/sys/dev/sound/midi/mpu401.c b/sys/dev/sound/midi/mpu401.c
index 2be285bc0040..224ebb1b01f4 100644
--- a/sys/dev/sound/midi/mpu401.c
+++ b/sys/dev/sound/midi/mpu401.c
@@ -88,8 +88,6 @@ static int mpu401_minqsize(struct snd_midi *, void *);
static int mpu401_moutqsize(struct snd_midi *, void *);
static void mpu401_mcallback(struct snd_midi *, void *, int);
static void mpu401_mcallbackp(struct snd_midi *, void *, int);
-static const char *mpu401_mdescr(struct snd_midi *, void *, int);
-static const char *mpu401_mprovider(struct snd_midi *, void *);
static kobj_method_t mpu401_methods[] = {
KOBJMETHOD(mpu_init, mpu401_minit),
@@ -98,8 +96,6 @@ static kobj_method_t mpu401_methods[] = {
KOBJMETHOD(mpu_outqsize, mpu401_moutqsize),
KOBJMETHOD(mpu_callback, mpu401_mcallback),
KOBJMETHOD(mpu_callbackp, mpu401_mcallbackp),
- KOBJMETHOD(mpu_descr, mpu401_mdescr),
- KOBJMETHOD(mpu_provider, mpu401_mprovider),
KOBJMETHOD_END
};
@@ -122,24 +118,12 @@ mpu401_intr(struct mpu401 *m)
int i;
int s;
-/*
- printf("mpu401_intr\n");
-*/
#define RXRDY(m) ( (STATUS(m) & MPU_INPUTBUSY) == 0)
#define TXRDY(m) ( (STATUS(m) & MPU_OUTPUTBUSY) == 0)
-#if 0
-#define D(x,l) printf("mpu401_intr %d %x %s %s\n",l, x, x&MPU_INPUTBUSY?"RX":"", x&MPU_OUTPUTBUSY?"TX":"")
-#else
-#define D(x,l)
-#endif
i = 0;
s = STATUS(m);
- D(s, 1);
while ((s & MPU_INPUTBUSY) == 0 && i < MPU_INTR_BUF) {
b[i] = READ(m);
-/*
- printf("mpu401_intr in i %d d %d\n", i, b[i]);
-*/
i++;
s = STATUS(m);
}
@@ -148,15 +132,9 @@ mpu401_intr(struct mpu401 *m)
i = 0;
while (!(s & MPU_OUTPUTBUSY) && i < MPU_INTR_BUF) {
if (midi_out(m->mid, b, 1)) {
-/*
- printf("mpu401_intr out i %d d %d\n", i, b[0]);
-*/
WRITE(m, *b);
} else {
-/*
- printf("mpu401_intr write: no output\n");
-*/
return 0;
}
i++;
@@ -262,13 +240,7 @@ static void
mpu401_mcallback(struct snd_midi *sm, void *arg, int flags)
{
struct mpu401 *m = arg;
-#if 0
- printf("mpu401_callback %s %s %s %s\n",
- flags & M_RX ? "M_RX" : "",
- flags & M_TX ? "M_TX" : "",
- flags & M_RXEN ? "M_RXEN" : "",
- flags & M_TXEN ? "M_TXEN" : "");
-#endif
+
if (flags & M_TXEN && m->si) {
callout_reset(&m->timer, 1, mpu401_timeout, m);
}
@@ -278,19 +250,5 @@ mpu401_mcallback(struct snd_midi *sm, void *arg, int flags)
static void
mpu401_mcallbackp(struct snd_midi *sm, void *arg, int flags)
{
-/* printf("mpu401_callbackp\n"); */
mpu401_mcallback(sm, arg, flags);
}
-
-static const char *
-mpu401_mdescr(struct snd_midi *sm, void *arg, int verbosity)
-{
-
- return "descr mpu401";
-}
-
-static const char *
-mpu401_mprovider(struct snd_midi *m, void *arg)
-{
- return "provider mpu401";
-}
diff --git a/sys/dev/sound/midi/mpu_if.m b/sys/dev/sound/midi/mpu_if.m
index b7cb586c5dd0..835d887f703a 100644
--- a/sys/dev/sound/midi/mpu_if.m
+++ b/sys/dev/sound/midi/mpu_if.m
@@ -56,17 +56,6 @@ METHOD void callback {
int _flags;
};
-METHOD const char * provider {
- struct snd_midi *_kobj;
- void *_cookie;
-};
-
-METHOD const char * descr {
- struct snd_midi *_kobj;
- void *_cookie;
- int _verbosity;
-};
-
METHOD int uninit {
struct snd_midi *_kobj;
void *_cookie;
diff --git a/sys/dev/sound/midi/sequencer.c b/sys/dev/sound/midi/sequencer.c
deleted file mode 100644
index 03b71688175c..000000000000
--- a/sys/dev/sound/midi/sequencer.c
+++ /dev/null
@@ -1,2107 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause
- *
- * Copyright (c) 2003 Mathew Kanner
- * Copyright (c) 1993 Hannu Savolainen
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * The sequencer personality manager.
- */
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/ioccom.h>
-
-#include <sys/filio.h>
-#include <sys/lock.h>
-#include <sys/sockio.h>
-#include <sys/fcntl.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-
-#include <sys/kernel.h> /* for DATA_SET */
-
-#include <sys/module.h>
-#include <sys/conf.h>
-#include <sys/file.h>
-#include <sys/uio.h>
-#include <sys/syslog.h>
-#include <sys/errno.h>
-#include <sys/malloc.h>
-#include <sys/bus.h>
-#include <machine/resource.h>
-#include <machine/bus.h>
-#include <machine/clock.h> /* for DELAY */
-#include <sys/soundcard.h>
-#include <sys/rman.h>
-#include <sys/mman.h>
-#include <sys/poll.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/kthread.h>
-#include <sys/unistd.h>
-#include <sys/selinfo.h>
-#include <sys/sx.h>
-
-#ifdef HAVE_KERNEL_OPTION_HEADERS
-#include "opt_snd.h"
-#endif
-
-#include <dev/sound/midi/midi.h>
-#include <dev/sound/midi/midiq.h>
-#include "synth_if.h"
-
-#include <dev/sound/midi/sequencer.h>
-
-#define TMR_TIMERBASE 13
-
-#define SND_DEV_SEQ 1 /* Sequencer output /dev/sequencer (FM
- * synthesizer and MIDI output) */
-#define SND_DEV_MUSIC 8 /* /dev/music, level 2 interface */
-
-/* Length of a sequencer event. */
-#define EV_SZ 8
-#define IEV_SZ 8
-
-/* Lookup modes */
-#define LOOKUP_EXIST (0)
-#define LOOKUP_OPEN (1)
-#define LOOKUP_CLOSE (2)
-
-#define MIDIDEV(y) (dev2unit(y) & 0x0f)
-
-/* These are the entries to the sequencer driver. */
-static d_open_t mseq_open;
-static d_close_t mseq_close;
-static d_ioctl_t mseq_ioctl;
-static d_read_t mseq_read;
-static d_write_t mseq_write;
-static d_poll_t mseq_poll;
-
-static struct cdevsw seq_cdevsw = {
- .d_version = D_VERSION,
- .d_open = mseq_open,
- .d_close = mseq_close,
- .d_read = mseq_read,
- .d_write = mseq_write,
- .d_ioctl = mseq_ioctl,
- .d_poll = mseq_poll,
- .d_name = "sequencer",
-};
-
-struct seq_softc {
- KOBJ_FIELDS;
-
- struct mtx seq_lock, q_lock;
- struct cv empty_cv, reset_cv, in_cv, out_cv, state_cv, th_cv;
-
- MIDIQ_HEAD(, u_char) in_q, out_q;
-
- u_long flags;
- /* Flags (protected by flag_mtx of mididev_info) */
- int fflags; /* Access mode */
- int music;
-
- int out_water; /* Sequence output threshould */
- snd_sync_parm sync_parm; /* AIOSYNC parameter set */
- struct thread *sync_thread; /* AIOSYNCing thread */
- struct selinfo in_sel, out_sel;
- int midi_number;
- struct cdev *seqdev, *musicdev;
- int unit;
- int maxunits;
- kobj_t *midis;
- int *midi_flags;
- kobj_t mapper;
- void *mapper_cookie;
- struct timeval timerstop, timersub;
- int timerbase, tempo;
- int timerrun;
- int done;
- int playing;
- int recording;
- int busy;
- int pre_event_timeout;
- int waiting;
-};
-
-/*
- * Module specific stuff, including how many sequecers
- * we currently own.
- */
-
-SYSCTL_NODE(_hw_midi, OID_AUTO, seq, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "Midi sequencer");
-
-int seq_debug;
-/* XXX: should this be moved into debug.midi? */
-SYSCTL_INT(_hw_midi_seq, OID_AUTO, debug, CTLFLAG_RW, &seq_debug, 0, "");
-
-midi_cmdtab cmdtab_seqevent[] = {
- {SEQ_NOTEOFF, "SEQ_NOTEOFF"},
- {SEQ_NOTEON, "SEQ_NOTEON"},
- {SEQ_WAIT, "SEQ_WAIT"},
- {SEQ_PGMCHANGE, "SEQ_PGMCHANGE"},
- {SEQ_SYNCTIMER, "SEQ_SYNCTIMER"},
- {SEQ_MIDIPUTC, "SEQ_MIDIPUTC"},
- {SEQ_DRUMON, "SEQ_DRUMON"},
- {SEQ_DRUMOFF, "SEQ_DRUMOFF"},
- {SEQ_ECHO, "SEQ_ECHO"},
- {SEQ_AFTERTOUCH, "SEQ_AFTERTOUCH"},
- {SEQ_CONTROLLER, "SEQ_CONTROLLER"},
- {SEQ_BALANCE, "SEQ_BALANCE"},
- {SEQ_VOLMODE, "SEQ_VOLMODE"},
- {SEQ_FULLSIZE, "SEQ_FULLSIZE"},
- {SEQ_PRIVATE, "SEQ_PRIVATE"},
- {SEQ_EXTENDED, "SEQ_EXTENDED"},
- {EV_SEQ_LOCAL, "EV_SEQ_LOCAL"},
- {EV_TIMING, "EV_TIMING"},
- {EV_CHN_COMMON, "EV_CHN_COMMON"},
- {EV_CHN_VOICE, "EV_CHN_VOICE"},
- {EV_SYSEX, "EV_SYSEX"},
- {-1, NULL},
-};
-
-midi_cmdtab cmdtab_seqioctl[] = {
- {SNDCTL_SEQ_RESET, "SNDCTL_SEQ_RESET"},
- {SNDCTL_SEQ_SYNC, "SNDCTL_SEQ_SYNC"},
- {SNDCTL_SYNTH_INFO, "SNDCTL_SYNTH_INFO"},
- {SNDCTL_SEQ_CTRLRATE, "SNDCTL_SEQ_CTRLRATE"},
- {SNDCTL_SEQ_GETOUTCOUNT, "SNDCTL_SEQ_GETOUTCOUNT"},
- {SNDCTL_SEQ_GETINCOUNT, "SNDCTL_SEQ_GETINCOUNT"},
- {SNDCTL_SEQ_PERCMODE, "SNDCTL_SEQ_PERCMODE"},
- {SNDCTL_FM_LOAD_INSTR, "SNDCTL_FM_LOAD_INSTR"},
- {SNDCTL_SEQ_TESTMIDI, "SNDCTL_SEQ_TESTMIDI"},
- {SNDCTL_SEQ_RESETSAMPLES, "SNDCTL_SEQ_RESETSAMPLES"},
- {SNDCTL_SEQ_NRSYNTHS, "SNDCTL_SEQ_NRSYNTHS"},
- {SNDCTL_SEQ_NRMIDIS, "SNDCTL_SEQ_NRMIDIS"},
- {SNDCTL_SEQ_GETTIME, "SNDCTL_SEQ_GETTIME"},
- {SNDCTL_MIDI_INFO, "SNDCTL_MIDI_INFO"},
- {SNDCTL_SEQ_THRESHOLD, "SNDCTL_SEQ_THRESHOLD"},
- {SNDCTL_SYNTH_MEMAVL, "SNDCTL_SYNTH_MEMAVL"},
- {SNDCTL_FM_4OP_ENABLE, "SNDCTL_FM_4OP_ENABLE"},
- {SNDCTL_PMGR_ACCESS, "SNDCTL_PMGR_ACCESS"},
- {SNDCTL_SEQ_PANIC, "SNDCTL_SEQ_PANIC"},
- {SNDCTL_SEQ_OUTOFBAND, "SNDCTL_SEQ_OUTOFBAND"},
- {SNDCTL_TMR_TIMEBASE, "SNDCTL_TMR_TIMEBASE"},
- {SNDCTL_TMR_START, "SNDCTL_TMR_START"},
- {SNDCTL_TMR_STOP, "SNDCTL_TMR_STOP"},
- {SNDCTL_TMR_CONTINUE, "SNDCTL_TMR_CONTINUE"},
- {SNDCTL_TMR_TEMPO, "SNDCTL_TMR_TEMPO"},
- {SNDCTL_TMR_SOURCE, "SNDCTL_TMR_SOURCE"},
- {SNDCTL_TMR_METRONOME, "SNDCTL_TMR_METRONOME"},
- {SNDCTL_TMR_SELECT, "SNDCTL_TMR_SELECT"},
- {SNDCTL_MIDI_PRETIME, "SNDCTL_MIDI_PRETIME"},
- {AIONWRITE, "AIONWRITE"},
- {AIOGSIZE, "AIOGSIZE"},
- {AIOSSIZE, "AIOSSIZE"},
- {AIOGFMT, "AIOGFMT"},
- {AIOSFMT, "AIOSFMT"},
- {AIOGMIX, "AIOGMIX"},
- {AIOSMIX, "AIOSMIX"},
- {AIOSTOP, "AIOSTOP"},
- {AIOSYNC, "AIOSYNC"},
- {AIOGCAP, "AIOGCAP"},
- {-1, NULL},
-};
-
-midi_cmdtab cmdtab_timer[] = {
- {TMR_WAIT_REL, "TMR_WAIT_REL"},
- {TMR_WAIT_ABS, "TMR_WAIT_ABS"},
- {TMR_STOP, "TMR_STOP"},
- {TMR_START, "TMR_START"},
- {TMR_CONTINUE, "TMR_CONTINUE"},
- {TMR_TEMPO, "TMR_TEMPO"},
- {TMR_ECHO, "TMR_ECHO"},
- {TMR_CLOCK, "TMR_CLOCK"},
- {TMR_SPP, "TMR_SPP"},
- {TMR_TIMESIG, "TMR_TIMESIG"},
- {-1, NULL},
-};
-
-midi_cmdtab cmdtab_seqcv[] = {
- {MIDI_NOTEOFF, "MIDI_NOTEOFF"},
- {MIDI_NOTEON, "MIDI_NOTEON"},
- {MIDI_KEY_PRESSURE, "MIDI_KEY_PRESSURE"},
- {-1, NULL},
-};
-
-midi_cmdtab cmdtab_seqccmn[] = {
- {MIDI_CTL_CHANGE, "MIDI_CTL_CHANGE"},
- {MIDI_PGM_CHANGE, "MIDI_PGM_CHANGE"},
- {MIDI_CHN_PRESSURE, "MIDI_CHN_PRESSURE"},
- {MIDI_PITCH_BEND, "MIDI_PITCH_BEND"},
- {MIDI_SYSTEM_PREFIX, "MIDI_SYSTEM_PREFIX"},
- {-1, NULL},
-};
-
-#ifndef KOBJMETHOD_END
-#define KOBJMETHOD_END { NULL, NULL }
-#endif
-
-/*
- * static const char *mpu401_mprovider(kobj_t obj, struct mpu401 *m);
- */
-
-static kobj_method_t seq_methods[] = {
- /* KOBJMETHOD(mpu_provider,mpu401_mprovider), */
- KOBJMETHOD_END
-};
-
-DEFINE_CLASS(sequencer, seq_methods, 0);
-
-/* The followings are the local function. */
-static int seq_convertold(u_char *event, u_char *out);
-
-/*
- * static void seq_midiinput(struct seq_softc * scp, void *md);
- */
-static void seq_reset(struct seq_softc *scp);
-static int seq_sync(struct seq_softc *scp);
-
-static int seq_processevent(struct seq_softc *scp, u_char *event);
-
-static int seq_timing(struct seq_softc *scp, u_char *event);
-static int seq_local(struct seq_softc *scp, u_char *event);
-
-static int seq_chnvoice(struct seq_softc *scp, kobj_t md, u_char *event);
-static int seq_chncommon(struct seq_softc *scp, kobj_t md, u_char *event);
-static int seq_sysex(struct seq_softc *scp, kobj_t md, u_char *event);
-
-static int seq_fetch_mid(struct seq_softc *scp, int unit, kobj_t *md);
-void seq_copytoinput(struct seq_softc *scp, u_char *event, int len);
-int seq_modevent(module_t mod, int type, void *data);
-struct seq_softc *seqs[10];
-static struct mtx seqinfo_mtx;
-static u_long nseq = 0;
-
-static void timer_start(struct seq_softc *t);
-static void timer_stop(struct seq_softc *t);
-static void timer_setvals(struct seq_softc *t, int tempo, int timerbase);
-static void timer_wait(struct seq_softc *t, int ticks, int wait_abs);
-static int timer_now(struct seq_softc *t);
-
-static void
-timer_start(struct seq_softc *t)
-{
- t->timerrun = 1;
- getmicrotime(&t->timersub);
-}
-
-static void
-timer_continue(struct seq_softc *t)
-{
- struct timeval now;
-
- if (t->timerrun == 1)
- return;
- t->timerrun = 1;
- getmicrotime(&now);
- timevalsub(&now, &t->timerstop);
- timevaladd(&t->timersub, &now);
-}
-
-static void
-timer_stop(struct seq_softc *t)
-{
- t->timerrun = 0;
- getmicrotime(&t->timerstop);
-}
-
-static void
-timer_setvals(struct seq_softc *t, int tempo, int timerbase)
-{
- t->tempo = tempo;
- t->timerbase = timerbase;
-}
-
-static void
-timer_wait(struct seq_softc *t, int ticks, int wait_abs)
-{
- struct timeval now, when;
- int ret;
- unsigned long long i;
-
- while (t->timerrun == 0) {
- SEQ_DEBUG(2, printf("Timer wait when timer isn't running\n"));
- /*
- * The old sequencer used timeouts that only increased
- * the timer when the timer was running.
- * Hence the sequencer would stick (?) if the
- * timer was disabled.
- */
- cv_wait(&t->reset_cv, &t->seq_lock);
- if (t->playing == 0)
- return;
- }
-
- i = ticks * 60ull * 1000000ull / (t->tempo * t->timerbase);
-
- when.tv_sec = i / 1000000;
- when.tv_usec = i % 1000000;
-
-#if 0
- printf("timer_wait tempo %d timerbase %d ticks %d abs %d u_sec %llu\n",
- t->tempo, t->timerbase, ticks, wait_abs, i);
-#endif
-
- if (wait_abs != 0) {
- getmicrotime(&now);
- timevalsub(&now, &t->timersub);
- timevalsub(&when, &now);
- }
- if (when.tv_sec < 0 || when.tv_usec < 0) {
- SEQ_DEBUG(3,
- printf("seq_timer error negative time %lds.%06lds\n",
- (long)when.tv_sec, (long)when.tv_usec));
- return;
- }
- i = when.tv_sec * 1000000ull;
- i += when.tv_usec;
- i *= hz;
- i /= 1000000ull;
-#if 0
- printf("seq_timer usec %llu ticks %llu\n",
- when.tv_sec * 1000000ull + when.tv_usec, i);
-#endif
- t->waiting = 1;
- ret = cv_timedwait(&t->reset_cv, &t->seq_lock, i + 1);
- t->waiting = 0;
-
- if (ret != EWOULDBLOCK)
- SEQ_DEBUG(3, printf("seq_timer didn't timeout\n"));
-
-}
-
-static int
-timer_now(struct seq_softc *t)
-{
- struct timeval now;
- unsigned long long i;
- int ret;
-
- if (t->timerrun == 0)
- now = t->timerstop;
- else
- getmicrotime(&now);
-
- timevalsub(&now, &t->timersub);
-
- i = now.tv_sec * 1000000ull;
- i += now.tv_usec;
- i *= t->timerbase;
-/* i /= t->tempo; */
- i /= 1000000ull;
-
- ret = i;
- /*
- * printf("timer_now: %llu %d\n", i, ret);
- */
-
- return ret;
-}
-
-static void
-seq_eventthread(void *arg)
-{
- struct seq_softc *scp = arg;
- u_char event[EV_SZ];
-
- mtx_lock(&scp->seq_lock);
- SEQ_DEBUG(2, printf("seq_eventthread started\n"));
- while (scp->done == 0) {
-restart:
- while (scp->playing == 0) {
- cv_wait(&scp->state_cv, &scp->seq_lock);
- if (scp->done)
- goto done;
- }
-
- while (MIDIQ_EMPTY(scp->out_q)) {
- cv_broadcast(&scp->empty_cv);
- cv_wait(&scp->out_cv, &scp->seq_lock);
- if (scp->playing == 0)
- goto restart;
- if (scp->done)
- goto done;
- }
-
- MIDIQ_DEQ(scp->out_q, event, EV_SZ);
-
- if (MIDIQ_AVAIL(scp->out_q) < scp->out_water) {
- cv_broadcast(&scp->out_cv);
- selwakeup(&scp->out_sel);
- }
- seq_processevent(scp, event);
- }
-
-done:
- cv_broadcast(&scp->th_cv);
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(2, printf("seq_eventthread finished\n"));
- kproc_exit(0);
-}
-
-/*
- * seq_processevent: This maybe called by the event thread or the IOCTL
- * handler for queued and out of band events respectively.
- */
-static int
-seq_processevent(struct seq_softc *scp, u_char *event)
-{
- int ret;
- kobj_t m;
-
- ret = 0;
-
- if (event[0] == EV_SEQ_LOCAL)
- ret = seq_local(scp, event);
- else if (event[0] == EV_TIMING)
- ret = seq_timing(scp, event);
- else if (event[0] != EV_CHN_VOICE &&
- event[0] != EV_CHN_COMMON &&
- event[0] != EV_SYSEX &&
- event[0] != SEQ_MIDIPUTC) {
- ret = 1;
- SEQ_DEBUG(2, printf("seq_processevent not known %d\n",
- event[0]));
- } else if (seq_fetch_mid(scp, event[1], &m) != 0) {
- ret = 1;
- SEQ_DEBUG(2, printf("seq_processevent midi unit not found %d\n",
- event[1]));
- } else
- switch (event[0]) {
- case EV_CHN_VOICE:
- ret = seq_chnvoice(scp, m, event);
- break;
- case EV_CHN_COMMON:
- ret = seq_chncommon(scp, m, event);
- break;
- case EV_SYSEX:
- ret = seq_sysex(scp, m, event);
- break;
- case SEQ_MIDIPUTC:
- mtx_unlock(&scp->seq_lock);
- ret = SYNTH_WRITERAW(m, &event[2], 1);
- mtx_lock(&scp->seq_lock);
- break;
- }
- return ret;
-}
-
-static int
-seq_addunit(void)
-{
- struct seq_softc *scp;
- int ret;
- u_char *buf;
-
- gone_in(15, "Warning! MIDI sequencer to be removed soon: no longer "
- "needed or used\n");
-
- /* Allocate the softc. */
- ret = ENOMEM;
- scp = malloc(sizeof(*scp), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (scp == NULL) {
- SEQ_DEBUG(1, printf("seq_addunit: softc allocation failed.\n"));
- goto err;
- }
- kobj_init((kobj_t)scp, &sequencer_class);
-
- buf = malloc(sizeof(*buf) * EV_SZ * 1024, M_TEMP, M_NOWAIT | M_ZERO);
- if (buf == NULL)
- goto err;
- MIDIQ_INIT(scp->in_q, buf, EV_SZ * 1024);
- buf = malloc(sizeof(*buf) * EV_SZ * 1024, M_TEMP, M_NOWAIT | M_ZERO);
- if (buf == NULL)
- goto err;
- MIDIQ_INIT(scp->out_q, buf, EV_SZ * 1024);
- ret = EINVAL;
-
- scp->midis = malloc(sizeof(kobj_t) * 32, M_TEMP, M_NOWAIT | M_ZERO);
- scp->midi_flags = malloc(sizeof(*scp->midi_flags) * 32, M_TEMP,
- M_NOWAIT | M_ZERO);
-
- if (scp->midis == NULL || scp->midi_flags == NULL)
- goto err;
-
- scp->flags = 0;
-
- mtx_init(&scp->seq_lock, "seqflq", NULL, 0);
- cv_init(&scp->state_cv, "seqstate");
- cv_init(&scp->empty_cv, "seqempty");
- cv_init(&scp->reset_cv, "seqtimer");
- cv_init(&scp->out_cv, "seqqout");
- cv_init(&scp->in_cv, "seqqin");
- cv_init(&scp->th_cv, "seqstart");
-
- /*
- * Init the damn timer
- */
-
- scp->mapper = midimapper_addseq(scp, &scp->unit, &scp->mapper_cookie);
- if (scp->mapper == NULL)
- goto err;
-
- scp->seqdev = make_dev(&seq_cdevsw, SND_DEV_SEQ, UID_ROOT, GID_WHEEL,
- 0666, "sequencer%d", scp->unit);
-
- scp->musicdev = make_dev(&seq_cdevsw, SND_DEV_MUSIC, UID_ROOT,
- GID_WHEEL, 0666, "music%d", scp->unit);
-
- if (scp->seqdev == NULL || scp->musicdev == NULL)
- goto err;
- /*
- * TODO: Add to list of sequencers this module provides
- */
-
- ret =
- kproc_create
- (seq_eventthread, scp, NULL, RFHIGHPID, 0,
- "sequencer %02d", scp->unit);
-
- if (ret)
- goto err;
-
- scp->seqdev->si_drv1 = scp->musicdev->si_drv1 = scp;
-
- SEQ_DEBUG(2, printf("sequencer %d created scp %p\n", scp->unit, scp));
-
- ret = 0;
-
- mtx_lock(&seqinfo_mtx);
- seqs[nseq++] = scp;
- mtx_unlock(&seqinfo_mtx);
-
- goto ok;
-
-err:
- if (scp != NULL) {
- if (scp->seqdev != NULL)
- destroy_dev(scp->seqdev);
- if (scp->musicdev != NULL)
- destroy_dev(scp->musicdev);
- /*
- * TODO: Destroy mutex and cv
- */
- if (scp->midis != NULL)
- free(scp->midis, M_TEMP);
- if (scp->midi_flags != NULL)
- free(scp->midi_flags, M_TEMP);
- if (scp->out_q.b)
- free(scp->out_q.b, M_TEMP);
- if (scp->in_q.b)
- free(scp->in_q.b, M_TEMP);
- free(scp, M_DEVBUF);
- }
-ok:
- return ret;
-}
-
-static int
-seq_delunit(int unit)
-{
- struct seq_softc *scp = seqs[unit];
- int i;
-
- //SEQ_DEBUG(4, printf("seq_delunit: %d\n", unit));
- SEQ_DEBUG(1, printf("seq_delunit: 1 \n"));
- mtx_lock(&scp->seq_lock);
-
- scp->playing = 0;
- scp->done = 1;
- cv_broadcast(&scp->out_cv);
- cv_broadcast(&scp->state_cv);
- cv_broadcast(&scp->reset_cv);
- SEQ_DEBUG(1, printf("seq_delunit: 2 \n"));
- cv_wait(&scp->th_cv, &scp->seq_lock);
- SEQ_DEBUG(1, printf("seq_delunit: 3.0 \n"));
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(1, printf("seq_delunit: 3.1 \n"));
-
- cv_destroy(&scp->state_cv);
- SEQ_DEBUG(1, printf("seq_delunit: 4 \n"));
- cv_destroy(&scp->empty_cv);
- SEQ_DEBUG(1, printf("seq_delunit: 5 \n"));
- cv_destroy(&scp->reset_cv);
- SEQ_DEBUG(1, printf("seq_delunit: 6 \n"));
- cv_destroy(&scp->out_cv);
- SEQ_DEBUG(1, printf("seq_delunit: 7 \n"));
- cv_destroy(&scp->in_cv);
- SEQ_DEBUG(1, printf("seq_delunit: 8 \n"));
- cv_destroy(&scp->th_cv);
-
- SEQ_DEBUG(1, printf("seq_delunit: 10 \n"));
- if (scp->seqdev)
- destroy_dev(scp->seqdev);
- SEQ_DEBUG(1, printf("seq_delunit: 11 \n"));
- if (scp->musicdev)
- destroy_dev(scp->musicdev);
- SEQ_DEBUG(1, printf("seq_delunit: 12 \n"));
- scp->seqdev = scp->musicdev = NULL;
- if (scp->midis != NULL)
- free(scp->midis, M_TEMP);
- SEQ_DEBUG(1, printf("seq_delunit: 13 \n"));
- if (scp->midi_flags != NULL)
- free(scp->midi_flags, M_TEMP);
- SEQ_DEBUG(1, printf("seq_delunit: 14 \n"));
- free(scp->out_q.b, M_TEMP);
- SEQ_DEBUG(1, printf("seq_delunit: 15 \n"));
- free(scp->in_q.b, M_TEMP);
-
- SEQ_DEBUG(1, printf("seq_delunit: 16 \n"));
-
- mtx_destroy(&scp->seq_lock);
- SEQ_DEBUG(1, printf("seq_delunit: 17 \n"));
- free(scp, M_DEVBUF);
-
- mtx_lock(&seqinfo_mtx);
- for (i = unit; i < (nseq - 1); i++)
- seqs[i] = seqs[i + 1];
- nseq--;
- mtx_unlock(&seqinfo_mtx);
-
- return 0;
-}
-
-int
-seq_modevent(module_t mod, int type, void *data)
-{
- int retval, r;
-
- retval = 0;
-
- switch (type) {
- case MOD_LOAD:
- mtx_init(&seqinfo_mtx, "seqmod", NULL, 0);
- retval = seq_addunit();
- break;
-
- case MOD_UNLOAD:
- while (nseq) {
- r = seq_delunit(nseq - 1);
- if (r) {
- retval = r;
- break;
- }
- }
- if (nseq == 0) {
- retval = 0;
- mtx_destroy(&seqinfo_mtx);
- }
- break;
-
- default:
- break;
- }
-
- return retval;
-}
-
-static int
-seq_fetch_mid(struct seq_softc *scp, int unit, kobj_t *md)
-{
-
- if (unit >= scp->midi_number || unit < 0)
- return EINVAL;
-
- *md = scp->midis[unit];
-
- return 0;
-}
-
-int
-mseq_open(struct cdev *i_dev, int flags, int mode, struct thread *td)
-{
- struct seq_softc *scp = i_dev->si_drv1;
- int i;
-
- gone_in(15, "Warning! MIDI sequencer to be removed soon: no longer "
- "needed or used\n");
-
- if (scp == NULL)
- return ENXIO;
-
- SEQ_DEBUG(3, printf("seq_open: scp %p unit %d, flags 0x%x.\n",
- scp, scp->unit, flags));
-
- /*
- * Mark this device busy.
- */
-
- midistat_lock();
- mtx_lock(&scp->seq_lock);
- if (scp->busy) {
- mtx_unlock(&scp->seq_lock);
- midistat_unlock();
- SEQ_DEBUG(2, printf("seq_open: unit %d is busy.\n", scp->unit));
- return EBUSY;
- }
- scp->fflags = flags;
- /*
- if ((scp->fflags & O_NONBLOCK) != 0)
- scp->flags |= SEQ_F_NBIO;
- */
- scp->music = MIDIDEV(i_dev) == SND_DEV_MUSIC;
-
- /*
- * Enumerate the available midi devices
- */
- scp->midi_number = 0;
- scp->maxunits = midimapper_open_locked(scp->mapper, &scp->mapper_cookie);
-
- if (scp->maxunits == 0)
- SEQ_DEBUG(2, printf("seq_open: no midi devices\n"));
-
- for (i = 0; i < scp->maxunits; i++) {
- scp->midis[scp->midi_number] =
- midimapper_fetch_synth_locked(scp->mapper,
- scp->mapper_cookie, i);
- if (scp->midis[scp->midi_number]) {
- if (SYNTH_OPEN(scp->midis[scp->midi_number], scp,
- scp->fflags) != 0)
- scp->midis[scp->midi_number] = NULL;
- else {
- scp->midi_flags[scp->midi_number] =
- SYNTH_QUERY(scp->midis[scp->midi_number]);
- scp->midi_number++;
- }
- }
- }
- midistat_unlock();
-
- timer_setvals(scp, 60, 100);
-
- timer_start(scp);
- timer_stop(scp);
- /*
- * actually, if we're in rdonly mode, we should start the timer
- */
- /*
- * TODO: Handle recording now
- */
-
- scp->out_water = MIDIQ_SIZE(scp->out_q) / 2;
-
- scp->busy = 1;
- mtx_unlock(&scp->seq_lock);
-
- SEQ_DEBUG(2, printf("seq_open: opened, mode %s.\n",
- scp->music ? "music" : "sequencer"));
- SEQ_DEBUG(2,
- printf("Sequencer %d %p opened maxunits %d midi_number %d:\n",
- scp->unit, scp, scp->maxunits, scp->midi_number));
- for (i = 0; i < scp->midi_number; i++)
- SEQ_DEBUG(3, printf(" midi %d %p\n", i, scp->midis[i]));
-
- return 0;
-}
-
-/*
- * mseq_close
- */
-int
-mseq_close(struct cdev *i_dev, int flags, int mode, struct thread *td)
-{
- int i;
- struct seq_softc *scp = i_dev->si_drv1;
- int ret;
-
- if (scp == NULL)
- return ENXIO;
-
- SEQ_DEBUG(2, printf("seq_close: unit %d.\n", scp->unit));
-
- mtx_lock(&scp->seq_lock);
-
- ret = ENXIO;
- if (scp->busy == 0)
- goto err;
-
- seq_reset(scp);
- seq_sync(scp);
-
- for (i = 0; i < scp->midi_number; i++)
- if (scp->midis[i])
- SYNTH_CLOSE(scp->midis[i]);
-
- midimapper_close(scp->mapper, scp->mapper_cookie);
-
- timer_stop(scp);
-
- scp->busy = 0;
- ret = 0;
-
-err:
- SEQ_DEBUG(3, printf("seq_close: closed ret = %d.\n", ret));
- mtx_unlock(&scp->seq_lock);
- return ret;
-}
-
-int
-mseq_read(struct cdev *i_dev, struct uio *uio, int ioflag)
-{
- int retval, used;
- struct seq_softc *scp = i_dev->si_drv1;
-
-#define SEQ_RSIZE 32
- u_char buf[SEQ_RSIZE];
-
- if (scp == NULL)
- return ENXIO;
-
- SEQ_DEBUG(7, printf("mseq_read: unit %d, resid %zd.\n",
- scp->unit, uio->uio_resid));
-
- mtx_lock(&scp->seq_lock);
- if ((scp->fflags & FREAD) == 0) {
- SEQ_DEBUG(2, printf("mseq_read: unit %d is not for reading.\n",
- scp->unit));
- retval = EIO;
- goto err1;
- }
- /*
- * Begin recording.
- */
- /*
- * if ((scp->flags & SEQ_F_READING) == 0)
- */
- /*
- * TODO, start recording if not alread
- */
-
- /*
- * I think the semantics are to return as soon
- * as possible.
- * Second thought, it doesn't seem like midimoutain
- * expects that at all.
- * TODO: Look up in some sort of spec
- */
-
- while (uio->uio_resid > 0) {
- while (MIDIQ_EMPTY(scp->in_q)) {
- retval = EWOULDBLOCK;
- /*
- * I wish I knew which one to care about
- */
-
- if (scp->fflags & O_NONBLOCK)
- goto err1;
- if (ioflag & O_NONBLOCK)
- goto err1;
-
- retval = cv_wait_sig(&scp->in_cv, &scp->seq_lock);
- if (retval != 0)
- goto err1;
- }
-
- used = MIN(MIDIQ_LEN(scp->in_q), uio->uio_resid);
- used = MIN(used, SEQ_RSIZE);
-
- SEQ_DEBUG(8, printf("midiread: uiomove cc=%d\n", used));
- MIDIQ_DEQ(scp->in_q, buf, used);
- mtx_unlock(&scp->seq_lock);
- retval = uiomove(buf, used, uio);
- mtx_lock(&scp->seq_lock);
- if (retval)
- goto err1;
- }
-
- retval = 0;
-err1:
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(6, printf("mseq_read: ret %d, resid %zd.\n",
- retval, uio->uio_resid));
-
- return retval;
-}
-
-int
-mseq_write(struct cdev *i_dev, struct uio *uio, int ioflag)
-{
- u_char event[EV_SZ], newevent[EV_SZ], ev_code;
- struct seq_softc *scp = i_dev->si_drv1;
- int retval;
- int used;
-
- SEQ_DEBUG(7, printf("seq_write: unit %d, resid %zd.\n",
- scp->unit, uio->uio_resid));
-
- if (scp == NULL)
- return ENXIO;
-
- mtx_lock(&scp->seq_lock);
-
- if ((scp->fflags & FWRITE) == 0) {
- SEQ_DEBUG(2, printf("seq_write: unit %d is not for writing.\n",
- scp->unit));
- retval = EIO;
- goto err0;
- }
- while (uio->uio_resid > 0) {
- while (MIDIQ_AVAIL(scp->out_q) == 0) {
- retval = EWOULDBLOCK;
- if (scp->fflags & O_NONBLOCK)
- goto err0;
- if (ioflag & O_NONBLOCK)
- goto err0;
- SEQ_DEBUG(8, printf("seq_write cvwait\n"));
-
- scp->playing = 1;
- cv_broadcast(&scp->out_cv);
- cv_broadcast(&scp->state_cv);
-
- retval = cv_wait_sig(&scp->out_cv, &scp->seq_lock);
- /*
- * We slept, maybe things have changed since last
- * dying check
- */
- if (retval != 0)
- goto err0;
-#if 0
- /*
- * Useless test
- */
- if (scp != i_dev->si_drv1)
- retval = ENXIO;
-#endif
- }
-
- used = MIN(uio->uio_resid, 4);
-
- SEQ_DEBUG(8, printf("seqout: resid %zd len %jd avail %jd\n",
- uio->uio_resid, (intmax_t)MIDIQ_LEN(scp->out_q),
- (intmax_t)MIDIQ_AVAIL(scp->out_q)));
-
- if (used != 4) {
- retval = ENXIO;
- goto err0;
- }
- mtx_unlock(&scp->seq_lock);
- retval = uiomove(event, used, uio);
- mtx_lock(&scp->seq_lock);
- if (retval)
- goto err0;
-
- ev_code = event[0];
- SEQ_DEBUG(8, printf("seq_write: unit %d, event %s.\n",
- scp->unit, midi_cmdname(ev_code, cmdtab_seqevent)));
-
- /* Have a look at the event code. */
- if (ev_code == SEQ_FULLSIZE) {
- /*
- * TODO: restore code for SEQ_FULLSIZE
- */
-#if 0
- /*
- * A long event, these are the patches/samples for a
- * synthesizer.
- */
- midiunit = *(u_short *)&event[2];
- mtx_lock(&sd->seq_lock);
- ret = lookup_mididev(scp, midiunit, LOOKUP_OPEN, &md);
- mtx_unlock(&sd->seq_lock);
- if (ret != 0)
- return (ret);
-
- SEQ_DEBUG(printf("seq_write: loading a patch to the unit %d.\n", midiunit));
-
- ret = md->synth.loadpatch(md, *(short *)&event[0], buf,
- p + 4, count, 0);
- return (ret);
-#else
- /*
- * For now, just flush the darn buffer
- */
- SEQ_DEBUG(2,
- printf("seq_write: SEQ_FULLSIZE flusing buffer.\n"));
- while (uio->uio_resid > 0) {
- mtx_unlock(&scp->seq_lock);
- retval = uiomove(event, MIN(EV_SZ, uio->uio_resid), uio);
- mtx_lock(&scp->seq_lock);
- if (retval)
- goto err0;
- }
- retval = 0;
- goto err0;
-#endif
- }
- retval = EINVAL;
- if (ev_code >= 128) {
- int error;
-
- /*
- * Some sort of an extended event. The size is eight
- * bytes. scoop extra info.
- */
- if (scp->music && ev_code == SEQ_EXTENDED) {
- SEQ_DEBUG(2, printf("seq_write: invalid level two event %x.\n", ev_code));
- goto err0;
- }
- mtx_unlock(&scp->seq_lock);
- if (uio->uio_resid < 4)
- error = EINVAL;
- else
- error = uiomove((caddr_t)&event[4], 4, uio);
- mtx_lock(&scp->seq_lock);
- if (error) {
- SEQ_DEBUG(2,
- printf("seq_write: user memory mangled?\n"));
- goto err0;
- }
- } else {
- /*
- * Size four event.
- */
- if (scp->music) {
- SEQ_DEBUG(2, printf("seq_write: four byte event in music mode.\n"));
- goto err0;
- }
- }
- if (ev_code == SEQ_MIDIPUTC) {
- /*
- * TODO: event[2] is unit number to receive char.
- * Range check it.
- */
- }
- if (scp->music) {
-#ifdef not_ever_ever
- if (event[0] == EV_TIMING &&
- (event[1] == TMR_START || event[1] == TMR_STOP)) {
- /*
- * For now, try to make midimoutain work by
- * forcing these events to be processed
- * immediately.
- */
- seq_processevent(scp, event);
- } else
- MIDIQ_ENQ(scp->out_q, event, EV_SZ);
-#else
- MIDIQ_ENQ(scp->out_q, event, EV_SZ);
-#endif
- } else {
- if (seq_convertold(event, newevent) > 0)
- MIDIQ_ENQ(scp->out_q, newevent, EV_SZ);
-#if 0
- else
- goto err0;
-#endif
- }
- }
-
- scp->playing = 1;
- cv_broadcast(&scp->state_cv);
- cv_broadcast(&scp->out_cv);
-
- retval = 0;
-
-err0:
- SEQ_DEBUG(6,
- printf("seq_write done: leftover buffer length %zd retval %d\n",
- uio->uio_resid, retval));
- mtx_unlock(&scp->seq_lock);
- return retval;
-}
-
-int
-mseq_ioctl(struct cdev *i_dev, u_long cmd, caddr_t arg, int mode,
- struct thread *td)
-{
- int midiunit, ret, tmp;
- struct seq_softc *scp = i_dev->si_drv1;
- struct synth_info *synthinfo;
- struct midi_info *midiinfo;
- u_char event[EV_SZ];
- u_char newevent[EV_SZ];
-
- kobj_t md;
-
- /*
- * struct snd_size *sndsize;
- */
-
- if (scp == NULL)
- return ENXIO;
-
- SEQ_DEBUG(6, printf("seq_ioctl: unit %d, cmd %s.\n",
- scp->unit, midi_cmdname(cmd, cmdtab_seqioctl)));
-
- ret = 0;
-
- switch (cmd) {
- case SNDCTL_SEQ_GETTIME:
- /*
- * ioctl needed by libtse
- */
- mtx_lock(&scp->seq_lock);
- *(int *)arg = timer_now(scp);
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(6, printf("seq_ioctl: gettime %d.\n", *(int *)arg));
- ret = 0;
- break;
- case SNDCTL_TMR_METRONOME:
- /* fallthrough */
- case SNDCTL_TMR_SOURCE:
- /*
- * Not implemented
- */
- ret = 0;
- break;
- case SNDCTL_TMR_TEMPO:
- event[1] = TMR_TEMPO;
- event[4] = *(int *)arg & 0xFF;
- event[5] = (*(int *)arg >> 8) & 0xFF;
- event[6] = (*(int *)arg >> 16) & 0xFF;
- event[7] = (*(int *)arg >> 24) & 0xFF;
- goto timerevent;
- case SNDCTL_TMR_TIMEBASE:
- event[1] = TMR_TIMERBASE;
- event[4] = *(int *)arg & 0xFF;
- event[5] = (*(int *)arg >> 8) & 0xFF;
- event[6] = (*(int *)arg >> 16) & 0xFF;
- event[7] = (*(int *)arg >> 24) & 0xFF;
- goto timerevent;
- case SNDCTL_TMR_START:
- event[1] = TMR_START;
- goto timerevent;
- case SNDCTL_TMR_STOP:
- event[1] = TMR_STOP;
- goto timerevent;
- case SNDCTL_TMR_CONTINUE:
- event[1] = TMR_CONTINUE;
-timerevent:
- event[0] = EV_TIMING;
- mtx_lock(&scp->seq_lock);
- if (!scp->music) {
- ret = EINVAL;
- mtx_unlock(&scp->seq_lock);
- break;
- }
- seq_processevent(scp, event);
- mtx_unlock(&scp->seq_lock);
- break;
- case SNDCTL_TMR_SELECT:
- SEQ_DEBUG(2,
- printf("seq_ioctl: SNDCTL_TMR_SELECT not supported\n"));
- ret = EINVAL;
- break;
- case SNDCTL_SEQ_SYNC:
- if (mode == O_RDONLY) {
- ret = 0;
- break;
- }
- mtx_lock(&scp->seq_lock);
- ret = seq_sync(scp);
- mtx_unlock(&scp->seq_lock);
- break;
- case SNDCTL_SEQ_PANIC:
- /* fallthrough */
- case SNDCTL_SEQ_RESET:
- /*
- * SNDCTL_SEQ_PANIC == SNDCTL_SEQ_RESET
- */
- mtx_lock(&scp->seq_lock);
- seq_reset(scp);
- mtx_unlock(&scp->seq_lock);
- ret = 0;
- break;
- case SNDCTL_SEQ_TESTMIDI:
- mtx_lock(&scp->seq_lock);
- /*
- * TODO: SNDCTL_SEQ_TESTMIDI now means "can I write to the
- * device?".
- */
- mtx_unlock(&scp->seq_lock);
- break;
-#if 0
- case SNDCTL_SEQ_GETINCOUNT:
- if (mode == O_WRONLY)
- *(int *)arg = 0;
- else {
- mtx_lock(&scp->seq_lock);
- *(int *)arg = scp->in_q.rl;
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(printf("seq_ioctl: incount %d.\n",
- *(int *)arg));
- }
- ret = 0;
- break;
- case SNDCTL_SEQ_GETOUTCOUNT:
- if (mode == O_RDONLY)
- *(int *)arg = 0;
- else {
- mtx_lock(&scp->seq_lock);
- *(int *)arg = scp->out_q.fl;
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(printf("seq_ioctl: outcount %d.\n",
- *(int *)arg));
- }
- ret = 0;
- break;
-#endif
- case SNDCTL_SEQ_CTRLRATE:
- if (*(int *)arg != 0) {
- ret = EINVAL;
- break;
- }
- mtx_lock(&scp->seq_lock);
- *(int *)arg = scp->timerbase;
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(3, printf("seq_ioctl: ctrlrate %d.\n", *(int *)arg));
- ret = 0;
- break;
- /*
- * TODO: ioctl SNDCTL_SEQ_RESETSAMPLES
- */
-#if 0
- case SNDCTL_SEQ_RESETSAMPLES:
- mtx_lock(&scp->seq_lock);
- ret = lookup_mididev(scp, *(int *)arg, LOOKUP_OPEN, &md);
- mtx_unlock(&scp->seq_lock);
- if (ret != 0)
- break;
- ret = midi_ioctl(MIDIMKDEV(major(i_dev), *(int *)arg,
- SND_DEV_MIDIN), cmd, arg, mode, td);
- break;
-#endif
- case SNDCTL_SEQ_NRSYNTHS:
- mtx_lock(&scp->seq_lock);
- *(int *)arg = scp->midi_number;
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(3, printf("seq_ioctl: synths %d.\n", *(int *)arg));
- ret = 0;
- break;
- case SNDCTL_SEQ_NRMIDIS:
- mtx_lock(&scp->seq_lock);
- if (scp->music)
- *(int *)arg = 0;
- else {
- /*
- * TODO: count the numbder of devices that can WRITERAW
- */
- *(int *)arg = scp->midi_number;
- }
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(3, printf("seq_ioctl: midis %d.\n", *(int *)arg));
- ret = 0;
- break;
- /*
- * TODO: ioctl SNDCTL_SYNTH_MEMAVL
- */
-#if 0
- case SNDCTL_SYNTH_MEMAVL:
- mtx_lock(&scp->seq_lock);
- ret = lookup_mididev(scp, *(int *)arg, LOOKUP_OPEN, &md);
- mtx_unlock(&scp->seq_lock);
- if (ret != 0)
- break;
- ret = midi_ioctl(MIDIMKDEV(major(i_dev), *(int *)arg,
- SND_DEV_MIDIN), cmd, arg, mode, td);
- break;
-#endif
- case SNDCTL_SEQ_OUTOFBAND:
- for (ret = 0; ret < EV_SZ; ret++)
- event[ret] = (u_char)arg[0];
-
- mtx_lock(&scp->seq_lock);
- if (scp->music)
- ret = seq_processevent(scp, event);
- else {
- if (seq_convertold(event, newevent) > 0)
- ret = seq_processevent(scp, newevent);
- else
- ret = EINVAL;
- }
- mtx_unlock(&scp->seq_lock);
- break;
- case SNDCTL_SYNTH_INFO:
- synthinfo = (struct synth_info *)arg;
- midiunit = synthinfo->device;
- mtx_lock(&scp->seq_lock);
- if (seq_fetch_mid(scp, midiunit, &md) == 0) {
- bzero(synthinfo, sizeof(*synthinfo));
- synthinfo->name[0] = 'f';
- synthinfo->name[1] = 'a';
- synthinfo->name[2] = 'k';
- synthinfo->name[3] = 'e';
- synthinfo->name[4] = 's';
- synthinfo->name[5] = 'y';
- synthinfo->name[6] = 'n';
- synthinfo->name[7] = 't';
- synthinfo->name[8] = 'h';
- synthinfo->device = midiunit;
- synthinfo->synth_type = SYNTH_TYPE_MIDI;
- synthinfo->capabilities = scp->midi_flags[midiunit];
- ret = 0;
- } else
- ret = EINVAL;
- mtx_unlock(&scp->seq_lock);
- break;
- case SNDCTL_MIDI_INFO:
- midiinfo = (struct midi_info *)arg;
- midiunit = midiinfo->device;
- mtx_lock(&scp->seq_lock);
- if (seq_fetch_mid(scp, midiunit, &md) == 0) {
- bzero(midiinfo, sizeof(*midiinfo));
- midiinfo->name[0] = 'f';
- midiinfo->name[1] = 'a';
- midiinfo->name[2] = 'k';
- midiinfo->name[3] = 'e';
- midiinfo->name[4] = 'm';
- midiinfo->name[5] = 'i';
- midiinfo->name[6] = 'd';
- midiinfo->name[7] = 'i';
- midiinfo->device = midiunit;
- midiinfo->capabilities = scp->midi_flags[midiunit];
- /*
- * TODO: What devtype?
- */
- midiinfo->dev_type = 0x01;
- ret = 0;
- } else
- ret = EINVAL;
- mtx_unlock(&scp->seq_lock);
- break;
- case SNDCTL_SEQ_THRESHOLD:
- mtx_lock(&scp->seq_lock);
- RANGE(*(int *)arg, 1, MIDIQ_SIZE(scp->out_q) - 1);
- scp->out_water = *(int *)arg;
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(3, printf("seq_ioctl: water %d.\n", *(int *)arg));
- ret = 0;
- break;
- case SNDCTL_MIDI_PRETIME:
- tmp = *(int *)arg;
- if (tmp < 0)
- tmp = 0;
- mtx_lock(&scp->seq_lock);
- scp->pre_event_timeout = (hz * tmp) / 10;
- *(int *)arg = scp->pre_event_timeout;
- mtx_unlock(&scp->seq_lock);
- SEQ_DEBUG(3, printf("seq_ioctl: pretime %d.\n", *(int *)arg));
- ret = 0;
- break;
- case SNDCTL_FM_4OP_ENABLE:
- case SNDCTL_PMGR_IFACE:
- case SNDCTL_PMGR_ACCESS:
- /*
- * Patch manager and fm are ded, ded, ded.
- */
- /* fallthrough */
- default:
- /*
- * TODO: Consider ioctl default case.
- * Old code used to
- * if ((scp->fflags & O_ACCMODE) == FREAD) {
- * ret = EIO;
- * break;
- * }
- * Then pass on the ioctl to device 0
- */
- SEQ_DEBUG(2,
- printf("seq_ioctl: unsupported IOCTL %ld.\n", cmd));
- ret = EINVAL;
- break;
- }
-
- return ret;
-}
-
-int
-mseq_poll(struct cdev *i_dev, int events, struct thread *td)
-{
- int ret, lim;
- struct seq_softc *scp = i_dev->si_drv1;
-
- SEQ_DEBUG(3, printf("seq_poll: unit %d.\n", scp->unit));
- SEQ_DEBUG(1, printf("seq_poll: unit %d.\n", scp->unit));
-
- mtx_lock(&scp->seq_lock);
-
- ret = 0;
-
- /* Look up the appropriate queue and select it. */
- if ((events & (POLLOUT | POLLWRNORM)) != 0) {
- /* Start playing. */
- scp->playing = 1;
- cv_broadcast(&scp->state_cv);
- cv_broadcast(&scp->out_cv);
-
- lim = scp->out_water;
-
- if (MIDIQ_AVAIL(scp->out_q) < lim)
- /* No enough space, record select. */
- selrecord(td, &scp->out_sel);
- else
- /* We can write now. */
- ret |= events & (POLLOUT | POLLWRNORM);
- }
- if ((events & (POLLIN | POLLRDNORM)) != 0) {
- /* TODO: Start recording. */
-
- /* Find out the boundary. */
- lim = 1;
- if (MIDIQ_LEN(scp->in_q) < lim)
- /* No data ready, record select. */
- selrecord(td, &scp->in_sel);
- else
- /* We can read now. */
- ret |= events & (POLLIN | POLLRDNORM);
- }
- mtx_unlock(&scp->seq_lock);
-
- return (ret);
-}
-
-#if 0
-static void
-sein_qtr(void *p, void /* mididev_info */ *md)
-{
- struct seq_softc *scp;
-
- scp = (struct seq_softc *)p;
-
- mtx_lock(&scp->seq_lock);
-
- /* Restart playing if we have the data to output. */
- if (scp->queueout_pending)
- seq_callback(scp, SEQ_CB_START | SEQ_CB_WR);
- /* Check the midi device if we are reading. */
- if ((scp->flags & SEQ_F_READING) != 0)
- seq_midiinput(scp, md);
-
- mtx_unlock(&scp->seq_lock);
-}
-
-#endif
-/*
- * seq_convertold
- * Was the old playevent. Use this to convert and old
- * style /dev/sequencer event to a /dev/music event
- */
-static int
-seq_convertold(u_char *event, u_char *out)
-{
- int used;
- u_char dev, chn, note, vel;
-
- out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] =
- out[7] = 0;
-
- dev = 0;
- chn = event[1];
- note = event[2];
- vel = event[3];
-
- used = 0;
-
-restart:
- /*
- * TODO: Debug statement
- */
- switch (event[0]) {
- case EV_TIMING:
- case EV_CHN_VOICE:
- case EV_CHN_COMMON:
- case EV_SYSEX:
- case EV_SEQ_LOCAL:
- out[0] = event[0];
- out[1] = event[1];
- out[2] = event[2];
- out[3] = event[3];
- out[4] = event[4];
- out[5] = event[5];
- out[6] = event[6];
- out[7] = event[7];
- used += 8;
- break;
- case SEQ_NOTEOFF:
- out[0] = EV_CHN_VOICE;
- out[1] = dev;
- out[2] = MIDI_NOTEOFF;
- out[3] = chn;
- out[4] = note;
- out[5] = 255;
- used += 4;
- break;
-
- case SEQ_NOTEON:
- out[0] = EV_CHN_VOICE;
- out[1] = dev;
- out[2] = MIDI_NOTEON;
- out[3] = chn;
- out[4] = note;
- out[5] = vel;
- used += 4;
- break;
-
- /*
- * wait delay = (event[2] << 16) + (event[3] << 8) + event[4]
- */
-
- case SEQ_PGMCHANGE:
- out[0] = EV_CHN_COMMON;
- out[1] = dev;
- out[2] = MIDI_PGM_CHANGE;
- out[3] = chn;
- out[4] = note;
- out[5] = vel;
- used += 4;
- break;
-/*
- out[0] = EV_TIMING;
- out[1] = dev;
- out[2] = MIDI_PGM_CHANGE;
- out[3] = chn;
- out[4] = note;
- out[5] = vel;
- SEQ_DEBUG(4,printf("seq_playevent: synctimer\n"));
- break;
-*/
-
- case SEQ_MIDIPUTC:
- SEQ_DEBUG(4,
- printf("seq_playevent: put data 0x%02x, unit %d.\n",
- event[1], event[2]));
- /*
- * Pass through to the midi device.
- * device = event[2]
- * data = event[1]
- */
- out[0] = SEQ_MIDIPUTC;
- out[1] = dev;
- out[2] = chn;
- used += 4;
- break;
-#ifdef notyet
- case SEQ_ECHO:
- /*
- * This isn't handled here yet because I don't know if I can
- * just use four bytes events. There might be consequences
- * in the _read routing
- */
- if (seq_copytoinput(scp, event, 4) == EAGAIN) {
- ret = QUEUEFULL;
- break;
- }
- ret = MORE;
- break;
-#endif
- case SEQ_EXTENDED:
- switch (event[1]) {
- case SEQ_NOTEOFF:
- case SEQ_NOTEON:
- case SEQ_PGMCHANGE:
- event++;
- used = 4;
- goto restart;
- break;
- case SEQ_AFTERTOUCH:
- /*
- * SYNTH_AFTERTOUCH(md, event[3], event[4])
- */
- case SEQ_BALANCE:
- /*
- * SYNTH_PANNING(md, event[3], (char)event[4])
- */
- case SEQ_CONTROLLER:
- /*
- * SYNTH_CONTROLLER(md, event[3], event[4], *(short *)&event[5])
- */
- case SEQ_VOLMODE:
- /*
- * SYNTH_VOLUMEMETHOD(md, event[3])
- */
- default:
- SEQ_DEBUG(2,
- printf("seq_convertold: SEQ_EXTENDED type %d"
- "not handled\n", event[1]));
- break;
- }
- break;
- case SEQ_WAIT:
- out[0] = EV_TIMING;
- out[1] = TMR_WAIT_REL;
- out[4] = event[2];
- out[5] = event[3];
- out[6] = event[4];
-
- SEQ_DEBUG(5, printf("SEQ_WAIT %d",
- event[2] + (event[3] << 8) + (event[4] << 24)));
-
- used += 4;
- break;
-
- case SEQ_ECHO:
- case SEQ_SYNCTIMER:
- case SEQ_PRIVATE:
- default:
- SEQ_DEBUG(2,
- printf("seq_convertold: event type %d not handled %d %d %d\n",
- event[0], event[1], event[2], event[3]));
- break;
- }
- return used;
-}
-
-/*
- * Writting to the sequencer buffer never blocks and drops
- * input which cannot be queued
- */
-void
-seq_copytoinput(struct seq_softc *scp, u_char *event, int len)
-{
-
- mtx_assert(&scp->seq_lock, MA_OWNED);
-
- if (MIDIQ_AVAIL(scp->in_q) < len) {
- /*
- * ENOROOM? EINPUTDROPPED? ETOUGHLUCK?
- */
- SEQ_DEBUG(2, printf("seq_copytoinput: queue full\n"));
- } else {
- MIDIQ_ENQ(scp->in_q, event, len);
- selwakeup(&scp->in_sel);
- cv_broadcast(&scp->in_cv);
- }
-
-}
-
-static int
-seq_chnvoice(struct seq_softc *scp, kobj_t md, u_char *event)
-{
- int ret, voice;
- u_char cmd, chn, note, parm;
-
- ret = 0;
- cmd = event[2];
- chn = event[3];
- note = event[4];
- parm = event[5];
-
- mtx_assert(&scp->seq_lock, MA_OWNED);
-
- SEQ_DEBUG(5, printf("seq_chnvoice: unit %d, dev %d, cmd %s,"
- " chn %d, note %d, parm %d.\n", scp->unit, event[1],
- midi_cmdname(cmd, cmdtab_seqcv), chn, note, parm));
-
- voice = SYNTH_ALLOC(md, chn, note);
-
- mtx_unlock(&scp->seq_lock);
-
- switch (cmd) {
- case MIDI_NOTEON:
- if (note < 128 || note == 255) {
-#if 0
- if (scp->music && chn == 9) {
- /*
- * This channel is a percussion. The note
- * number is the patch number.
- */
- /*
- mtx_unlock(&scp->seq_lock);
- if (SYNTH_SETINSTR(md, voice, 128 + note)
- == EAGAIN) {
- mtx_lock(&scp->seq_lock);
- return (QUEUEFULL);
- }
- mtx_lock(&scp->seq_lock);
- */
- note = 60; /* Middle C. */
- }
-#endif
- if (scp->music) {
- /*
- mtx_unlock(&scp->seq_lock);
- if (SYNTH_SETUPVOICE(md, voice, chn)
- == EAGAIN) {
- mtx_lock(&scp->seq_lock);
- return (QUEUEFULL);
- }
- mtx_lock(&scp->seq_lock);
- */
- }
- SYNTH_STARTNOTE(md, voice, note, parm);
- }
- break;
- case MIDI_NOTEOFF:
- SYNTH_KILLNOTE(md, voice, note, parm);
- break;
- case MIDI_KEY_PRESSURE:
- SYNTH_AFTERTOUCH(md, voice, parm);
- break;
- default:
- ret = 1;
- SEQ_DEBUG(2, printf("seq_chnvoice event type %d not handled\n",
- event[1]));
- break;
- }
-
- mtx_lock(&scp->seq_lock);
- return ret;
-}
-
-static int
-seq_chncommon(struct seq_softc *scp, kobj_t md, u_char *event)
-{
- int ret;
- u_short w14;
- u_char cmd, chn, p1;
-
- ret = 0;
- cmd = event[2];
- chn = event[3];
- p1 = event[4];
- w14 = *(u_short *)&event[6];
-
- SEQ_DEBUG(5, printf("seq_chncommon: unit %d, dev %d, cmd %s, chn %d,"
- " p1 %d, w14 %d.\n", scp->unit, event[1],
- midi_cmdname(cmd, cmdtab_seqccmn), chn, p1, w14));
- mtx_unlock(&scp->seq_lock);
- switch (cmd) {
- case MIDI_PGM_CHANGE:
- SEQ_DEBUG(4, printf("seq_chncommon pgmchn chn %d pg %d\n",
- chn, p1));
- SYNTH_SETINSTR(md, chn, p1);
- break;
- case MIDI_CTL_CHANGE:
- SEQ_DEBUG(4, printf("seq_chncommon ctlch chn %d pg %d %d\n",
- chn, p1, w14));
- SYNTH_CONTROLLER(md, chn, p1, w14);
- break;
- case MIDI_PITCH_BEND:
- if (scp->music) {
- /*
- * TODO: MIDI_PITCH_BEND
- */
-#if 0
- mtx_lock(&md->synth.vc_mtx);
- md->synth.chn_info[chn].bender_value = w14;
- if (md->midiunit >= 0) {
- /*
- * Handle all of the notes playing on this
- * channel.
- */
- key = ((int)chn << 8);
- for (i = 0; i < md->synth.alloc.max_voice; i++)
- if ((md->synth.alloc.map[i] & 0xff00) == key) {
- mtx_unlock(&md->synth.vc_mtx);
- mtx_unlock(&scp->seq_lock);
- if (md->synth.bender(md, i, w14) == EAGAIN) {
- mtx_lock(&scp->seq_lock);
- return (QUEUEFULL);
- }
- mtx_lock(&scp->seq_lock);
- }
- } else {
- mtx_unlock(&md->synth.vc_mtx);
- mtx_unlock(&scp->seq_lock);
- if (md->synth.bender(md, chn, w14) == EAGAIN) {
- mtx_lock(&scp->seq_lock);
- return (QUEUEFULL);
- }
- mtx_lock(&scp->seq_lock);
- }
-#endif
- } else
- SYNTH_BENDER(md, chn, w14);
- break;
- default:
- ret = 1;
- SEQ_DEBUG(2,
- printf("seq_chncommon event type %d not handled.\n",
- event[1]));
- break;
- }
- mtx_lock(&scp->seq_lock);
- return ret;
-}
-
-static int
-seq_timing(struct seq_softc *scp, u_char *event)
-{
- int param;
- int ret;
-
- ret = 0;
- param = event[4] + (event[5] << 8) +
- (event[6] << 16) + (event[7] << 24);
-
- SEQ_DEBUG(5, printf("seq_timing: unit %d, cmd %d, param %d.\n",
- scp->unit, event[1], param));
- switch (event[1]) {
- case TMR_WAIT_REL:
- timer_wait(scp, param, 0);
- break;
- case TMR_WAIT_ABS:
- timer_wait(scp, param, 1);
- break;
- case TMR_START:
- timer_start(scp);
- cv_broadcast(&scp->reset_cv);
- break;
- case TMR_STOP:
- timer_stop(scp);
- /*
- * The following cv_broadcast isn't needed since we only
- * wait for 0->1 transitions. It probably won't hurt
- */
- cv_broadcast(&scp->reset_cv);
- break;
- case TMR_CONTINUE:
- timer_continue(scp);
- cv_broadcast(&scp->reset_cv);
- break;
- case TMR_TEMPO:
- if (param < 8)
- param = 8;
- if (param > 360)
- param = 360;
- SEQ_DEBUG(4, printf("Timer set tempo %d\n", param));
- timer_setvals(scp, param, scp->timerbase);
- break;
- case TMR_TIMERBASE:
- if (param < 1)
- param = 1;
- if (param > 1000)
- param = 1000;
- SEQ_DEBUG(4, printf("Timer set timerbase %d\n", param));
- timer_setvals(scp, scp->tempo, param);
- break;
- case TMR_ECHO:
- /*
- * TODO: Consider making 4-byte events for /dev/sequencer
- * PRO: Maybe needed by legacy apps
- * CON: soundcard.h has been warning for a while many years
- * to expect 8 byte events.
- */
-#if 0
- if (scp->music)
- seq_copytoinput(scp, event, 8);
- else {
- param = (param << 8 | SEQ_ECHO);
- seq_copytoinput(scp, (u_char *)&param, 4);
- }
-#else
- seq_copytoinput(scp, event, 8);
-#endif
- break;
- default:
- SEQ_DEBUG(2, printf("seq_timing event type %d not handled.\n",
- event[1]));
- ret = 1;
- break;
- }
- return ret;
-}
-
-static int
-seq_local(struct seq_softc *scp, u_char *event)
-{
- int ret;
-
- ret = 0;
- mtx_assert(&scp->seq_lock, MA_OWNED);
-
- SEQ_DEBUG(5, printf("seq_local: unit %d, cmd %d\n", scp->unit,
- event[1]));
- switch (event[1]) {
- default:
- SEQ_DEBUG(1, printf("seq_local event type %d not handled\n",
- event[1]));
- ret = 1;
- break;
- }
- return ret;
-}
-
-static int
-seq_sysex(struct seq_softc *scp, kobj_t md, u_char *event)
-{
- int i, l;
-
- mtx_assert(&scp->seq_lock, MA_OWNED);
- SEQ_DEBUG(5, printf("seq_sysex: unit %d device %d\n", scp->unit,
- event[1]));
- l = 0;
- for (i = 0; i < 6 && event[i + 2] != 0xff; i++)
- l = i + 1;
- if (l > 0) {
- mtx_unlock(&scp->seq_lock);
- if (SYNTH_SENDSYSEX(md, &event[2], l) == EAGAIN) {
- mtx_lock(&scp->seq_lock);
- return 1;
- }
- mtx_lock(&scp->seq_lock);
- }
- return 0;
-}
-
-/*
- * Reset no longer closes the raw devices nor seq_sync's
- * Callers are IOCTL and seq_close
- */
-static void
-seq_reset(struct seq_softc *scp)
-{
- int chn, i;
- kobj_t m;
-
- mtx_assert(&scp->seq_lock, MA_OWNED);
-
- SEQ_DEBUG(5, printf("seq_reset: unit %d.\n", scp->unit));
-
- /*
- * Stop reading and writing.
- */
-
- /* scp->recording = 0; */
- scp->playing = 0;
- cv_broadcast(&scp->state_cv);
- cv_broadcast(&scp->out_cv);
- cv_broadcast(&scp->reset_cv);
-
- /*
- * For now, don't reset the timers.
- */
- MIDIQ_CLEAR(scp->in_q);
- MIDIQ_CLEAR(scp->out_q);
-
- for (i = 0; i < scp->midi_number; i++) {
- m = scp->midis[i];
- mtx_unlock(&scp->seq_lock);
- SYNTH_RESET(m);
- for (chn = 0; chn < 16; chn++) {
- SYNTH_CONTROLLER(m, chn, 123, 0);
- SYNTH_CONTROLLER(m, chn, 121, 0);
- SYNTH_BENDER(m, chn, 1 << 13);
- }
- mtx_lock(&scp->seq_lock);
- }
-}
-
-/*
- * seq_sync
- * *really* flush the output queue
- * flush the event queue, then flush the synthsisers.
- * Callers are IOCTL and close
- */
-
-#define SEQ_SYNC_TIMEOUT 8
-static int
-seq_sync(struct seq_softc *scp)
-{
- int i, rl, sync[16], done;
-
- mtx_assert(&scp->seq_lock, MA_OWNED);
-
- SEQ_DEBUG(4, printf("seq_sync: unit %d.\n", scp->unit));
-
- /*
- * Wait until output queue is empty. Check every so often to see if
- * the queue is moving along. If it isn't just abort.
- */
- while (!MIDIQ_EMPTY(scp->out_q)) {
- if (!scp->playing) {
- scp->playing = 1;
- cv_broadcast(&scp->state_cv);
- cv_broadcast(&scp->out_cv);
- }
- rl = MIDIQ_LEN(scp->out_q);
-
- i = cv_timedwait_sig(&scp->out_cv,
- &scp->seq_lock, SEQ_SYNC_TIMEOUT * hz);
-
- if (i == EINTR || i == ERESTART) {
- if (i == EINTR) {
- /*
- * XXX: I don't know why we stop playing
- */
- scp->playing = 0;
- cv_broadcast(&scp->out_cv);
- }
- return i;
- }
- if (i == EWOULDBLOCK && rl == MIDIQ_LEN(scp->out_q) &&
- scp->waiting == 0) {
- /*
- * A queue seems to be stuck up. Give up and clear
- * queues.
- */
- MIDIQ_CLEAR(scp->out_q);
- scp->playing = 0;
- cv_broadcast(&scp->state_cv);
- cv_broadcast(&scp->out_cv);
- cv_broadcast(&scp->reset_cv);
-
- /*
- * TODO: Consider if the raw devices need to be flushed
- */
-
- SEQ_DEBUG(1, printf("seq_sync queue stuck, aborting\n"));
-
- return i;
- }
- }
-
- scp->playing = 0;
- /*
- * Since syncing a midi device might block, unlock scp->seq_lock.
- */
-
- mtx_unlock(&scp->seq_lock);
- for (i = 0; i < scp->midi_number; i++)
- sync[i] = 1;
-
- do {
- done = 1;
- for (i = 0; i < scp->midi_number; i++)
- if (sync[i]) {
- if (SYNTH_INSYNC(scp->midis[i]) == 0)
- sync[i] = 0;
- else
- done = 0;
- }
- if (!done)
- DELAY(5000);
-
- } while (!done);
-
- mtx_lock(&scp->seq_lock);
- return 0;
-}
-
-char *
-midi_cmdname(int cmd, midi_cmdtab *tab)
-{
- while (tab->name != NULL) {
- if (cmd == tab->cmd)
- return (tab->name);
- tab++;
- }
-
- return ("unknown");
-}
diff --git a/sys/dev/sound/midi/synth_if.m b/sys/dev/sound/midi/synth_if.m
deleted file mode 100644
index a763b3422bc6..000000000000
--- a/sys/dev/sound/midi/synth_if.m
+++ /dev/null
@@ -1,312 +0,0 @@
-#-
-# Copyright (c) 2003 Mathew Kanner
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#
-#
-
-INTERFACE synth;
-
-#include <sys/systm.h>
-
-CODE {
-
-synth_killnote_t nokillnote;
-synth_startnote_t nostartnote;
-synth_setinstr_t nosetinstr;
-synth_hwcontrol_t nohwcontrol;
-synth_aftertouch_t noaftertouch;
-synth_panning_t nopanning;
-synth_controller_t nocontroller;
-synth_volumemethod_t novolumemethod;
-synth_bender_t nobender;
-synth_setupvoice_t nosetupvoice;
-synth_sendsysex_t nosendsysex;
-synth_allocvoice_t noallocvoice;
-synth_writeraw_t nowriteraw;
-synth_reset_t noreset;
-synth_shortname_t noshortname;
-synth_open_t noopen;
-synth_close_t noclose;
-synth_query_t noquery;
-synth_insync_t noinsync;
-synth_alloc_t noalloc;
-
- int
- nokillnote(void *_kobj, uint8_t _chn, uint8_t _note, uint8_t _vel)
- {
- printf("nokillnote\n");
- return 0;
- }
-
- int
- noopen(void *_kobj, void *_arg, int mode)
- {
- printf("noopen\n");
- return 0;
- }
-
- int
- noquery(void *_kboj)
- {
- printf("noquery\n");
- return 0;
- }
-
- int
- nostartnote(void *_kb, uint8_t _voice, uint8_t _note, uint8_t _parm)
- {
- printf("nostartnote\n");
- return 0;
- }
-
- int
- nosetinstr(void *_kb, uint8_t _chn, uint16_t _patchno)
- {
- printf("nosetinstr\n");
- return 0;
- }
-
- int
- nohwcontrol(void *_kb, uint8_t *_event)
- {
- printf("nohwcontrol\n");
- return 0;
- }
-
- int
- noaftertouch ( void /* X */ * _kobj, uint8_t _x1, uint8_t _x2)
- {
- printf("noaftertouch\n");
- return 0;
- }
-
- int
- nopanning ( void /* X */ * _kobj, uint8_t _x1, uint8_t _x2)
- {
- printf("nopanning\n");
- return 0;
- }
-
- int
- nocontroller ( void /* X */ * _kobj, uint8_t _x1, uint8_t _x2, uint16_t _x3)
- {
- printf("nocontroller\n");
- return 0;
- }
-
- int
- novolumemethod (
- void /* X */ * _kobj,
- uint8_t _x1)
- {
- printf("novolumemethod\n");
- return 0;
- }
-
- int
- nobender ( void /* X */ * _kobj, uint8_t _voice, uint16_t _bend)
- {
- printf("nobender\n");
- return 0;
- }
-
- int
- nosetupvoice ( void /* X */ * _kobj, uint8_t _voice, uint8_t _chn)
- {
-
- printf("nosetupvoice\n");
- return 0;
- }
-
- int
- nosendsysex ( void /* X */ * _kobj, void * _buf, size_t _len)
- {
- printf("nosendsysex\n");
- return 0;
- }
-
- int
- noallocvoice ( void /* X */ * _kobj, uint8_t _chn, uint8_t _note, void *_x)
- {
- printf("noallocvoice\n");
- return 0;
- }
-
- int
- nowriteraw ( void /* X */ * _kobjt, uint8_t * _buf, size_t _len)
- {
- printf("nowriteraw\n");
- return 1;
- }
-
- int
- noreset ( void /* X */ * _kobjt)
- {
-
- printf("noreset\n");
- return 0;
- }
-
- char *
- noshortname (void /* X */ * _kobjt)
- {
- printf("noshortname\n");
- return "noshortname";
- }
-
- int
- noclose ( void /* X */ * _kobjt)
- {
-
- printf("noclose\n");
- return 0;
- }
-
- int
- noinsync (void /* X */ * _kobjt)
- {
-
- printf("noinsync\n");
- return 0;
- }
-
- int
- noalloc ( void /* x */ * _kbojt, uint8_t _chn, uint8_t _note)
- {
- printf("noalloc\n");
- return 0;
- }
-}
-
-METHOD int killnote {
- void /* X */ *_kobj;
- uint8_t _chan;
- uint8_t _note;
- uint8_t _vel;
-} DEFAULT nokillnote;
-
-METHOD int startnote {
- void /* X */ *_kobj;
- uint8_t _voice;
- uint8_t _note;
- uint8_t _parm;
-} DEFAULT nostartnote;
-
-METHOD int setinstr {
- void /* X */ *_kobj;
- uint8_t _chn;
- uint16_t _patchno;
-} DEFAULT nosetinstr;
-
-METHOD int hwcontrol {
- void /* X */ *_kobj;
- uint8_t *_event;
-} DEFAULT nohwcontrol;
-
-METHOD int aftertouch {
- void /* X */ *_kobj;
- uint8_t _x1;
- uint8_t _x2;
-} DEFAULT noaftertouch;
-
-METHOD int panning {
- void /* X */ *_kobj;
- uint8_t _x1;
- uint8_t _x2;
-} DEFAULT nopanning;
-
-METHOD int controller {
- void /* X */ *_kobj;
- uint8_t _x1;
- uint8_t _x2;
- uint16_t _x3;
-} DEFAULT nocontroller;
-
-METHOD int volumemethod {
- void /* X */ *_kobj;
- uint8_t _x1;
-} DEFAULT novolumemethod;
-
-METHOD int bender {
- void /* X */ *_kobj;
- uint8_t _voice;
- uint16_t _bend;
-} DEFAULT nobender;
-
-METHOD int setupvoice {
- void /* X */ *_kobj;
- uint8_t _voice;
- uint8_t _chn;
-} DEFAULT nosetupvoice;
-
-METHOD int sendsysex {
- void /* X */ *_kobj;
- void *_buf;
- size_t _len;
-} DEFAULT nosendsysex;
-
-METHOD int allocvoice {
- void /* X */ *_kobj;
- uint8_t _chn;
- uint8_t _note;
- void *_x;
-} DEFAULT noallocvoice;
-
-METHOD int writeraw {
- void /* X */ *_kobjt;
- uint8_t *_buf;
- size_t _len;
-} DEFAULT nowriteraw;
-
-METHOD int reset {
- void /* X */ *_kobjt;
-} DEFAULT noreset;
-
-METHOD char * shortname {
- void /* X */ *_kobjt;
-} DEFAULT noshortname;
-
-METHOD int open {
- void /* X */ *_kobjt;
- void *_sythn;
- int _mode;
-} DEFAULT noopen;
-
-METHOD int close {
- void /* X */ *_kobjt;
-} DEFAULT noclose;
-
-METHOD int query {
- void /* X */ *_kobjt;
-} DEFAULT noquery;
-
-METHOD int insync {
- void /* X */ *_kobjt;
-} DEFAULT noinsync;
-
-METHOD int alloc {
- void /* x */ *_kbojt;
- uint8_t _chn;
- uint8_t _note;
-} DEFAULT noalloc;
diff --git a/sys/dev/sound/pcm/mixer.c b/sys/dev/sound/pcm/mixer.c
index 092af3298f0e..f281dff36248 100644
--- a/sys/dev/sound/pcm/mixer.c
+++ b/sys/dev/sound/pcm/mixer.c
@@ -750,8 +750,8 @@ mixer_init(device_t dev, kobj_class_t cls, void *devinfo)
mixer_setrecsrc(m, 0); /* Set default input. */
- pdev = make_dev(&mixer_cdevsw, SND_DEV_CTL, UID_ROOT, GID_WHEEL, 0666,
- "mixer%d", unit);
+ pdev = make_dev(&mixer_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "mixer%d",
+ unit);
pdev->si_drv1 = m;
snddev->mixer_dev = pdev;
diff --git a/sys/dev/sound/pcm/sndstat.c b/sys/dev/sound/pcm/sndstat.c
index 509a35c5a038..51d0fb3bb686 100644
--- a/sys/dev/sound/pcm/sndstat.c
+++ b/sys/dev/sound/pcm/sndstat.c
@@ -52,7 +52,6 @@
#define SS_TYPE_PCM 1
#define SS_TYPE_MIDI 2
-#define SS_TYPE_SEQUENCER 3
static d_open_t sndstat_open;
static void sndstat_close(void *);
@@ -1165,8 +1164,6 @@ sndstat_register(device_t dev, char *str)
type = SS_TYPE_PCM;
else if (!strcmp(devtype, "midi"))
type = SS_TYPE_MIDI;
- else if (!strcmp(devtype, "sequencer"))
- type = SS_TYPE_SEQUENCER;
else
return (EINVAL);
@@ -1441,8 +1438,8 @@ static void
sndstat_sysinit(void *p)
{
sx_init(&sndstat_lock, "sndstat lock");
- sndstat_dev = make_dev(&sndstat_cdevsw, SND_DEV_STATUS,
- UID_ROOT, GID_WHEEL, 0644, "sndstat");
+ sndstat_dev = make_dev(&sndstat_cdevsw, 0, UID_ROOT, GID_WHEEL, 0644,
+ "sndstat");
}
SYSINIT(sndstat_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST, sndstat_sysinit, NULL);
diff --git a/sys/dev/sound/pcm/sound.h b/sys/dev/sound/pcm/sound.h
index 315452e294d1..6bd435d0ea25 100644
--- a/sys/dev/sound/pcm/sound.h
+++ b/sys/dev/sound/pcm/sound.h
@@ -148,14 +148,6 @@ struct snd_mixer;
#define RANGE(var, low, high) (var) = \
(((var)<(low))? (low) : ((var)>(high))? (high) : (var))
-enum {
- SND_DEV_CTL = 0, /* Control port /dev/mixer */
- SND_DEV_SEQ, /* Sequencer /dev/sequencer */
- SND_DEV_MIDIN, /* Raw midi access */
- SND_DEV_DSP, /* Digitized voice /dev/dsp */
- SND_DEV_STATUS, /* /dev/sndstat */
-};
-
#define DSP_DEFAULT_SPEED 8000
extern int snd_unit;
diff --git a/sys/dev/ufshci/ufshci_private.h b/sys/dev/ufshci/ufshci_private.h
index cac743884ee6..ac58d44102a0 100644
--- a/sys/dev/ufshci/ufshci_private.h
+++ b/sys/dev/ufshci/ufshci_private.h
@@ -149,6 +149,8 @@ struct ufshci_hw_queue {
bus_dmamap_t queuemem_map;
bus_addr_t req_queue_addr;
+ bus_addr_t *ucd_bus_addr;
+
uint32_t num_entries;
uint32_t num_trackers;
@@ -198,8 +200,6 @@ struct ufshci_req_queue {
bus_dma_tag_t dma_tag_payload;
bus_dmamap_t ucdmem_map;
-
- bus_addr_t ucd_addr;
};
struct ufshci_device {
diff --git a/sys/dev/ufshci/ufshci_req_sdb.c b/sys/dev/ufshci/ufshci_req_sdb.c
index 4670281d367a..b1f303afaef5 100644
--- a/sys/dev/ufshci/ufshci_req_sdb.c
+++ b/sys/dev/ufshci/ufshci_req_sdb.c
@@ -48,6 +48,29 @@ ufshci_req_sdb_cmd_desc_destroy(struct ufshci_req_queue *req_queue)
}
}
+static void
+ufshci_ucd_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
+{
+ struct ufshci_hw_queue *hwq = arg;
+ int i;
+
+ if (error != 0) {
+ printf("ufshci: Failed to map UCD, error = %d\n", error);
+ return;
+ }
+
+ if (hwq->num_trackers != nseg) {
+ printf(
+ "ufshci: Failed to map UCD, num_trackers = %d, nseg = %d\n",
+ hwq->num_trackers, nseg);
+ return;
+ }
+
+ for (i = 0; i < nseg; i++) {
+ hwq->ucd_bus_addr[i] = seg[i].ds_addr;
+ }
+}
+
static int
ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
uint32_t num_entries, struct ufshci_controller *ctrlr)
@@ -55,7 +78,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
struct ufshci_hw_queue *hwq = &req_queue->hwq[UFSHCI_SDB_Q];
struct ufshci_tracker *tr;
size_t ucd_allocsz, payload_allocsz;
- uint64_t ucdmem_phys;
uint8_t *ucdmem;
int i, error;
@@ -71,10 +93,11 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
* Allocate physical memory for UTP Command Descriptor (UCD)
* Note: UFSHCI UCD format is restricted to 128-byte alignment.
*/
- error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128,
- ctrlr->page_size, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
- ucd_allocsz, howmany(ucd_allocsz, ctrlr->page_size),
- ctrlr->page_size, 0, NULL, NULL, &req_queue->dma_tag_ucd);
+ error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, 0,
+ BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ucd_allocsz,
+ howmany(ucd_allocsz, sizeof(struct ufshci_utp_cmd_desc)),
+ sizeof(struct ufshci_utp_cmd_desc), 0, NULL, NULL,
+ &req_queue->dma_tag_ucd);
if (error != 0) {
ufshci_printf(ctrlr, "request cmd desc tag create failed %d\n",
error);
@@ -88,7 +111,7 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
}
if (bus_dmamap_load(req_queue->dma_tag_ucd, req_queue->ucdmem_map,
- ucdmem, ucd_allocsz, ufshci_single_map, &ucdmem_phys, 0) != 0) {
+ ucdmem, ucd_allocsz, ufshci_ucd_map, hwq, 0) != 0) {
ufshci_printf(ctrlr, "failed to load cmd desc memory\n");
bus_dmamem_free(req_queue->dma_tag_ucd, req_queue->ucd,
req_queue->ucdmem_map);
@@ -96,7 +119,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
}
req_queue->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
- req_queue->ucd_addr = ucdmem_phys;
/*
* Allocate physical memory for PRDT
@@ -128,10 +150,9 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
tr->slot_state = UFSHCI_SLOT_STATE_FREE;
tr->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
- tr->ucd_bus_addr = ucdmem_phys;
+ tr->ucd_bus_addr = hwq->ucd_bus_addr[i];
ucdmem += sizeof(struct ufshci_utp_cmd_desc);
- ucdmem_phys += sizeof(struct ufshci_utp_cmd_desc);
hwq->act_tr[i] = tr;
}
@@ -175,6 +196,11 @@ ufshci_req_sdb_construct(struct ufshci_controller *ctrlr,
req_queue->hwq = malloc(sizeof(struct ufshci_hw_queue), M_UFSHCI,
M_ZERO | M_NOWAIT);
hwq = &req_queue->hwq[UFSHCI_SDB_Q];
+ hwq->num_entries = req_queue->num_entries;
+ hwq->num_trackers = req_queue->num_trackers;
+ req_queue->hwq->ucd_bus_addr = malloc(sizeof(bus_addr_t) *
+ req_queue->num_trackers,
+ M_UFSHCI, M_ZERO | M_NOWAIT);
mtx_init(&hwq->qlock, "ufshci req_queue lock", NULL, MTX_DEF);
@@ -277,6 +303,7 @@ ufshci_req_sdb_destroy(struct ufshci_controller *ctrlr,
if (mtx_initialized(&hwq->qlock))
mtx_destroy(&hwq->qlock);
+ free(req_queue->hwq->ucd_bus_addr, M_UFSHCI);
free(req_queue->hwq, M_UFSHCI);
}
diff --git a/sys/dev/usb/controller/xhci_pci.c b/sys/dev/usb/controller/xhci_pci.c
index b50e33ea36ce..d5cfd228a429 100644
--- a/sys/dev/usb/controller/xhci_pci.c
+++ b/sys/dev/usb/controller/xhci_pci.c
@@ -99,6 +99,11 @@ xhci_pci_match(device_t self)
return ("AMD Starship USB 3.0 controller");
case 0x149c1022:
return ("AMD Matisse USB 3.0 controller");
+ case 0x15b61022:
+ case 0x15b71022:
+ return ("AMD Raphael/Granite Ridge USB 3.1 controller");
+ case 0x15b81022:
+ return ("AMD Raphael/Granite Ridge USB 2.0 controller");
case 0x15e01022:
case 0x15e11022:
return ("AMD Raven USB 3.1 controller");
@@ -109,6 +114,8 @@ xhci_pci_match(device_t self)
return ("AMD 300 Series USB 3.1 controller");
case 0x43d51022:
return ("AMD 400 Series USB 3.1 controller");
+ case 0x43f71022:
+ return ("AMD 600 Series USB 3.2 controller");
case 0x78121022:
case 0x78141022:
case 0x79141022:
diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c
index 1ffa15dd157b..819debadd1ac 100644
--- a/sys/dev/vmm/vmm_dev.c
+++ b/sys/dev/vmm/vmm_dev.c
@@ -351,6 +351,7 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = {
VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
+ VMMDEV_IOCTL(VM_STAT_DESC, 0),
#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c
index 64039575c0ad..675c0573bd7e 100644
--- a/sys/dev/vt/hw/vga/vt_vga.c
+++ b/sys/dev/vt/hw/vga/vt_vga.c
@@ -1347,7 +1347,7 @@ vga_postswitch(struct vt_device *vd)
/* Reinit VGA mode, to restore view after app which change mode. */
vga_initialize(vd, (vd->vd_flags & VDF_TEXTMODE));
- /* Ask vt(9) to update chars on visible area. */
+ /* Ask vt(4) to update chars on visible area. */
vd->vd_flags |= VDF_INVALID;
}
diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c
index b0f58b38a6f1..b51ef6766de4 100644
--- a/sys/dev/vt/vt_core.c
+++ b/sys/dev/vt/vt_core.c
@@ -125,10 +125,10 @@ static const struct terminal_class vt_termclass = {
(vw)->vw_number)
static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "vt(9) parameters");
+ "vt(4) parameters");
static VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)");
static VT_SYSCTL_INT(enable_bell, 0, "Enable bell");
-static VT_SYSCTL_INT(debug, 0, "vt(9) debug level");
+static VT_SYSCTL_INT(debug, 0, "vt(4) debug level");
static VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode");
static VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend");
diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c
index 676ea5de12b8..58a22b8bdc50 100644
--- a/sys/fs/fdescfs/fdesc_vnops.c
+++ b/sys/fs/fdescfs/fdesc_vnops.c
@@ -547,6 +547,8 @@ fdesc_readdir(struct vop_readdir_args *ap)
fmp = VFSTOFDESC(ap->a_vp->v_mount);
if (ap->a_ncookies != NULL)
*ap->a_ncookies = 0;
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
off = (int)uio->uio_offset;
if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
@@ -559,7 +561,12 @@ fdesc_readdir(struct vop_readdir_args *ap)
fcnt = i - 2; /* The first two nodes are `.' and `..' */
FILEDESC_SLOCK(fdp);
- while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
+ while (uio->uio_resid >= UIO_MX) {
+ if (i >= fdp->fd_nfiles + 2) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
+ break;
+ }
bzero((caddr_t)dp, UIO_MX);
switch (i) {
case 0: /* `.' */
diff --git a/sys/fs/fuse/fuse_file.h b/sys/fs/fuse/fuse_file.h
index 2a90e66d1b23..232132473953 100644
--- a/sys/fs/fuse/fuse_file.h
+++ b/sys/fs/fuse/fuse_file.h
@@ -139,7 +139,7 @@ struct fuse_filehandle {
/*
* flags returned by FUSE_OPEN
- * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE
+ * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE, FOPEN_NOFLUSH
* Unsupported:
* FOPEN_NONSEEKABLE: Adding support would require a new per-file
* or per-vnode attribute, which would have to be checked by
diff --git a/sys/fs/fuse/fuse_kernel.h b/sys/fs/fuse/fuse_kernel.h
index c95caf898ad8..942448b47365 100644
--- a/sys/fs/fuse/fuse_kernel.h
+++ b/sys/fs/fuse/fuse_kernel.h
@@ -182,6 +182,12 @@
* - add FUSE_OPEN_KILL_SUIDGID
* - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT
* - add FUSE_SETXATTR_ACL_KILL_SGID
+ *
+ * 7.34
+ * - add FUSE_SYNCFS
+ *
+ * 7.35
+ * - add FOPEN_NOFLUSH
*/
#ifndef _FUSE_FUSE_KERNEL_H
@@ -217,7 +223,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 33
+#define FUSE_KERNEL_MINOR_VERSION 35
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -288,12 +294,14 @@ struct fuse_file_lock {
* FOPEN_NONSEEKABLE: the file is not seekable
* FOPEN_CACHE_DIR: allow caching this directory
* FOPEN_STREAM: the file is stream-like (no file position at all)
+ * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
*/
#define FOPEN_DIRECT_IO (1 << 0)
#define FOPEN_KEEP_CACHE (1 << 1)
#define FOPEN_NONSEEKABLE (1 << 2)
#define FOPEN_CACHE_DIR (1 << 3)
#define FOPEN_STREAM (1 << 4)
+#define FOPEN_NOFLUSH (1 << 5)
/**
* INIT request/reply flags
@@ -518,6 +526,7 @@ enum fuse_opcode {
FUSE_COPY_FILE_RANGE = 47,
FUSE_SETUPMAPPING = 48,
FUSE_REMOVEMAPPING = 49,
+ FUSE_SYNCFS = 50,
#ifdef linux
/* CUSE specific operations */
@@ -939,7 +948,8 @@ struct fuse_notify_retrieve_in {
};
/* Device ioctls: */
-#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t)
+#define FUSE_DEV_IOC_MAGIC 229
+#define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t)
struct fuse_lseek_in {
uint64_t fh;
@@ -992,4 +1002,8 @@ struct fuse_removemapping_one {
#define FUSE_REMOVEMAPPING_MAX_ENTRY \
(PAGE_SIZE / sizeof(struct fuse_removemapping_one))
+struct fuse_syncfs_in {
+ uint64_t padding;
+};
+
#endif /* _FUSE_FUSE_KERNEL_H */
diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c
index 107e6db299e0..ae28617537fd 100644
--- a/sys/fs/fuse/fuse_vnops.c
+++ b/sys/fs/fuse/fuse_vnops.c
@@ -89,6 +89,8 @@
#include <sys/buf.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
+#define EXTERR_CATEGORY EXTERR_CAT_FUSE
+#include <sys/exterrvar.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -289,6 +291,10 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag)
if (err)
return err;
+ if (fufh->fuse_open_flags & FOPEN_NOFLUSH &&
+ (!fsess_opt_writeback(vnode_mount(vp))))
+ return (0);
+
fdisp_init(&fdi, sizeof(*ffi));
fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred);
ffi = fdi.indata;
@@ -435,7 +441,8 @@ fuse_vnop_access(struct vop_access_args *ap)
if (vnode_isvroot(vp)) {
return 0;
}
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (!(data->dataflags & FSESS_INITED)) {
if (vnode_isvroot(vp)) {
@@ -444,7 +451,8 @@ fuse_vnop_access(struct vop_access_args *ap)
return 0;
}
}
- return EBADF;
+ return (EXTERROR(EBADF, "Access denied until FUSE session "
+ "is initialized"));
}
if (vnode_islnk(vp)) {
return 0;
@@ -485,7 +493,8 @@ fuse_vnop_advlock(struct vop_advlock_args *ap)
dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
switch(ap->a_op) {
@@ -502,7 +511,7 @@ fuse_vnop_advlock(struct vop_advlock_args *ap)
op = FUSE_SETLK;
break;
default:
- return EINVAL;
+ return (EXTERROR(EINVAL, "Unsupported lock flags"));
}
if (!(dataflags & FSESS_POSIX_LOCKS))
@@ -530,14 +539,14 @@ fuse_vnop_advlock(struct vop_advlock_args *ap)
size = vattr.va_size;
if (size > OFF_MAX ||
(fl->l_start > 0 && size > OFF_MAX - fl->l_start)) {
- err = EOVERFLOW;
+ err = EXTERROR(EOVERFLOW, "Offset is too large");
goto out;
}
start = size + fl->l_start;
break;
default:
- return (EINVAL);
+ return (EXTERROR(EINVAL, "Unsupported offset type"));
}
err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid);
@@ -599,15 +608,14 @@ fuse_vnop_allocate(struct vop_allocate_args *ap)
int err;
if (fuse_isdeadfs(vp))
- return (ENXIO);
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
switch (vp->v_type) {
case VFIFO:
return (ESPIPE);
case VLNK:
case VREG:
- if (vfs_isrdonly(mp))
- return (EROFS);
break;
default:
return (ENODEV);
@@ -617,7 +625,8 @@ fuse_vnop_allocate(struct vop_allocate_args *ap)
return (EROFS);
if (fsess_not_impl(mp, FUSE_FALLOCATE))
- return (EINVAL);
+ return (EXTERROR(EINVAL, "This server does not implement "
+ "FUSE_FALLOCATE"));
io.uio_offset = *offset;
io.uio_resid = *len;
@@ -647,13 +656,14 @@ fuse_vnop_allocate(struct vop_allocate_args *ap)
if (err == ENOSYS) {
fsess_set_notimpl(mp, FUSE_FALLOCATE);
- err = EINVAL;
+ err = EXTERROR(EINVAL, "This server does not implement "
+ "FUSE_ALLOCATE");
} else if (err == EOPNOTSUPP) {
/*
* The file system server does not support FUSE_FALLOCATE with
* the supplied mode for this particular file.
*/
- err = EINVAL;
+ err = EXTERROR(EINVAL, "This file can't be pre-allocated");
} else if (!err) {
*offset += *len;
*len = 0;
@@ -699,7 +709,8 @@ fuse_vnop_bmap(struct vop_bmap_args *ap)
int maxrun;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
mp = vnode_mount(vp);
@@ -866,19 +877,21 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap)
pid_t pid;
int err;
- err = ENOSYS;
if (mp == NULL || mp != vnode_mount(outvp))
- goto fallback;
+ return (EXTERROR(ENOSYS, "Mount points do not match"));
if (incred->cr_uid != outcred->cr_uid)
- goto fallback;
+ return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not "
+ "support different credentials for infd and outfd"));
if (incred->cr_groups[0] != outcred->cr_groups[0])
- goto fallback;
+ return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not "
+ "support different credentials for infd and outfd"));
/* Caller busied mp, mnt_data can be safely accessed. */
if (fsess_not_impl(mp, FUSE_COPY_FILE_RANGE))
- goto fallback;
+ return (EXTERROR(ENOSYS, "This daemon does not "
+ "implement COPY_FILE_RANGE"));
if (ap->a_fsizetd == NULL)
td = curthread;
@@ -888,7 +901,7 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap)
vn_lock_pair(invp, false, LK_SHARED, outvp, false, LK_EXCLUSIVE);
if (invp->v_data == NULL || outvp->v_data == NULL) {
- err = EBADF;
+ err = EXTERROR(EBADF, "vnode got reclaimed");
goto unlock;
}
@@ -952,7 +965,6 @@ unlock:
if (err == ENOSYS)
fsess_set_notimpl(mp, FUSE_COPY_FILE_RANGE);
-fallback:
/*
* No need to call vn_rlimit_fsizex_res before return, since the uio is
@@ -1020,7 +1032,8 @@ fuse_vnop_create(struct vop_create_args *ap)
int flags;
if (fuse_isdeadfs(dvp))
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
/* FUSE expects sockets to be created with FUSE_MKNOD */
if (vap->va_type == VSOCK)
@@ -1036,7 +1049,7 @@ fuse_vnop_create(struct vop_create_args *ap)
bzero(&fdi, sizeof(fdi));
if (vap->va_type != VREG)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "Only regular files can be created"));
if (fsess_not_impl(mp, FUSE_CREATE) || vap->va_type == VSOCK) {
/* Fallback to FUSE_MKNOD/FUSE_OPEN */
@@ -1217,8 +1230,8 @@ fuse_vnop_getattr(struct vop_getattr_args *ap)
if (!(dataflags & FSESS_INITED)) {
if (!vnode_isvroot(vp)) {
fdata_set_dead(fuse_get_mpdata(vnode_mount(vp)));
- err = ENOTCONN;
- return err;
+ return (EXTERROR(ENOTCONN, "FUSE daemon is not "
+ "initialized"));
} else {
goto fake;
}
@@ -1347,10 +1360,11 @@ fuse_vnop_link(struct vop_link_args *ap)
int err;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (vnode_mount(tdvp) != vnode_mount(vp)) {
- return EXDEV;
+ return (EXDEV);
}
/*
@@ -1360,7 +1374,7 @@ fuse_vnop_link(struct vop_link_args *ap)
* validating that nlink does not overflow.
*/
if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX)
- return EMLINK;
+ return (EMLINK);
fli.oldnodeid = VTOI(vp);
fdisp_init(&fdi, 0);
@@ -1372,12 +1386,13 @@ fuse_vnop_link(struct vop_link_args *ap)
feo = fdi.answ;
if (fli.oldnodeid != feo->nodeid) {
+ static const char exterr[] = "Server assigned wrong inode "
+ "for a hard link.";
struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
- fuse_warn(data, FSESS_WARN_ILLEGAL_INODE,
- "Assigned wrong inode for a hard link.");
+ fuse_warn(data, FSESS_WARN_ILLEGAL_INODE, exterr);
fuse_vnode_clear_attr_cache(vp);
fuse_vnode_clear_attr_cache(tdvp);
- err = EIO;
+ err = EXTERROR(EIO, exterr);
goto out;
}
@@ -1454,7 +1469,8 @@ fuse_vnop_lookup(struct vop_lookup_args *ap)
if (fuse_isdeadfs(dvp)) {
*vpp = NULL;
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (!vnode_isdir(dvp))
return ENOTDIR;
@@ -1474,7 +1490,8 @@ fuse_vnop_lookup(struct vop_lookup_args *ap)
* Since the file system doesn't support ".." lookups,
* we have no way to find this entry.
*/
- return ESTALE;
+ return (EXTERROR(ESTALE, "This server does not support "
+ "'..' lookups"));
}
nid = VTOFUD(dvp)->parent_nid;
if (nid == 0)
@@ -1597,11 +1614,11 @@ fuse_vnop_lookup(struct vop_lookup_args *ap)
vref(dvp);
*vpp = dvp;
} else {
+ static const char exterr[] = "Server assigned "
+ "same inode to both parent and child.";
fuse_warn(fuse_get_mpdata(mp),
- FSESS_WARN_ILLEGAL_INODE,
- "Assigned same inode to both parent and "
- "child.");
- err = EIO;
+ FSESS_WARN_ILLEGAL_INODE, exterr);
+ err = EXTERROR(EIO, exterr);
}
} else {
@@ -1689,7 +1706,8 @@ fuse_vnop_mkdir(struct vop_mkdir_args *ap)
struct fuse_mkdir_in fmdi;
if (fuse_isdeadfs(dvp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode);
fmdi.umask = curthread->td_proc->p_pd->pd_cmask;
@@ -1716,7 +1734,8 @@ fuse_vnop_mknod(struct vop_mknod_args *ap)
struct vattr *vap = ap->a_vap;
if (fuse_isdeadfs(dvp))
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
return fuse_internal_mknod(dvp, vpp, cnp, vap);
}
@@ -1740,11 +1759,13 @@ fuse_vnop_open(struct vop_open_args *ap)
pid_t pid = td->td_proc->p_pid;
if (fuse_isdeadfs(vp))
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO)
- return (EOPNOTSUPP);
+ return (EXTERROR(EOPNOTSUPP, "Unsupported vnode type",
+ vp->v_type));
if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0)
- return EINVAL;
+ return (EXTERROR(EINVAL, "Illegal mode", a_mode));
if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) {
fuse_vnode_open(vp, 0, td);
@@ -1826,7 +1847,8 @@ fuse_vnop_pathconf(struct vop_pathconf_args *ap)
return (0);
} else if (fsess_not_impl(mp, FUSE_LSEEK)) {
/* FUSE_LSEEK is not implemented */
- return (EINVAL);
+ return (EXTERROR(EINVAL, "This server does not "
+ "implement FUSE_LSEEK"));
} else {
return (err);
}
@@ -1860,7 +1882,8 @@ fuse_vnop_read(struct vop_read_args *ap)
MPASS(vp->v_type == VREG || vp->v_type == VDIR);
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (VTOFUD(vp)->flag & FN_DIRECTIO) {
@@ -1937,10 +1960,11 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
if (ap->a_eofflag)
*ap->a_eofflag = 0;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (uio_resid(uio) < sizeof(struct dirent))
- return EINVAL;
+ return (EXTERROR(EINVAL, "Buffer is too small"));
tresid = uio->uio_resid;
err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
@@ -2010,7 +2034,8 @@ fuse_vnop_readlink(struct vop_readlink_args *ap)
int err;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (!vnode_islnk(vp)) {
return EINVAL;
@@ -2021,10 +2046,11 @@ fuse_vnop_readlink(struct vop_readlink_args *ap)
goto out;
}
if (strnlen(fdi.answ, fdi.iosize) + 1 < fdi.iosize) {
+ static const char exterr[] = "Server returned an embedded NUL "
+ "from FUSE_READLINK.";
struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
- fuse_warn(data, FSESS_WARN_READLINK_EMBEDDED_NUL,
- "Returned an embedded NUL from FUSE_READLINK.");
- err = EIO;
+ fuse_warn(data, FSESS_WARN_READLINK_EMBEDDED_NUL, exterr);
+ err = EXTERROR(EIO, exterr);
goto out;
}
if (((char *)fdi.answ)[0] == '/' &&
@@ -2108,10 +2134,11 @@ fuse_vnop_remove(struct vop_remove_args *ap)
int err;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (vnode_isdir(vp)) {
- return EPERM;
+ return (EXTERROR(EPERM, "vnode is a directory"));
}
err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK);
@@ -2144,12 +2171,13 @@ fuse_vnop_rename(struct vop_rename_args *ap)
int err = 0;
if (fuse_isdeadfs(fdvp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (fvp->v_mount != tdvp->v_mount ||
(tvp && fvp->v_mount != tvp->v_mount)) {
SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename");
- err = EXDEV;
+ err = EXTERROR(EXDEV, "Cross-device rename");
goto out;
}
cache_purge(fvp);
@@ -2220,10 +2248,12 @@ fuse_vnop_rmdir(struct vop_rmdir_args *ap)
int err;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (VTOFUD(vp) == VTOFUD(dvp)) {
- return EINVAL;
+ return (EXTERROR(EINVAL, "Directory to be removed "
+ "contains itself"));
}
err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR);
@@ -2260,7 +2290,8 @@ fuse_vnop_setattr(struct vop_setattr_args *ap)
checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS;
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (vap->va_uid != (uid_t)VNOVAL) {
@@ -2425,7 +2456,8 @@ fuse_vnop_symlink(struct vop_symlink_args *ap)
size_t len;
if (fuse_isdeadfs(dvp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
/*
* Unlike the other creator type calls, here we have to create a message
@@ -2471,7 +2503,8 @@ fuse_vnop_write(struct vop_write_args *ap)
MPASS(vp->v_type == VREG || vp->v_type == VDIR);
if (fuse_isdeadfs(vp)) {
- return ENXIO;
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
}
if (VTOFUD(vp)->flag & FN_DIRECTIO) {
@@ -2624,10 +2657,12 @@ fuse_vnop_getextattr(struct vop_getextattr_args *ap)
int err;
if (fuse_isdeadfs(vp))
- return (ENXIO);
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
if (fsess_not_impl(mp, FUSE_GETXATTR))
- return EOPNOTSUPP;
+ return (EXTERROR(EOPNOTSUPP, "This server does not implement "
+ "extended attributes"));
err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
if (err)
@@ -2665,7 +2700,8 @@ fuse_vnop_getextattr(struct vop_getextattr_args *ap)
if (err != 0) {
if (err == ENOSYS) {
fsess_set_notimpl(mp, FUSE_GETXATTR);
- err = EOPNOTSUPP;
+ err = (EXTERROR(EOPNOTSUPP, "This server does not "
+ "implement extended attributes"));
}
goto out;
}
@@ -2711,10 +2747,12 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap)
int err;
if (fuse_isdeadfs(vp))
- return (ENXIO);
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
if (fsess_not_impl(mp, FUSE_SETXATTR))
- return EOPNOTSUPP;
+ return (EXTERROR(EOPNOTSUPP, "This server does not implement "
+ "setting extended attributes"));
if (vfs_isrdonly(mp))
return EROFS;
@@ -2726,9 +2764,11 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap)
* return EOPNOTSUPP.
*/
if (fsess_not_impl(mp, FUSE_REMOVEXATTR))
- return (EOPNOTSUPP);
+ return (EXTERROR(EOPNOTSUPP, "This server does not "
+ "implement removing extended attributess"));
else
- return (EINVAL);
+ return (EXTERROR(EINVAL, "DELETEEXTATTR should be used "
+ "to remove extattrs"));
}
err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
@@ -2774,7 +2814,8 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap)
if (err == ENOSYS) {
fsess_set_notimpl(mp, FUSE_SETXATTR);
- err = EOPNOTSUPP;
+ err = EXTERROR(EOPNOTSUPP, "This server does not implement "
+ "setting extended attributes");
}
if (err == ERESTART) {
/* Can't restart after calling uiomove */
@@ -2885,10 +2926,12 @@ fuse_vnop_listextattr(struct vop_listextattr_args *ap)
int err;
if (fuse_isdeadfs(vp))
- return (ENXIO);
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
if (fsess_not_impl(mp, FUSE_LISTXATTR))
- return EOPNOTSUPP;
+ return (EXTERROR(EOPNOTSUPP, "This server does not implement "
+ "extended attributes"));
err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
if (err)
@@ -2916,7 +2959,8 @@ fuse_vnop_listextattr(struct vop_listextattr_args *ap)
if (err != 0) {
if (err == ENOSYS) {
fsess_set_notimpl(mp, FUSE_LISTXATTR);
- err = EOPNOTSUPP;
+ err = EXTERROR(EOPNOTSUPP, "This server does not "
+ "implement extended attributes");
}
goto out;
}
@@ -3016,7 +3060,8 @@ fuse_vnop_deallocate(struct vop_deallocate_args *ap)
bool closefufh = false;
if (fuse_isdeadfs(vp))
- return (ENXIO);
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
if (vfs_isrdonly(mp))
return (EROFS);
@@ -3122,10 +3167,12 @@ fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap)
int err;
if (fuse_isdeadfs(vp))
- return (ENXIO);
+ return (EXTERROR(ENXIO, "This FUSE session is about "
+ "to be closed"));
if (fsess_not_impl(mp, FUSE_REMOVEXATTR))
- return EOPNOTSUPP;
+ return (EXTERROR(EOPNOTSUPP, "This server does not implement "
+ "removing extended attributes"));
if (vfs_isrdonly(mp))
return EROFS;
@@ -3154,7 +3201,8 @@ fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap)
err = fdisp_wait_answ(&fdi);
if (err == ENOSYS) {
fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
- err = EOPNOTSUPP;
+ err = EXTERROR(EOPNOTSUPP, "This server does not implement "
+ "removing extended attributes");
}
fdisp_destroy(&fdi);
@@ -3208,7 +3256,8 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap)
/* NFS requires lookups for "." and ".." */
SDT_PROBE2(fusefs, , vnops, trace, 1,
"VOP_VPTOFH without FUSE_EXPORT_SUPPORT");
- return EOPNOTSUPP;
+ return (EXTERROR(EOPNOTSUPP, "This server is "
+ "missing FUSE_EXPORT_SUPPORT"));
}
if ((mp->mnt_flag & MNT_EXPORTED) &&
fsess_is_impl(mp, FUSE_OPENDIR))
@@ -3226,7 +3275,8 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap)
*/
SDT_PROBE2(fusefs, , vnops, trace, 1,
"VOP_VPTOFH with FUSE_OPENDIR");
- return EOPNOTSUPP;
+ return (EXTERROR(EOPNOTSUPP, "This server implements "
+ "FUSE_OPENDIR so is not compatible with getfh"));
}
err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
@@ -3240,6 +3290,7 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap)
if (fvdat->generation <= UINT32_MAX)
fhp->gen = fvdat->generation;
else
- return EOVERFLOW;
+ return (EXTERROR(EOVERFLOW, "inode generation "
+ "number overflow"));
return (0);
}
diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c
index da4848169173..208b64930e61 100644
--- a/sys/fs/msdosfs/msdosfs_conv.c
+++ b/sys/fs/msdosfs/msdosfs_conv.c
@@ -797,19 +797,24 @@ mbsadjpos(const char **instr, size_t inlen, size_t outlen, int weight, int flag,
static u_char *
dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struct msdosfsmount *pmp)
{
- u_char c, *outp;
- size_t len, olen;
+ u_char c, *outp, *outp1;
+ size_t i, len, olen;
outp = outbuf;
if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
olen = len = 4;
+ outp1 = outp;
if (lower & (LCASE_BASE | LCASE_EXT))
msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char **)instr,
ilen, (char **)&outp, &olen, KICONV_LOWER);
else
msdosfs_iconv->convchr(pmp->pm_d2u, (const char **)instr,
ilen, (char **)&outp, &olen);
+ for (i = 0; i < outp - outp1; i++) {
+ if (outp1[i] == '/')
+ outp1[i] = '?';
+ }
len -= olen;
/*
@@ -826,6 +831,8 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struc
c = dos2unix[c];
if (lower & (LCASE_BASE | LCASE_EXT))
c = u2l[c];
+ if (c == '/')
+ c = '?';
*outp++ = c;
outbuf[1] = '\0';
}
diff --git a/sys/fs/msdosfs/msdosfs_lookup.c b/sys/fs/msdosfs/msdosfs_lookup.c
index e799a5ce05f6..8ab6d35a2685 100644
--- a/sys/fs/msdosfs/msdosfs_lookup.c
+++ b/sys/fs/msdosfs/msdosfs_lookup.c
@@ -845,7 +845,6 @@ doscheckpath(struct denode *source, struct denode *target, daddr_t *wait_scn)
*wait_scn = 0;
pmp = target->de_pmp;
- lockmgr_assert(&pmp->pm_checkpath_lock, KA_XLOCKED);
KASSERT(pmp == source->de_pmp,
("doscheckpath: source and target on different filesystems"));
diff --git a/sys/fs/msdosfs/msdosfs_vfsops.c b/sys/fs/msdosfs/msdosfs_vfsops.c
index adcffe45df82..4431d36c8a8e 100644
--- a/sys/fs/msdosfs/msdosfs_vfsops.c
+++ b/sys/fs/msdosfs/msdosfs_vfsops.c
@@ -575,7 +575,6 @@ mountmsdosfs(struct vnode *odevvp, struct mount *mp)
pmp->pm_bo = bo;
lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0);
- lockinit(&pmp->pm_checkpath_lock, 0, "msdoscp", 0, 0);
TASK_INIT(&pmp->pm_rw2ro_task, 0, msdosfs_remount_ro, pmp);
@@ -871,7 +870,6 @@ error_exit:
}
if (pmp != NULL) {
lockdestroy(&pmp->pm_fatlock);
- lockdestroy(&pmp->pm_checkpath_lock);
free(pmp->pm_inusemap, M_MSDOSFSFAT);
free(pmp, M_MSDOSFSMNT);
mp->mnt_data = NULL;
@@ -971,7 +969,6 @@ msdosfs_unmount(struct mount *mp, int mntflags)
dev_rel(pmp->pm_dev);
free(pmp->pm_inusemap, M_MSDOSFSFAT);
lockdestroy(&pmp->pm_fatlock);
- lockdestroy(&pmp->pm_checkpath_lock);
free(pmp, M_MSDOSFSMNT);
mp->mnt_data = NULL;
return (error);
diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c
index 6417b7dac16b..33e0d94954d7 100644
--- a/sys/fs/msdosfs/msdosfs_vnops.c
+++ b/sys/fs/msdosfs/msdosfs_vnops.c
@@ -945,7 +945,7 @@ msdosfs_rename(struct vop_rename_args *ap)
struct denode *fdip, *fip, *tdip, *tip, *nip;
u_char toname[12], oldname[11];
u_long to_diroffset;
- bool checkpath_locked, doingdirectory, newparent;
+ bool doingdirectory, newparent;
int error;
u_long cn, pcl, blkoff;
daddr_t bn, wait_scn, scn;
@@ -986,8 +986,6 @@ msdosfs_rename(struct vop_rename_args *ap)
if (tvp != NULL && tvp != tdvp)
VOP_UNLOCK(tvp);
- checkpath_locked = false;
-
relock:
doingdirectory = newparent = false;
@@ -1108,12 +1106,8 @@ relock:
if (doingdirectory && newparent) {
if (error != 0) /* write access check above */
goto unlock;
- lockmgr(&pmp->pm_checkpath_lock, LK_EXCLUSIVE, NULL);
- checkpath_locked = true;
error = doscheckpath(fip, tdip, &wait_scn);
if (wait_scn != 0) {
- lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL);
- checkpath_locked = false;
VOP_UNLOCK(fdvp);
VOP_UNLOCK(tdvp);
VOP_UNLOCK(fvp);
@@ -1276,8 +1270,6 @@ relock:
cache_purge(fvp);
unlock:
- if (checkpath_locked)
- lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL);
vput(fdvp);
vput(fvp);
if (tvp != NULL) {
@@ -1289,7 +1281,6 @@ unlock:
vput(tdvp);
return (error);
releout:
- MPASS(!checkpath_locked);
vrele(tdvp);
if (tvp != NULL)
vrele(tvp);
@@ -1530,6 +1521,9 @@ msdosfs_readdir(struct vop_readdir_args *ap)
ap->a_vp, uio, ap->a_cred, ap->a_eofflag);
#endif
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
+
/*
* msdosfs_readdir() won't operate properly on regular files since
* it does i/o only with the filesystem vnode, and hence can
@@ -1623,8 +1617,11 @@ msdosfs_readdir(struct vop_readdir_args *ap)
on = (offset - bias) & pmp->pm_crbomask;
n = min(pmp->pm_bpcluster - on, uio->uio_resid);
diff = dep->de_FileSize - (offset - bias);
- if (diff <= 0)
- break;
+ if (diff <= 0) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
+ goto out;
+ }
n = min(n, diff);
error = pcbmap(dep, lbn, &bn, &cn, &blsize);
if (error)
@@ -1655,6 +1652,8 @@ msdosfs_readdir(struct vop_readdir_args *ap)
*/
if (dentp->deName[0] == SLOT_EMPTY) {
brelse(bp);
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
goto out;
}
/*
@@ -1752,15 +1751,6 @@ out:
uio->uio_offset = off;
- /*
- * Set the eofflag (NFS uses it)
- */
- if (ap->a_eofflag) {
- if (dep->de_FileSize - (offset - bias) <= 0)
- *ap->a_eofflag = 1;
- else
- *ap->a_eofflag = 0;
- }
return (error);
}
@@ -1951,6 +1941,9 @@ msdosfs_pathconf(struct vop_pathconf_args *ap)
case _PC_NO_TRUNC:
*ap->a_retval = 0;
return (0);
+ case _PC_HAS_HIDDENSYSTEM:
+ *ap->a_retval = 1;
+ return (0);
default:
return (vop_stdpathconf(ap));
}
diff --git a/sys/fs/msdosfs/msdosfsmount.h b/sys/fs/msdosfs/msdosfsmount.h
index fcaac544a74d..04e6b75bea2a 100644
--- a/sys/fs/msdosfs/msdosfsmount.h
+++ b/sys/fs/msdosfs/msdosfsmount.h
@@ -118,7 +118,6 @@ struct msdosfsmount {
void *pm_u2d; /* Unicode->DOS iconv handle */
void *pm_d2u; /* DOS->Local iconv handle */
struct lock pm_fatlock; /* lockmgr protecting allocations */
- struct lock pm_checkpath_lock; /* protects doscheckpath result */
struct task pm_rw2ro_task; /* context for emergency remount ro */
};
diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c
index f46b0d282861..a957315aaa12 100644
--- a/sys/fs/nfs/nfs_commonsubs.c
+++ b/sys/fs/nfs/nfs_commonsubs.c
@@ -630,6 +630,10 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap,
NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNERGROUP);
if ((flags & NFSSATTR_FULL) && vap->va_size != VNOVAL)
NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
+ if ((flags & NFSSATTR_FULL) && vap->va_flags != VNOVAL) {
+ NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN);
+ NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM);
+ }
if (vap->va_atime.tv_sec != VNOVAL)
NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET);
if (vap->va_mtime.tv_sec != VNOVAL)
@@ -643,7 +647,8 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap,
NFSATTRBIT_TIMECREATE))
NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE);
(void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0,
- &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL);
+ &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL,
+ false, false, false);
break;
}
}
@@ -1314,6 +1319,7 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp,
u_int32_t freenum = 0, tuint;
u_int64_t uquad = 0, thyp, thyp2;
uint16_t tui16;
+ long has_pathconf;
#ifdef QUOTA
struct dqblk dqb;
uid_t savuid;
@@ -1421,6 +1427,16 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp,
NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACL);
NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACLSUPPORT);
}
+ /* Some filesystems do not support uf_hidden */
+ if (vp == NULL || VOP_PATHCONF(vp,
+ _PC_HAS_HIDDENSYSTEM, &has_pathconf) != 0)
+ has_pathconf = 0;
+ if (has_pathconf == 0) {
+ NFSCLRBIT_ATTRBIT(&checkattrbits,
+ NFSATTRBIT_HIDDEN);
+ NFSCLRBIT_ATTRBIT(&checkattrbits,
+ NFSATTRBIT_SYSTEM);
+ }
if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits)
|| retnotsup)
*retcmpp = NFSERR_NOTSAME;
@@ -1521,15 +1537,13 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp,
NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
if (compare) {
if (!(*retcmpp)) {
- long has_named_attr;
-
if (vp == NULL || VOP_PATHCONF(vp,
- _PC_HAS_NAMEDATTR, &has_named_attr)
+ _PC_HAS_NAMEDATTR, &has_pathconf)
!= 0)
- has_named_attr = 0;
- if ((has_named_attr != 0 &&
+ has_pathconf = 0;
+ if ((has_pathconf != 0 &&
*tl != newnfs_true) ||
- (has_named_attr == 0 &&
+ (has_pathconf == 0 &&
*tl != newnfs_false))
*retcmpp = NFSERR_NOTSAME;
}
@@ -1792,9 +1806,17 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp,
free(cp2, M_NFSSTRING);
break;
case NFSATTRBIT_HIDDEN:
- NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
- if (compare && !(*retcmpp))
- *retcmpp = NFSERR_ATTRNOTSUPP;
+ NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
+ if (compare) {
+ if (!(*retcmpp) && ((*tl == newnfs_true &&
+ (nap->na_flags & UF_HIDDEN) == 0) ||
+ (*tl == newnfs_false &&
+ (nap->na_flags & UF_HIDDEN) != 0)))
+ *retcmpp = NFSERR_NOTSAME;
+ } else if (nap != NULL) {
+ if (*tl == newnfs_true)
+ nap->na_flags |= UF_HIDDEN;
+ }
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_HOMOGENEOUS:
@@ -2166,9 +2188,17 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp,
attrsum += NFSX_HYPER;
break;
case NFSATTRBIT_SYSTEM:
- NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
- if (compare && !(*retcmpp))
- *retcmpp = NFSERR_ATTRNOTSUPP;
+ NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
+ if (compare) {
+ if (!(*retcmpp) && ((*tl == newnfs_true &&
+ (nap->na_flags & UF_SYSTEM) == 0) ||
+ (*tl == newnfs_false &&
+ (nap->na_flags & UF_SYSTEM) != 0)))
+ *retcmpp = NFSERR_NOTSAME;
+ } else if (nap != NULL) {
+ if (*tl == newnfs_true)
+ nap->na_flags |= UF_SYSTEM;
+ }
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_TIMEACCESS:
@@ -2617,7 +2647,8 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror,
nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram,
int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno,
- struct statfs *pnfssf)
+ struct statfs *pnfssf, bool xattrsupp, bool has_hiddensystem,
+ bool has_namedattr)
{
int bitpos, retnum = 0;
u_int32_t *tl;
@@ -2631,10 +2662,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
struct nfsfsinfo fsinf;
struct timespec temptime;
NFSACL_T *aclp, *naclp = NULL;
- size_t atsiz;
- bool xattrsupp;
short irflag;
- long has_named_attr;
#ifdef QUOTA
struct dqblk dqb;
uid_t savuid;
@@ -2718,18 +2746,6 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
}
}
- /* Check to see if Extended Attributes are supported. */
- xattrsupp = false;
- if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_XATTRSUPPORT)) {
- if (NFSVOPLOCK(vp, LK_SHARED) == 0) {
- error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER,
- "xxx", NULL, &atsiz, cred, p);
- NFSVOPUNLOCK(vp);
- if (error != EOPNOTSUPP)
- xattrsupp = true;
- }
- }
-
/*
* Put out the attribute bitmap for the ones being filled in
* and get the field for the number of attributes returned.
@@ -2751,6 +2767,10 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT);
NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL);
}
+ if (!has_hiddensystem) {
+ NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN);
+ NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM);
+ }
retnum += nfsrv_putattrbit(nd, &attrbits);
break;
case NFSATTRBIT_TYPE:
@@ -2791,10 +2811,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
break;
case NFSATTRBIT_NAMEDATTR:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
- if (VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR, &has_named_attr)
- != 0)
- has_named_attr = 0;
- if (has_named_attr != 0)
+ if (has_namedattr)
*tl = newnfs_true;
else
*tl = newnfs_false;
@@ -2899,6 +2916,14 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
*tl = 0;
retnum += 2 * NFSX_UNSIGNED;
break;
+ case NFSATTRBIT_HIDDEN:
+ NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
+ if ((vap->va_flags & UF_HIDDEN) != 0)
+ *tl = newnfs_true;
+ else
+ *tl = newnfs_false;
+ retnum += NFSX_UNSIGNED;
+ break;
case NFSATTRBIT_HOMOGENEOUS:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
if (fsinf.fs_properties & NFSV3FSINFO_HOMOGENEOUS)
@@ -3088,6 +3113,14 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
txdr_hyper(vap->va_bytes, tl);
retnum += NFSX_HYPER;
break;
+ case NFSATTRBIT_SYSTEM:
+ NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
+ if ((vap->va_flags & UF_SYSTEM) != 0)
+ *tl = newnfs_true;
+ else
+ *tl = newnfs_false;
+ retnum += NFSX_UNSIGNED;
+ break;
case NFSATTRBIT_TIMEACCESS:
NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
txdr_nfsv4time(&vap->va_atime, tl);
diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h
index 3b6c1ec90c06..54f60a753c50 100644
--- a/sys/fs/nfs/nfs_var.h
+++ b/sys/fs/nfs/nfs_var.h
@@ -395,8 +395,9 @@ int nfsrv_putopbit(struct nfsrv_descript *, nfsopbit_t *);
void nfsrv_wcc(struct nfsrv_descript *, int, struct nfsvattr *, int,
struct nfsvattr *);
int nfsv4_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, NFSACL_T *,
- struct vattr *, fhandle_t *, int, nfsattrbit_t *,
- struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *);
+ struct vattr *, fhandle_t *, int, nfsattrbit_t *, struct ucred *,
+ NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *, bool, bool,
+ bool);
void nfsrv_fillattr(struct nfsrv_descript *, struct nfsvattr *);
struct mbuf *nfsrv_adj(struct mbuf *, int, int);
void nfsrv_postopattr(struct nfsrv_descript *, int, struct nfsvattr *);
@@ -735,7 +736,8 @@ int nfsvno_updfilerev(vnode_t, struct nfsvattr *, struct nfsrv_descript *,
NFSPROC_T *);
int nfsvno_fillattr(struct nfsrv_descript *, struct mount *, vnode_t,
struct nfsvattr *, fhandle_t *, int, nfsattrbit_t *,
- struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t);
+ struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, bool, bool,
+ bool);
int nfsrv_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *,
NFSACL_T *, NFSPROC_T *);
int nfsv4_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *,
diff --git a/sys/fs/nfs/nfsproto.h b/sys/fs/nfs/nfsproto.h
index eff53e1a384e..cb5a80e8df73 100644
--- a/sys/fs/nfs/nfsproto.h
+++ b/sys/fs/nfs/nfsproto.h
@@ -1142,6 +1142,7 @@ struct nfsv3_sattr {
NFSATTRBM_FILESFREE | \
NFSATTRBM_FILESTOTAL | \
NFSATTRBM_FSLOCATIONS | \
+ NFSATTRBM_HIDDEN | \
NFSATTRBM_HOMOGENEOUS | \
NFSATTRBM_MAXFILESIZE | \
NFSATTRBM_MAXLINK | \
@@ -1163,6 +1164,7 @@ struct nfsv3_sattr {
NFSATTRBM_SPACEFREE | \
NFSATTRBM_SPACETOTAL | \
NFSATTRBM_SPACEUSED | \
+ NFSATTRBM_SYSTEM | \
NFSATTRBM_TIMEACCESS | \
NFSATTRBM_TIMECREATE | \
NFSATTRBM_TIMEDELTA | \
@@ -1210,11 +1212,13 @@ struct nfsv3_sattr {
*/
#define NFSATTRBIT_SETABLE0 \
(NFSATTRBM_SIZE | \
+ NFSATTRBM_HIDDEN | \
NFSATTRBM_ACL)
#define NFSATTRBIT_SETABLE1 \
(NFSATTRBM_MODE | \
NFSATTRBM_OWNER | \
NFSATTRBM_OWNERGROUP | \
+ NFSATTRBM_SYSTEM | \
NFSATTRBM_TIMECREATE | \
NFSATTRBM_TIMEACCESSSET | \
NFSATTRBM_TIMEMODIFYSET)
@@ -1254,6 +1258,7 @@ struct nfsv3_sattr {
NFSATTRBM_SIZE | \
NFSATTRBM_FSID | \
NFSATTRBM_FILEID | \
+ NFSATTRBM_HIDDEN | \
NFSATTRBM_MAXREAD)
/*
@@ -1266,6 +1271,7 @@ struct nfsv3_sattr {
NFSATTRBM_OWNERGROUP | \
NFSATTRBM_RAWDEV | \
NFSATTRBM_SPACEUSED | \
+ NFSATTRBM_SYSTEM | \
NFSATTRBM_TIMEACCESS | \
NFSATTRBM_TIMECREATE | \
NFSATTRBM_TIMEMETADATA | \
@@ -1288,6 +1294,7 @@ struct nfsv3_sattr {
NFSATTRBM_SIZE | \
NFSATTRBM_FSID | \
NFSATTRBM_FILEID | \
+ NFSATTRBM_HIDDEN | \
NFSATTRBM_MAXREAD)
/*
@@ -1298,6 +1305,7 @@ struct nfsv3_sattr {
NFSATTRBM_NUMLINKS | \
NFSATTRBM_RAWDEV | \
NFSATTRBM_SPACEUSED | \
+ NFSATTRBM_SYSTEM | \
NFSATTRBM_TIMEACCESS | \
NFSATTRBM_TIMECREATE | \
NFSATTRBM_TIMEMETADATA | \
diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c
index c07da6f9275f..2f3c59b68518 100644
--- a/sys/fs/nfsclient/nfs_clrpcops.c
+++ b/sys/fs/nfsclient/nfs_clrpcops.c
@@ -4158,6 +4158,13 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep,
if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
NFSATTRBIT_TIMECREATE))
NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE);
+ if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
+ NFSATTRBIT_HIDDEN) ||
+ !NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
+ NFSATTRBIT_SYSTEM)) {
+ NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN);
+ NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM);
+ }
}
/*
@@ -5429,7 +5436,8 @@ nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
NFSZERO_ATTRBIT(&attrbits);
NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
(void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0,
- &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL);
+ &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, false, false,
+ false);
error = nfscl_request(nd, vp, p, cred);
if (error)
return (error);
diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c
index 1ae5ed1a75ca..99a781640c53 100644
--- a/sys/fs/nfsclient/nfs_clstate.c
+++ b/sys/fs/nfsclient/nfs_clstate.c
@@ -3701,7 +3701,7 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p)
if (!error)
(void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va,
NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0,
- (uint64_t)0, NULL);
+ (uint64_t)0, NULL, false, false, false);
break;
case NFSV4OP_CBRECALL:
NFSCL_DEBUG(4, "cbrecall\n");
diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c
index 0049d7edca33..fa451887e73e 100644
--- a/sys/fs/nfsclient/nfs_clvnops.c
+++ b/sys/fs/nfsclient/nfs_clvnops.c
@@ -1074,21 +1074,29 @@ nfs_setattr(struct vop_setattr_args *ap)
int error = 0;
u_quad_t tsize;
struct timespec ts;
+ struct nfsmount *nmp;
#ifndef nolint
tsize = (u_quad_t)0;
#endif
/*
- * Setting of flags and marking of atimes are not supported.
+ * Only setting of UF_HIDDEN and UF_SYSTEM are supported and
+ * only for NFSv4 servers that support them.
*/
- if (vap->va_flags != VNOVAL)
+ nmp = VFSTONFS(vp->v_mount);
+ if (vap->va_flags != VNOVAL && (!NFSHASNFSV4(nmp) ||
+ (vap->va_flags & ~(UF_HIDDEN | UF_SYSTEM)) != 0 ||
+ ((vap->va_flags & UF_HIDDEN) != 0 &&
+ !NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, NFSATTRBIT_HIDDEN)) ||
+ ((vap->va_flags & UF_SYSTEM) != 0 &&
+ !NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, NFSATTRBIT_SYSTEM))))
return (EOPNOTSUPP);
/*
* Disallow write attempts if the filesystem is mounted read-only.
*/
- if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
+ if ((vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
vap->va_mtime.tv_sec != VNOVAL ||
vap->va_birthtime.tv_sec != VNOVAL ||
@@ -4754,6 +4762,15 @@ nfs_pathconf(struct vop_pathconf_args *ap)
else
*ap->a_retval = 0;
break;
+ case _PC_HAS_HIDDENSYSTEM:
+ if (NFS_ISV4(vp) && NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr,
+ NFSATTRBIT_HIDDEN) &&
+ NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr,
+ NFSATTRBIT_SYSTEM))
+ *ap->a_retval = 1;
+ else
+ *ap->a_retval = 0;
+ break;
default:
error = vop_stdpathconf(ap);
diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c
index 3bf54d82b959..4f0d5946d6b9 100644
--- a/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/sys/fs/nfsserver/nfs_nfsdport.c
@@ -449,6 +449,7 @@ nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap,
}
nvap->na_bsdflags = 0;
+ nvap->na_flags = 0;
error = VOP_GETATTR(vp, &nvap->na_vattr, nd->nd_cred);
if (lockedit != 0)
NFSVOPUNLOCK(vp);
@@ -1651,10 +1652,11 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp,
}
if (fvp == tvp) {
/*
- * If source and destination are the same, there is nothing to
- * do. Set error to -1 to indicate this.
+ * If source and destination are the same, there is
+ * nothing to do. Set error to EJUSTRETURN to indicate
+ * this.
*/
- error = -1;
+ error = EJUSTRETURN;
goto out;
}
if (nd->nd_flag & ND_NFSV4) {
@@ -1696,10 +1698,26 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp,
" dsdvp=%p\n", dsdvp[0]);
}
out:
- if (!error) {
+ mp = NULL;
+ if (error == 0) {
+ error = VOP_GETWRITEMOUNT(tondp->ni_dvp, &mp);
+ if (error == 0) {
+ if (mp == NULL) {
+ error = ENOENT;
+ } else {
+ error = lockmgr(&mp->mnt_renamelock,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL);
+ if (error != 0)
+ error = ERELOOKUP;
+ }
+ }
+ }
+ if (error == 0) {
error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
&fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
&tondp->ni_cnd);
+ lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+ vfs_rel(mp);
} else {
if (tdvp == tvp)
vrele(tdvp);
@@ -1709,8 +1727,13 @@ out:
vput(tvp);
vrele(fromndp->ni_dvp);
vrele(fvp);
- if (error == -1)
+ if (error == EJUSTRETURN) {
error = 0;
+ } else if (error == ERELOOKUP && mp != NULL) {
+ lockmgr(&mp->mnt_renamelock, LK_EXCLUSIVE, 0);
+ lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+ vfs_rel(mp);
+ }
}
/*
@@ -2089,7 +2112,8 @@ int
nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
struct ucred *cred, struct thread *p, int isdgram, int reterr,
- int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
+ int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno,
+ bool xattrsupp, bool has_hiddensystem, bool has_namedattr)
{
struct statfs *sf;
int error;
@@ -2108,7 +2132,7 @@ nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
}
error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
- mounted_on_fileno, sf);
+ mounted_on_fileno, sf, xattrsupp, has_hiddensystem, has_namedattr);
free(sf, M_TEMP);
NFSEXITCODE2(0, nd);
return (error);
@@ -2425,7 +2449,7 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
struct nfsvattr nva, at, *nvap = &nva;
struct mbuf *mb0, *mb1;
struct nfsreferral *refp;
- int nlen, r, error = 0, getret = 1, usevget = 1;
+ int nlen, r, error = 0, getret = 1, ret, usevget = 1;
int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
caddr_t bpos0, bpos1;
u_int64_t off, toff, verf __unused;
@@ -2439,6 +2463,9 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
uint64_t mounted_on_fileno;
struct thread *p = curthread;
int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1;
+ size_t atsiz;
+ long pathval;
+ bool has_hiddensystem, has_namedattr, xattrsupp;
if (nd->nd_repstat) {
nfsrv_postopattr(nd, getret, &at);
@@ -2913,9 +2940,32 @@ again:
*tl++ = newnfs_true;
txdr_hyper(*cookiep, tl);
dirlen += nfsm_strtom(nd, dp->d_name, nlen);
+ xattrsupp = false;
+ has_hiddensystem = false;
+ has_namedattr = false;
if (nvp != NULL) {
supports_nfsv4acls =
nfs_supportsnfsv4acls(nvp);
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_XATTRSUPPORT)) {
+ ret = VOP_GETEXTATTR(nvp,
+ EXTATTR_NAMESPACE_USER,
+ "xxx", NULL, &atsiz,
+ nd->nd_cred, p);
+ xattrsupp = ret != EOPNOTSUPP;
+ }
+ if (VOP_PATHCONF(nvp,
+ _PC_HAS_HIDDENSYSTEM, &pathval) !=
+ 0)
+ pathval = 0;
+ has_hiddensystem = pathval > 0;
+ pathval = 0;
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_NAMEDATTR) &&
+ VOP_PATHCONF(nvp, _PC_HAS_NAMEDATTR,
+ &pathval) != 0)
+ pathval = 0;
+ has_namedattr = pathval > 0;
NFSVOPUNLOCK(nvp);
} else
supports_nfsv4acls = 0;
@@ -2935,13 +2985,15 @@ again:
nvp, nvap, &nfh, r, &rderrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
- mounted_on_fileno);
+ mounted_on_fileno, xattrsupp,
+ has_hiddensystem, has_namedattr);
} else {
dirlen += nfsvno_fillattr(nd, new_mp,
nvp, nvap, &nfh, r, &attrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
- mounted_on_fileno);
+ mounted_on_fileno, xattrsupp,
+ has_hiddensystem, has_namedattr);
}
if (nvp != NULL)
vrele(nvp);
@@ -3127,6 +3179,9 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
bitpos = NFSATTRBIT_MAX;
} else {
bitpos = 0;
+ if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_HIDDEN) ||
+ NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SYSTEM))
+ nvap->na_flags = 0;
}
moderet = 0;
for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
@@ -3163,9 +3218,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_HIDDEN:
- NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
- if (!nd->nd_repstat)
- nd->nd_repstat = NFSERR_ATTRNOTSUPP;
+ NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
+ if (nd->nd_repstat == 0) {
+ if (*tl == newnfs_true)
+ nvap->na_flags |= UF_HIDDEN;
+ }
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_MIMETYPE:
@@ -3240,9 +3297,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
break;
case NFSATTRBIT_SYSTEM:
- NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
- if (!nd->nd_repstat)
- nd->nd_repstat = NFSERR_ATTRNOTSUPP;
+ NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
+ if (nd->nd_repstat == 0) {
+ if (*tl == newnfs_true)
+ nvap->na_flags |= UF_SYSTEM;
+ }
attrsum += NFSX_UNSIGNED;
break;
case NFSATTRBIT_TIMEACCESSSET:
@@ -6326,7 +6385,7 @@ nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
* the same type (VREG).
*/
nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL,
- NULL, 0, 0, 0, 0, 0, NULL);
+ NULL, 0, 0, 0, 0, 0, NULL, false, false, false);
error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
if (error != 0) {
diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c
index 4e15d55eb312..9eebcda548c6 100644
--- a/sys/fs/nfsserver/nfs_nfsdserv.c
+++ b/sys/fs/nfsserver/nfs_nfsdserv.c
@@ -241,7 +241,7 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
{
struct nfsvattr nva;
fhandle_t fh;
- int at_root = 0, error = 0, supports_nfsv4acls;
+ int at_root = 0, error = 0, ret, supports_nfsv4acls;
struct nfsreferral *refp;
nfsattrbit_t attrbits, tmpbits;
struct mount *mp;
@@ -250,6 +250,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
uint64_t mounted_on_fileno = 0;
accmode_t accmode;
struct thread *p = curthread;
+ size_t atsiz;
+ long pathval;
+ bool has_hiddensystem, has_namedattr, xattrsupp;
if (nd->nd_repstat)
goto out;
@@ -307,6 +310,26 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
&nva, &attrbits, p);
if (nd->nd_repstat == 0) {
supports_nfsv4acls = nfs_supportsnfsv4acls(vp);
+ xattrsupp = false;
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_XATTRSUPPORT)) {
+ ret = VOP_GETEXTATTR(vp,
+ EXTATTR_NAMESPACE_USER,
+ "xxx", NULL, &atsiz, nd->nd_cred,
+ p);
+ xattrsupp = ret != EOPNOTSUPP;
+ }
+ if (VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM,
+ &pathval) != 0)
+ pathval = 0;
+ has_hiddensystem = pathval > 0;
+ pathval = 0;
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_NAMEDATTR) &&
+ VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR,
+ &pathval) != 0)
+ pathval = 0;
+ has_namedattr = pathval > 0;
mp = vp->v_mount;
if (nfsrv_enable_crossmntpt != 0 &&
vp->v_type == VDIR &&
@@ -340,7 +363,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
(void)nfsvno_fillattr(nd, mp, vp, &nva,
&fh, 0, &attrbits, nd->nd_cred, p,
isdgram, 1, supports_nfsv4acls,
- at_root, mounted_on_fileno);
+ at_root, mounted_on_fileno,
+ xattrsupp, has_hiddensystem,
+ has_namedattr);
vfs_unbusy(mp);
}
vrele(vp);
@@ -403,8 +428,10 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram,
if (error)
goto nfsmout;
- /* For NFSv4, only va_uid is used from nva2. */
+ /* For NFSv4, only va_uid and va_flags is used from nva2. */
NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_OWNER);
+ NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_HIDDEN);
+ NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_SYSTEM);
preat_ret = nfsvno_getattr(vp, &nva2, nd, p, 1, &retbits);
if (!nd->nd_repstat)
nd->nd_repstat = preat_ret;
@@ -463,6 +490,9 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram,
&nva, &attrbits, exp, p);
if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV4)) {
+ u_long oldflags;
+
+ oldflags = nva2.na_flags;
/*
* For V4, try setting the attributes in sets, so that the
* reply bitmap will be correct for an error case.
@@ -532,6 +562,32 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram,
NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_MODESETMASKED);
}
}
+ if (!nd->nd_repstat &&
+ (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN) ||
+ NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM))) {
+ if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN)) {
+ if ((nva.na_flags & UF_HIDDEN) != 0)
+ oldflags |= UF_HIDDEN;
+ else
+ oldflags &= ~UF_HIDDEN;
+ }
+ if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM)) {
+ if ((nva.na_flags & UF_SYSTEM) != 0)
+ oldflags |= UF_SYSTEM;
+ else
+ oldflags &= ~UF_SYSTEM;
+ }
+ NFSVNO_ATTRINIT(&nva2);
+ NFSVNO_SETATTRVAL(&nva2, flags, oldflags);
+ nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p,
+ exp);
+ if (!nd->nd_repstat) {
+ if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN))
+ NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_HIDDEN);
+ if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM))
+ NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_SYSTEM);
+ }
+ }
#ifdef NFS4_ACL_EXTATTR_NAME
if (!nd->nd_repstat && aclp->acl_cnt > 0 &&
@@ -4322,9 +4378,10 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
int error = 0;
NFSNAMEICNDSET(&cn, nd->nd_cred, LOOKUP, OPENNAMED | ISLASTCN |
- NOFOLLOW);
+ NOFOLLOW | LOCKLEAF);
cn.cn_nameptr = ".";
cn.cn_namelen = 1;
+ cn.cn_lkflags = LK_SHARED;
NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true)
cn.cn_flags |= CREATENAMED;
@@ -4343,6 +4400,8 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
if (nd->nd_repstat == ENOATTR)
nd->nd_repstat = NFSERR_NOENT;
}
+ if (nd->nd_repstat == 0)
+ NFSVOPUNLOCK(*vpp);
vput(dp);
NFSEXITCODE2(0, nd);
diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
index 0356877eaf05..7dcc83880bb9 100644
--- a/sys/fs/nullfs/null_subr.c
+++ b/sys/fs/nullfs/null_subr.c
@@ -245,6 +245,10 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp)
vp->v_object = lowervp->v_object;
vn_irflag_set(vp, VIRF_PGREAD);
}
+ if ((vn_irflag_read(lowervp) & VIRF_INOTIFY) != 0)
+ vn_irflag_set(vp, VIRF_INOTIFY);
+ if ((vn_irflag_read(lowervp) & VIRF_INOTIFY_PARENT) != 0)
+ vn_irflag_set(vp, VIRF_INOTIFY_PARENT);
if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp)
vp->v_vflag |= VV_ROOT;
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index 8608216e10e5..74c1a8f3acb6 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -190,6 +190,26 @@ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW,
&null_bug_bypass, 0, "");
/*
+ * Synchronize inotify flags with the lower vnode:
+ * - If the upper vnode has the flag set and the lower does not, then the lower
+ * vnode is unwatched and the upper vnode does not need to go through
+ * VOP_INOTIFY.
+ * - If the lower vnode is watched, then the upper vnode should go through
+ * VOP_INOTIFY, so copy the flag up.
+ */
+static void
+null_copy_inotify(struct vnode *vp, struct vnode *lvp, short flag)
+{
+ if ((vn_irflag_read(vp) & flag) != 0) {
+ if (__predict_false((vn_irflag_read(lvp) & flag) == 0))
+ vn_irflag_unset(vp, flag);
+ } else if ((vn_irflag_read(lvp) & flag) != 0) {
+ if (__predict_false((vn_irflag_read(vp) & flag) == 0))
+ vn_irflag_set(vp, flag);
+ }
+}
+
+/*
* This is the 10-Apr-92 bypass routine.
* This version has been optimized for speed, throwing away some
* safety checks. It should still always work, but it's not as
@@ -305,7 +325,10 @@ null_bypass(struct vop_generic_args *ap)
lvp = *(vps_p[i]);
/*
- * Get rid of the transient hold on lvp.
+ * Get rid of the transient hold on lvp. Copy inotify
+ * flags up in case something is watching the lower
+ * layer.
+ *
* If lowervp was unlocked during VOP
* operation, nullfs upper vnode could have
* been reclaimed, which changes its v_vnlock
@@ -314,6 +337,10 @@ null_bypass(struct vop_generic_args *ap)
* upper (reclaimed) vnode.
*/
if (lvp != NULLVP) {
+ null_copy_inotify(old_vps[i], lvp,
+ VIRF_INOTIFY);
+ null_copy_inotify(old_vps[i], lvp,
+ VIRF_INOTIFY_PARENT);
if (VOP_ISLOCKED(lvp) == LK_EXCLUSIVE &&
old_vps[i]->v_vnlock != lvp->v_vnlock) {
VOP_UNLOCK(lvp);
diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c
index 56bf766ef801..227e2b93883e 100644
--- a/sys/fs/p9fs/p9fs_vnops.c
+++ b/sys/fs/p9fs/p9fs_vnops.c
@@ -1784,6 +1784,9 @@ p9fs_readdir(struct vop_readdir_args *ap)
return (EBADF);
}
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
+
io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK);
/* We haven't reached the end yet. read more. */
@@ -1801,8 +1804,11 @@ p9fs_readdir(struct vop_readdir_args *ap)
count = p9_client_readdir(vofid, (char *)io_buffer,
diroffset, count);
- if (count == 0)
+ if (count == 0) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
break;
+ }
if (count < 0) {
error = EIO;
diff --git a/sys/fs/smbfs/smbfs_vnops.c b/sys/fs/smbfs/smbfs_vnops.c
index c30995508c00..5d412cabadb8 100644
--- a/sys/fs/smbfs/smbfs_vnops.c
+++ b/sys/fs/smbfs/smbfs_vnops.c
@@ -810,6 +810,9 @@ smbfs_pathconf(struct vop_pathconf_args *ap)
case _PC_NO_TRUNC:
*retval = 1;
break;
+ case _PC_HAS_HIDDENSYSTEM:
+ *retval = 1;
+ break;
default:
error = vop_stdpathconf(ap);
}
diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c
index c99d0732be50..9d2a587b177a 100644
--- a/sys/fs/tmpfs/tmpfs_vnops.c
+++ b/sys/fs/tmpfs/tmpfs_vnops.c
@@ -1691,6 +1691,10 @@ tmpfs_pathconf(struct vop_pathconf_args *v)
*retval = PAGE_SIZE;
break;
+ case _PC_HAS_HIDDENSYSTEM:
+ *retval = 1;
+ break;
+
default:
error = vop_stdpathconf(v);
}
diff --git a/sys/fs/udf/ecma167-udf.h b/sys/fs/udf/ecma167-udf.h
index 839bbec08254..19e114763cac 100644
--- a/sys/fs/udf/ecma167-udf.h
+++ b/sys/fs/udf/ecma167-udf.h
@@ -243,7 +243,7 @@ struct part_map_spare {
uint8_t n_st; /* Number of Sparing Tables */
uint8_t reserved1;
uint32_t st_size;
- uint32_t st_loc[1];
+ uint32_t st_loc[];
} __packed;
union udf_pmap {
@@ -266,7 +266,7 @@ struct udf_sparing_table {
uint16_t rt_l; /* Relocation Table len */
uint8_t reserved[2];
uint32_t seq_num;
- struct spare_map_entry entries[1];
+ struct spare_map_entry entries[];
} __packed;
/* Partition Descriptor [3/10.5] */
diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c
index c7438147c0a0..c5ef1f686093 100644
--- a/sys/fs/udf/udf_vfsops.c
+++ b/sys/fs/udf/udf_vfsops.c
@@ -81,6 +81,7 @@
#include <sys/fcntl.h>
#include <sys/iconv.h>
#include <sys/kernel.h>
+#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
@@ -729,7 +730,7 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
struct ifid *ifhp;
struct vnode *nvp;
struct udf_node *np;
- off_t fsize;
+ uint64_t fsize;
int error;
ifhp = (struct ifid *)fhp;
@@ -741,6 +742,10 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
np = VTON(nvp);
fsize = le64toh(np->fentry->inf_len);
+ if (fsize > OFF_MAX) {
+ *vpp = NULLVP;
+ return (EIO);
+ }
*vpp = nvp;
vnode_create_vobject(*vpp, fsize, curthread);
diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c
index 88bf4917a851..37889241e8c3 100644
--- a/sys/fs/udf/udf_vnops.c
+++ b/sys/fs/udf/udf_vnops.c
@@ -39,6 +39,7 @@
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/iconv.h>
+#include <sys/limits.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/dirent.h>
@@ -182,11 +183,14 @@ udf_access(struct vop_access_args *a)
}
static int
-udf_open(struct vop_open_args *ap) {
+udf_open(struct vop_open_args *ap)
+{
struct udf_node *np = VTON(ap->a_vp);
- off_t fsize;
+ uint64_t fsize;
fsize = le64toh(np->fentry->inf_len);
+ if (fsize > OFF_MAX)
+ return (EIO);
vnode_create_vobject(ap->a_vp, fsize, ap->a_td);
return 0;
}
@@ -314,12 +318,13 @@ udf_getattr(struct vop_getattr_args *a)
* that directories consume at least one logical block,
* make it appear so.
*/
- if (fentry->logblks_rec != 0) {
- vap->va_size =
- le64toh(fentry->logblks_rec) * node->udfmp->bsize;
- } else {
+ vap->va_size = le64toh(fentry->logblks_rec);
+ if (vap->va_size == 0)
vap->va_size = node->udfmp->bsize;
- }
+ else if (vap->va_size > UINT64_MAX / node->udfmp->bsize)
+ vap->va_size = UINT64_MAX;
+ else
+ vap->va_size *= node->udfmp->bsize;
} else {
vap->va_size = le64toh(fentry->inf_len);
}
@@ -446,6 +451,7 @@ udf_read(struct vop_read_args *ap)
struct buf *bp;
uint8_t *data;
daddr_t lbn, rablock;
+ uint64_t len;
off_t diff, fsize;
ssize_t n;
int error = 0;
@@ -471,7 +477,12 @@ udf_read(struct vop_read_args *ap)
return (error);
}
- fsize = le64toh(node->fentry->inf_len);
+ len = le64toh(node->fentry->inf_len);
+ if (len > OFF_MAX) {
+ /* too big, just cap to the requested length */
+ len = uio->uio_resid;
+ }
+ fsize = len;
udfmp = node->udfmp;
do {
lbn = lblkno(udfmp, uio->uio_offset);
@@ -783,6 +794,7 @@ udf_readdir(struct vop_readdir_args *a)
struct udf_uiodir uiodir;
struct udf_dirstream *ds;
uint64_t *cookies = NULL;
+ uint64_t len;
int ncookies;
int error = 0;
@@ -811,8 +823,12 @@ udf_readdir(struct vop_readdir_args *a)
* Iterate through the file id descriptors. Give the parent dir
* entry special attention.
*/
- ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len),
- node->udfmp);
+ len = le64toh(node->fentry->inf_len);
+ if (len > INT_MAX) {
+ /* too big, just cap to INT_MAX */
+ len = INT_MAX;
+ }
+ ds = udf_opendir(node, uio->uio_offset, len, node->udfmp);
while ((fid = udf_getfid(ds)) != NULL) {
/* XXX Should we return an error on a bad fid? */
@@ -904,7 +920,8 @@ udf_readlink(struct vop_readlink_args *ap)
struct udf_node *node;
void *buf;
char *cp;
- int error, len, root;
+ uint64_t len;
+ int error, root;
/*
* A symbolic link in UDF is a list of variable-length path
@@ -914,6 +931,8 @@ udf_readlink(struct vop_readlink_args *ap)
vp = ap->a_vp;
node = VTON(vp);
len = le64toh(node->fentry->inf_len);
+ if (len > MAXPATHLEN)
+ return (EIO);
buf = malloc(len, M_DEVBUF, M_WAITOK);
iov[0].iov_len = len;
iov[0].iov_base = buf;
@@ -1116,13 +1135,14 @@ udf_lookup(struct vop_cachedlookup_args *a)
struct udf_mnt *udfmp;
struct fileid_desc *fid = NULL;
struct udf_dirstream *ds;
+ uint64_t fsize;
u_long nameiop;
u_long flags;
char *nameptr;
long namelen;
ino_t id = 0;
int offset, error = 0;
- int fsize, lkflags, ltype, numdirpasses;
+ int lkflags, ltype, numdirpasses;
dvp = a->a_dvp;
node = VTON(dvp);
@@ -1133,6 +1153,10 @@ udf_lookup(struct vop_cachedlookup_args *a)
nameptr = a->a_cnp->cn_nameptr;
namelen = a->a_cnp->cn_namelen;
fsize = le64toh(node->fentry->inf_len);
+ if (fsize > INT_MAX) {
+ /* too big, just cap to INT_MAX */
+ fsize = INT_MAX;
+ }
/*
* If this is a LOOKUP and we've already partially searched through
diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC
index e7d460af21d4..f577cd07ac7c 100644
--- a/sys/i386/conf/GENERIC
+++ b/sys/i386/conf/GENERIC
@@ -17,6 +17,8 @@
# in NOTES.
#
+#NO_UNIVERSE
+
cpu I486_CPU
cpu I586_CPU
cpu I686_CPU
diff --git a/sys/i386/conf/GENERIC-NODEBUG b/sys/i386/conf/GENERIC-NODEBUG
index ea07613a796f..a93304481b5f 100644
--- a/sys/i386/conf/GENERIC-NODEBUG
+++ b/sys/i386/conf/GENERIC-NODEBUG
@@ -25,6 +25,8 @@
# in NOTES.
#
+#NO_UNIVERSE
+
include GENERIC
include "std.nodebug"
diff --git a/sys/i386/conf/LINT b/sys/i386/conf/LINT
index 41207eb63cb9..2e947202f723 100644
--- a/sys/i386/conf/LINT
+++ b/sys/i386/conf/LINT
@@ -1,3 +1,4 @@
+#NO_UNIVERSE
include "../../conf/NOTES"
include "../../x86/conf/NOTES"
diff --git a/sys/i386/conf/MINIMAL b/sys/i386/conf/MINIMAL
index 2a06eb84bff8..8019617ca4d4 100644
--- a/sys/i386/conf/MINIMAL
+++ b/sys/i386/conf/MINIMAL
@@ -31,6 +31,8 @@
# in NOTES.
#
+#NO_UNIVERSE
+
cpu I486_CPU
cpu I586_CPU
cpu I686_CPU
diff --git a/sys/i386/conf/PAE b/sys/i386/conf/PAE
index a39d32d77106..72af9e9a9eec 100644
--- a/sys/i386/conf/PAE
+++ b/sys/i386/conf/PAE
@@ -2,6 +2,8 @@
# PAE -- Generic kernel configuration file for FreeBSD/i386 PAE
#
+#NO_UNIVERSE
+
include GENERIC
ident PAE-GENERIC
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 465b4d0f365b..b44f5e08bbcf 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -876,14 +876,16 @@ __CONCAT(PMTYPE, init_pat)(void)
#ifdef PMAP_PAE_COMP
static void *
-pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
- int wait)
+pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *sflagsp,
+ int flags)
{
/* Inform UMA that this allocator uses kernel_map/object. */
- *flags = UMA_SLAB_KERNEL;
+ *sflagsp = UMA_SLAB_KERNEL;
+ /* contig allocations cannot be NEVERFREED */
+ flags &= ~M_NEVERFREED;
return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
- bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
+ bytes, flags, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
}
#endif
@@ -5617,6 +5619,8 @@ __CONCAT(PMTYPE, unmapdev)(void *p, vm_size_t size)
static void
__CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pat_mode == ma)
+ return;
m->md.pat_mode = ma;
if ((m->flags & PG_FICTITIOUS) != 0)
diff --git a/sys/i386/linux/linux_proto.h b/sys/i386/linux/linux_proto.h
index aa2dfbb68745..49f002a633d2 100644
--- a/sys/i386/linux/linux_proto.h
+++ b/sys/i386/linux/linux_proto.h
@@ -981,10 +981,13 @@ struct linux_inotify_init_args {
syscallarg_t dummy;
};
struct linux_inotify_add_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)];
+ char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)];
};
struct linux_inotify_rm_watch_args {
- syscallarg_t dummy;
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char wd_l_[PADL_(uint32_t)]; uint32_t wd; char wd_r_[PADR_(uint32_t)];
};
struct linux_migrate_pages_args {
syscallarg_t dummy;
@@ -1178,7 +1181,7 @@ struct linux_pipe2_args {
char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
};
struct linux_inotify_init1_args {
- syscallarg_t dummy;
+ char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
};
struct linux_preadv_args {
char fd_l_[PADL_(l_ulong)]; l_ulong fd; char fd_r_[PADR_(l_ulong)];
diff --git a/sys/i386/linux/linux_sysent.c b/sys/i386/linux/linux_sysent.c
index 7be646f34144..b8893008944b 100644
--- a/sys/i386/linux/linux_sysent.c
+++ b/sys/i386/linux/linux_sysent.c
@@ -306,8 +306,8 @@ struct sysent linux_sysent[] = {
{ .sy_narg = AS(linux_ioprio_set_args), .sy_call = (sy_call_t *)linux_ioprio_set, .sy_auevent = AUE_SETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 289 = linux_ioprio_set */
{ .sy_narg = AS(linux_ioprio_get_args), .sy_call = (sy_call_t *)linux_ioprio_get, .sy_auevent = AUE_GETPRIORITY, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 290 = linux_ioprio_get */
{ .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 291 = linux_inotify_init */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */
+ { .sy_narg = AS(linux_inotify_add_watch_args), .sy_call = (sy_call_t *)linux_inotify_add_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */
+ { .sy_narg = AS(linux_inotify_rm_watch_args), .sy_call = (sy_call_t *)linux_inotify_rm_watch, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */
{ .sy_narg = 0, .sy_call = (sy_call_t *)linux_migrate_pages, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 294 = linux_migrate_pages */
{ .sy_narg = AS(linux_openat_args), .sy_call = (sy_call_t *)linux_openat, .sy_auevent = AUE_OPEN_RWTC, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 295 = linux_openat */
{ .sy_narg = AS(linux_mkdirat_args), .sy_call = (sy_call_t *)linux_mkdirat, .sy_auevent = AUE_MKDIRAT, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 296 = linux_mkdirat */
@@ -346,7 +346,7 @@ struct sysent linux_sysent[] = {
{ .sy_narg = AS(linux_epoll_create1_args), .sy_call = (sy_call_t *)linux_epoll_create1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 329 = linux_epoll_create1 */
{ .sy_narg = AS(linux_dup3_args), .sy_call = (sy_call_t *)linux_dup3, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 330 = linux_dup3 */
{ .sy_narg = AS(linux_pipe2_args), .sy_call = (sy_call_t *)linux_pipe2, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 331 = linux_pipe2 */
- { .sy_narg = 0, .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */
+ { .sy_narg = AS(linux_inotify_init1_args), .sy_call = (sy_call_t *)linux_inotify_init1, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 332 = linux_inotify_init1 */
{ .sy_narg = AS(linux_preadv_args), .sy_call = (sy_call_t *)linux_preadv, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 333 = linux_preadv */
{ .sy_narg = AS(linux_pwritev_args), .sy_call = (sy_call_t *)linux_pwritev, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 334 = linux_pwritev */
{ .sy_narg = AS(linux_rt_tgsigqueueinfo_args), .sy_call = (sy_call_t *)linux_rt_tgsigqueueinfo, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 335 = linux_rt_tgsigqueueinfo */
diff --git a/sys/i386/linux/linux_systrace_args.c b/sys/i386/linux/linux_systrace_args.c
index f3e3c32a2bbf..563d1a795ae1 100644
--- a/sys/i386/linux/linux_systrace_args.c
+++ b/sys/i386/linux/linux_systrace_args.c
@@ -2071,12 +2071,19 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
}
/* linux_inotify_add_watch */
case 292: {
- *n_args = 0;
+ struct linux_inotify_add_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = (intptr_t)p->pathname; /* const char * */
+ uarg[a++] = p->mask; /* uint32_t */
+ *n_args = 3;
break;
}
/* linux_inotify_rm_watch */
case 293: {
- *n_args = 0;
+ struct linux_inotify_rm_watch_args *p = params;
+ iarg[a++] = p->fd; /* l_int */
+ uarg[a++] = p->wd; /* uint32_t */
+ *n_args = 2;
break;
}
/* linux_migrate_pages */
@@ -2410,7 +2417,9 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
}
/* linux_inotify_init1 */
case 332: {
- *n_args = 0;
+ struct linux_inotify_init1_args *p = params;
+ iarg[a++] = p->flags; /* l_int */
+ *n_args = 1;
break;
}
/* linux_preadv */
@@ -6604,9 +6613,32 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_add_watch */
case 292:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "userland const char *";
+ break;
+ case 2:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_inotify_rm_watch */
case 293:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
break;
/* linux_migrate_pages */
case 294:
@@ -7172,6 +7204,13 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_init1 */
case 332:
+ switch (ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
break;
/* linux_preadv */
case 333:
@@ -9889,8 +9928,14 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
case 291:
/* linux_inotify_add_watch */
case 292:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_inotify_rm_watch */
case 293:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_migrate_pages */
case 294:
/* linux_openat */
@@ -10062,6 +10107,9 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
/* linux_inotify_init1 */
case 332:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
/* linux_preadv */
case 333:
if (ndx == 0 || ndx == 1)
diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master
index 958336be0f08..2113ea51ac5d 100644
--- a/sys/i386/linux/syscalls.master
+++ b/sys/i386/linux/syscalls.master
@@ -1605,10 +1605,17 @@
int linux_inotify_init(void);
}
292 AUE_NULL STD {
- int linux_inotify_add_watch(void);
+ int linux_inotify_add_watch(
+ l_int fd,
+ const char *pathname,
+ uint32_t mask
+ );
}
293 AUE_NULL STD {
- int linux_inotify_rm_watch(void);
+ int linux_inotify_rm_watch(
+ l_int fd,
+ uint32_t wd
+ );
}
; Linux 2.6.16:
294 AUE_NULL STD {
@@ -1872,7 +1879,9 @@
);
}
332 AUE_NULL STD {
- int linux_inotify_init1(void);
+ int linux_inotify_init1(
+ l_int flags
+ );
}
; Linux 2.6.30:
333 AUE_NULL STD {
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index a48a513aa3b5..91792430d24c 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -658,5 +658,7 @@ struct sysent sysent[] = {
{ .sy_narg = AS(getrlimitusage_args), .sy_call = (sy_call_t *)sys_getrlimitusage, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 589 = getrlimitusage */
{ .sy_narg = AS(fchroot_args), .sy_call = (sy_call_t *)sys_fchroot, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 590 = fchroot */
{ .sy_narg = AS(setcred_args), .sy_call = (sy_call_t *)sys_setcred, .sy_auevent = AUE_SETCRED, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 591 = setcred */
- { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */
+ { .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 592 = exterrctl */
+ { .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 593 = inotify_add_watch_at */
+ { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */
};
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index ac4b6ac3f457..a27ab33b34da 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -38,9 +38,11 @@
#include "opt_ddb.h"
#include "opt_ktrace.h"
+#define EXTERR_CATEGORY EXTERR_CAT_FILEDESC
#include <sys/systm.h>
#include <sys/capsicum.h>
#include <sys/conf.h>
+#include <sys/exterrvar.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
@@ -478,6 +480,92 @@ kern_fcntl_freebsd(struct thread *td, int fd, int cmd, intptr_t arg)
return (error);
}
+struct flags_trans_elem {
+ u_int f;
+ u_int t;
+};
+
+static u_int
+flags_trans(const struct flags_trans_elem *ftes, int nitems, u_int from_flags)
+{
+ u_int res;
+ int i;
+
+ res = 0;
+ for (i = 0; i < nitems; i++) {
+ if ((from_flags & ftes[i].f) != 0)
+ res |= ftes[i].t;
+ }
+ return (res);
+}
+
+static uint8_t
+fd_to_fde_flags(int fd_flags)
+{
+ static const struct flags_trans_elem fd_to_fde_flags_s[] = {
+ { .f = FD_CLOEXEC, .t = UF_EXCLOSE },
+ { .f = FD_CLOFORK, .t = UF_FOCLOSE },
+ { .f = FD_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH },
+ };
+
+ return (flags_trans(fd_to_fde_flags_s, nitems(fd_to_fde_flags_s),
+ fd_flags));
+}
+
+static int
+fde_to_fd_flags(uint8_t fde_flags)
+{
+ static const struct flags_trans_elem fde_to_fd_flags_s[] = {
+ { .f = UF_EXCLOSE, .t = FD_CLOEXEC },
+ { .f = UF_FOCLOSE, .t = FD_CLOFORK },
+ { .f = UF_RESOLVE_BENEATH, .t = FD_RESOLVE_BENEATH },
+ };
+
+ return (flags_trans(fde_to_fd_flags_s, nitems(fde_to_fd_flags_s),
+ fde_flags));
+}
+
+static uint8_t
+fddup_to_fde_flags(int fddup_flags)
+{
+ static const struct flags_trans_elem fddup_to_fde_flags_s[] = {
+ { .f = FDDUP_FLAG_CLOEXEC, .t = UF_EXCLOSE },
+ { .f = FDDUP_FLAG_CLOFORK, .t = UF_FOCLOSE },
+ };
+
+ return (flags_trans(fddup_to_fde_flags_s, nitems(fddup_to_fde_flags_s),
+ fddup_flags));
+}
+
+static uint8_t
+close_range_to_fde_flags(int close_range_flags)
+{
+ static const struct flags_trans_elem close_range_to_fde_flags_s[] = {
+ { .f = CLOSE_RANGE_CLOEXEC, .t = UF_EXCLOSE },
+ { .f = CLOSE_RANGE_CLOFORK, .t = UF_FOCLOSE },
+ };
+
+ return (flags_trans(close_range_to_fde_flags_s,
+ nitems(close_range_to_fde_flags_s), close_range_flags));
+}
+
+static uint8_t
+open_to_fde_flags(int open_flags, bool sticky_orb)
+{
+ static const struct flags_trans_elem open_to_fde_flags_s[] = {
+ { .f = O_CLOEXEC, .t = UF_EXCLOSE },
+ { .f = O_CLOFORK, .t = UF_FOCLOSE },
+ { .f = O_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH },
+ };
+#if defined(__clang__) && __clang_major__ >= 19
+ _Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f ==
+ O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb");
+#endif
+
+ return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) -
+ (sticky_orb ? 0 : 1), open_flags));
+}
+
int
kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
{
@@ -492,6 +580,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
int error, flg, kif_sz, seals, tmp, got_set, got_cleared;
uint64_t bsize;
off_t foffset;
+ int flags;
error = 0;
flg = F_POSIX;
@@ -511,6 +600,11 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
break;
+ case F_DUPFD_CLOFORK:
+ tmp = arg;
+ error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOFORK, fd, tmp);
+ break;
+
case F_DUP2FD:
tmp = arg;
error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
@@ -526,10 +620,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
FILEDESC_SLOCK(fdp);
fde = fdeget_noref(fdp, fd);
if (fde != NULL) {
- td->td_retval[0] =
- ((fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0) |
- ((fde->fde_flags & UF_RESOLVE_BENEATH) ?
- FD_RESOLVE_BENEATH : 0);
+ td->td_retval[0] = fde_to_fd_flags(fde->fde_flags);
error = 0;
}
FILEDESC_SUNLOCK(fdp);
@@ -543,10 +634,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
/*
* UF_RESOLVE_BENEATH is sticky and cannot be cleared.
*/
- fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
- ((arg & FD_CLOEXEC) != 0 ? UF_EXCLOSE : 0) |
- ((arg & FD_RESOLVE_BENEATH) != 0 ?
- UF_RESOLVE_BENEATH : 0);
+ fde->fde_flags = (fde->fde_flags &
+ ~(UF_EXCLOSE | UF_FOCLOSE)) | fd_to_fde_flags(arg);
error = 0;
}
FILEDESC_XUNLOCK(fdp);
@@ -916,7 +1005,17 @@ revert_f_setfl:
break;
default:
- error = EINVAL;
+ if ((cmd & ((1u << F_DUP3FD_SHIFT) - 1)) != F_DUP3FD)
+ return (EXTERROR(EINVAL, "invalid fcntl cmd"));
+ /* Handle F_DUP3FD */
+ flags = (cmd >> F_DUP3FD_SHIFT);
+ if ((flags & ~(FD_CLOEXEC | FD_CLOFORK)) != 0)
+ return (EXTERROR(EINVAL, "invalid flags for F_DUP3FD"));
+ tmp = arg;
+ error = kern_dup(td, FDDUP_FIXED,
+ ((flags & FD_CLOEXEC) != 0 ? FDDUP_FLAG_CLOEXEC : 0) |
+ ((flags & FD_CLOFORK) != 0 ? FDDUP_FLAG_CLOFORK : 0),
+ fd, tmp);
break;
}
return (error);
@@ -946,7 +1045,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
fdp = p->p_fd;
oioctls = NULL;
- MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
+ MPASS((flags & ~(FDDUP_FLAG_CLOEXEC | FDDUP_FLAG_CLOFORK)) == 0);
MPASS(mode < FDDUP_LASTMODE);
AUDIT_ARG_FD(old);
@@ -971,8 +1070,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
goto unlock;
if (mode == FDDUP_FIXED && old == new) {
td->td_retval[0] = new;
- if (flags & FDDUP_FLAG_CLOEXEC)
- fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
+ fdp->fd_ofiles[new].fde_flags |= fddup_to_fde_flags(flags);
error = 0;
goto unlock;
}
@@ -1047,10 +1145,8 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
fde_copy(oldfde, newfde);
filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
nioctls);
- if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
- newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
- else
- newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+ newfde->fde_flags = (oldfde->fde_flags & ~(UF_EXCLOSE | UF_FOCLOSE)) |
+ fddup_to_fde_flags(flags);
#ifdef CAPABILITIES
seqc_write_end(&newfde->fde_seqc);
#endif
@@ -1416,13 +1512,14 @@ kern_close(struct thread *td, int fd)
}
static int
-close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
+close_range_flags(struct thread *td, u_int lowfd, u_int highfd, int flags)
{
struct filedesc *fdp;
struct fdescenttbl *fdt;
struct filedescent *fde;
- int fd;
+ int fd, fde_flags;
+ fde_flags = close_range_to_fde_flags(flags);
fdp = td->td_proc->p_fd;
FILEDESC_XLOCK(fdp);
fdt = atomic_load_ptr(&fdp->fd_files);
@@ -1434,7 +1531,7 @@ close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
for (; fd <= highfd; fd++) {
fde = &fdt->fdt_ofiles[fd];
if (fde->fde_file != NULL)
- fde->fde_flags |= UF_EXCLOSE;
+ fde->fde_flags |= fde_flags;
}
out_locked:
FILEDESC_XUNLOCK(fdp);
@@ -1492,8 +1589,8 @@ kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd)
return (EINVAL);
}
- if ((flags & CLOSE_RANGE_CLOEXEC) != 0)
- return (close_range_cloexec(td, lowfd, highfd));
+ if ((flags & (CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
+ return (close_range_flags(td, lowfd, highfd, flags));
return (close_range_impl(td, lowfd, highfd));
}
@@ -1513,7 +1610,7 @@ sys_close_range(struct thread *td, struct close_range_args *uap)
AUDIT_ARG_CMD(uap->highfd);
AUDIT_ARG_FFLAGS(uap->flags);
- if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0)
+ if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
return (EINVAL);
return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd));
}
@@ -2171,8 +2268,7 @@ _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
seqc_write_begin(&fde->fde_seqc);
#endif
fde->fde_file = fp;
- fde->fde_flags = ((flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0) |
- ((flags & O_RESOLVE_BENEATH) != 0 ? UF_RESOLVE_BENEATH : 0);
+ fde->fde_flags = open_to_fde_flags(flags, true);
if (fcaps != NULL)
filecaps_move(fcaps, &fde->fde_caps);
else
@@ -2432,6 +2528,7 @@ fdcopy(struct filedesc *fdp)
newfdp->fd_freefile = fdp->fd_freefile;
FILEDESC_FOREACH_FDE(fdp, i, ofde) {
if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
+ (ofde->fde_flags & UF_FOCLOSE) != 0 ||
!fhold(ofde->fde_file)) {
if (newfdp->fd_freefile == fdp->fd_freefile)
newfdp->fd_freefile = i;
@@ -2729,6 +2826,12 @@ fdcloseexec(struct thread *td)
fdfree(fdp, i);
(void) closefp(fdp, i, fp, td, false, false);
FILEDESC_UNLOCK_ASSERT(fdp);
+ } else if (fde->fde_flags & UF_FOCLOSE) {
+ /*
+ * https://austingroupbugs.net/view.php?id=1851
+ * FD_CLOFORK should not be preserved across exec
+ */
+ fde->fde_flags &= ~UF_FOCLOSE;
}
}
}
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index cf067527237e..03268365891e 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -29,6 +29,7 @@
#include <sys/cdefs.h>
#include "opt_capsicum.h"
#include "opt_hwpmc_hooks.h"
+#include "opt_hwt_hooks.h"
#include "opt_ktrace.h"
#include "opt_vm.h"
@@ -90,6 +91,10 @@
#include <sys/pmckern.h>
#endif
+#ifdef HWT_HOOKS
+#include <dev/hwt/hwt_hook.h>
+#endif
+
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
@@ -936,6 +941,20 @@ interpret:
}
#endif
+#ifdef HWT_HOOKS
+ if ((td->td_proc->p_flag2 & P2_HWT) != 0) {
+ struct hwt_record_entry ent;
+
+ VOP_UNLOCK(imgp->vp);
+ ent.fullpath = imgp->execpath;
+ ent.addr = imgp->et_dyn_addr;
+ ent.baseaddr = imgp->reloc_base;
+ ent.record_type = HWT_RECORD_EXECUTABLE;
+ HWT_CALL_HOOK(td, HWT_EXEC, &ent);
+ vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+ }
+#endif
+
/* Set values passed into the program in registers. */
(*p->p_sysent->sv_setregs)(td, imgp, stack_base);
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
index f388ac8a583a..d566bc01bc5e 100644
--- a/sys/kern/kern_linker.c
+++ b/sys/kern/kern_linker.c
@@ -30,6 +30,7 @@
#include "opt_ddb.h"
#include "opt_kld.h"
#include "opt_hwpmc_hooks.h"
+#include "opt_hwt_hooks.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -64,7 +65,7 @@
#include "linker_if.h"
-#ifdef HWPMC_HOOKS
+#if defined(HWPMC_HOOKS) || defined(HWT_HOOKS)
#include <sys/pmckern.h>
#endif
@@ -2184,7 +2185,7 @@ linker_basename(const char *path)
return (filename);
}
-#ifdef HWPMC_HOOKS
+#if defined(HWPMC_HOOKS) || defined(HWT_HOOKS)
/*
* Inform hwpmc about the set of kernel modules currently loaded.
*/
diff --git a/sys/kern/kern_pmc.c b/sys/kern/kern_pmc.c
index a3b572976fbf..15afe1a46d07 100644
--- a/sys/kern/kern_pmc.c
+++ b/sys/kern/kern_pmc.c
@@ -72,6 +72,10 @@ int __read_mostly (*pmc_hook)(struct thread *td, int function, void *arg) = NULL
/* Interrupt handler */
int __read_mostly (*pmc_intr)(struct trapframe *tf) = NULL;
+/* HWT hooks */
+void __read_mostly (*hwt_hook)(struct thread *td, int func, void *arg) = NULL;
+int __read_mostly (*hwt_intr)(struct trapframe *tf) = NULL;
+
DPCPU_DEFINE(uint8_t, pmc_sampled);
/*
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index c8b01afeab4f..dcd38c6e6fbe 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -1637,6 +1637,12 @@ uifree(struct uidinfo *uip)
if (uip->ui_pipecnt != 0)
printf("freeing uidinfo: uid = %d, pipecnt = %ld\n",
uip->ui_uid, uip->ui_pipecnt);
+ if (uip->ui_inotifycnt != 0)
+ printf("freeing uidinfo: uid = %d, inotifycnt = %ld\n",
+ uip->ui_uid, uip->ui_inotifycnt);
+ if (uip->ui_inotifywatchcnt != 0)
+ printf("freeing uidinfo: uid = %d, inotifywatchcnt = %ld\n",
+ uip->ui_uid, uip->ui_inotifywatchcnt);
free(uip, M_UIDINFO);
}
@@ -1742,6 +1748,21 @@ chgpipecnt(struct uidinfo *uip, int diff, rlim_t max)
return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt"));
}
+int
+chginotifycnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+ return (chglimit(uip, &uip->ui_inotifycnt, diff, max, "inotifycnt"));
+}
+
+int
+chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+ return (chglimit(uip, &uip->ui_inotifywatchcnt, diff, max,
+ "inotifywatchcnt"));
+}
+
static int
sysctl_kern_proc_rlimit_usage(SYSCTL_HANDLER_ARGS)
{
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 17b53208157a..35b258e68701 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -27,12 +27,12 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include "opt_kern_tls.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capsicum.h>
+#include <sys/inotify.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/ktls.h>
@@ -1246,6 +1246,8 @@ out:
*/
if (error == 0) {
td->td_retval[0] = 0;
+ if (sbytes > 0 && vp != NULL)
+ INOTIFY(vp, IN_ACCESS);
}
if (sent != NULL) {
(*sent) = sbytes;
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 4565abc4b540..5d51aa675cb7 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1050,8 +1050,7 @@ osigaction(struct thread *td, struct osigaction_args *uap)
int
osigreturn(struct thread *td, struct osigreturn_args *uap)
{
-
- return (nosys(td, (struct nosys_args *)uap));
+ return (kern_nosys(td, 0));
}
#endif
#endif /* COMPAT_43 */
@@ -4139,7 +4138,7 @@ coredump(struct thread *td)
struct flock lf;
struct vattr vattr;
size_t fullpathsize;
- int error, error1, locked;
+ int error, error1, jid, locked, ppid, sig;
char *name; /* name of corefile */
void *rl_cookie;
off_t limit;
@@ -4168,6 +4167,10 @@ coredump(struct thread *td)
PROC_UNLOCK(p);
return (EFBIG);
}
+
+ ppid = p->p_oppid;
+ sig = p->p_sig;
+ jid = p->p_ucred->cr_prison->pr_id;
PROC_UNLOCK(p);
error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
@@ -4253,6 +4256,9 @@ coredump(struct thread *td)
}
devctl_safe_quote_sb(sb, name);
sbuf_putc(sb, '"');
+
+ sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d",
+ jid, p->p_pid, ppid, sig);
if (sbuf_finish(sb) == 0)
devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
out2:
@@ -4281,6 +4287,12 @@ struct nosys_args {
int
nosys(struct thread *td, struct nosys_args *args)
{
+ return (kern_nosys(td, args->dummy));
+}
+
+int
+kern_nosys(struct thread *td, int dummy)
+{
struct proc *p;
p = td->td_proc;
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
index 24406763a93a..a93d711e7597 100644
--- a/sys/kern/kern_syscalls.c
+++ b/sys/kern/kern_syscalls.c
@@ -35,6 +35,7 @@
#include <sys/resourcevar.h>
#include <sys/sx.h>
#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
@@ -50,14 +51,14 @@ int
lkmnosys(struct thread *td, struct nosys_args *args)
{
- return (nosys(td, args));
+ return (kern_nosys(td, 0));
}
int
lkmressys(struct thread *td, struct nosys_args *args)
{
- return (nosys(td, args));
+ return (kern_nosys(td, 0));
}
struct sysent nosys_sysent = {
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index 8ad885b42ebe..0e8c2b9f362e 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -29,7 +29,7 @@
#include "opt_ktrace.h"
#include "opt_posix.h"
#include "opt_hwpmc_hooks.h"
-
+#include "opt_hwt_hooks.h"
#include <sys/systm.h>
#include <sys/kernel.h>
#ifdef KTRACE
@@ -60,6 +60,9 @@
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
+#ifdef HWT_HOOKS
+#include <dev/hwt/hwt_hook.h>
+#endif
#include <machine/frame.h>
@@ -280,6 +283,10 @@ thread_create(struct thread *td, struct rtprio *rtp,
PMC_CALL_HOOK_UNLOCKED(newtd, PMC_FN_THR_CREATE_LOG, NULL);
#endif
+#ifdef HWT_HOOKS
+ HWT_CALL_HOOK(newtd, HWT_THREAD_CREATE, NULL);
+#endif
+
tidhash_add(newtd);
/* ignore timesharing class */
@@ -613,6 +620,9 @@ sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap)
if (PMC_PROC_IS_USING_PMCS(p) || PMC_SYSTEM_SAMPLING_ACTIVE())
PMC_CALL_HOOK_UNLOCKED(ttd, PMC_FN_THR_CREATE_LOG, NULL);
#endif
+#ifdef HWT_HOOKS
+ HWT_CALL_HOOK(ttd, HWT_THREAD_SET_NAME, NULL);
+#endif
#ifdef KTR
sched_clear_tdname(ttd);
#endif
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 2dff461e932a..f853af193016 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -30,6 +30,7 @@
#include "opt_witness.h"
#include "opt_hwpmc_hooks.h"
+#include "opt_hwt_hooks.h"
#include <sys/systm.h>
#include <sys/asan.h>
@@ -60,6 +61,9 @@
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
+#ifdef HWT_HOOKS
+#include <dev/hwt/hwt_hook.h>
+#endif
#include <sys/priv.h>
#include <security/audit/audit.h>
@@ -1002,6 +1006,11 @@ thread_exit(void)
} else if (PMC_SYSTEM_SAMPLING_ACTIVE())
PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL);
#endif
+
+#ifdef HWT_HOOKS
+ HWT_CALL_HOOK(td, HWT_THREAD_EXIT, NULL);
+#endif
+
PROC_UNLOCK(p);
PROC_STATLOCK(p);
thread_lock(td);
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 753494983416..504f9a2338ef 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -36,6 +36,7 @@
#include <sys/cdefs.h>
#include "opt_hwpmc_hooks.h"
+#include "opt_hwt_hooks.h"
#include "opt_sched.h"
#include <sys/param.h>
@@ -63,6 +64,10 @@
#include <sys/pmckern.h>
#endif
+#ifdef HWT_HOOKS
+#include <dev/hwt/hwt_hook.h>
+#endif
+
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
int __read_mostly dtrace_vtime_active;
@@ -1075,6 +1080,11 @@ sched_switch(struct thread *td, int flags)
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
+#ifdef HWT_HOOKS
+ HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL);
+ HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
+#endif
+
SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
/* I feel sleepy */
@@ -1696,10 +1706,20 @@ sched_idletd(void *dummy)
static void
sched_throw_tail(struct thread *td)
{
+ struct thread *newtd;
mtx_assert(&sched_lock, MA_OWNED);
KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
- cpu_throw(td, choosethread()); /* doesn't return */
+
+ newtd = choosethread();
+
+#ifdef HWT_HOOKS
+ if (td)
+ HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL);
+ HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
+#endif
+
+ cpu_throw(td, newtd); /* doesn't return */
}
/*
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 508ec0ab97ec..409439ca34da 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -39,6 +39,7 @@
#include <sys/cdefs.h>
#include "opt_hwpmc_hooks.h"
+#include "opt_hwt_hooks.h"
#include "opt_sched.h"
#include <sys/param.h>
@@ -69,6 +70,10 @@
#include <sys/pmckern.h>
#endif
+#ifdef HWT_HOOKS
+#include <dev/hwt/hwt_hook.h>
+#endif
+
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
int __read_mostly dtrace_vtime_active;
@@ -2432,6 +2437,12 @@ sched_switch(struct thread *td, int flags)
if (dtrace_vtime_active)
(*dtrace_vtime_switch_func)(newtd);
#endif
+
+#ifdef HWT_HOOKS
+ HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL);
+ HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
+#endif
+
td->td_oncpu = NOCPU;
cpu_switch(td, newtd, mtx);
cpuid = td->td_oncpu = PCPU_GET(cpuid);
@@ -3252,6 +3263,10 @@ sched_ap_entry(void)
newtd = sched_throw_grab(tdq);
+#ifdef HWT_HOOKS
+ HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
+#endif
+
/* doesn't return */
cpu_throw(NULL, newtd);
}
@@ -3278,6 +3293,10 @@ sched_throw(struct thread *td)
newtd = sched_throw_grab(tdq);
+#ifdef HWT_HOOKS
+ HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
+#endif
+
/* doesn't return */
cpu_switch(td, newtd, TDQ_LOCKPTR(tdq));
}
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 0edb631d1475..464efda1e91a 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code)
if (__predict_false(!kasan_enabled))
return;
- if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS &&
- (vm_offset_t)addr < DMAP_MAX_ADDRESS)
+ if (kasan_md_unsupported((vm_offset_t)addr))
return;
KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS &&
diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c
index 7cc6fb593697..5ad5b0af1681 100644
--- a/sys/kern/subr_capability.c
+++ b/sys/kern/subr_capability.c
@@ -74,6 +74,10 @@ const cap_rights_t cap_getsockopt_rights =
CAP_RIGHTS_INITIALIZER(CAP_GETSOCKOPT);
const cap_rights_t cap_getsockname_rights =
CAP_RIGHTS_INITIALIZER(CAP_GETSOCKNAME);
+const cap_rights_t cap_inotify_add_rights =
+ CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_ADD);
+const cap_rights_t cap_inotify_rm_rights =
+ CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_RM);
const cap_rights_t cap_ioctl_rights = CAP_RIGHTS_INITIALIZER(CAP_IOCTL);
const cap_rights_t cap_listen_rights = CAP_RIGHTS_INITIALIZER(CAP_LISTEN);
const cap_rights_t cap_linkat_source_rights =
diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
index 3a3548bad52b..bb86c779b936 100644
--- a/sys/kern/subr_pctrie.c
+++ b/sys/kern/subr_pctrie.c
@@ -691,21 +691,23 @@ _pctrie_lookup_ge(struct pctrie *ptree, struct pctrie_node *node,
*/
if (node == PCTRIE_NULL || *pctrie_toval(node) < index) {
/* Climb the path to find a node with a descendant > index. */
- for (node = parent; node != NULL; node = pctrie_parent(node)) {
- slot = pctrie_slot(node, index) + 1;
- if ((node->pn_popmap >> slot) != 0)
+ node = NULL;
+ while (parent != NULL) {
+ slot = pctrie_slot(parent, index) + 1;
+ if ((parent->pn_popmap >> slot) != 0)
break;
+ node = parent;
+ parent = pctrie_parent(node);
}
- if (node == NULL) {
+ if (parent == NULL) {
if (parent_out != NULL)
- *parent_out = NULL;
+ *parent_out = node;
return (NULL);
}
/* Step to the least child with a descendant > index. */
- slot += ffs(node->pn_popmap >> slot) - 1;
- parent = node;
- node = pctrie_node_load(&node->pn_child[slot], NULL,
+ slot += ffs(parent->pn_popmap >> slot) - 1;
+ node = pctrie_node_load(&parent->pn_child[slot], NULL,
PCTRIE_LOCKED);
}
/* Descend to the least leaf of the subtrie. */
@@ -785,21 +787,23 @@ _pctrie_lookup_le(struct pctrie *ptree, struct pctrie_node *node,
*/
if (node == PCTRIE_NULL || *pctrie_toval(node) > index) {
/* Climb the path to find a node with a descendant < index. */
- for (node = parent; node != NULL; node = pctrie_parent(node)) {
- slot = pctrie_slot(node, index);
- if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
+ node = NULL;
+ while (parent != NULL) {
+ slot = pctrie_slot(parent, index);
+ if ((parent->pn_popmap & ((1 << slot) - 1)) != 0)
break;
+ node = parent;
+ parent = pctrie_parent(node);
}
- if (node == NULL) {
+ if (parent == NULL) {
if (parent_out != NULL)
- *parent_out = NULL;
+ *parent_out = node;
return (NULL);
}
/* Step to the greatest child with a descendant < index. */
- slot = ilog2(node->pn_popmap & ((1 << slot) - 1));
- parent = node;
- node = pctrie_node_load(&node->pn_child[slot], NULL,
+ slot = ilog2(parent->pn_popmap & ((1 << slot) - 1));
+ node = pctrie_node_load(&parent->pn_child[slot], NULL,
PCTRIE_LOCKED);
}
/* Descend to the greatest leaf of the subtrie. */
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 18388ae5f232..bac7d0080c71 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -338,8 +338,9 @@ ast_handler(struct thread *td, struct trapframe *framep, bool dtor)
td->td_ast = 0;
}
- CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
- td->td_proc->p_comm);
+ CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td,
+ td->td_proc == NULL ? -1 : td->td_proc->p_pid,
+ td->td_proc == NULL ? "" : td->td_proc->p_comm);
KASSERT(framep == NULL || TRAPF_USERMODE(framep),
("ast in kernel mode"));
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index d31ff3b939cc..b472aaea89e6 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -37,16 +37,17 @@
#include "opt_capsicum.h"
#include "opt_ktrace.h"
-#define EXTERR_CATEGORY EXTERR_CAT_FILEDESC
+#define EXTERR_CATEGORY EXTERR_CAT_GENIO
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/capsicum.h>
+#include <sys/exterrvar.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/fcntl.h>
#include <sys/file.h>
-#include <sys/exterrvar.h>
+#include <sys/inotify.h>
#include <sys/lock.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
@@ -195,7 +196,7 @@ sys_read(struct thread *td, struct read_args *uap)
int error;
if (uap->nbyte > IOSIZE_MAX)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "length > iosize_max"));
aiov.iov_base = uap->buf;
aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
@@ -233,7 +234,7 @@ kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset)
int error;
if (nbyte > IOSIZE_MAX)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "length > iosize_max"));
aiov.iov_base = buf;
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
@@ -329,7 +330,7 @@ kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset)
error = ESPIPE;
else if (offset < 0 &&
(fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
- error = EINVAL;
+ error = EXTERROR(EINVAL, "neg offset");
else
error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
fdrop(fp, td);
@@ -396,7 +397,7 @@ sys_write(struct thread *td, struct write_args *uap)
int error;
if (uap->nbyte > IOSIZE_MAX)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "length > iosize_max"));
aiov.iov_base = (void *)(uintptr_t)uap->buf;
aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
@@ -435,7 +436,7 @@ kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
int error;
if (nbyte > IOSIZE_MAX)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "length > iosize_max"));
aiov.iov_base = (void *)(uintptr_t)buf;
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
@@ -531,7 +532,7 @@ kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset)
error = ESPIPE;
else if (offset < 0 &&
(fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
- error = EINVAL;
+ error = EXTERROR(EINVAL, "neg offset");
else
error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
fdrop(fp, td);
@@ -602,14 +603,14 @@ kern_ftruncate(struct thread *td, int fd, off_t length)
AUDIT_ARG_FD(fd);
if (length < 0)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "negative length"));
error = fget(td, fd, &cap_ftruncate_rights, &fp);
if (error)
return (error);
AUDIT_ARG_FILE(td->td_proc, fp);
if (!(fp->f_flag & FWRITE)) {
fdrop(fp, td);
- return (EINVAL);
+ return (EXTERROR(EINVAL, "non-writable"));
}
error = fo_truncate(fp, length, td->td_ucred, td);
fdrop(fp, td);
@@ -840,8 +841,10 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
int error;
AUDIT_ARG_FD(fd);
- if (offset < 0 || len <= 0)
- return (EINVAL);
+ if (offset < 0)
+ return (EXTERROR(EINVAL, "negative offset"));
+ if (len <= 0)
+ return (EXTERROR(EINVAL, "negative length"));
/* Check for wrap. */
if (offset > OFF_MAX - len)
return (EFBIG);
@@ -898,16 +901,21 @@ kern_fspacectl(struct thread *td, int fd, int cmd,
AUDIT_ARG_FFLAGS(flags);
if (rqsr == NULL)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "no range"));
rmsr = *rqsr;
if (rmsrp != NULL)
*rmsrp = rmsr;
- if (cmd != SPACECTL_DEALLOC ||
- rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
- rqsr->r_offset > OFF_MAX - rqsr->r_len ||
- (flags & ~SPACECTL_F_SUPPORTED) != 0)
- return (EINVAL);
+ if (cmd != SPACECTL_DEALLOC)
+ return (EXTERROR(EINVAL, "cmd", cmd));
+ if (rqsr->r_offset < 0)
+ return (EXTERROR(EINVAL, "neg offset"));
+ if (rqsr->r_len <= 0)
+ return (EXTERROR(EINVAL, "neg len"));
+ if (rqsr->r_offset > OFF_MAX - rqsr->r_len)
+ return (EXTERROR(EINVAL, "offset too large"));
+ if ((flags & ~SPACECTL_F_SUPPORTED) != 0)
+ return (EXTERROR(EINVAL, "reserved flags", flags));
error = fget_write(td, fd, &cap_pwrite_rights, &fp);
if (error != 0)
@@ -939,7 +947,6 @@ int
kern_specialfd(struct thread *td, int type, void *arg)
{
struct file *fp;
- struct specialfd_eventfd *ae;
int error, fd, fflags;
fflags = 0;
@@ -948,14 +955,24 @@ kern_specialfd(struct thread *td, int type, void *arg)
return (error);
switch (type) {
- case SPECIALFD_EVENTFD:
+ case SPECIALFD_EVENTFD: {
+ struct specialfd_eventfd *ae;
+
ae = arg;
if ((ae->flags & EFD_CLOEXEC) != 0)
fflags |= O_CLOEXEC;
error = eventfd_create_file(td, fp, ae->initval, ae->flags);
break;
+ }
+ case SPECIALFD_INOTIFY: {
+ struct specialfd_inotify *si;
+
+ si = arg;
+ error = inotify_create_file(td, fp, si->flags, &fflags);
+ break;
+ }
default:
- error = EINVAL;
+ error = EXTERROR(EINVAL, "invalid type", type);
break;
}
@@ -970,13 +987,14 @@ kern_specialfd(struct thread *td, int type, void *arg)
int
sys___specialfd(struct thread *td, struct __specialfd_args *args)
{
- struct specialfd_eventfd ae;
int error;
switch (args->type) {
- case SPECIALFD_EVENTFD:
+ case SPECIALFD_EVENTFD: {
+ struct specialfd_eventfd ae;
+
if (args->len != sizeof(struct specialfd_eventfd)) {
- error = EINVAL;
+ error = EXTERROR(EINVAL, "eventfd params ABI");
break;
}
error = copyin(args->req, &ae, sizeof(ae));
@@ -984,13 +1002,27 @@ sys___specialfd(struct thread *td, struct __specialfd_args *args)
break;
if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK |
EFD_SEMAPHORE)) != 0) {
- error = EINVAL;
+ error = EXTERROR(EINVAL, "reserved flag");
break;
}
error = kern_specialfd(td, args->type, &ae);
break;
+ }
+ case SPECIALFD_INOTIFY: {
+ struct specialfd_inotify si;
+
+ if (args->len != sizeof(si)) {
+ error = EINVAL;
+ break;
+ }
+ error = copyin(args->req, &si, sizeof(si));
+ if (error != 0)
+ break;
+ error = kern_specialfd(td, args->type, &si);
+ break;
+ }
default:
- error = EINVAL;
+ error = EXTERROR(EINVAL, "unknown type", args->type);
break;
}
return (error);
@@ -1166,7 +1198,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
int error, lf, ndu;
if (nd < 0)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "negative ndescs"));
fdp = td->td_proc->p_fd;
ndu = nd;
lf = fdp->fd_nfiles;
@@ -1259,7 +1291,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
rtv = *tvp;
if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
rtv.tv_usec >= 1000000) {
- error = EINVAL;
+ error = EXTERROR(EINVAL, "invalid timeval");
goto done;
}
if (!timevalisset(&rtv))
@@ -1491,7 +1523,7 @@ sys_poll(struct thread *td, struct poll_args *uap)
if (uap->timeout != INFTIM) {
if (uap->timeout < 0)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "invalid timeout"));
ts.tv_sec = uap->timeout / 1000;
ts.tv_nsec = (uap->timeout % 1000) * 1000000;
tsp = &ts;
@@ -1516,7 +1548,7 @@ kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds,
precision = 0;
if (tsp != NULL) {
if (!timespecvalid_interval(tsp))
- return (EINVAL);
+ return (EXTERROR(EINVAL, "invalid timespec"));
if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
sbt = 0;
else {
@@ -1619,7 +1651,7 @@ kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds,
int error;
if (kern_poll_maxfds(nfds))
- return (EINVAL);
+ return (EXTERROR(EINVAL, "too large nfds"));
if (nfds > nitems(stackfds))
kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
else
@@ -1796,7 +1828,7 @@ selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
rtv = *tvp;
if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
rtv.tv_usec >= 1000000)
- return (EINVAL);
+ return (EXTERROR(EINVAL, "invalid timeval"));
if (!timevalisset(&rtv))
asbt = 0;
else if (rtv.tv_sec <= INT32_MAX) {
@@ -2173,7 +2205,7 @@ kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type,
(uintptr_t)p2->p_vmspace);
break;
default:
- error = EINVAL;
+ error = EXTERROR(EINVAL, "unknown op");
break;
}
@@ -2277,6 +2309,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap)
return (EINVAL);
td->td_pflags2 &= ~TDP2_UEXTERR;
return (0);
+ case EXTERRCTL_UD:
+ /*
+ * Important: this code must always return EINVAL and never any
+ * extended error, for testing purposes.
+ */
+ /* FALLTHROUGH */
default:
return (EINVAL);
}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index 9340779918a2..ed651da96b14 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -548,7 +548,7 @@ sys_pipe2(struct thread *td, struct pipe2_args *uap)
{
int error, fildes[2];
- if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
+ if ((uap->flags & ~(O_CLOEXEC | O_CLOFORK | O_NONBLOCK)) != 0)
return (EINVAL);
error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
if (error)
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index fa36cc824078..90a4f3a7dad8 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -598,4 +598,6 @@ const char *syscallnames[] = {
"fchroot", /* 590 = fchroot */
"setcred", /* 591 = setcred */
"exterrctl", /* 592 = exterrctl */
+ "inotify_add_watch_at", /* 593 = inotify_add_watch_at */
+ "inotify_rm_watch", /* 594 = inotify_rm_watch */
};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 08b557a7a540..90559fab6086 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3349,11 +3349,26 @@
size_t size
);
}
-592 AUE_NULL STD {
+592 AUE_NULL STD|CAPENABLED {
int exterrctl(
u_int op,
u_int flags,
_In_reads_bytes_(4) void *ptr
);
}
+593 AUE_INOTIFY STD|CAPENABLED {
+ int inotify_add_watch_at(
+ int fd,
+ int dfd,
+ _In_z_ const char *path,
+ uint32_t mask
+ );
+ }
+594 AUE_INOTIFY STD|CAPENABLED {
+ int inotify_rm_watch(
+ int fd,
+ int wd
+ );
+ }
+
; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 15789d3eb5fa..90b21616a558 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3482,6 +3482,24 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 3;
break;
}
+ /* inotify_add_watch_at */
+ case 593: {
+ struct inotify_add_watch_at_args *p = params;
+ iarg[a++] = p->fd; /* int */
+ iarg[a++] = p->dfd; /* int */
+ uarg[a++] = (intptr_t)p->path; /* const char * */
+ uarg[a++] = p->mask; /* uint32_t */
+ *n_args = 4;
+ break;
+ }
+ /* inotify_rm_watch */
+ case 594: {
+ struct inotify_rm_watch_args *p = params;
+ iarg[a++] = p->fd; /* int */
+ iarg[a++] = p->wd; /* int */
+ *n_args = 2;
+ break;
+ }
default:
*n_args = 0;
break;
@@ -9317,6 +9335,38 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
};
break;
+ /* inotify_add_watch_at */
+ case 593:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "userland const char *";
+ break;
+ case 3:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* inotify_rm_watch */
+ case 594:
+ switch (ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
default:
break;
};
@@ -11305,6 +11355,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
if (ndx == 0 || ndx == 1)
p = "int";
break;
+ /* inotify_add_watch_at */
+ case 593:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* inotify_rm_watch */
+ case 594:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
default:
break;
};
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
index 11141d197aec..a545a0a54c25 100644
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -1724,7 +1724,7 @@ freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
return (sys_msgsys(td, (struct msgsys_args *)uap));
}
#else
- return (nosys(td, NULL));
+ return (kern_nosys(td, 0));
#endif
}
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
index e399517010fc..a99e1a4de14e 100644
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -1904,7 +1904,7 @@ freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
return (sys_semsys(td, (struct semsys_args *)uap));
}
#else
- return (nosys(td, NULL));
+ return (kern_nosys(td, 0));
#endif
}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index 60e3fe92a4b7..8d1a469127c6 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -1474,7 +1474,7 @@ freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
return (EINVAL);
}
#else
- return (nosys(td, NULL));
+ return (kern_nosys(td, 0));
#endif
}
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index ad8485028987..133724ac76c5 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -151,6 +151,10 @@ kern_socket(struct thread *td, int domain, int type, int protocol)
type &= ~SOCK_CLOEXEC;
oflag |= O_CLOEXEC;
}
+ if ((type & SOCK_CLOFORK) != 0) {
+ type &= ~SOCK_CLOFORK;
+ oflag |= O_CLOFORK;
+ }
if ((type & SOCK_NONBLOCK) != 0) {
type &= ~SOCK_NONBLOCK;
fflag |= FNONBLOCK;
@@ -352,7 +356,8 @@ kern_accept4(struct thread *td, int s, struct sockaddr *sa, int flags,
goto done;
#endif
error = falloc_caps(td, &nfp, &fd,
- (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
+ ((flags & SOCK_CLOEXEC) != 0 ? O_CLOEXEC : 0) |
+ ((flags & SOCK_CLOFORK) != 0 ? O_CLOFORK : 0), &fcaps);
if (error != 0)
goto done;
SOCK_LOCK(head);
@@ -435,7 +440,7 @@ int
sys_accept4(struct thread *td, struct accept4_args *uap)
{
- if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ if ((uap->flags & ~(SOCK_CLOEXEC | SOCK_CLOFORK | SOCK_NONBLOCK)) != 0)
return (EINVAL);
return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
@@ -557,6 +562,10 @@ kern_socketpair(struct thread *td, int domain, int type, int protocol,
type &= ~SOCK_CLOEXEC;
oflag |= O_CLOEXEC;
}
+ if ((type & SOCK_CLOFORK) != 0) {
+ type &= ~SOCK_CLOFORK;
+ oflag |= O_CLOFORK;
+ }
if ((type & SOCK_NONBLOCK) != 0) {
type &= ~SOCK_NONBLOCK;
fflag |= FNONBLOCK;
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 72bd0246db11..0056dac65c7d 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -3463,7 +3463,8 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
UNP_LINK_UNLOCK_ASSERT();
- fdflags = (flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+ fdflags = ((flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0) |
+ ((flags & MSG_CMSG_CLOFORK) ? O_CLOFORK : 0);
error = 0;
if (controlp != NULL) /* controlp == NULL => free control messages */
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 97dc854c9386..02973146068d 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -301,7 +301,7 @@ static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */
static struct unrhdr *aiod_unr;
static void aio_biocleanup(struct bio *bp);
-void aio_init_aioinfo(struct proc *p);
+static int aio_init_aioinfo(struct proc *p);
static int aio_onceonly(void);
static int aio_free_entry(struct kaiocb *job);
static void aio_process_rw(struct kaiocb *job);
@@ -309,7 +309,7 @@ static void aio_process_sync(struct kaiocb *job);
static void aio_process_mlock(struct kaiocb *job);
static void aio_schedule_fsync(void *context, int pending);
static int aio_newproc(int *);
-int aio_aqueue(struct thread *td, struct aiocb *ujob,
+static int aio_aqueue(struct thread *td, struct aiocb *ujob,
struct aioliojob *lio, int type, struct aiocb_ops *ops);
static int aio_queue_file(struct file *fp, struct kaiocb *job);
static void aio_biowakeup(struct bio *bp);
@@ -422,10 +422,11 @@ aio_onceonly(void)
* Init the per-process aioinfo structure. The aioinfo limits are set
* per-process for user limit (resource) management.
*/
-void
+static int
aio_init_aioinfo(struct proc *p)
{
struct kaioinfo *ki;
+ int error;
ki = uma_zalloc(kaio_zone, M_WAITOK);
mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
@@ -451,8 +452,20 @@ aio_init_aioinfo(struct proc *p)
uma_zfree(kaio_zone, ki);
}
- while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
- aio_newproc(NULL);
+ error = 0;
+ while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) {
+ error = aio_newproc(NULL);
+ if (error != 0) {
+ /*
+ * At least one worker is enough to have AIO
+ * functional. Clear error in that case.
+ */
+ if (num_aio_procs > 0)
+ error = 0;
+ break;
+ }
+ }
+ return (error);
}
static int
@@ -1476,7 +1489,7 @@ static struct aiocb_ops aiocb_ops_osigevent = {
* Queue a new AIO request. Choosing either the threaded or direct bio VCHR
* technique is done in this code.
*/
-int
+static int
aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
int type, struct aiocb_ops *ops)
{
@@ -1490,8 +1503,11 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
int fd, kqfd;
u_short evflags;
- if (p->p_aioinfo == NULL)
- aio_init_aioinfo(p);
+ if (p->p_aioinfo == NULL) {
+ error = aio_init_aioinfo(p);
+ if (error != 0)
+ goto err1;
+ }
ki = p->p_aioinfo;
@@ -2213,8 +2229,11 @@ kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
if (nent < 0 || nent > max_aio_queue_per_proc)
return (EINVAL);
- if (p->p_aioinfo == NULL)
- aio_init_aioinfo(p);
+ if (p->p_aioinfo == NULL) {
+ error = aio_init_aioinfo(p);
+ if (error != 0)
+ return (error);
+ }
ki = p->p_aioinfo;
@@ -2503,8 +2522,11 @@ kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
timo = tvtohz(&atv);
}
- if (p->p_aioinfo == NULL)
- aio_init_aioinfo(p);
+ if (p->p_aioinfo == NULL) {
+ error = aio_init_aioinfo(p);
+ if (error != 0)
+ return (error);
+ }
ki = p->p_aioinfo;
error = 0;
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 883beaf6d1da..89c1d779f04c 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -41,6 +41,7 @@
#include <sys/counter.h>
#include <sys/filedesc.h>
#include <sys/fnv_hash.h>
+#include <sys/inotify.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
@@ -331,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
"char *");
SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
-SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
+ "enum cache_fpl_status");
SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
SDT_PROBE_DECLARE(vfs, namei, lookup, return);
@@ -2629,6 +2631,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
atomic_store_ptr(&dvp->v_cache_dd, ncp);
} else if (vp != NULL) {
/*
+ * Take the slow path in INOTIFY(). This flag will be lazily
+ * cleared by cache_vop_inotify() once all directories referring
+ * to vp are unwatched.
+ */
+ if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0))
+ vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT);
+
+ /*
* For this case, the cache entry maps both the
* directory name in it and the name ".." for the
* directory's parent.
@@ -4008,6 +4018,56 @@ out:
return (error);
}
+void
+cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie)
+{
+ struct mtx *vlp;
+ struct namecache *ncp;
+ int isdir;
+ bool logged, self;
+
+ isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+ self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 &&
+ (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0);
+
+ if (self) {
+ int selfevent;
+
+ if (event == _IN_ATTRIB_LINKCOUNT)
+ selfevent = IN_ATTRIB;
+ else
+ selfevent = event;
+ inotify_log(vp, NULL, 0, selfevent | isdir, cookie);
+ }
+ if ((event & IN_ALL_EVENTS) == 0)
+ return;
+
+ logged = false;
+ vlp = VP2VNODELOCK(vp);
+ mtx_lock(vlp);
+ TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
+ if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+ continue;
+ if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) {
+ /*
+ * XXX-MJ if the vnode has two links in the same
+ * dir, we'll log the same event twice.
+ */
+ inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen,
+ event | isdir, cookie);
+ logged = true;
+ }
+ }
+ if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) {
+ /*
+ * We didn't find a watched directory that contains this vnode,
+ * so stop calling VOP_INOTIFY for operations on the vnode.
+ */
+ vn_irflag_unset(vp, VIRF_INOTIFY_PARENT);
+ }
+ mtx_unlock(vlp);
+}
+
#ifdef DDB
static void
db_print_vpath(struct vnode *vp)
@@ -6361,15 +6421,11 @@ out:
cache_fpl_smr_assert_not_entered(&fpl);
cache_fpl_assert_status(&fpl);
*status = fpl.status;
- if (SDT_PROBES_ENABLED()) {
- SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
- if (fpl.status == CACHE_FPL_STATUS_HANDLED)
- SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
- ndp);
- }
-
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
MPASS(error != CACHE_FPL_FAILED);
+ SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
+ ndp);
if (error != 0) {
cache_fpl_cleanup_cnp(fpl.cnp);
MPASS(fpl.dvp == NULL);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index be49c0887609..fd6202a1424c 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -39,6 +39,7 @@
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/filio.h>
+#include <sys/inotify.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
@@ -119,6 +120,8 @@ struct vop_vector default_vnodeops = {
.vop_getwritemount = vop_stdgetwritemount,
.vop_inactive = VOP_NULL,
.vop_need_inactive = vop_stdneed_inactive,
+ .vop_inotify = vop_stdinotify,
+ .vop_inotify_add_watch = vop_stdinotify_add_watch,
.vop_ioctl = vop_stdioctl,
.vop_kqfilter = vop_stdkqfilter,
.vop_islocked = vop_stdislocked,
@@ -453,6 +456,7 @@ vop_stdpathconf(struct vop_pathconf_args *ap)
case _PC_MAC_PRESENT:
case _PC_NAMEDATTR_ENABLED:
case _PC_HAS_NAMEDATTR:
+ case _PC_HAS_HIDDENSYSTEM:
*ap->a_retval = 0;
return (0);
default:
@@ -1306,6 +1310,20 @@ vop_stdneed_inactive(struct vop_need_inactive_args *ap)
}
int
+vop_stdinotify(struct vop_inotify_args *ap)
+{
+ vn_inotify(ap->a_vp, ap->a_dvp, ap->a_cnp, ap->a_event, ap->a_cookie);
+ return (0);
+}
+
+int
+vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *ap)
+{
+ return (vn_inotify_add_watch(ap->a_vp, ap->a_sc, ap->a_mask,
+ ap->a_wdp, ap->a_td));
+}
+
+int
vop_stdioctl(struct vop_ioctl_args *ap)
{
struct vnode *vp;
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
new file mode 100644
index 000000000000..d3cd0d1f9832
--- /dev/null
+++ b/sys/kern/vfs_inotify.c
@@ -0,0 +1,1011 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Klara, Inc.
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/caprights.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#define EXTERR_CATEGORY EXTERR_CAT_INOTIFY
+#include <sys/exterrvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/inotify.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslimits.h>
+#include <sys/sysproto.h>
+#include <sys/tree.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+uint32_t inotify_rename_cookie;
+
+static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "inotify configuration");
+
+static int inotify_max_queued_events = 16384;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
+ &inotify_max_queued_events, 0,
+ "Maximum number of events to queue on an inotify descriptor");
+
+static int inotify_max_user_instances = 256;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
+ &inotify_max_user_instances, 0,
+ "Maximum number of inotify descriptors per user");
+
+static int inotify_max_user_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
+ &inotify_max_user_watches, 0,
+ "Maximum number of inotify watches per user");
+
+static int inotify_max_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
+ &inotify_max_watches, 0,
+ "Maximum number of inotify watches system-wide");
+
+static int inotify_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
+ &inotify_watches, 0,
+ "Total number of inotify watches currently in use");
+
+static int inotify_coalesce = 1;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
+ &inotify_coalesce, 0,
+ "Coalesce inotify events when possible");
+
+static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
+SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
+ &inotify_event_drops,
+ "Number of inotify events dropped due to limits or allocation failures");
+
+static fo_rdwr_t inotify_read;
+static fo_ioctl_t inotify_ioctl;
+static fo_poll_t inotify_poll;
+static fo_kqfilter_t inotify_kqfilter;
+static fo_stat_t inotify_stat;
+static fo_close_t inotify_close;
+static fo_fill_kinfo_t inotify_fill_kinfo;
+
+static const struct fileops inotifyfdops = {
+ .fo_read = inotify_read,
+ .fo_write = invfo_rdwr,
+ .fo_truncate = invfo_truncate,
+ .fo_ioctl = inotify_ioctl,
+ .fo_poll = inotify_poll,
+ .fo_kqfilter = inotify_kqfilter,
+ .fo_stat = inotify_stat,
+ .fo_close = inotify_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_fill_kinfo = inotify_fill_kinfo,
+ .fo_cmp = file_kcmp_generic,
+ .fo_flags = DFLAG_PASSABLE,
+};
+
+static void filt_inotifydetach(struct knote *kn);
+static int filt_inotifyevent(struct knote *kn, long hint);
+
+static const struct filterops inotify_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_inotifydetach,
+ .f_event = filt_inotifyevent,
+};
+
+static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
+
+struct inotify_record {
+ STAILQ_ENTRY(inotify_record) link;
+ struct inotify_event ev;
+};
+
+static uint64_t inotify_ino = 1;
+
+/*
+ * On LP64 systems this occupies 64 bytes, so we don't get internal
+ * fragmentation by allocating watches with malloc(9). If the size changes,
+ * consider using a UMA zone to improve memory efficiency.
+ */
+struct inotify_watch {
+ struct inotify_softc *sc; /* back-pointer */
+ int wd; /* unique ID */
+ uint32_t mask; /* event mask */
+ struct vnode *vp; /* vnode being watched, refed */
+ RB_ENTRY(inotify_watch) ilink; /* inotify linkage */
+ TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */
+};
+
+static void
+inotify_init(void *arg __unused)
+{
+ /* Don't let a user hold too many vnodes. */
+ inotify_max_user_watches = desiredvnodes / 3;
+ /* Don't let the system hold too many vnodes. */
+ inotify_max_watches = desiredvnodes / 2;
+}
+SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
+
+static int
+inotify_watch_cmp(const struct inotify_watch *a,
+ const struct inotify_watch *b)
+{
+ if (a->wd < b->wd)
+ return (-1);
+ else if (a->wd > b->wd)
+ return (1);
+ else
+ return (0);
+}
+RB_HEAD(inotify_watch_tree, inotify_watch);
+RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
+
+struct inotify_softc {
+ struct mtx lock; /* serialize all softc writes */
+ STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */
+ struct inotify_record overflow; /* preallocated record */
+ int nextwatch; /* next watch ID to try */
+ int npending; /* number of pending events */
+ size_t nbpending; /* bytes available to read */
+ uint64_t ino; /* unique identifier */
+ struct inotify_watch_tree watches; /* active watches */
+ struct selinfo sel; /* select/poll/kevent info */
+ struct ucred *cred; /* credential ref */
+};
+
+static struct inotify_record *
+inotify_dequeue(struct inotify_softc *sc)
+{
+ struct inotify_record *rec;
+
+ mtx_assert(&sc->lock, MA_OWNED);
+ KASSERT(!STAILQ_EMPTY(&sc->pending),
+ ("%s: queue for %p is empty", __func__, sc));
+
+ rec = STAILQ_FIRST(&sc->pending);
+ STAILQ_REMOVE_HEAD(&sc->pending, link);
+ sc->npending--;
+ sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
+ return (rec);
+}
+
+static void
+inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
+{
+ mtx_assert(&sc->lock, MA_OWNED);
+
+ if (head)
+ STAILQ_INSERT_HEAD(&sc->pending, rec, link);
+ else
+ STAILQ_INSERT_TAIL(&sc->pending, rec, link);
+ sc->npending++;
+ sc->nbpending += sizeof(rec->ev) + rec->ev.len;
+}
+
+static int
+inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
+ struct thread *td)
+{
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ int error;
+ bool first;
+
+ sc = fp->f_data;
+ error = 0;
+
+ mtx_lock(&sc->lock);
+ while (STAILQ_EMPTY(&sc->pending)) {
+ if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
+ mtx_unlock(&sc->lock);
+ return (EWOULDBLOCK);
+ }
+ error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
+ if (error != 0) {
+ mtx_unlock(&sc->lock);
+ return (error);
+ }
+ }
+ for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
+ size_t len;
+
+ rec = inotify_dequeue(sc);
+ len = sizeof(rec->ev) + rec->ev.len;
+ if (uio->uio_resid < (ssize_t)len) {
+ inotify_enqueue(sc, rec, true);
+ if (first) {
+ error = EXTERROR(EINVAL,
+ "read buffer is too small");
+ }
+ break;
+ }
+ mtx_unlock(&sc->lock);
+ error = uiomove(&rec->ev, len, uio);
+#ifdef KTRACE
+ if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+ ktrstruct("inotify", &rec->ev, len);
+#endif
+ mtx_lock(&sc->lock);
+ if (error != 0) {
+ inotify_enqueue(sc, rec, true);
+ mtx_unlock(&sc->lock);
+ return (error);
+ }
+ if (rec == &sc->overflow) {
+ /*
+ * Signal to inotify_queue_record() that the overflow
+ * record can be reused.
+ */
+ memset(rec, 0, sizeof(*rec));
+ } else {
+ free(rec, M_INOTIFY);
+ }
+ }
+ mtx_unlock(&sc->lock);
+ return (error);
+}
+
+static int
+inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
+ struct thread *td)
+{
+ struct inotify_softc *sc;
+
+ sc = fp->f_data;
+
+ switch (com) {
+ case FIONREAD:
+ *(int *)data = (int)sc->nbpending;
+ return (0);
+ case FIONBIO:
+ case FIOASYNC:
+ return (0);
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+static int
+inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+ struct inotify_softc *sc;
+ int revents;
+
+ sc = fp->f_data;
+ revents = 0;
+
+ mtx_lock(&sc->lock);
+ if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(td, &sc->sel);
+ mtx_unlock(&sc->lock);
+ return (revents);
+}
+
+static void
+filt_inotifydetach(struct knote *kn)
+{
+ struct inotify_softc *sc;
+
+ sc = kn->kn_hook;
+ knlist_remove(&sc->sel.si_note, kn, 0);
+}
+
+static int
+filt_inotifyevent(struct knote *kn, long hint)
+{
+ struct inotify_softc *sc;
+
+ sc = kn->kn_hook;
+ mtx_assert(&sc->lock, MA_OWNED);
+ kn->kn_data = sc->nbpending;
+ return (kn->kn_data > 0);
+}
+
+static int
+inotify_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct inotify_softc *sc;
+
+ if (kn->kn_filter != EVFILT_READ)
+ return (EINVAL);
+ sc = fp->f_data;
+ kn->kn_fop = &inotify_rfiltops;
+ kn->kn_hook = sc;
+ knlist_add(&sc->sel.si_note, kn, 0);
+ return (0);
+}
+
+static int
+inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
+{
+ struct inotify_softc *sc;
+
+ sc = fp->f_data;
+
+ memset(sb, 0, sizeof(*sb));
+ sb->st_mode = S_IFREG | S_IRUSR;
+ sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
+ mtx_lock(&sc->lock);
+ sb->st_size = sc->nbpending;
+ sb->st_blocks = sc->npending;
+ sb->st_uid = sc->cred->cr_ruid;
+ sb->st_gid = sc->cred->cr_rgid;
+ sb->st_ino = sc->ino;
+ mtx_unlock(&sc->lock);
+ return (0);
+}
+
+static void
+inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
+{
+ struct vnode *vp;
+
+ vp = watch->vp;
+ mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
+
+ atomic_subtract_int(&inotify_watches, 1);
+ (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+
+ TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+ if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
+ vn_irflag_unset(vp, VIRF_INOTIFY);
+}
+
+/*
+ * Assumes that the watch has already been removed from its softc.
+ */
+static void
+inotify_remove_watch(struct inotify_watch *watch)
+{
+ struct inotify_softc *sc;
+ struct vnode *vp;
+
+ sc = watch->sc;
+
+ vp = watch->vp;
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ inotify_unlink_watch_locked(sc, watch);
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+ vrele(vp);
+ free(watch, M_INOTIFY);
+}
+
+static int
+inotify_close(struct file *fp, struct thread *td)
+{
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ struct inotify_watch *watch;
+
+ sc = fp->f_data;
+
+ mtx_lock(&sc->lock);
+ (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
+ while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+ mtx_unlock(&sc->lock);
+ inotify_remove_watch(watch);
+ mtx_lock(&sc->lock);
+ }
+ while (!STAILQ_EMPTY(&sc->pending)) {
+ rec = inotify_dequeue(sc);
+ if (rec != &sc->overflow)
+ free(rec, M_INOTIFY);
+ }
+ mtx_unlock(&sc->lock);
+ seldrain(&sc->sel);
+ knlist_destroy(&sc->sel.si_note);
+ mtx_destroy(&sc->lock);
+ crfree(sc->cred);
+ free(sc, M_INOTIFY);
+ return (0);
+}
+
+static int
+inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+ struct filedesc *fdp)
+{
+ struct inotify_softc *sc;
+
+ sc = fp->f_data;
+
+ kif->kf_type = KF_TYPE_INOTIFY;
+ kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
+ kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
+ return (0);
+}
+
+int
+inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
+{
+ struct inotify_softc *sc;
+ int fflags;
+
+ if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
+ return (EINVAL);
+
+ if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
+ inotify_max_user_instances))
+ return (EMFILE);
+
+ sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
+ sc->nextwatch = 1; /* Required for compatibility. */
+ STAILQ_INIT(&sc->pending);
+ RB_INIT(&sc->watches);
+ mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
+ knlist_init_mtx(&sc->sel.si_note, &sc->lock);
+ sc->cred = crhold(td->td_ucred);
+ sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
+
+ fflags = FREAD;
+ if ((flags & IN_NONBLOCK) != 0)
+ fflags |= FNONBLOCK;
+ if ((flags & IN_CLOEXEC) != 0)
+ *fflagsp |= O_CLOEXEC;
+ finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
+
+ return (0);
+}
+
+static struct inotify_record *
+inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
+ uint32_t cookie, int waitok)
+{
+ struct inotify_event *evp;
+ struct inotify_record *rec;
+
+ rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
+ waitok | M_ZERO);
+ if (rec == NULL)
+ return (NULL);
+ evp = &rec->ev;
+ evp->wd = wd;
+ evp->mask = event;
+ evp->cookie = cookie;
+ evp->len = _IN_NAMESIZE(namelen);
+ if (name != NULL)
+ memcpy(evp->name, name, namelen);
+ return (rec);
+}
+
+static bool
+inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
+{
+ struct inotify_record *prev;
+
+ mtx_assert(&sc->lock, MA_OWNED);
+
+ prev = STAILQ_LAST(&sc->pending, inotify_record, link);
+ return (prev != NULL && prev->ev.mask == evp->mask &&
+ prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
+ prev->ev.len == evp->len &&
+ memcmp(prev->ev.name, evp->name, evp->len) == 0);
+}
+
+static void
+inotify_overflow_event(struct inotify_event *evp)
+{
+ evp->mask = IN_Q_OVERFLOW;
+ evp->wd = -1;
+ evp->cookie = 0;
+ evp->len = 0;
+}
+
+/*
+ * Put an event record on the queue for an inotify desscriptor. Return false if
+ * the record was not enqueued for some reason, true otherwise.
+ */
+static bool
+inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
+{
+ struct inotify_event *evp;
+
+ mtx_assert(&sc->lock, MA_OWNED);
+
+ evp = &rec->ev;
+ if (__predict_false(rec == &sc->overflow)) {
+ /*
+ * Is the overflow record already in the queue? If so, there's
+ * not much else we can do: we're here because a kernel memory
+ * shortage prevented new record allocations.
+ */
+ counter_u64_add(inotify_event_drops, 1);
+ if (evp->mask == IN_Q_OVERFLOW)
+ return (false);
+ inotify_overflow_event(evp);
+ } else {
+ /* Try to coalesce duplicate events. */
+ if (inotify_coalesce && inotify_can_coalesce(sc, evp))
+ return (false);
+
+ /*
+ * Would this one overflow the queue? If so, convert it to an
+ * overflow event and try again to coalesce.
+ */
+ if (sc->npending >= inotify_max_queued_events) {
+ counter_u64_add(inotify_event_drops, 1);
+ inotify_overflow_event(evp);
+ if (inotify_can_coalesce(sc, evp))
+ return (false);
+ }
+ }
+ inotify_enqueue(sc, rec, false);
+ selwakeup(&sc->sel);
+ KNOTE_LOCKED(&sc->sel.si_note, 0);
+ wakeup(&sc->pending);
+ return (true);
+}
+
+static int
+inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
+ int event, uint32_t cookie)
+{
+ struct inotify_watch key;
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ int relecount;
+ bool allocfail;
+
+ relecount = 0;
+
+ sc = watch->sc;
+ rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
+ M_NOWAIT);
+ if (rec == NULL) {
+ rec = &sc->overflow;
+ allocfail = true;
+ } else {
+ allocfail = false;
+ }
+
+ mtx_lock(&sc->lock);
+ if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
+ free(rec, M_INOTIFY);
+ if ((watch->mask & IN_ONESHOT) != 0 ||
+ (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
+ if (!allocfail) {
+ rec = inotify_alloc_record(watch->wd, NULL, 0,
+ IN_IGNORED, 0, M_NOWAIT);
+ if (rec == NULL)
+ rec = &sc->overflow;
+ if (!inotify_queue_record(sc, rec) &&
+ rec != &sc->overflow)
+ free(rec, M_INOTIFY);
+ }
+
+ /*
+ * Remove the watch, taking care to handle races with
+ * inotify_close().
+ */
+ key.wd = watch->wd;
+ if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+ inotify_unlink_watch_locked(sc, watch);
+ free(watch, M_INOTIFY);
+
+ /* Defer vrele() to until locks are dropped. */
+ relecount++;
+ }
+ }
+ mtx_unlock(&sc->lock);
+ return (relecount);
+}
+
+void
+inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
+ uint32_t cookie)
+{
+ struct inotify_watch *watch, *tmp;
+ int relecount;
+
+ KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
+ ("inotify_log: invalid event %#x", event));
+
+ relecount = 0;
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
+ KASSERT(watch->vp == vp,
+ ("inotify_log: watch %p vp != vp", watch));
+ if ((watch->mask & event) != 0 || event == IN_UNMOUNT) {
+ relecount += inotify_log_one(watch, name, namelen, event,
+ cookie);
+ }
+ }
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+ for (int i = 0; i < relecount; i++)
+ vrele(vp);
+}
+
+/*
+ * An inotify event occurred on a watched vnode.
+ */
+void
+vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
+ int event, uint32_t cookie)
+{
+ int isdir;
+
+ VNPASS(vp->v_holdcnt > 0, vp);
+
+ isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+
+ if (dvp != NULL) {
+ VNPASS(dvp->v_holdcnt > 0, dvp);
+
+ /*
+ * Should we log an event for the vnode itself?
+ */
+ if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
+ int selfevent;
+
+ switch (event) {
+ case _IN_MOVE_DELETE:
+ case IN_DELETE:
+ /*
+ * IN_DELETE_SELF is only generated when the
+ * last hard link of a file is removed.
+ */
+ selfevent = IN_DELETE_SELF;
+ if (vp->v_type != VDIR) {
+ struct vattr va;
+ int error;
+
+ error = VOP_GETATTR(vp, &va,
+ cnp->cn_cred);
+ if (error == 0 && va.va_nlink != 0)
+ selfevent = 0;
+ }
+ break;
+ case IN_MOVED_FROM:
+ cookie = 0;
+ selfevent = IN_MOVE_SELF;
+ break;
+ case _IN_ATTRIB_LINKCOUNT:
+ selfevent = IN_ATTRIB;
+ break;
+ default:
+ selfevent = event;
+ break;
+ }
+
+ if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
+ inotify_log(vp, NULL, 0, selfevent | isdir,
+ cookie);
+ }
+ }
+
+ /*
+ * Something is watching the directory through which this vnode
+ * was referenced, so we may need to log the event.
+ */
+ if ((event & IN_ALL_EVENTS) != 0 &&
+ (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
+ inotify_log(dvp, cnp->cn_nameptr,
+ cnp->cn_namelen, event | isdir, cookie);
+ }
+ } else {
+ /*
+ * We don't know which watched directory might contain the
+ * vnode, so we have to fall back to searching the name cache.
+ */
+ cache_vop_inotify(vp, event, cookie);
+ }
+}
+
+int
+vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
+ uint32_t *wdp, struct thread *td)
+{
+ struct inotify_watch *watch, *watch1;
+ uint32_t wd;
+
+ /*
+ * If this is a directory, make sure all of its entries are present in
+ * the name cache so that we're able to look them up if an event occurs.
+ * The persistent reference on the directory prevents the outgoing name
+ * cache entries from being reclaimed.
+ */
+ if (vp->v_type == VDIR) {
+ struct dirent *dp;
+ char *buf;
+ off_t off;
+ size_t buflen, len;
+ int eof, error;
+
+ buflen = 128 * sizeof(struct dirent);
+ buf = malloc(buflen, M_TEMP, M_WAITOK);
+
+ error = 0;
+ len = off = eof = 0;
+ for (;;) {
+ struct nameidata nd;
+
+ error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
+ &len, &off, &eof);
+ if (error != 0)
+ break;
+ if (len == 0)
+ /* Finished reading. */
+ break;
+ if (strcmp(dp->d_name, ".") == 0 ||
+ strcmp(dp->d_name, "..") == 0)
+ continue;
+
+ /*
+ * namei() consumes a reference on the starting
+ * directory if it's specified as a vnode.
+ */
+ vrefact(vp);
+ VOP_UNLOCK(vp);
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
+ dp->d_name, vp);
+ error = namei(&nd);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ if (error != 0)
+ break;
+ vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
+ vrele(nd.ni_vp);
+ }
+ free(buf, M_TEMP);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * The vnode referenced in kern_inotify_add_watch() might be different
+ * than this one if nullfs is in the picture.
+ */
+ vrefact(vp);
+ watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
+ watch->sc = sc;
+ watch->vp = vp;
+ watch->mask = mask;
+
+ /*
+ * Are we updating an existing watch? Search the vnode's list rather
+ * than that of the softc, as the former is likely to be shorter.
+ */
+ v_addpollinfo(vp);
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
+ if (watch1->sc == sc)
+ break;
+ }
+ mtx_lock(&sc->lock);
+ if (watch1 != NULL) {
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+ /*
+ * We found an existing watch, update it based on our flags.
+ */
+ if ((mask & IN_MASK_CREATE) != 0) {
+ mtx_unlock(&sc->lock);
+ vrele(vp);
+ free(watch, M_INOTIFY);
+ return (EEXIST);
+ }
+ if ((mask & IN_MASK_ADD) != 0)
+ watch1->mask |= mask;
+ else
+ watch1->mask = mask;
+ *wdp = watch1->wd;
+ mtx_unlock(&sc->lock);
+ vrele(vp);
+ free(watch, M_INOTIFY);
+ return (EJUSTRETURN);
+ }
+
+ /*
+ * We're creating a new watch. Add it to the softc and vnode watch
+ * lists.
+ */
+ do {
+ struct inotify_watch key;
+
+ /*
+ * Search for the next available watch descriptor. This is
+ * implemented so as to avoid reusing watch descriptors for as
+ * long as possible.
+ */
+ key.wd = wd = sc->nextwatch++;
+ watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+ } while (watch1 != NULL || wd == 0);
+ watch->wd = wd;
+ RB_INSERT(inotify_watch_tree, &sc->watches, watch);
+ TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+ mtx_unlock(&sc->lock);
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ vn_irflag_set_cond(vp, VIRF_INOTIFY);
+
+ *wdp = wd;
+
+ return (0);
+}
+
+void
+vn_inotify_revoke(struct vnode *vp)
+{
+ if (vp->v_pollinfo == NULL) {
+ /* This is a nullfs vnode which shadows a watched vnode. */
+ return;
+ }
+ inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
+}
+
+static int
+fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
+ struct file **fpp)
+{
+ struct file *fp;
+ int error;
+
+ error = fget(td, fd, needrightsp, &fp);
+ if (error != 0)
+ return (error);
+ if (fp->f_type != DTYPE_INOTIFY) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ *fpp = fp;
+ return (0);
+}
+
+int
+kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
+ struct thread *td)
+{
+ struct nameidata nd;
+ struct file *fp;
+ struct inotify_softc *sc;
+ struct vnode *vp;
+ uint32_t wd;
+ int count, error;
+
+ fp = NULL;
+ vp = NULL;
+
+ if ((mask & IN_ALL_EVENTS) == 0)
+ return (EXTERROR(EINVAL, "no events specified"));
+ if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
+ (IN_MASK_ADD | IN_MASK_CREATE))
+ return (EXTERROR(EINVAL,
+ "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
+ if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
+ return (EXTERROR(EINVAL, "unrecognized flag"));
+
+ error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
+ if (error != 0)
+ return (error);
+ sc = fp->f_data;
+
+ NDINIT_AT(&nd, LOOKUP,
+ ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
+ LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
+ error = namei(&nd);
+ if (error != 0)
+ goto out;
+ NDFREE_PNBUF(&nd);
+ vp = nd.ni_vp;
+
+ error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
+ if (error != 0)
+ goto out;
+
+ if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+
+ count = atomic_fetchadd_int(&inotify_watches, 1);
+ if (count > inotify_max_watches) {
+ atomic_subtract_int(&inotify_watches, 1);
+ error = ENOSPC;
+ goto out;
+ }
+ if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
+ inotify_max_user_watches)) {
+ atomic_subtract_int(&inotify_watches, 1);
+ error = ENOSPC;
+ goto out;
+ }
+ error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
+ if (error != 0) {
+ atomic_subtract_int(&inotify_watches, 1);
+ (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+ if (error == EJUSTRETURN) {
+ /* We updated an existing watch, everything is ok. */
+ error = 0;
+ } else {
+ goto out;
+ }
+ }
+ td->td_retval[0] = wd;
+
+out:
+ if (vp != NULL)
+ vput(vp);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_inotify_add_watch_at(struct thread *td,
+ struct inotify_add_watch_at_args *uap)
+{
+ return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
+ uap->mask, td));
+}
+
+int
+kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
+{
+ struct file *fp;
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ struct inotify_watch key, *watch;
+ int error;
+
+ error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
+ if (error != 0)
+ return (error);
+ sc = fp->f_data;
+
+ rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
+
+ /*
+ * For compatibility with Linux, we do not remove pending events
+ * associated with the watch. Watch descriptors are implemented so as
+ * to avoid being reused for as long as possible, so one hopes that any
+ * pending events from the removed watch descriptor will be removed
+ * before the watch descriptor is recycled.
+ */
+ key.wd = wd;
+ mtx_lock(&sc->lock);
+ watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+ if (watch == NULL) {
+ free(rec, M_INOTIFY);
+ error = EINVAL;
+ } else {
+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+ if (!inotify_queue_record(sc, rec)) {
+ free(rec, M_INOTIFY);
+ error = 0;
+ }
+ }
+ mtx_unlock(&sc->lock);
+ if (watch != NULL)
+ inotify_remove_watch(watch);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
+{
+ return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index 86c7bdaa02c0..fb3e6a7a2534 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -75,14 +75,20 @@ static void NDVALIDATE_impl(struct nameidata *, int);
#endif
/*
+ * Reset ndp to its original state.
+ */
+#define NDRESET(ndp) do { \
+ NDREINIT_DBG(ndp); \
+ ndp->ni_resflags = 0; \
+ ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS; \
+} while (0)
+/*
* Prepare namei() to restart. Reset components to its original state and set
* ISRESTARTED flag which signals the underlying lookup code to change the root
* from ABI root to actual root and prevents a further restarts.
*/
#define NDRESTART(ndp) do { \
- NDREINIT_DBG(ndp); \
- ndp->ni_resflags = 0; \
- ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS; \
+ NDRESET(ndp); \
ndp->ni_cnd.cn_flags |= ISRESTARTED; \
} while (0)
@@ -162,8 +168,8 @@ static struct vop_vector crossmp_vnodeops = {
*/
struct nameicap_tracker {
- struct vnode *dp;
TAILQ_ENTRY(nameicap_tracker) nm_link;
+ struct mount *mp;
};
/* Zone for cap mode tracker elements used for dotdot capability checks. */
@@ -192,49 +198,75 @@ SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
"enables \"..\" components in path lookup in capability mode "
"on non-local mount");
-static void
+static int
nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
{
struct nameicap_tracker *nt;
+ struct mount *mp;
+ int error;
if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
- return;
+ return (0);
+ mp = NULL;
+ error = VOP_GETWRITEMOUNT(dp, &mp);
+ if (error != 0)
+ return (error);
nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head);
- if (nt != NULL && nt->dp == dp)
- return;
+ if (nt != NULL && nt->mp == mp) {
+ vfs_rel(mp);
+ return (0);
+ }
nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK);
- vhold(dp);
- nt->dp = dp;
- TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+ nt->mp = mp;
+ error = lockmgr(&mp->mnt_renamelock, LK_SHARED | LK_NOWAIT, 0);
+ if (error != 0) {
+ MPASS(ndp->ni_nctrack_mnt == NULL);
+ ndp->ni_nctrack_mnt = mp;
+ free(nt, M_NAMEITRACKER);
+ error = ERESTART;
+ } else {
+ TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+ }
+ return (error);
}
static void
-nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first)
+nameicap_cleanup(struct nameidata *ndp, int error)
{
struct nameicap_tracker *nt, *nt1;
+ struct mount *mp;
+
+ KASSERT((ndp->ni_nctrack_mnt == NULL &&
+ TAILQ_EMPTY(&ndp->ni_cap_tracker)) ||
+ (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0,
+ ("tracker active and not strictrelative"));
- nt = first;
- TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+ TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+ mp = nt->mp;
+ lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+ vfs_rel(mp);
TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
- vdrop(nt->dp);
free(nt, M_NAMEITRACKER);
}
-}
-static void
-nameicap_cleanup(struct nameidata *ndp)
-{
- KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
- (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
- nameicap_cleanup_from(ndp, NULL);
+ mp = ndp->ni_nctrack_mnt;
+ if (mp != NULL) {
+ if (error == ERESTART) {
+ lockmgr(&mp->mnt_renamelock, LK_EXCLUSIVE, 0);
+ lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+ }
+ vfs_rel(mp);
+ ndp->ni_nctrack_mnt = NULL;
+ }
}
/*
- * For dotdot lookups in capability mode, only allow the component
- * lookup to succeed if the resulting directory was already traversed
- * during the operation. This catches situations where already
- * traversed directory is moved to different parent, and then we walk
- * over it with dotdots.
+ * For dotdot lookups in capability mode, disallow walking over the
+ * directory no_rbeneath_dpp that was used as the starting point of
+ * the lookup. Since we take the mnt_renamelocks of all mounts we
+ * ever walked over during lookup, parallel renames are disabled.
+ * This prevents the situation where we circumvent walk over
+ * ni_rbeneath_dpp following dotdots.
*
* Also allow to force failure of dotdot lookups for non-local
* filesystems, where external agents might assist local lookups to
@@ -243,7 +275,6 @@ nameicap_cleanup(struct nameidata *ndp)
static int
nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
{
- struct nameicap_tracker *nt;
struct mount *mp;
if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf &
@@ -253,22 +284,16 @@ nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
NI_LCF_CAP_DOTDOT_KTR)) == NI_LCF_STRICTREL_KTR))
NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0)
- return (ENOTCAPABLE);
+ goto violation;
+ if (dp == ndp->ni_rbeneath_dpp)
+ goto violation;
mp = dp->v_mount;
if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
(mp->mnt_flag & MNT_LOCAL) == 0)
- goto capfail;
- TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
- nm_link) {
- if (dp == nt->dp) {
- nt = TAILQ_NEXT(nt, nm_link);
- if (nt != NULL)
- nameicap_cleanup_from(ndp, nt);
- return (0);
- }
- }
+ goto violation;
+ return (0);
-capfail:
+violation:
if (__predict_false((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0))
NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
return (ENOTCAPABLE);
@@ -394,6 +419,8 @@ namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
NI_LCF_CAP_DOTDOT;
}
}
+ if (error == 0 && (ndp->ni_lcf & NI_LCF_STRICTREL) != 0)
+ ndp->ni_rbeneath_dpp = *dpp;
/*
* If we are auditing the kernel pathname, save the user pathname.
@@ -631,6 +658,7 @@ restart:
error = namei_getpath(ndp);
if (__predict_false(error != 0)) {
namei_cleanup_cnp(cnp);
+ nameicap_cleanup(ndp, error);
SDT_PROBE4(vfs, namei, lookup, return, error, NULL,
false, ndp);
return (error);
@@ -661,12 +689,12 @@ restart:
else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
(cnp->cn_flags & ISRESTARTED) == 0)) {
namei_cleanup_cnp(cnp);
+ nameicap_cleanup(ndp, ERESTART);
NDRESTART(ndp);
goto restart;
}
return (error);
case CACHE_FPL_STATUS_PARTIAL:
- TAILQ_INIT(&ndp->ni_cap_tracker);
dp = ndp->ni_startdir;
break;
case CACHE_FPL_STATUS_DESTROYED:
@@ -674,18 +702,21 @@ restart:
error = namei_getpath(ndp);
if (__predict_false(error != 0)) {
namei_cleanup_cnp(cnp);
+ nameicap_cleanup(ndp, error);
return (error);
}
cnp->cn_nameptr = cnp->cn_pnbuf;
/* FALLTHROUGH */
case CACHE_FPL_STATUS_ABORTED:
- TAILQ_INIT(&ndp->ni_cap_tracker);
MPASS(ndp->ni_lcf == 0);
if (*cnp->cn_pnbuf == '\0') {
if ((cnp->cn_flags & EMPTYPATH) != 0) {
- return (namei_emptypath(ndp));
+ error = namei_emptypath(ndp);
+ nameicap_cleanup(ndp, error);
+ return (error);
}
namei_cleanup_cnp(cnp);
+ nameicap_cleanup(ndp, ENOENT);
SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL,
false, ndp);
return (ENOENT);
@@ -693,6 +724,7 @@ restart:
error = namei_setup(ndp, &dp, &pwd);
if (error != 0) {
namei_cleanup_cnp(cnp);
+ nameicap_cleanup(ndp, error);
return (error);
}
break;
@@ -705,16 +737,23 @@ restart:
ndp->ni_startdir = dp;
error = vfs_lookup(ndp);
if (error != 0) {
- if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
- error == ENOENT &&
- (cnp->cn_flags & ISRESTARTED) == 0)) {
- nameicap_cleanup(ndp);
- pwd_drop(pwd);
- namei_cleanup_cnp(cnp);
- NDRESTART(ndp);
- goto restart;
- } else
+ uint64_t was_restarted;
+ bool abi_restart;
+
+ was_restarted = ndp->ni_cnd.cn_flags &
+ ISRESTARTED;
+ abi_restart = pwd->pwd_adir != pwd->pwd_rdir &&
+ error == ENOENT && was_restarted == 0;
+ if (error != ERESTART && !abi_restart)
goto out;
+ nameicap_cleanup(ndp, error);
+ pwd_drop(pwd);
+ namei_cleanup_cnp(cnp);
+ NDRESET(ndp);
+ if (abi_restart)
+ was_restarted = ISRESTARTED;
+ ndp->ni_cnd.cn_flags |= was_restarted;
+ goto restart;
}
/*
@@ -723,7 +762,7 @@ restart:
if ((cnp->cn_flags & ISSYMLINK) == 0) {
SDT_PROBE4(vfs, namei, lookup, return, error,
ndp->ni_vp, false, ndp);
- nameicap_cleanup(ndp);
+ nameicap_cleanup(ndp, 0);
pwd_drop(pwd);
NDVALIDATE(ndp);
return (0);
@@ -756,10 +795,10 @@ restart:
ndp->ni_vp = NULL;
vrele(ndp->ni_dvp);
out:
- MPASS(error != 0);
+ MPASS(error != 0 && error != ERESTART);
SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
namei_cleanup_cnp(cnp);
- nameicap_cleanup(ndp);
+ nameicap_cleanup(ndp, error);
pwd_drop(pwd);
return (error);
}
@@ -1185,7 +1224,9 @@ dirloop:
}
}
- nameicap_tracker_add(ndp, dp);
+ error = nameicap_tracker_add(ndp, dp);
+ if (error != 0)
+ goto bad;
/*
* Make sure degenerate names don't get here, their handling was
@@ -1210,9 +1251,7 @@ dirloop:
* the jail or chroot, don't let them out.
* 5. If doing a capability lookup and lookup_cap_dotdot is
* enabled, return ENOTCAPABLE if the lookup would escape
- * from the initial file descriptor directory. Checks are
- * done by ensuring that namei() already traversed the
- * result of dotdot lookup.
+ * from the initial file descriptor directory.
*/
if (cnp->cn_flags & ISDOTDOT) {
if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL_KTR |
@@ -1238,7 +1277,7 @@ dirloop:
NI_CAP_VIOLATION(ndp, cnp->cn_pnbuf);
if ((ndp->ni_lcf & NI_LCF_STRICTREL) != 0) {
error = ENOTCAPABLE;
- goto capdotdot;
+ goto bad;
}
}
if (isroot || ((dp->v_vflag & VV_ROOT) != 0 &&
@@ -1261,11 +1300,6 @@ dirloop:
vn_lock(dp,
enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
LK_RETRY));
- error = nameicap_check_dotdot(ndp, dp);
- if (error != 0) {
-capdotdot:
- goto bad;
- }
}
}
@@ -1314,7 +1348,9 @@ unionlookup:
vn_lock(dp,
enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
LK_RETRY));
- nameicap_tracker_add(ndp, dp);
+ error = nameicap_tracker_add(ndp, dp);
+ if (error != 0)
+ goto bad;
goto unionlookup;
}
@@ -1415,7 +1451,7 @@ nextname:
goto dirloop;
}
if (cnp->cn_flags & ISDOTDOT) {
- error = nameicap_check_dotdot(ndp, ndp->ni_vp);
+ error = nameicap_check_dotdot(ndp, ndp->ni_dvp);
if (error != 0)
goto bad2;
}
@@ -1485,8 +1521,11 @@ success:
}
success_right_lock:
if (ndp->ni_vp != NULL) {
- if ((cnp->cn_flags & ISDOTDOT) == 0)
- nameicap_tracker_add(ndp, ndp->ni_vp);
+ if ((cnp->cn_flags & ISDOTDOT) == 0) {
+ error = nameicap_tracker_add(ndp, ndp->ni_vp);
+ if (error != 0)
+ goto bad2;
+ }
if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS)
return (vfs_lookup_failifexists(ndp));
}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index cb18468d28bc..8e64a7fe966b 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -156,6 +156,7 @@ mount_init(void *mem, int size, int flags)
mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+ lockinit(&mp->mnt_renamelock, PVFS, "rename", 0, 0);
mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
mp->mnt_ref = 0;
mp->mnt_vfs_ops = 1;
@@ -170,6 +171,7 @@ mount_fini(void *mem, int size)
mp = (struct mount *)mem;
uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
+ lockdestroy(&mp->mnt_renamelock);
lockdestroy(&mp->mnt_explock);
mtx_destroy(&mp->mnt_listmtx);
mtx_destroy(&mp->mnt_mtx);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index dc2fb59fb81c..918b256e6c59 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -38,7 +38,6 @@
* External virtual filesystem routines
*/
-#include <sys/cdefs.h>
#include "opt_ddb.h"
#include "opt_watchdog.h"
@@ -57,6 +56,7 @@
#include <sys/extattr.h>
#include <sys/file.h>
#include <sys/fcntl.h>
+#include <sys/inotify.h>
#include <sys/jail.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
@@ -5246,7 +5246,8 @@ destroy_vpollinfo_free(struct vpollinfo *vi)
static void
destroy_vpollinfo(struct vpollinfo *vi)
{
-
+ KASSERT(TAILQ_EMPTY(&vi->vpi_inotify),
+ ("%s: pollinfo %p has lingering watches", __func__, vi));
knlist_clear(&vi->vpi_selinfo.si_note, 1);
seldrain(&vi->vpi_selinfo);
destroy_vpollinfo_free(vi);
@@ -5260,12 +5261,13 @@ v_addpollinfo(struct vnode *vp)
{
struct vpollinfo *vi;
- if (vp->v_pollinfo != NULL)
+ if (atomic_load_ptr(&vp->v_pollinfo) != NULL)
return;
vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
vfs_knlunlock, vfs_knl_assert_lock);
+ TAILQ_INIT(&vi->vpi_inotify);
VI_LOCK(vp);
if (vp->v_pollinfo != NULL) {
VI_UNLOCK(vp);
@@ -5851,6 +5853,8 @@ vop_rename_pre(void *ap)
struct vop_rename_args *a = ap;
#ifdef DEBUG_VFS_LOCKS
+ struct mount *tmp;
+
if (a->a_tvp)
ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
@@ -5868,6 +5872,11 @@ vop_rename_pre(void *ap)
if (a->a_tvp)
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+
+ tmp = NULL;
+ VOP_GETWRITEMOUNT(a->a_tdvp, &tmp);
+ lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED);
+ vfs_rel(tmp);
#endif
/*
* It may be tempting to add vn_seqc_write_begin/end calls here and
@@ -6057,6 +6066,28 @@ vop_need_inactive_debugpost(void *ap, int rc)
#endif
void
+vop_allocate_post(void *ap, int rc)
+{
+ struct vop_allocate_args *a;
+
+ a = ap;
+ if (rc == 0)
+ INOTIFY(a->a_vp, IN_MODIFY);
+}
+
+void
+vop_copy_file_range_post(void *ap, int rc)
+{
+ struct vop_copy_file_range_args *a;
+
+ a = ap;
+ if (rc == 0) {
+ INOTIFY(a->a_invp, IN_ACCESS);
+ INOTIFY(a->a_outvp, IN_MODIFY);
+ }
+}
+
+void
vop_create_pre(void *ap)
{
struct vop_create_args *a;
@@ -6076,8 +6107,20 @@ vop_create_post(void *ap, int rc)
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+ }
+}
+
+void
+vop_deallocate_post(void *ap, int rc)
+{
+ struct vop_deallocate_args *a;
+
+ a = ap;
+ if (rc == 0)
+ INOTIFY(a->a_vp, IN_MODIFY);
}
void
@@ -6122,8 +6165,10 @@ vop_deleteextattr_post(void *ap, int rc)
a = ap;
vp = a->a_vp;
vn_seqc_write_end(vp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+ INOTIFY(vp, IN_ATTRIB);
+ }
}
void
@@ -6153,6 +6198,8 @@ vop_link_post(void *ap, int rc)
if (!rc) {
VFS_KNOTE_LOCKED(vp, NOTE_LINK);
VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
+ INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
+ INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE);
}
}
@@ -6176,8 +6223,10 @@ vop_mkdir_post(void *ap, int rc)
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+ }
}
#ifdef DEBUG_VFS_LOCKS
@@ -6212,8 +6261,10 @@ vop_mknod_post(void *ap, int rc)
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+ }
}
void
@@ -6225,8 +6276,10 @@ vop_reclaim_post(void *ap, int rc)
a = ap;
vp = a->a_vp;
ASSERT_VOP_IN_SEQC(vp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
+ INOTIFY_REVOKE(vp);
+ }
}
void
@@ -6257,6 +6310,8 @@ vop_remove_post(void *ap, int rc)
if (!rc) {
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
+ INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
+ INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
}
}
@@ -6288,6 +6343,8 @@ vop_rename_post(void *ap, int rc)
VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
if (a->a_tvp)
VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
+ INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp,
+ a->a_tdvp, a->a_tcnp);
}
if (a->a_tdvp != a->a_fdvp)
vdrop(a->a_fdvp);
@@ -6327,6 +6384,7 @@ vop_rmdir_post(void *ap, int rc)
vp->v_vflag |= VV_UNLINKED;
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
+ INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
}
}
@@ -6350,8 +6408,10 @@ vop_setattr_post(void *ap, int rc)
a = ap;
vp = a->a_vp;
vn_seqc_write_end(vp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+ INOTIFY(vp, IN_ATTRIB);
+ }
}
void
@@ -6396,8 +6456,10 @@ vop_setextattr_post(void *ap, int rc)
a = ap;
vp = a->a_vp;
vn_seqc_write_end(vp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+ INOTIFY(vp, IN_ATTRIB);
+ }
}
void
@@ -6420,8 +6482,10 @@ vop_symlink_post(void *ap, int rc)
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+ }
}
void
@@ -6429,8 +6493,10 @@ vop_open_post(void *ap, int rc)
{
struct vop_open_args *a = ap;
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
+ INOTIFY(a->a_vp, IN_OPEN);
+ }
}
void
@@ -6442,6 +6508,8 @@ vop_close_post(void *ap, int rc)
!VN_IS_DOOMED(a->a_vp))) {
VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
NOTE_CLOSE_WRITE : NOTE_CLOSE);
+ INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
+ IN_CLOSE_WRITE : IN_CLOSE_NOWRITE);
}
}
@@ -6450,8 +6518,10 @@ vop_read_post(void *ap, int rc)
{
struct vop_read_args *a = ap;
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+ INOTIFY(a->a_vp, IN_ACCESS);
+ }
}
void
@@ -6468,8 +6538,10 @@ vop_readdir_post(void *ap, int rc)
{
struct vop_readdir_args *a = ap;
- if (!rc)
+ if (!rc) {
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+ INOTIFY(a->a_vp, IN_ACCESS);
+ }
}
static struct knlist fs_knlist;
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index c236f241bf20..c71e0d9ee569 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -3766,7 +3766,7 @@ int
kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
const char *new, enum uio_seg pathseg)
{
- struct mount *mp = NULL;
+ struct mount *mp, *tmp;
struct vnode *tvp, *fvp, *tdvp;
struct nameidata fromnd, tond;
uint64_t tondflags;
@@ -3774,6 +3774,7 @@ kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
short irflag;
again:
+ tmp = mp = NULL;
bwillwrite();
#ifdef MAC
if (mac_vnode_check_rename_from_enabled()) {
@@ -3809,6 +3810,7 @@ again:
tvp = tond.ni_vp;
error = vn_start_write(fvp, &mp, V_NOWAIT);
if (error != 0) {
+again1:
NDFREE_PNBUF(&fromnd);
NDFREE_PNBUF(&tond);
if (tvp != NULL)
@@ -3819,11 +3821,25 @@ again:
vput(tdvp);
vrele(fromnd.ni_dvp);
vrele(fvp);
+ if (tmp != NULL) {
+ lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE, NULL);
+ lockmgr(&tmp->mnt_renamelock, LK_RELEASE, NULL);
+ vfs_rel(tmp);
+ tmp = NULL;
+ }
error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
if (error != 0)
return (error);
goto again;
}
+ error = VOP_GETWRITEMOUNT(tdvp, &tmp);
+ if (error != 0 || tmp == NULL)
+ goto again1;
+ error = lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+ if (error != 0) {
+ vn_finished_write(mp);
+ goto again1;
+ }
irflag = vn_irflag_read(fvp);
if (((irflag & VIRF_NAMEDATTR) != 0 && tdvp != fromnd.ni_dvp) ||
(irflag & VIRF_NAMEDDIR) != 0) {
@@ -3884,6 +3900,8 @@ out:
vrele(fromnd.ni_dvp);
vrele(fvp);
}
+ lockmgr(&tmp->mnt_renamelock, LK_RELEASE, 0);
+ vfs_rel(tmp);
vn_finished_write(mp);
out1:
if (error == ERESTART)
@@ -4296,10 +4314,6 @@ kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
vp = fp->f_vnode;
foffset = foffset_lock(fp, 0);
unionread:
- if (vp->v_type != VDIR) {
- error = EINVAL;
- goto fail;
- }
if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) {
error = ENOENT;
goto fail;
@@ -4312,6 +4326,19 @@ unionread:
auio.uio_segflg = bufseg;
auio.uio_td = td;
vn_lock(vp, LK_SHARED | LK_RETRY);
+ /*
+ * We want to return ENOTDIR for anything that is not VDIR, but
+ * not for VBAD, and we can't check for VBAD while the vnode is
+ * unlocked.
+ */
+ if (vp->v_type != VDIR) {
+ if (vp->v_type == VBAD)
+ error = EBADF;
+ else
+ error = ENOTDIR;
+ VOP_UNLOCK(vp);
+ goto fail;
+ }
AUDIT_ARG_VNODE1(vp);
loff = auio.uio_offset = foffset;
#ifdef MAC
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index b29286654f60..6451c9e07a60 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -41,6 +41,7 @@
*/
#include "opt_hwpmc_hooks.h"
+#include "opt_hwt_hooks.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -51,6 +52,7 @@
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filio.h>
+#include <sys/inotify.h>
#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/limits.h>
@@ -86,6 +88,10 @@
#include <sys/pmckern.h>
#endif
+#ifdef HWT_HOOKS
+#include <dev/hwt/hwt_hook.h>
+#endif
+
static fo_rdwr_t vn_read;
static fo_rdwr_t vn_write;
static fo_rdwr_t vn_io_fault;
@@ -303,7 +309,8 @@ restart:
NDREINIT(ndp);
goto restart;
}
- if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
+ if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0 ||
+ (vn_irflag_read(ndp->ni_dvp) & VIRF_INOTIFY) != 0)
ndp->ni_cnd.cn_flags |= MAKEENTRY;
#ifdef MAC
error = mac_vnode_check_create(cred, ndp->ni_dvp,
@@ -479,6 +486,7 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
VOP_ACCESS(vp, VREAD, cred, td) == 0)
fp->f_flag |= FKQALLOWED;
+ INOTIFY(vp, IN_OPEN);
return (0);
}
@@ -1741,6 +1749,8 @@ vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
vattr.va_vaflags |= VA_SYNC;
error = VOP_SETATTR(vp, &vattr, cred);
VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
+ if (error == 0)
+ INOTIFY(vp, IN_MODIFY);
}
return (error);
}
@@ -3005,6 +3015,24 @@ vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
}
}
#endif
+
+#ifdef HWT_HOOKS
+ if (HWT_HOOK_INSTALLED && (prot & VM_PROT_EXECUTE) != 0 &&
+ error == 0) {
+ struct hwt_record_entry ent;
+ char *fullpath;
+ char *freepath;
+
+ if (vn_fullpath(vp, &fullpath, &freepath) == 0) {
+ ent.fullpath = fullpath;
+ ent.addr = (uintptr_t) *addr;
+ ent.record_type = HWT_RECORD_MMAP;
+ HWT_CALL_HOOK(td, HWT_MMAP, &ent);
+ free(freepath, M_TEMP);
+ }
+ }
+#endif
+
return (error);
}
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index a2b6a7c8ff9f..38138a4af921 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -702,6 +702,7 @@ vop_vptocnp {
%% allocate vp E E E
+%! allocate post vop_allocate_post
vop_allocate {
IN struct vnode *vp;
@@ -786,6 +787,7 @@ vop_fdatasync {
%% copy_file_range invp U U U
%% copy_file_range outvp U U U
+%! copy_file_range post vop_copy_file_range_post
vop_copy_file_range {
IN struct vnode *invp;
@@ -810,6 +812,7 @@ vop_vput_pair {
%% deallocate vp L L L
+%! deallocate post vop_deallocate_post
vop_deallocate {
IN struct vnode *vp;
@@ -821,6 +824,27 @@ vop_deallocate {
};
+%% inotify vp - - -
+
+vop_inotify {
+ IN struct vnode *vp;
+ IN struct vnode *dvp;
+ IN struct componentname *cnp;
+ IN int event;
+ IN uint32_t cookie;
+};
+
+
+%% inotify_add_watch vp L L L
+
+vop_inotify_add_watch {
+ IN struct vnode *vp;
+ IN struct inotify_softc *sc;
+ IN uint32_t mask;
+ OUT uint32_t *wdp;
+ IN struct thread *td;
+};
+
# The VOPs below are spares at the end of the table to allow new VOPs to be
# added in stable branches without breaking the KBI. New VOPs in HEAD should
# be added above these spares. When merging a new VOP to a stable branch,
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 35cf17be109f..7cb6e2124326 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -141,6 +141,7 @@ SUBDIR= \
${_hptnr} \
${_hptrr} \
hwpmc \
+ ${_hwt} \
${_hyperv} \
i2c \
${_iavf} \
@@ -325,6 +326,7 @@ SUBDIR= \
proto \
pseudofs \
${_pst} \
+ ${_pt} \
pty \
puc \
pwm \
@@ -841,6 +843,7 @@ _iwx= iwx
_ixl= ixl
_nvdimm= nvdimm
_pms= pms
+_pt= pt
_qat= qat
.if ${MK_SOURCELESS_UCODE} != "no"
_qatfw= qatfw
@@ -859,6 +862,10 @@ _smartpqi= smartpqi
_p2sb= p2sb
.endif
+.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64"
+_hwt= hwt
+.endif
+
.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \
${MACHINE_CPUARCH} == "riscv"
.if ${MK_BHYVE} != "no" || defined(ALL_MODULES)
diff --git a/sys/modules/efirt/Makefile b/sys/modules/efirt/Makefile
index 4738996fd4e6..c46484465b68 100644
--- a/sys/modules/efirt/Makefile
+++ b/sys/modules/efirt/Makefile
@@ -9,7 +9,7 @@ SRCS+= device_if.h bus_if.h clock_if.h
DPSRCS+= assym.inc
.if ${MACHINE_CPUARCH} == "amd64"
-SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h
+SRCS+= opt_acpi.h opt_hwpmc_hooks.h opt_kstack_pages.h
.endif
efirt_support.o: efirt_support.S assym.inc
diff --git a/sys/modules/hwt/Makefile b/sys/modules/hwt/Makefile
new file mode 100644
index 000000000000..6704e22422d1
--- /dev/null
+++ b/sys/modules/hwt/Makefile
@@ -0,0 +1,21 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/dev/hwt
+
+KMOD = hwt
+SRCS = \
+ hwt.c \
+ hwt_backend.c \
+ hwt_config.c \
+ hwt_context.c \
+ hwt_contexthash.c \
+ hwt_cpu.c \
+ hwt_hook.c \
+ hwt_ioctl.c \
+ hwt_owner.c \
+ hwt_ownerhash.c \
+ hwt_record.c \
+ hwt_thread.c \
+ hwt_vm.c
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/ice/Makefile b/sys/modules/ice/Makefile
index 91f20193d878..9f9c9f602cda 100644
--- a/sys/modules/ice/Makefile
+++ b/sys/modules/ice/Makefile
@@ -13,6 +13,7 @@ SRCS += opt_inet.h opt_inet6.h opt_rss.h opt_iflib.h
SRCS += ice_lib.c ice_osdep.c ice_resmgr.c ice_strings.c
SRCS += ice_iflib_recovery_txrx.c ice_iflib_txrx.c if_ice_iflib.c
SRCS += ice_fw_logging.c ice_ddp_common.c
+SRCS.PCI_IOV += pci_iov_if.h ice_iov.c ice_vf_mbx.c
# RDMA Client interface
# TODO: Is this the right way to compile this?
diff --git a/sys/modules/iwlwifi/Makefile b/sys/modules/iwlwifi/Makefile
index 6e0fea6efc3a..9774c3da61ee 100644
--- a/sys/modules/iwlwifi/Makefile
+++ b/sys/modules/iwlwifi/Makefile
@@ -4,6 +4,7 @@ DEVIWLWIFIDIR= ${SRCTOP}/sys/contrib/dev/iwlwifi
WITH_CONFIG_PM= 0
WITH_DEBUGFS= 1
+WITH_CONFIG_ACPI= 1
KMOD= if_iwlwifi
@@ -40,6 +41,12 @@ CFLAGS+= -DCONFIG_PM
CFLAGS+= -DCONFIG_PM_SLEEP
.endif
+.if defined(WITH_CONFIG_ACPI) && ${WITH_CONFIG_ACPI} > 0
+SRCS+= fw/acpi.c
+CFLAGS+= -DCONFIG_ACPI
+CFLAGS+= -DLINUXKPI_WANT_LINUX_ACPI
+.endif
+
SRCS+= iwl-devtrace.c
# Other
@@ -56,7 +63,6 @@ CFLAGS+= -DCONFIG_IWLMVM=1
# Helpful after fresh imports.
#CFLAGS+= -ferror-limit=0
-#CFLAGS+= -DCONFIG_ACPI=1
#CFLAGS+= -DCONFIG_INET=1 # Need LKPI TSO implementation.
#CFLAGS+= -DCONFIG_IPV6=1
CFLAGS+= -DCONFIG_IWLWIFI_DEBUG=1
diff --git a/sys/modules/pt/Makefile b/sys/modules/pt/Makefile
new file mode 100644
index 000000000000..416b072face9
--- /dev/null
+++ b/sys/modules/pt/Makefile
@@ -0,0 +1,8 @@
+
+.PATH: ${SRCTOP}/sys/amd64/pt
+
+KMOD= pt
+SRCS= pt.c pt.h device_if.h bus_if.h
+SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/qlnx/qlnxe/Makefile b/sys/modules/qlnx/qlnxe/Makefile
index 3d8415cf0e57..2a44ae6ddde5 100644
--- a/sys/modules/qlnx/qlnxe/Makefile
+++ b/sys/modules/qlnx/qlnxe/Makefile
@@ -58,6 +58,7 @@ SRCS+=qlnx_rdma.c
SRCS+=qlnx_ioctl.c
SRCS+=qlnx_os.c
+SRCS+=opt_inet.h
SRCS+= ${LINUXKPI_GENSRCS}
diff --git a/sys/modules/rtw89/Makefile b/sys/modules/rtw89/Makefile
index 73945591826c..09580f288c62 100644
--- a/sys/modules/rtw89/Makefile
+++ b/sys/modules/rtw89/Makefile
@@ -39,6 +39,7 @@ SRCS+= ${LINUXKPI_GENSRCS}
SRCS+= opt_wlan.h opt_inet6.h opt_inet.h opt_acpi.h
CFLAGS+= -DKBUILD_MODNAME='"rtw89"'
+CFLAGS+= -DLINUXKPI_WANT_LINUX_ACPI
CFLAGS+= -I${DEVRTW89DIR}
CFLAGS+= ${LINUXKPI_INCLUDES}
diff --git a/sys/modules/sound/sound/Makefile b/sys/modules/sound/sound/Makefile
index d2cfed2f4b6a..f3978e9bd9cc 100644
--- a/sys/modules/sound/sound/Makefile
+++ b/sys/modules/sound/sound/Makefile
@@ -13,11 +13,11 @@ SRCS+= feeder.c feeder_rate.c feeder_volume.c
SRCS+= feeder_chain.c feeder_eq.c feeder_format.c
SRCS+= feeder_matrix.c feeder_mixer.c
SRCS+= feeder_eq_gen.h feeder_rate_gen.h snd_fxdiv_gen.h
-SRCS+= mpu_if.h mpufoi_if.h synth_if.h
-SRCS+= mpu_if.c mpufoi_if.c synth_if.c
+SRCS+= mpu_if.h mpufoi_if.h
+SRCS+= mpu_if.c mpufoi_if.c
SRCS+= ac97.c buffer.c channel.c dsp.c
SRCS+= mixer.c sndstat.c sound.c vchan.c
-SRCS+= midi.c mpu401.c sequencer.c
+SRCS+= midi.c mpu401.c
feeder_eq_gen.h: ${SYSDIR}/tools/sound/feeder_eq_mkfilter.awk
${AWK} -f ${SYSDIR}/tools/sound/feeder_eq_mkfilter.awk -- ${FEEDER_EQ_PRESETS} > ${.TARGET}
diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h
index 6eefedba8775..01485cf26e06 100644
--- a/sys/net/ethernet.h
+++ b/sys/net/ethernet.h
@@ -62,6 +62,8 @@ struct ether_header {
u_char ether_shost[ETHER_ADDR_LEN];
u_short ether_type;
} __packed;
+_Static_assert(sizeof(struct ether_header) == ETHER_HDR_LEN,
+ "size of struct ether_header is wrong");
/*
* Structure of a 48-bit Ethernet address.
@@ -69,6 +71,8 @@ struct ether_header {
struct ether_addr {
u_char octet[ETHER_ADDR_LEN];
} __packed;
+_Static_assert(sizeof(struct ether_addr) == ETHER_ADDR_LEN,
+ "size of struct ether_addr is wrong");
#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
#define ETHER_IS_IPV6_MULTICAST(addr) \
@@ -81,6 +85,23 @@ struct ether_addr {
(addr)[3] | (addr)[4] | (addr)[5]) == 0x00)
/*
+ * 802.1q VID constants from IEEE 802.1Q-2014, table 9-2.
+ */
+
+/* Null VID: The tag contains only PCP (priority) and DEI information. */
+#define DOT1Q_VID_NULL 0x0
+/* The default PVID for a bridge port. NB: bridge(4) does not honor this. */
+#define DOT1Q_VID_DEF_PVID 0x1
+/* The default SR_PVID for SRP Stream related traffic. */
+#define DOT1Q_VID_DEF_SR_PVID 0x2
+/* A VID reserved for implementation use, not permitted on the wire. */
+#define DOT1Q_VID_RSVD_IMPL 0xfff
+/* The lowest valid VID. */
+#define DOT1Q_VID_MIN 0x1
+/* The highest valid VID. */
+#define DOT1Q_VID_MAX 0xffe
+
+/*
* This is the type of the VLAN ID inside the tag, not the tag itself.
*/
typedef uint16_t ether_vlanid_t;
@@ -95,6 +116,8 @@ struct ether_vlan_header {
uint16_t evl_tag;
uint16_t evl_proto;
} __packed;
+_Static_assert(sizeof(struct ether_vlan_header) == ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN,
+ "size of struct ether_vlan_header is wrong");
#define EVL_VLID_MASK 0x0FFF
#define EVL_PRI_MASK 0xE000
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index bc421a8e156d..5b3ee740d75e 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -254,6 +254,8 @@ struct bridge_iflist {
uint32_t bif_addrcnt; /* cur. # of addresses */
uint32_t bif_addrexceeded;/* # of address violations */
struct epoch_context bif_epoch_ctx;
+ ether_vlanid_t bif_untagged; /* untagged vlan id */
+ ifbvlan_set_t bif_vlan_set; /* allowed tagged vlans */
};
/*
@@ -331,13 +333,12 @@ static void bridge_inject(struct ifnet *, struct mbuf *);
static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *,
struct rtentry *);
static int bridge_enqueue(struct bridge_softc *, struct ifnet *,
- struct mbuf *);
+ struct mbuf *, struct bridge_iflist *);
static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int);
static void bridge_forward(struct bridge_softc *, struct bridge_iflist *,
struct mbuf *m);
static bool bridge_member_ifaddrs(void);
-
static void bridge_timer(void *);
static void bridge_broadcast(struct bridge_softc *, struct ifnet *,
@@ -353,6 +354,9 @@ static void bridge_rtage(struct bridge_softc *);
static void bridge_rtflush(struct bridge_softc *, int);
static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *,
ether_vlanid_t);
+static bool bridge_vfilter_in(const struct bridge_iflist *, struct mbuf *);
+static bool bridge_vfilter_out(const struct bridge_iflist *,
+ const struct mbuf *);
static void bridge_rtable_init(struct bridge_softc *);
static void bridge_rtable_fini(struct bridge_softc *);
@@ -400,6 +404,9 @@ static int bridge_ioctl_sma(struct bridge_softc *, void *);
static int bridge_ioctl_sifprio(struct bridge_softc *, void *);
static int bridge_ioctl_sifcost(struct bridge_softc *, void *);
static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *);
+static int bridge_ioctl_sifuntagged(struct bridge_softc *, void *);
+static int bridge_ioctl_sifvlanset(struct bridge_softc *, void *);
+static int bridge_ioctl_gifvlanset(struct bridge_softc *, void *);
static int bridge_ioctl_addspan(struct bridge_softc *, void *);
static int bridge_ioctl_delspan(struct bridge_softc *, void *);
static int bridge_ioctl_gbparam(struct bridge_softc *, void *);
@@ -618,6 +625,14 @@ static const struct bridge_control bridge_control_table[] = {
{ bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq),
BC_F_COPYIN|BC_F_SUSER },
+ { bridge_ioctl_sifuntagged, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_sifvlanset, sizeof(struct ifbif_vlan_req),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gifvlanset, sizeof(struct ifbif_vlan_req),
+ BC_F_COPYIN|BC_F_COPYOUT },
};
static const int bridge_control_table_size = nitems(bridge_control_table);
@@ -832,6 +847,7 @@ bridge_clone_create(struct if_clone *ifc, char *name, size_t len,
ifp->if_softc = sc;
if_initname(ifp, bridge_name, ifd->unit);
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_capabilities = ifp->if_capenable = IFCAP_VLAN_HWTAGGING;
ifp->if_ioctl = bridge_ioctl;
#ifdef ALTQ
ifp->if_start = bridge_altq_start;
@@ -954,6 +970,7 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
struct ifbaconf ifbaconf;
struct ifbrparam ifbrparam;
struct ifbropreq ifbropreq;
+ struct ifbif_vlan_req ifvlanreq;
} args;
struct ifdrv *ifd = (struct ifdrv *) data;
const struct bridge_control *bc;
@@ -1495,6 +1512,7 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
req->ifbr_addrcnt = bif->bif_addrcnt;
req->ifbr_addrmax = bif->bif_addrmax;
req->ifbr_addrexceeded = bif->bif_addrexceeded;
+ req->ifbr_untagged = bif->bif_untagged;
/* Copy STP state options as flags */
if (bp->bp_operedge)
@@ -1873,6 +1891,84 @@ bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg)
}
static int
+bridge_ioctl_sifuntagged(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->ifbr_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ if (req->ifbr_untagged > DOT1Q_VID_MAX)
+ return (EINVAL);
+
+ if (req->ifbr_untagged != DOT1Q_VID_NULL)
+ bif->bif_flags |= IFBIF_VLANFILTER;
+ bif->bif_untagged = req->ifbr_untagged;
+ return (0);
+}
+
+static int
+bridge_ioctl_sifvlanset(struct bridge_softc *sc, void *arg)
+{
+ struct ifbif_vlan_req *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->bv_ifname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ /* Reject invalid VIDs. */
+ if (BRVLAN_TEST(&req->bv_set, DOT1Q_VID_NULL) ||
+ BRVLAN_TEST(&req->bv_set, DOT1Q_VID_RSVD_IMPL))
+ return (EINVAL);
+
+ switch (req->bv_op) {
+ /* Replace the existing vlan set with the new set */
+ case BRDG_VLAN_OP_SET:
+ BIT_COPY(BRVLAN_SETSIZE, &req->bv_set, &bif->bif_vlan_set);
+ break;
+
+ /* Modify the existing vlan set to add the given vlans */
+ case BRDG_VLAN_OP_ADD:
+ BIT_OR(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set);
+ break;
+
+ /* Modify the existing vlan set to remove the given vlans */
+ case BRDG_VLAN_OP_DEL:
+ BIT_ANDNOT(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set);
+ break;
+
+ /* Invalid or unknown operation */
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * The only reason to modify the VLAN access list is to use VLAN
+ * filtering on this interface, so enable it automatically.
+ */
+ bif->bif_flags |= IFBIF_VLANFILTER;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_gifvlanset(struct bridge_softc *sc, void *arg)
+{
+ struct ifbif_vlan_req *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->bv_ifname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ BIT_COPY(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set);
+ return (0);
+}
+
+static int
bridge_ioctl_addspan(struct bridge_softc *sc, void *arg)
{
struct ifbreq *req = arg;
@@ -2150,12 +2246,25 @@ bridge_stop(struct ifnet *ifp, int disable)
*
*/
static int
-bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
+bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m,
+ struct bridge_iflist *bif)
{
int len, err = 0;
short mflags;
struct mbuf *m0;
+ /*
+ * Find the bridge member port this packet is being sent on, if the
+ * caller didn't already provide it.
+ */
+ if (bif == NULL)
+ bif = bridge_lookup_member_if(sc, dst_ifp);
+ if (bif == NULL) {
+ /* Perhaps the interface was removed from the bridge */
+ m_freem(m);
+ return (EINVAL);
+ }
+
/* We may be sending a fragment so traverse the mbuf */
for (; m; m = m0) {
m0 = m->m_nextpkt;
@@ -2164,6 +2273,18 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
mflags = m->m_flags;
/*
+ * If VLAN filtering is enabled, and the native VLAN ID of the
+ * outgoing interface matches the VLAN ID of the frame, remove
+ * the VLAN header.
+ */
+ if ((bif->bif_flags & IFBIF_VLANFILTER) &&
+ bif->bif_untagged != DOT1Q_VID_NULL &&
+ VLANTAGOF(m) == bif->bif_untagged) {
+ m->m_flags &= ~M_VLANTAG;
+ m->m_pkthdr.ether_vtag = 0;
+ }
+
+ /*
* If underlying interface can not do VLAN tag insertion itself
* then attach a packet tag that holds it.
*/
@@ -2234,7 +2355,7 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
return;
}
- bridge_enqueue(sc, ifp, m);
+ bridge_enqueue(sc, ifp, m, NULL);
}
/*
@@ -2329,7 +2450,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
}
}
- bridge_enqueue(sc, dst_if, mc);
+ bridge_enqueue(sc, dst_if, mc, bif);
}
if (used == 0)
m_freem(m);
@@ -2347,7 +2468,7 @@ sendunicast:
return (0);
}
- bridge_enqueue(sc, dst_if, m);
+ bridge_enqueue(sc, dst_if, m, NULL);
return (0);
}
@@ -2364,17 +2485,18 @@ bridge_transmit(struct ifnet *ifp, struct mbuf *m)
struct ether_header *eh;
struct ifnet *dst_if;
int error = 0;
+ ether_vlanid_t vlan;
sc = ifp->if_softc;
ETHER_BPF_MTAP(ifp, m);
eh = mtod(m, struct ether_header *);
+ vlan = VLANTAGOF(m);
if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) &&
- (dst_if = bridge_rtlookup(sc, eh->ether_dhost, DOT1Q_VID_NULL)) !=
- NULL) {
- error = bridge_enqueue(sc, dst_if, m);
+ (dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan)) != NULL) {
+ error = bridge_enqueue(sc, dst_if, m, NULL);
} else
bridge_broadcast(sc, ifp, m, 0);
@@ -2435,18 +2557,18 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
struct bridge_iflist *dbif;
struct ifnet *src_if, *dst_if, *ifp;
struct ether_header *eh;
- uint16_t vlan;
uint8_t *dst;
int error;
+ ether_vlanid_t vlan;
NET_EPOCH_ASSERT();
src_if = m->m_pkthdr.rcvif;
ifp = sc->sc_ifp;
+ vlan = VLANTAGOF(m);
if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
- vlan = VLANTAGOF(m);
if ((sbif->bif_flags & IFBIF_STP) &&
sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
@@ -2555,6 +2677,10 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)
goto drop;
+ /* Do VLAN filtering. */
+ if (!bridge_vfilter_out(dbif, m))
+ goto drop;
+
if ((dbif->bif_flags & IFBIF_STP) &&
dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
goto drop;
@@ -2566,7 +2692,7 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
return;
}
- bridge_enqueue(sc, dst_if, m);
+ bridge_enqueue(sc, dst_if, m, dbif);
return;
drop:
@@ -2636,6 +2762,15 @@ bridge_input(struct ifnet *ifp, struct mbuf *m)
return (NULL);
}
+ /* Do VLAN filtering. */
+ if (!bridge_vfilter_in(bif, m)) {
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IERRORS, 1);
+ m_freem(m);
+ return (NULL);
+ }
+ /* bridge_vfilter_in() may add a tag */
+ vlan = VLANTAGOF(m);
+
bridge_span(sc, m);
if (m->m_flags & (M_BCAST|M_MCAST)) {
@@ -2761,6 +2896,15 @@ bridge_input(struct ifnet *ifp, struct mbuf *m)
} \
if ((iface) != bifp) \
ETHER_BPF_MTAP(iface, m); \
+ /* Pass tagged packets to if_vlan, if it's loaded */ \
+ if (VLANTAGOF(m) != 0) { \
+ if (bifp->if_vlantrunk == NULL) { \
+ m_freem(m); \
+ return (NULL); \
+ } \
+ (*vlan_input_p)(bifp, m); \
+ return (NULL); \
+ } \
return (m); \
} \
\
@@ -2817,6 +2961,30 @@ bridge_inject(struct ifnet *ifp, struct mbuf *m)
{
struct bridge_softc *sc;
+ if (ifp->if_type == IFT_L2VLAN) {
+ /*
+ * vlan(4) gives us the vlan ifnet, so we need to get the
+ * bridge softc to get a pointer to ether_input to send the
+ * packet to.
+ */
+ struct ifnet *bifp = NULL;
+
+ if (vlan_trunkdev_p == NULL) {
+ m_freem(m);
+ return;
+ }
+
+ bifp = vlan_trunkdev_p(ifp);
+ if (bifp == NULL) {
+ m_freem(m);
+ return;
+ }
+
+ sc = if_getsoftc(bifp);
+ sc->sc_if_input(ifp, m);
+ return;
+ }
+
KASSERT((if_getcapenable(ifp) & IFCAP_NETMAP) != 0,
("%s: iface %s is not running in netmap mode",
__func__, if_name(ifp)));
@@ -2867,6 +3035,10 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE))
continue;
+ /* Do VLAN filtering. */
+ if (!bridge_vfilter_out(dbif, m))
+ continue;
+
if ((dbif->bif_flags & IFBIF_STP) &&
dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
continue;
@@ -2910,7 +3082,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
continue;
}
- bridge_enqueue(sc, dst_if, mc);
+ bridge_enqueue(sc, dst_if, mc, dbif);
}
if (used == 0)
m_freem(m);
@@ -2946,11 +3118,116 @@ bridge_span(struct bridge_softc *sc, struct mbuf *m)
continue;
}
- bridge_enqueue(sc, dst_if, mc);
+ bridge_enqueue(sc, dst_if, mc, bif);
}
}
/*
+ * Incoming VLAN filtering. Given a frame and the member interface it was
+ * received on, decide whether the port configuration allows it.
+ */
+static bool
+bridge_vfilter_in(const struct bridge_iflist *sbif, struct mbuf *m)
+{
+ ether_vlanid_t vlan;
+
+ vlan = VLANTAGOF(m);
+ /* Make sure the vlan id is reasonable. */
+ if (vlan > DOT1Q_VID_MAX)
+ return (false);
+
+ /* If VLAN filtering isn't enabled, pass everything. */
+ if ((sbif->bif_flags & IFBIF_VLANFILTER) == 0)
+ return (true);
+
+ if (vlan == DOT1Q_VID_NULL) {
+ /*
+ * The frame doesn't have a tag. If the interface does not
+ * have an untagged vlan configured, drop the frame.
+ */
+ if (sbif->bif_untagged == DOT1Q_VID_NULL)
+ return (false);
+
+ /*
+ * Otherwise, insert a new tag based on the interface's
+ * untagged vlan id.
+ */
+ m->m_pkthdr.ether_vtag = sbif->bif_untagged;
+ m->m_flags |= M_VLANTAG;
+ } else {
+ /*
+ * The frame has a tag, so check it matches the interface's
+ * vlan access list. We explicitly do not accept tagged
+ * frames for the untagged vlan id here (unless it's also
+ * in the access list).
+ */
+ if (!BRVLAN_TEST(&sbif->bif_vlan_set, vlan))
+ return (false);
+ }
+
+ /* Accept the frame. */
+ return (true);
+}
+
+/*
+ * Outgoing VLAN filtering. Given a frame, its vlan, and the member interface
+ * we intend to send it to, decide whether the port configuration allows it to
+ * be sent.
+ */
+static bool
+bridge_vfilter_out(const struct bridge_iflist *dbif, const struct mbuf *m)
+{
+ struct ether_header *eh;
+ ether_vlanid_t vlan;
+
+ NET_EPOCH_ASSERT();
+
+ /* If VLAN filtering isn't enabled, pass everything. */
+ if ((dbif->bif_flags & IFBIF_VLANFILTER) == 0)
+ return (true);
+
+ vlan = VLANTAGOF(m);
+
+ /*
+ * Always allow untagged 802.1D STP frames, even if they would
+ * otherwise be dropped. This is required for STP to work on
+ * a filtering bridge.
+ *
+ * Tagged STP (Cisco PVST+) is a non-standard extension, so
+ * handle those frames via the normal filtering path.
+ */
+ eh = mtod(m, struct ether_header *);
+ if (vlan == DOT1Q_VID_NULL &&
+ memcmp(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN) == 0)
+ return (true);
+
+ /*
+ * If the frame wasn't assigned to a vlan at ingress, drop it.
+ * We can't forward these frames to filtering ports because we
+ * don't know what VLAN they're supposed to be in.
+ */
+ if (vlan == DOT1Q_VID_NULL)
+ return (false);
+
+ /*
+ * If the frame's vlan matches the interfaces's untagged vlan,
+ * allow it.
+ */
+ if (vlan == dbif->bif_untagged)
+ return (true);
+
+ /*
+ * If the frame's vlan is on the interface's tagged access list,
+ * allow it.
+ */
+ if (BRVLAN_TEST(&dbif->bif_vlan_set, vlan))
+ return (true);
+
+ /* The frame was not permitted, so drop it. */
+ return (false);
+}
+
+/*
* bridge_rtupdate:
*
* Add a bridge routing entry.
diff --git a/sys/net/if_bridgevar.h b/sys/net/if_bridgevar.h
index 90beb6c96d82..97b63e3d4416 100644
--- a/sys/net/if_bridgevar.h
+++ b/sys/net/if_bridgevar.h
@@ -78,6 +78,8 @@
#define _NET_IF_BRIDGEVAR_H_
#include <sys/types.h>
+#include <sys/_bitset.h>
+#include <sys/bitset.h>
#include <sys/callout.h>
#include <sys/queue.h>
#include <sys/condvar.h>
@@ -122,6 +124,9 @@
#define BRDGSPROTO 28 /* set protocol (ifbrparam) */
#define BRDGSTXHC 29 /* set tx hold count (ifbrparam) */
#define BRDGSIFAMAX 30 /* set max interface addrs (ifbreq) */
+#define BRDGSIFUNTAGGED 31 /* set if untagged vlan */
+#define BRDGSIFVLANSET 32 /* set if vlan set */
+#define BRDGGIFVLANSET 33 /* get if vlan set */
/*
* Generic bridge control request.
@@ -139,6 +144,7 @@ struct ifbreq {
uint32_t ifbr_addrcnt; /* member if addr number */
uint32_t ifbr_addrmax; /* member if addr max */
uint32_t ifbr_addrexceeded; /* member if addr violations */
+ ether_vlanid_t ifbr_untagged; /* member if untagged vlan */
uint8_t pad[32];
};
@@ -155,10 +161,11 @@ struct ifbreq {
#define IFBIF_BSTP_ADMEDGE 0x0200 /* member stp admin edge enabled */
#define IFBIF_BSTP_ADMCOST 0x0400 /* member stp admin path cost */
#define IFBIF_PRIVATE 0x0800 /* if is a private segment */
+#define IFBIF_VLANFILTER 0x1000 /* if does vlan filtering */
#define IFBIFBITS "\020\001LEARNING\002DISCOVER\003STP\004SPAN" \
"\005STICKY\014PRIVATE\006EDGE\007AUTOEDGE\010PTP" \
- "\011AUTOPTP"
+ "\011AUTOPTP\015VLANFILTER"
#define IFBIFMASK ~(IFBIF_BSTP_EDGE|IFBIF_BSTP_AUTOEDGE|IFBIF_BSTP_PTP| \
IFBIF_BSTP_AUTOPTP|IFBIF_BSTP_ADMEDGE| \
IFBIF_BSTP_ADMCOST) /* not saved */
@@ -304,6 +311,26 @@ struct ifbpstpconf {
eaddr[5] = pv >> 0; \
} while (0)
+/*
+ * Bridge VLAN access request.
+ */
+#define BRVLAN_SETSIZE 4096
+typedef __BITSET_DEFINE(ifbvlan_set, BRVLAN_SETSIZE) ifbvlan_set_t;
+
+#define BRVLAN_SET(set, bit) __BIT_SET(BRVLAN_SETSIZE, (bit), set)
+#define BRVLAN_CLR(set, bit) __BIT_CLR(BRVLAN_SETSIZE, (bit), set)
+#define BRVLAN_TEST(set, bit) __BIT_ISSET(BRVLAN_SETSIZE, (bit), set)
+
+#define BRDG_VLAN_OP_SET 1 /* replace current vlan set */
+#define BRDG_VLAN_OP_ADD 2 /* add vlans to current set */
+#define BRDG_VLAN_OP_DEL 3 /* remove vlans from current set */
+
+struct ifbif_vlan_req {
+ char bv_ifname[IFNAMSIZ];
+ uint8_t bv_op;
+ ifbvlan_set_t bv_set;
+};
+
#ifdef _KERNEL
#define BRIDGE_INPUT(_ifp, _m) do { \
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index cf697089708c..3ae0c01c0efc 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -92,11 +92,6 @@
#include <crypto/sha1.h>
-#ifdef CTASSERT
-CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
-CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
-#endif
-
VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */
/* netgraph node hooks for ng_ether(4) */
@@ -1510,9 +1505,7 @@ ether_gen_addr_byname(const char *nameunit, struct ether_addr *hwaddr)
SHA1Final(digest, &ctx);
free(buf, M_TEMP);
- addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) &
- OUI_FREEBSD_GENERATED_MASK;
- addr = OUI_FREEBSD(addr);
+ addr = (digest[0] << 8) | digest[1] | OUI_FREEBSD_GENERATED_LOW;
for (i = 0; i < ETHER_ADDR_LEN; ++i) {
hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) &
0xFF;
diff --git a/sys/net/if_gif.h b/sys/net/if_gif.h
index 3c1846b8f82a..c6692d3dd6bc 100644
--- a/sys/net/if_gif.h
+++ b/sys/net/if_gif.h
@@ -120,7 +120,8 @@ int in6_gif_setopts(struct gif_softc *, u_int);
#define GIFGOPTS _IOWR('i', 150, struct ifreq)
#define GIFSOPTS _IOW('i', 151, struct ifreq)
+#define GIF_NOCLAMP 0x0001
#define GIF_IGNORE_SOURCE 0x0002
-#define GIF_OPTMASK (GIF_IGNORE_SOURCE)
+#define GIF_OPTMASK (GIF_NOCLAMP|GIF_IGNORE_SOURCE)
#endif /* _NET_IF_GIF_H_ */
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index 9867a718e148..5b52bfa80e3b 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -718,6 +718,7 @@ lagg_capabilities(struct lagg_softc *sc)
sc->sc_ifp->if_capenable = ena;
sc->sc_ifp->if_capenable2 = ena2;
sc->sc_ifp->if_hwassist = hwa;
+ (void)if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax);
getmicrotime(&sc->sc_ifp->if_lastchange);
if (sc->sc_ifflags & IFF_DEBUG)
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index e9e1c82cb688..22fcb7bf7c64 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -1673,6 +1673,7 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid,
*/
if (p->if_type != IFT_ETHER &&
p->if_type != IFT_L2VLAN &&
+ p->if_type != IFT_BRIDGE &&
(p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0)
return (EPROTONOSUPPORT);
if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS)
diff --git a/sys/net/if_vlan_var.h b/sys/net/if_vlan_var.h
index f0b09445d04b..695bb81f77b3 100644
--- a/sys/net/if_vlan_var.h
+++ b/sys/net/if_vlan_var.h
@@ -126,13 +126,6 @@ struct vlanreq {
#define VLAN_PCP_MAX 7
-#define DOT1Q_VID_NULL 0x0
-#define DOT1Q_VID_DEF_PVID 0x1
-#define DOT1Q_VID_DEF_SR_PVID 0x2
-#define DOT1Q_VID_RSVD_IMPL 0xfff
-#define DOT1Q_VID_MIN 1 /* minimum valid vlan id */
-#define DOT1Q_VID_MAX 4094 /* maximum valid vlan id */
-
/*
* 802.1q full tag. Proto and vid are stored in host byte order.
*/
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index 71cb1862aabf..452a8eb4024b 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -508,18 +508,6 @@ extern struct sx pf_end_lock;
(c == AF_INET6 && !(a)->addr32[0] && !(a)->addr32[1] && \
!(a)->addr32[2] && !(a)->addr32[3] )) \
-#define PF_MATCHA(n, a, m, b, f) \
- pf_match_addr(n, a, m, b, f)
-
-#define PF_ACPY(a, b, f) \
- pf_addrcpy(a, b, f)
-
-#define PF_AINC(a, f) \
- pf_addr_inc(a, f)
-
-#define PF_POOLMASK(a, b, c, d, f) \
- pf_poolmask(a, b, c, d, f)
-
#else
/* Just IPv6 */
@@ -544,18 +532,6 @@ extern struct sx pf_end_lock;
!(a)->addr32[2] && \
!(a)->addr32[3] ) \
-#define PF_MATCHA(n, a, m, b, f) \
- pf_match_addr(n, a, m, b, f)
-
-#define PF_ACPY(a, b, f) \
- pf_addrcpy(a, b, f)
-
-#define PF_AINC(a, f) \
- pf_addr_inc(a, f)
-
-#define PF_POOLMASK(a, b, c, d, f) \
- pf_poolmask(a, b, c, d, f)
-
#else
/* Just IPv4 */
@@ -570,29 +546,14 @@ extern struct sx pf_end_lock;
#define PF_AZERO(a, c) \
(!(a)->addr32[0])
-#define PF_MATCHA(n, a, m, b, f) \
- pf_match_addr(n, a, m, b, f)
-
-#define PF_ACPY(a, b, f) \
- (a)->v4.s_addr = (b)->v4.s_addr
-
-#define PF_AINC(a, f) \
- do { \
- (a)->addr32[0] = htonl(ntohl((a)->addr32[0]) + 1); \
- } while (0)
-
-#define PF_POOLMASK(a, b, c, d, f) \
- do { \
- (a)->addr32[0] = ((b)->addr32[0] & (c)->addr32[0]) | \
- (((c)->addr32[0] ^ 0xffffffff ) & (d)->addr32[0]); \
- } while (0)
-
#endif /* PF_INET_ONLY */
#endif /* PF_INET6_ONLY */
#endif /* PF_INET_INET6 */
#ifdef _KERNEL
-#ifdef INET6
+
+void unhandled_af(int) __dead2;
+
static void inline
pf_addrcpy(struct pf_addr *dst, const struct pf_addr *src, sa_family_t af)
{
@@ -602,12 +563,15 @@ pf_addrcpy(struct pf_addr *dst, const struct pf_addr *src, sa_family_t af)
memcpy(&dst->v4, &src->v4, sizeof(dst->v4));
break;
#endif /* INET */
+#ifdef INET6
case AF_INET6:
memcpy(&dst->v6, &src->v6, sizeof(dst->v6));
break;
+#endif /* INET6 */
+ default:
+ unhandled_af(af);
}
}
-#endif /* INET6 */
#endif
/*
@@ -629,7 +593,7 @@ pf_addrcpy(struct pf_addr *dst, const struct pf_addr *src, sa_family_t af)
&(aw)->v.a.mask, (x), (af))) || \
((aw)->type == PF_ADDR_ADDRMASK && \
!PF_AZERO(&(aw)->v.a.mask, (af)) && \
- !PF_MATCHA(0, &(aw)->v.a.addr, \
+ !pf_match_addr(0, &(aw)->v.a.addr, \
&(aw)->v.a.mask, (x), (af))))) != \
(neg) \
)
@@ -1406,7 +1370,6 @@ struct pf_kruleset {
struct pf_krulequeue queues[2];
struct {
struct pf_krulequeue *ptr;
- struct pf_krule **ptr_array;
u_int32_t rcount;
u_int32_t ticket;
int open;
@@ -2341,7 +2304,6 @@ VNET_DECLARE(struct pf_krule *, pf_rulemarker);
#define V_pf_rulemarker VNET(pf_rulemarker)
#endif
-void unhandled_af(int) __dead2;
int pf_start(void);
int pf_stop(void);
void pf_initialize(void);
@@ -2477,11 +2439,11 @@ int pf_test(sa_family_t, int, int, struct ifnet *, struct mbuf **, struct inpcb
int pf_normalize_ip(u_short *, struct pf_pdesc *);
#endif /* INET */
-#ifdef INET6
-int pf_normalize_ip6(int, u_short *, struct pf_pdesc *);
void pf_poolmask(struct pf_addr *, struct pf_addr*,
struct pf_addr *, struct pf_addr *, sa_family_t);
void pf_addr_inc(struct pf_addr *, sa_family_t);
+#ifdef INET6
+int pf_normalize_ip6(int, u_short *, struct pf_pdesc *);
int pf_max_frag_size(struct mbuf *);
int pf_refragment6(struct ifnet *, struct mbuf **, struct m_tag *,
struct ifnet *, bool);
@@ -2537,7 +2499,7 @@ int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t);
void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t,
u_int64_t, int, int, int);
int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, sa_family_t,
- pf_addr_filter_func_t);
+ pf_addr_filter_func_t, bool);
void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *);
struct pfr_ktable *
pfr_attach_table(struct pf_kruleset *, char *);
@@ -2571,6 +2533,8 @@ int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int);
int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int);
int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *,
int *, u_int32_t, int);
+struct pfr_ktable
+ *pfr_ktable_select_active(struct pfr_ktable *);
MALLOC_DECLARE(PFI_MTYPE);
VNET_DECLARE(struct pfi_kkif *, pfi_all);
@@ -2674,11 +2638,10 @@ int pf_kanchor_copyout(const struct pf_kruleset *,
const struct pf_krule *, char *, size_t);
int pf_kanchor_nvcopyout(const struct pf_kruleset *,
const struct pf_krule *, nvlist_t *);
-void pf_kanchor_remove(struct pf_krule *);
+void pf_remove_kanchor(struct pf_krule *);
void pf_remove_if_empty_kruleset(struct pf_kruleset *);
struct pf_kruleset *pf_find_kruleset(const char *);
struct pf_kruleset *pf_get_leaf_kruleset(char *, char **);
-struct pf_kanchor *pf_create_kanchor(struct pf_kanchor *, const char *);
struct pf_kruleset *pf_find_or_create_kruleset(const char *);
void pf_rs_initialize(void);
@@ -2712,6 +2675,7 @@ int pf_ioctl_get_addrs(struct pf_nl_pooladdr *);
int pf_ioctl_get_addr(struct pf_nl_pooladdr *);
int pf_ioctl_get_rulesets(struct pfioc_ruleset *);
int pf_ioctl_get_ruleset(struct pfioc_ruleset *);
+int pf_ioctl_natlook(struct pfioc_natlook *);
void pf_krule_free(struct pf_krule *);
void pf_krule_clear_counters(struct pf_krule *);
@@ -2749,7 +2713,6 @@ u_short pf_map_addr(u_int8_t, struct pf_krule *,
u_short pf_map_addr_sn(u_int8_t, struct pf_krule *,
struct pf_addr *, struct pf_addr *,
struct pfi_kkif **nkif, struct pf_addr *,
- struct pf_ksrc_node **, struct pf_srchash **,
struct pf_kpool *, pf_sn_types_t);
int pf_get_transaddr_af(struct pf_krule *,
struct pf_pdesc *);
diff --git a/sys/net80211/ieee80211_hostap.c b/sys/net80211/ieee80211_hostap.c
index c5a478533313..9074878e17e4 100644
--- a/sys/net80211/ieee80211_hostap.c
+++ b/sys/net80211/ieee80211_hostap.c
@@ -2214,12 +2214,9 @@ hostap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0,
/* VHT */
if (IEEE80211_IS_CHAN_VHT(ni->ni_chan) &&
- vhtcap != NULL &&
- vhtinfo != NULL) {
- /* XXX TODO; see below */
- net80211_vap_printf(vap, "%s: VHT TODO!\n", __func__);
+ vhtcap != NULL) {
ieee80211_vht_node_init(ni);
- ieee80211_vht_update_cap(ni, vhtcap, vhtinfo);
+ ieee80211_vht_update_cap(ni, vhtcap);
} else if (ni->ni_flags & IEEE80211_NODE_VHT)
ieee80211_vht_node_cleanup(ni);
diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c
index 5ec80e3646b8..c28f124648a1 100644
--- a/sys/net80211/ieee80211_ht.c
+++ b/sys/net80211/ieee80211_ht.c
@@ -1952,6 +1952,11 @@ do { \
_RETURN_CHAN_BITS(0);
/*
+ * TODO: should we bail out if there's no htinfo?
+ * Or just treat it as if we can't do the HT20/HT40 check?
+ */
+
+ /*
* The original code was based on
* 802.11ac-2013, Table 8-183x-VHT Operation Information subfields.
* 802.11-2020, Table 9-274-VHT Operation Information subfields
@@ -1962,8 +1967,12 @@ do { \
*/
htinfo = (const struct ieee80211_ie_htinfo *)ni->ni_ies.htinfo_ie;
- ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
- IEEE80211_HTINFO_TXWIDTH_2040);
+ if (htinfo != NULL)
+ ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
+ IEEE80211_HTINFO_TXWIDTH_2040);
+ else
+ ht40 = false;
+
can_vht160 = can_vht80p80 = can_vht80 = false;
/* 20 Mhz */
diff --git a/sys/net80211/ieee80211_node.c b/sys/net80211/ieee80211_node.c
index ad17af6778a1..a201d1b278f0 100644
--- a/sys/net80211/ieee80211_node.c
+++ b/sys/net80211/ieee80211_node.c
@@ -3138,6 +3138,36 @@ ieee80211_getsignal(struct ieee80211vap *vap, int8_t *rssi, int8_t *noise)
}
/**
+ * @brief Increment the given TID TX sequence, return the current one.
+ *
+ * @param ni ieee80211_node to operate on
+ * @param tid TID, or IEEE80211_NONQOS_TID
+ * @returns sequence number, from 0 .. 4095 inclusive, post increments
+ */
+ieee80211_seq ieee80211_tx_seqno_fetch_incr(struct ieee80211_node *ni,
+ uint8_t tid)
+{
+ ieee80211_seq seq;
+
+ seq = ni->ni_txseqs[tid];
+ ni->ni_txseqs[tid] = (ni->ni_txseqs[tid] + 1) % IEEE80211_SEQ_RANGE;
+ return (seq);
+}
+
+/**
+ * @brief Return the current sequence number for the given TID
+ *
+ * @param ni ieee80211_node to operate on
+ * @param tid TID, or IEEE80211_NONQOS_TID
+ * @returns sequence number, from 0 .. 4095 inclusive
+ */
+ieee80211_seq ieee80211_tx_seqno_fetch(const struct ieee80211_node *ni,
+ uint8_t tid)
+{
+ return (ni->ni_txseqs[tid]);
+}
+
+/**
* @brief return a dot11rate / ratecode representing the current transmit rate
*
* This is the API call for legacy / 802.11n drivers and rate control APIs
diff --git a/sys/net80211/ieee80211_node.h b/sys/net80211/ieee80211_node.h
index c83eee04a8dc..ef25fa0d7fdd 100644
--- a/sys/net80211/ieee80211_node.h
+++ b/sys/net80211/ieee80211_node.h
@@ -531,6 +531,12 @@ void ieee80211_node_leave(struct ieee80211_node *);
int8_t ieee80211_getrssi(struct ieee80211vap *);
void ieee80211_getsignal(struct ieee80211vap *, int8_t *, int8_t *);
+/* TX sequence space related routines */
+ieee80211_seq ieee80211_tx_seqno_fetch_incr(struct ieee80211_node *,
+ uint8_t);
+ieee80211_seq ieee80211_tx_seqno_fetch(const struct ieee80211_node *,
+ uint8_t);
+
/*
* Node transmit rate specific manipulation.
*
diff --git a/sys/net80211/ieee80211_output.c b/sys/net80211/ieee80211_output.c
index a4151f807882..afe83ea0805c 100644
--- a/sys/net80211/ieee80211_output.c
+++ b/sys/net80211/ieee80211_output.c
@@ -4195,17 +4195,15 @@ ieee80211_tx_complete(struct ieee80211_node *ni, struct mbuf *m, int status)
* Check the frame type and TID and assign a suitable sequence number
* from the correct sequence number space.
*
+ * This implements the components of 802.11-2020 10.3.2.14.2
+ * (Transmitter Requirements) that net80211 currently supports.
+ *
* It assumes the mbuf has been encapsulated, and has the TID assigned
* if it is a QoS frame.
*
* Note this also clears any existing fragment ID in the header, so it
* must be called first before assigning fragment IDs.
*
- * For now this implements parts of 802.11-2012; it doesn't do all of
- * the needed checks for full compliance (notably QoS-Data NULL frames).
- *
- * TODO: update to 802.11-2020 10.3.2.14.2 (Transmitter Requirements)
- *
* @param ni ieee80211_node this frame will be transmitted to
* @param arg_tid A temporary check, existing callers may set
* this to a TID variable they were using, and this routine
@@ -4239,16 +4237,30 @@ ieee80211_output_seqno_assign(struct ieee80211_node *ni, int arg_tid,
"%s: called; TID mismatch; tid=%u, arg_tid=%d\n",
__func__, tid, arg_tid);
- if (IEEE80211_HAS_SEQ(type, subtype)) {
- /*
- * 802.11-2012 9.3.2.10 - QoS multicast frames
- * come out of a different seqno space.
- */
- if (IEEE80211_IS_MULTICAST(wh->i_addr1))
- seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
- else
- seqno = ni->ni_txseqs[tid]++;
- } else
+
+ /* 802.11-2020 10.3.2.14.2 (Transmitter Requirements) sections */
+
+ /* SNS7 - unicast PV1 management frame */
+
+ /* SNS6 - unicast PV1 data frame */
+
+ /* SNS5 - QoS NULL frames */
+ if (IEEE80211_QOS_HAS_SEQ(wh) && IEEE80211_IS_QOS_NULL(wh))
+ seqno = ieee80211_tx_seqno_fetch_incr(ni, IEEE80211_NONQOS_TID);
+
+ /* SNS4 - QMF STA transmitting a QMF */
+
+ /* SNS3 - QoS STA; Time Priority Management frame */
+
+ /* SNS2 - unicast QoS STA, data frame, excluding SNS5 */
+ else if (IEEE80211_QOS_HAS_SEQ(wh) &&
+ !IEEE80211_IS_MULTICAST(wh->i_addr1))
+ seqno = ieee80211_tx_seqno_fetch_incr(ni, tid);
+
+ /* SNS1 - Baseline (everything else) */
+ else if (IEEE80211_HAS_SEQ(type, subtype))
+ seqno = ieee80211_tx_seqno_fetch_incr(ni, IEEE80211_NONQOS_TID);
+ else
seqno = 0;
/*
@@ -4276,7 +4288,7 @@ ieee80211_output_beacon_seqno_assign(struct ieee80211_node *ni, struct mbuf *m)
wh = mtod(m, struct ieee80211_frame *);
- seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
+ seqno = ieee80211_tx_seqno_fetch_incr(ni, IEEE80211_NONQOS_TID);
*(uint16_t *)&wh->i_seq[0] =
htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
M_SEQNO_SET(m, seqno);
diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c
index e91977f1ef98..de0b691d4d2a 100644
--- a/sys/net80211/ieee80211_vht.c
+++ b/sys/net80211/ieee80211_vht.c
@@ -838,12 +838,10 @@ ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *ni)
}
void
-ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie,
- const uint8_t *vhtop_ie)
+ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie)
{
ieee80211_parse_vhtcap(ni, vhtcap_ie);
- ieee80211_parse_vhtopmode(ni, vhtop_ie);
}
static struct ieee80211_channel *
diff --git a/sys/net80211/ieee80211_vht.h b/sys/net80211/ieee80211_vht.h
index 2964de63c343..a1529df4a85b 100644
--- a/sys/net80211/ieee80211_vht.h
+++ b/sys/net80211/ieee80211_vht.h
@@ -52,8 +52,7 @@ uint8_t * ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *);
uint8_t *ieee80211_add_vhtcap_ch(uint8_t *, struct ieee80211vap *,
struct ieee80211_channel *);
-void ieee80211_vht_update_cap(struct ieee80211_node *,
- const uint8_t *, const uint8_t *);
+void ieee80211_vht_update_cap(struct ieee80211_node *, const uint8_t *);
struct ieee80211_channel *
ieee80211_vht_adjust_channel(struct ieee80211com *,
diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h
index b1f2b0ebf911..d6b75e482e35 100644
--- a/sys/netinet/icmp_var.h
+++ b/sys/netinet/icmp_var.h
@@ -104,11 +104,10 @@ extern int badport_bandlim(int);
#define BANDLIM_ICMP_UNREACH 0
#define BANDLIM_ICMP_ECHO 1
#define BANDLIM_ICMP_TSTAMP 2
-#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
-#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */
-#define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_SCTP_OOTB 6
-#define BANDLIM_MAX 7
+#define BANDLIM_TCP_RST 3
+#define BANDLIM_ICMP6_UNREACH 4
+#define BANDLIM_SCTP_OOTB 5
+#define BANDLIM_MAX 6
#endif
#endif
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index bccd4b84561a..dbe48242381d 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -1745,6 +1745,23 @@ in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
}
/*
+ * Dereference and rlock inp, for which the caller must own the
+ * reference. Returns true if inp no longer usable, false otherwise.
+ */
+bool
+in_pcbrele_rlock(struct inpcb *inp)
+{
+ INP_RLOCK(inp);
+ if (in_pcbrele_rlocked(inp))
+ return (true);
+ if ((inp->inp_flags & INP_FREED) != 0) {
+ INP_RUNLOCK(inp);
+ return (true);
+ }
+ return (false);
+}
+
+/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 57cf15ca37fc..9e0618e87601 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -681,6 +681,7 @@ void in_pcbref(struct inpcb *);
bool in_pcbrele(struct inpcb *, inp_lookup_t);
bool in_pcbrele_rlocked(struct inpcb *);
bool in_pcbrele_wlocked(struct inpcb *);
+bool in_pcbrele_rlock(struct inpcb *inp);
typedef bool inp_match_t(const struct inpcb *, void *);
struct inpcb_iterator {
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
index cb4b6df57c57..71b75d18efd0 100644
--- a/sys/netinet/ip_icmp.c
+++ b/sys/netinet/ip_icmp.c
@@ -1097,8 +1097,7 @@ static const char *icmp_rate_descrs[BANDLIM_MAX] = {
[BANDLIM_ICMP_UNREACH] = "icmp unreach",
[BANDLIM_ICMP_ECHO] = "icmp ping",
[BANDLIM_ICMP_TSTAMP] = "icmp tstamp",
- [BANDLIM_RST_CLOSEDPORT] = "closed port RST",
- [BANDLIM_RST_OPENPORT] = "open port RST",
+ [BANDLIM_TCP_RST] = "tcp reset",
[BANDLIM_ICMP6_UNREACH] = "icmp6 unreach",
[BANDLIM_SCTP_OOTB] = "sctp ootb",
};
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 91f8251589e4..b60cdf45af52 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -433,38 +433,40 @@ static void
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
int slots_to_run, int idx, bool from_callout)
{
- union tcp_log_stackspecific log;
- /*
- * Unused logs are
- * 64 bit - delRate, rttProp, bw_inuse
- * 16 bit - cwnd_gain
- * 8 bit - bbr_state, bbr_substate, inhpts;
- */
- memset(&log, 0, sizeof(log));
- log.u_bbr.flex1 = hpts->p_nxt_slot;
- log.u_bbr.flex2 = hpts->p_cur_slot;
- log.u_bbr.flex3 = hpts->p_prev_slot;
- log.u_bbr.flex4 = idx;
- log.u_bbr.flex5 = hpts->p_curtick;
- log.u_bbr.flex6 = hpts->p_on_queue_cnt;
- log.u_bbr.flex7 = hpts->p_cpu;
- log.u_bbr.flex8 = (uint8_t)from_callout;
- log.u_bbr.inflight = slots_to_run;
- log.u_bbr.applimited = hpts->overidden_sleep;
- log.u_bbr.delivered = hpts->saved_curtick;
- log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
- log.u_bbr.epoch = hpts->saved_curslot;
- log.u_bbr.lt_epoch = hpts->saved_prev_slot;
- log.u_bbr.pkts_out = hpts->p_delayed_by;
- log.u_bbr.lost = hpts->p_hpts_sleep_time;
- log.u_bbr.pacing_gain = hpts->p_cpu;
- log.u_bbr.pkt_epoch = hpts->p_runningslot;
- log.u_bbr.use_lt_bw = 1;
- TCP_LOG_EVENTP(tp, NULL,
- &tptosocket(tp)->so_rcv,
- &tptosocket(tp)->so_snd,
- BBR_LOG_HPTSDIAG, 0,
- 0, &log, false, tv);
+ if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
+ union tcp_log_stackspecific log;
+ /*
+ * Unused logs are
+ * 64 bit - delRate, rttProp, bw_inuse
+ * 16 bit - cwnd_gain
+ * 8 bit - bbr_state, bbr_substate, inhpts;
+ */
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = hpts->p_nxt_slot;
+ log.u_bbr.flex2 = hpts->p_cur_slot;
+ log.u_bbr.flex3 = hpts->p_prev_slot;
+ log.u_bbr.flex4 = idx;
+ log.u_bbr.flex5 = hpts->p_curtick;
+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+ log.u_bbr.flex7 = hpts->p_cpu;
+ log.u_bbr.flex8 = (uint8_t)from_callout;
+ log.u_bbr.inflight = slots_to_run;
+ log.u_bbr.applimited = hpts->overidden_sleep;
+ log.u_bbr.delivered = hpts->saved_curtick;
+ log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
+ log.u_bbr.epoch = hpts->saved_curslot;
+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+ log.u_bbr.pkts_out = hpts->p_delayed_by;
+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
+ log.u_bbr.pacing_gain = hpts->p_cpu;
+ log.u_bbr.pkt_epoch = hpts->p_runningslot;
+ log.u_bbr.use_lt_bw = 1;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tptosocket(tp)->so_rcv,
+ &tptosocket(tp)->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, tv);
+ }
}
static void
@@ -1353,10 +1355,7 @@ again:
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
- if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
- tcp_hpts_log(hpts, tp, &tv, slots_to_run, i,
- from_callout);
- }
+ tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
@@ -1487,7 +1486,7 @@ no_run:
}
void
-__tcp_set_hpts(struct tcpcb *tp, int32_t line)
+tcp_set_hpts(struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index b097a2b98db9..f5856ed8e688 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -149,8 +149,7 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,
#define tcp_hpts_insert(inp, slot) \
tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)
-void __tcp_set_hpts(struct tcpcb *tp, int32_t line);
-#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
+void tcp_set_hpts(struct tcpcb *tp);
void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason);
@@ -165,25 +164,25 @@ extern int32_t tcp_min_hptsi_time;
* The following functions should also be available
* to userspace as well.
*/
-static __inline uint32_t
+static inline uint32_t
tcp_tv_to_hptstick(const struct timeval *sv)
{
return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
}
-static __inline uint32_t
+static inline uint32_t
tcp_tv_to_usectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
-static __inline uint32_t
+static inline uint32_t
tcp_tv_to_mssectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
}
-static __inline uint64_t
+static inline uint64_t
tcp_tv_to_lusectick(const struct timeval *sv)
{
return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
@@ -199,7 +198,7 @@ get_hpts_min_sleep_time(void)
return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
}
-static __inline uint32_t
+static inline uint32_t
tcp_gethptstick(struct timeval *sv)
{
struct timeval tv;
@@ -210,7 +209,7 @@ tcp_gethptstick(struct timeval *sv)
return (tcp_tv_to_hptstick(sv));
}
-static __inline uint64_t
+static inline uint64_t
tcp_get_u64_usecs(struct timeval *tv)
{
struct timeval tvd;
@@ -221,7 +220,7 @@ tcp_get_u64_usecs(struct timeval *tv)
return (tcp_tv_to_lusectick(tv));
}
-static __inline uint32_t
+static inline uint32_t
tcp_get_usecs(struct timeval *tv)
{
struct timeval tvd;
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 7c032e13f37a..de428ae1af6f 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -621,6 +621,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
#endif /* INET6 */
struct tcpopt to; /* options in this segment */
char *s = NULL; /* address and port logging */
+ bool closed_port = false; /* segment is hitting a closed port */
NET_EPOCH_ASSERT();
@@ -907,7 +908,8 @@ findpcb:
log(LOG_INFO, "%s; %s: Connection attempt "
"to closed port\n", s, __func__);
}
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ rstreason = BANDLIM_TCP_RST;
+ closed_port = true;
goto dropwithreset;
}
INP_LOCK_ASSERT(inp);
@@ -998,12 +1000,14 @@ findpcb:
* down or it is in the CLOSED state. Either way we drop the
* segment and send an appropriate response.
*/
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ rstreason = BANDLIM_TCP_RST;
+ closed_port = true;
goto dropwithreset;
}
if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ rstreason = BANDLIM_TCP_RST;
+ closed_port = true;
goto dropwithreset;
}
@@ -1055,6 +1059,8 @@ findpcb:
* socket appended to the listen queue in SYN_RECEIVED state.
*/
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+ int result;
+
/*
* Parse the TCP options here because
* syncookies need access to the reflected
@@ -1064,8 +1070,8 @@ findpcb:
/*
* NB: syncache_expand() doesn't unlock inp.
*/
- rstreason = syncache_expand(&inc, &to, th, &so, m, port);
- if (rstreason < 0) {
+ result = syncache_expand(&inc, &to, th, &so, m, port);
+ if (result < 0) {
/*
* A failing TCP MD5 signature comparison
* must result in the segment being dropped
@@ -1073,7 +1079,7 @@ findpcb:
* to the sender.
*/
goto dropunlock;
- } else if (rstreason == 0) {
+ } else if (result == 0) {
/*
* No syncache entry, or ACK was not for our
* SYN/ACK. Do our protection against double
@@ -1092,7 +1098,7 @@ findpcb:
* of the failure cause.
*/
INP_WUNLOCK(inp);
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
lookupflag &= ~INPLOOKUP_WILDCARD;
goto findpcb;
}
@@ -1183,7 +1189,7 @@ tfo_socket_result:
s, __func__);
syncache_badack(&inc, port); /* XXX: Not needed! */
TCPSTAT_INC(tcps_badsyn);
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
goto dropwithreset;
}
/*
@@ -1259,7 +1265,7 @@ tfo_socket_result:
"Connection attempt to deprecated "
"IPv6 address rejected\n",
s, __func__);
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
goto dropwithreset;
}
}
@@ -1380,9 +1386,10 @@ dropwithreset:
* When blackholing do not respond with a RST but
* completely ignore the segment and drop it.
*/
- if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) ||
- (rstreason == BANDLIM_RST_CLOSEDPORT &&
- ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
+ if (rstreason == BANDLIM_TCP_RST &&
+ ((!closed_port && V_blackhole == 3) ||
+ (closed_port &&
+ ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
(V_blackhole_local || (
#ifdef INET6
isipv6 ? !in6_localip(&ip6->ip6_src) :
@@ -1515,7 +1522,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
struct tcpopt to;
int tfo_syn;
u_int maxseg = 0;
+ bool no_data;
+ no_data = (tlen == 0);
thflags = tcp_get_flags(th);
tp->sackhint.last_sack_ack = 0;
sack_changed = SACK_NOCHANGE;
@@ -1754,7 +1763,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tp->ts_recent = to.to_tsval;
}
- if (tlen == 0) {
+ if (no_data) {
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
!IN_RECOVERY(tp->t_flags) &&
@@ -1963,7 +1972,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -1976,7 +1985,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* FIN, or a RST.
*/
if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
} else if (thflags & TH_SYN) {
@@ -2244,7 +2253,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* for the "LAND" DoS attack.
*/
if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -2557,7 +2566,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
maxseg = tcp_maxseg(tp);
- if (tlen == 0 &&
+ if (no_data &&
(tiwin == tp->snd_wnd ||
(tp->t_flags & TF_SACK_PERMIT))) {
/*
@@ -3113,8 +3122,7 @@ step6:
(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
/* keep track of pure window updates */
- if (tlen == 0 &&
- tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
TCPSTAT_INC(tcps_rcvwinupd);
tp->snd_wnd = tiwin;
tp->snd_wl1 = th->th_seq;
@@ -3424,7 +3432,7 @@ dropafterack:
if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
(SEQ_GT(tp->snd_una, th->th_ack) ||
SEQ_GT(th->th_ack, tp->snd_max)) ) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
index 75d693bc019b..e24790ece43d 100644
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -2878,7 +2878,7 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
/* double check log state now that we have the lock */
if (inp->inp_flags & INP_DROPPED)
goto done;
- if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
+ if (tcp_bblogging_on(tp)) {
struct timeval tv;
tcp_log_eventspecific_t log;
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index fef32e16b2e4..3e7eef8a1cda 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -539,12 +539,12 @@ struct tcpcb;
NULL, NULL, 0, NULL); \
} while (0)
#endif /* TCP_LOG_FORCEVERBOSE */
+/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */
#define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
do { \
- if (tcp_bblogging_on(tp)) \
- tcp_log_event(tp, th, rxbuf, txbuf, eventid, \
- errornum, len, stackinfo, th_hostorder, \
- NULL, NULL, 0, tv); \
+ KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \
+ tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len, \
+ stackinfo, th_hostorder, NULL, NULL, 0, tv); \
} while (0)
#ifdef TCP_BLACKBOX
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index e2cfec5c9275..d2636f01714e 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -8763,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -8965,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (tp->t_flags & TF_FASTOPEN) {
@@ -8977,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
@@ -9010,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (SEQ_LT(th->th_seq, tp->irs)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -9288,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9385,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9535,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9637,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9739,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9848,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -11510,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (tiwin > bbr->r_ctl.rc_high_rwnd)
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 8e05498863b9..834e1347a152 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -40,7 +40,6 @@
#endif
#include <sys/lock.h>
#include <sys/malloc.h>
-#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
@@ -198,7 +197,7 @@ static uint32_t rack_pcm_blast = 0;
static uint32_t rack_pcm_is_enabled = 1;
static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
-static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */
+static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */
static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
@@ -938,7 +937,7 @@ rack_init_sysctls(void)
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "time_between", CTLFLAG_RW,
- & rack_time_between_probertt, 96000000,
+ &rack_time_between_probertt, 96000000,
"How many useconds between the lowest rtt falling must past before we enter probertt");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
@@ -3480,9 +3479,9 @@ static void
rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
{
if (rsm->r_flags & RACK_APP_LIMITED) {
- if (rack->r_ctl.rc_app_limited_cnt > 0) {
- rack->r_ctl.rc_app_limited_cnt--;
- }
+ KASSERT((rack->r_ctl.rc_app_limited_cnt > 0),
+ ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm));
+ rack->r_ctl.rc_app_limited_cnt--;
}
if (rsm->r_limit_type) {
/* currently there is only one limit type */
@@ -3554,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
* earlier.
*
* So lets calculate the BDP with the "known" b/w using
- * the SRTT has our rtt and then multiply it by the
- * goal.
+ * the SRTT as our rtt and then multiply it by the goal.
*/
bw = rack_get_bw(rack);
srtt = (uint64_t)tp->t_srtt;
@@ -5793,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
tp->t_badrxtwin = 0;
break;
}
- if ((CC_ALGO(tp)->cong_signal != NULL) &&
+ if ((CC_ALGO(tp)->cong_signal != NULL) &&
(type != CC_RTO)){
tp->t_ccv.curack = ack;
CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
@@ -5904,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li
*
* If reorder-fade is configured, then we track the last time we saw
* re-ordering occur. If we reach the point where enough time as
- * passed we no longer consider reordering has occuring.
+ * passed we no longer consider reordering as occurring.
*
* Or if reorder-face is 0, then once we see reordering we consider
* the connection to alway be subject to reordering and just set lro
@@ -7045,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
/* Push bit must go to the right edge as well */
if (rsm->r_flags & RACK_HAD_PUSH)
rsm->r_flags &= ~RACK_HAD_PUSH;
+ /* Update the count if app limited */
+ if (nrsm->r_flags & RACK_APP_LIMITED)
+ rack->r_ctl.rc_app_limited_cnt++;
/* Clone over the state of the hw_tls flag */
nrsm->r_hw_tls = rsm->r_hw_tls;
/*
@@ -7096,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack,
l_rsm->r_flags |= RACK_TLP;
if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
l_rsm->r_flags |= RACK_RWND_COLLAPSED;
- if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
+ if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
/*
* If both are app-limited then let the
@@ -8137,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
* remove the lost desgination and reduce the
* bytes considered lost.
*/
- rsm->r_flags &= ~RACK_WAS_LOST;
+ rsm->r_flags &= ~RACK_WAS_LOST;
KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -8832,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts
val = rack_probertt_lower_within * rack_time_between_probertt;
val /= 100;
- if ((rack->in_probe_rtt == 0) &&
+ if ((rack->in_probe_rtt == 0) &&
(rack->rc_skip_timely == 0) &&
((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
rack_enter_probertt(rack, us_cts);
@@ -10369,7 +10370,7 @@ more:
* and yet before retransmitting we get an ack
* which can happen due to reordering.
*/
- rsm->r_flags &= ~RACK_WAS_LOST;
+ rsm->r_flags &= ~RACK_WAS_LOST;
KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -11065,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
* We need to skip anything already set
* to be retransmitted.
*/
- if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
+ if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
(rsm->r_flags & RACK_MUST_RXT)) {
rsm = TAILQ_NEXT(rsm, r_tnext);
continue;
@@ -12875,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -13089,7 +13090,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (tp->t_flags & TF_FASTOPEN) {
@@ -13102,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
@@ -13136,7 +13137,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (SEQ_LT(th->th_seq, tp->irs)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -13399,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13495,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13645,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13746,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13848,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13952,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -16655,7 +16656,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
#ifdef TCP_ACCOUNTING
sched_unpin();
#endif
@@ -16919,7 +16920,7 @@ do_output_now:
} else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
goto do_output_now;
} else if ((no_output == 1) &&
- (nxt_pkt == 0) &&
+ (nxt_pkt == 0) &&
(tcp_in_hpts(rack->rc_tp) == 0)) {
/*
* We are not in hpts and we had a pacing timer up. Use
@@ -17546,7 +17547,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
rack->r_ctl.rc_last_us_rtt,
88, __LINE__, NULL, gain);
}
- if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
+ if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
(rack->use_fixed_rate == 0)) {
/*
* No way yet to make a b/w estimate or
@@ -17986,7 +17987,7 @@ start_set:
tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
rack->r_ctl.rc_gp_cumack_ts = 0;
if ((rack->r_ctl.cleared_app_ack == 1) &&
- (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) {
+ (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) {
/*
* We just cleared an application limited period
* so the next seq out needs to skip the first
@@ -20043,7 +20044,7 @@ again:
rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
}
}
- if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
+ if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
uint32_t rw_avail, cwa;
if (tp->snd_wnd > ctf_outstanding(tp))
@@ -21031,7 +21032,7 @@ just_return_nolock:
} else
log = 1;
}
- /* Mark the last packet has app limited */
+ /* Mark the last packet as app limited */
rsm = tqhash_max(rack->r_ctl.tqh);
if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
if (rack->r_ctl.rc_app_limited_cnt == 0)
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index da26b8cb1f9b..d1c4ba58bf55 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -672,7 +672,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t
(SEQ_GT(tp->snd_una, th->th_ack) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
*ret_val = 1;
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return;
} else
*ret_val = 0;
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index cd42a67294a6..db415f6bdf03 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -2720,9 +2720,15 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys)
ksr->snd_tag->sw->snd_tag_status_str !=
NULL) {
sz = SND_TAG_STATUS_MAXLEN;
- ksr->snd_tag->sw->snd_tag_status_str(
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = ksr->snd_tag->sw->
+ snd_tag_status_str(
ksr->snd_tag, NULL, &sz);
- len += sz;
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0)
+ len += sz;
}
}
kss = so->so_snd.sb_tls_info;
@@ -2739,9 +2745,15 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys)
kss->snd_tag->sw->snd_tag_status_str !=
NULL) {
sz = SND_TAG_STATUS_MAXLEN;
- kss->snd_tag->sw->snd_tag_status_str(
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = kss->snd_tag->sw->
+ snd_tag_status_str(
kss->snd_tag, NULL, &sz);
- len += sz;
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0)
+ len += sz;
}
}
if (p) {
@@ -2811,9 +2823,16 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys)
if (ksr->snd_tag != NULL &&
ksr->snd_tag->sw->snd_tag_status_str != NULL) {
sz = SND_TAG_STATUS_MAXLEN;
- ksr->snd_tag->sw->snd_tag_status_str(
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = ksr->snd_tag->sw->snd_tag_status_str(
ksr->snd_tag, buf + len, &sz);
- len += sz;
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0) {
+ xktls->rcv.drv_st_len = sz;
+ len += sz;
+ }
}
}
if (kss != NULL && kss->gen == xig.xig_gen) {
@@ -2828,9 +2847,16 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys)
if (kss->snd_tag != NULL &&
kss->snd_tag->sw->snd_tag_status_str != NULL) {
sz = SND_TAG_STATUS_MAXLEN;
- kss->snd_tag->sw->snd_tag_status_str(
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ error = kss->snd_tag->sw->snd_tag_status_str(
kss->snd_tag, buf + len, &sz);
- len += sz;
+ if (in_pcbrele_rlock(inp))
+ return (EDEADLK);
+ if (error == 0) {
+ xktls->snd.drv_st_len = sz;
+ len += sz;
+ }
}
}
len = roundup2(len, __alignof(*xktls));
@@ -2858,12 +2884,23 @@ tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys)
static int
tcp_ktlslist1(SYSCTL_HANDLER_ARGS, bool export_keys)
{
- int res;
-
- sx_xlock(&ktlslist_lock);
- res = tcp_ktlslist_locked(oidp, arg1, arg2, req, export_keys);
- sx_xunlock(&ktlslist_lock);
- return (res);
+ int repeats, error;
+
+ for (repeats = 0; repeats < 100; repeats++) {
+ if (sx_xlock_sig(&ktlslist_lock))
+ return (EINTR);
+ error = tcp_ktlslist_locked(oidp, arg1, arg2, req,
+ export_keys);
+ sx_xunlock(&ktlslist_lock);
+ if (error != EDEADLK)
+ break;
+ if (sig_intr() != 0) {
+ error = EINTR;
+ break;
+ }
+ req->oldidx = 0;
+ }
+ return (error);
}
static int
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 3ea561e63503..687b0d538666 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1520,7 +1520,8 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td)
INP_WLOCK_ASSERT(inp);
if (__predict_false((so->so_state &
- (SS_ISCONNECTING | SS_ISCONNECTED)) != 0))
+ (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING |
+ SS_ISDISCONNECTED)) != 0))
return (EISCONN);
if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0))
return (EOPNOTSUPP);
diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c
index d476829e8e3b..2bab1c57ce2a 100644
--- a/sys/netinet6/in6_gif.c
+++ b/sys/netinet6/in6_gif.c
@@ -194,6 +194,11 @@ in6_gif_setopts(struct gif_softc *sc, u_int options)
sc->gif_options = options;
in6_gif_attach(sc);
}
+
+ if ((options & GIF_NOCLAMP) !=
+ (sc->gif_options & GIF_NOCLAMP)) {
+ sc->gif_options = options;
+ }
return (0);
}
@@ -289,6 +294,7 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
{
struct gif_softc *sc = ifp->if_softc;
struct ip6_hdr *ip6;
+ u_long mtu;
/* prepend new IP header */
NET_EPOCH_ASSERT();
@@ -304,11 +310,15 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
ip6->ip6_nxt = proto;
ip6->ip6_hlim = V_ip6_gif_hlim;
/*
- * force fragmentation to minimum MTU, to avoid path MTU discovery.
- * it is too painful to ask for resend of inner packet, to achieve
- * path MTU discovery for encapsulated packets.
+ * Enforce fragmentation to minimum MTU, even if the interface MTU
+ * is larger, to avoid path MTU discovery when NOCLAMP is not
+ * set (default). IPv6 does not allow fragmentation on intermediate
+ * router nodes, so it is too painful to ask for resend of inner
+ * packet, to achieve path MTU discovery for encapsulated packets.
*/
- return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL));
+ mtu = ((sc->gif_options & GIF_NOCLAMP) == 0) ? IPV6_MINMTU : 0;
+
+ return (ip6_output(m, 0, NULL, mtu, 0, NULL, NULL));
}
static int
diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c
index 06fe9e8820c9..a825658bd9ee 100644
--- a/sys/netinet6/mld6.c
+++ b/sys/netinet6/mld6.c
@@ -234,17 +234,20 @@ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
"Per-interface MLDv2 state");
-static int mld_v1enable = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
- &mld_v1enable, 0, "Enable fallback to MLDv1");
+VNET_DEFINE_STATIC(bool, mld_v1enable) = true;
+#define V_mld_v1enable VNET(mld_v1enable)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RWTUN,
+ &VNET_NAME(mld_v1enable), 0, "Enable fallback to MLDv1");
-static int mld_v2enable = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN,
- &mld_v2enable, 0, "Enable MLDv2");
+VNET_DEFINE_STATIC(bool, mld_v2enable) = true;
+#define V_mld_v2enable VNET(mld_v2enable)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RWTUN,
+ &VNET_NAME(mld_v2enable), 0, "Enable MLDv2");
-static int mld_use_allow = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
- &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
+VNET_DEFINE_STATIC(bool, mld_use_allow) = true;
+#define V_mld_use_allow VNET(mld_use_allow)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_VNET | CTLFLAG_RWTUN,
+ &VNET_NAME(mld_use_allow), 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
/*
* Packed Router Alert option structure declaration.
@@ -481,7 +484,7 @@ mld_domifattach(struct ifnet *ifp)
mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
if ((ifp->if_flags & IFF_MULTICAST) == 0)
mli->mli_flags |= MLIF_SILENT;
- if (mld_use_allow)
+ if (V_mld_use_allow)
mli->mli_flags |= MLIF_USEALLOW;
MLD_LOCK();
@@ -614,7 +617,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
is_general_query = 0;
- if (!mld_v1enable) {
+ if (!V_mld_v1enable) {
CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
@@ -790,7 +793,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
NET_EPOCH_ASSERT();
- if (!mld_v2enable) {
+ if (!V_mld_v2enable) {
CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &ip6->ip6_src),
ifp, if_name(ifp));
@@ -1076,7 +1079,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
NET_EPOCH_ASSERT();
- if (!mld_v1enable) {
+ if (!V_mld_v1enable) {
CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c
index 0379ef7c789a..c90a1213bd66 100644
--- a/sys/netinet6/raw_ip6.c
+++ b/sys/netinet6/raw_ip6.c
@@ -765,8 +765,7 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
}
if (ifa != NULL &&
((struct in6_ifaddr *)ifa)->ia6_flags &
- (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
- IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
+ (IN6_IFF_NOTREADY|IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
NET_EPOCH_EXIT(et);
return (EADDRNOTAVAIL);
}
diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c
index 6bacc68b7441..92d0201b398a 100644
--- a/sys/netipsec/ipsec.c
+++ b/sys/netipsec/ipsec.c
@@ -636,8 +636,10 @@ ipsec4_in_reject1(const struct mbuf *m, struct ip *ip1, struct inpcb *inp)
#ifdef IPSEC_OFFLOAD
tag = ipsec_accel_input_tag_lookup(m);
- if (tag != NULL)
- return (0);
+ if (tag != NULL) {
+ tag->tag.m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
+ __DECONST(struct mbuf *, m)->m_flags |= M_DECRYPTED;
+ }
#endif
if (ip1 == NULL) {
diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c
index 467d5ded1d7a..8a09d5f37b4a 100644
--- a/sys/netipsec/ipsec_offload.c
+++ b/sys/netipsec/ipsec_offload.c
@@ -94,6 +94,7 @@ struct ifp_handle_sav {
size_t hdr_ext_size;
uint64_t cnt_octets;
uint64_t cnt_allocs;
+ struct xform_history xfh;
};
#define IFP_HS_HANDLED 0x00000001
@@ -159,6 +160,8 @@ static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav,
static int ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp);
+static bool ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
static void
ipsec_accel_init(void *arg)
@@ -185,6 +188,7 @@ ipsec_accel_init(void *arg)
ipsec_accel_drv_sa_lifetime_update_impl;
ipsec_accel_drv_sa_lifetime_fetch_p =
ipsec_accel_drv_sa_lifetime_fetch_impl;
+ ipsec_accel_fill_xh_p = ipsec_accel_fill_xh_impl;
pctrie_init(&drv_spi_pctrie);
ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER(
ifnet_departure_event, ipsec_accel_ifdetach_event, NULL,
@@ -209,6 +213,7 @@ ipsec_accel_fini(void *arg)
ipsec_accel_on_ifdown_p = NULL;
ipsec_accel_drv_sa_lifetime_update_p = NULL;
ipsec_accel_drv_sa_lifetime_fetch_p = NULL;
+ ipsec_accel_fill_xh_p = NULL;
ipsec_accel_sync_imp();
clean_unrhdr(drv_spi_unr); /* avoid panic, should go later */
clear_unrhdr(drv_spi_unr);
@@ -412,6 +417,10 @@ ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp,
ihs->ifdata = priv;
ihs->flags = flags;
ihs->hdr_ext_size = esp_hdrsiz(sav);
+ memcpy(&ihs->xfh.dst, &sav->sah->saidx.dst, sizeof(ihs->xfh.dst));
+ ihs->xfh.spi = sav->spi;
+ ihs->xfh.proto = sav->sah->saidx.proto;
+ ihs->xfh.mode = sav->sah->saidx.mode;
mtx_lock(&ipsec_accel_sav_tmp);
CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) {
if (i->ifp == ifp) {
@@ -1162,4 +1171,20 @@ ipsec_accel_key_setaccelif_impl(struct secasvar *sav)
return (m);
}
+static bool
+ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+ struct ifp_handle_sav *i;
+
+ if (drv_spi < IPSEC_ACCEL_DRV_SPI_MIN ||
+ drv_spi > IPSEC_ACCEL_DRV_SPI_MAX)
+ return (false);
+
+ i = DRVSPI_SA_PCTRIE_LOOKUP(&drv_spi_pctrie, drv_spi);
+ if (i == NULL)
+ return (false);
+ memcpy(xh, &i->xfh, sizeof(*xh));
+ return (true);
+}
+
#endif /* IPSEC_OFFLOAD */
diff --git a/sys/netipsec/ipsec_offload.h b/sys/netipsec/ipsec_offload.h
index 904fe6252396..ae60eaa8ae78 100644
--- a/sys/netipsec/ipsec_offload.h
+++ b/sys/netipsec/ipsec_offload.h
@@ -30,6 +30,7 @@
#include <sys/errno.h>
#include <net/if.h>
#include <net/if_var.h>
+#include <netipsec/xform.h>
struct secpolicy;
struct secasvar;
@@ -42,6 +43,7 @@ struct ipsec_accel_out_tag {
struct ipsec_accel_in_tag {
struct m_tag tag;
+ struct xform_history xh; /* Must be first to mimic IPSEC_IN_DONE */
uint16_t drv_spi;
};
@@ -66,6 +68,8 @@ extern void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs);
extern int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+extern bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
#ifdef IPSEC_OFFLOAD
/*
@@ -158,6 +162,16 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
return (NULL);
}
+static inline bool
+ipsec_accel_fill_xh(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+ bool (*p)(if_t ifp, uint32_t drv_spi, struct xform_history *xh);
+
+ p = atomic_load_ptr(&ipsec_accel_fill_xh_p);
+ if (p != NULL)
+ return (p(ifp, drv_spi, xh));
+ return (false);
+}
#else
#define ipsec_accel_sa_newkey(a)
@@ -168,6 +182,7 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
#define ipsec_accel_sync()
#define ipsec_accel_is_accel_sav(a)
#define ipsec_accel_key_setaccelif(a)
+#define ipsec_accel_fill_xh(a, b, c) (false)
#endif
void ipsec_accel_forget_sav_impl(struct secasvar *sav);
@@ -180,6 +195,7 @@ bool ipsec_accel_output(struct ifnet *ifp, struct mbuf *m,
struct inpcb *inp, struct secpolicy *sp, struct secasvar *sav, int af,
int mtu, int *hwassist);
void ipsec_accel_forget_sav(struct secasvar *sav);
+struct xform_history;
#else
#define ipsec_accel_input(a, b, c) (ENXIO)
#define ipsec_accel_output(a, b, c, d, e, f, g, h) ({ \
diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c
index ae67d83c6d13..4ba1b49c24f0 100644
--- a/sys/netipsec/key.c
+++ b/sys/netipsec/key.c
@@ -114,6 +114,8 @@ void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp,
u_int drv_spi, uint64_t octets, uint64_t allocs);
int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp,
u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
#endif
#define FULLMASK 0xff
diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h
index 8492ecb3021b..720317ed74f3 100644
--- a/sys/netlink/netlink_message_parser.h
+++ b/sys/netlink/netlink_message_parser.h
@@ -209,7 +209,8 @@ int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt,
int nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt,
const void *arg, void *target);
-bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...);
+bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...)
+ __printflike(2, 3);
#define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \
nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \
diff --git a/sys/netpfil/ipfilter/netinet/fil.c b/sys/netpfil/ipfilter/netinet/fil.c
index 2a75190a3ec7..2fcea433295f 100644
--- a/sys/netpfil/ipfilter/netinet/fil.c
+++ b/sys/netpfil/ipfilter/netinet/fil.c
@@ -437,7 +437,7 @@ static inline void
ipf_pr_ipv6hdr(fr_info_t *fin)
{
ip6_t *ip6 = (ip6_t *)fin->fin_ip;
- int p, go = 1, i, hdrcount;
+ int p, go = 1, i;
fr_ip_t *fi = &fin->fin_fi;
fin->fin_off = 0;
@@ -464,7 +464,6 @@ ipf_pr_ipv6hdr(fr_info_t *fin)
if (IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6))
fin->fin_flx |= FI_MULTICAST|FI_MBCAST;
- hdrcount = 0;
while (go && !(fin->fin_flx & FI_SHORT)) {
switch (p)
{
@@ -542,7 +541,6 @@ ipf_pr_ipv6hdr(fr_info_t *fin)
go = 0;
break;
}
- hdrcount++;
/*
* It is important to note that at this point, for the
@@ -2590,14 +2588,13 @@ ipf_scanlist(fr_info_t *fin, u_32_t pass)
/* functions called from the IPFilter "mainline" in ipf_check(). */
/* ------------------------------------------------------------------------ */
frentry_t *
-ipf_acctpkt(fr_info_t *fin, u_32_t *passp)
+ipf_acctpkt(fr_info_t *fin, u_32_t *passp __unused)
{
ipf_main_softc_t *softc = fin->fin_main_soft;
char group[FR_GROUPLEN];
frentry_t *fr, *frsave;
u_32_t pass, rulen;
- passp = passp;
fr = softc->ipf_acct[fin->fin_out][softc->ipf_active];
if (fr != NULL) {
@@ -4200,7 +4197,7 @@ ipf_getstat(ipf_main_softc_t *softc, friostat_t *fiop, int rev)
(rev / 10000) % 100,
(rev / 100) % 100);
#else
- rev = rev;
+ (void)rev; /* UNUSED */
(void) strncpy(fiop->f_version, ipfilter_version,
sizeof(fiop->f_version));
#endif
@@ -4408,13 +4405,12 @@ frrequest(ipf_main_softc_t *softc, int unit, ioctlcmd_t req, caddr_t data,
OP_ZERO /* zero statistics and counters */ }
addrem = OP_ADD;
frentry_t frd, *fp, *f, **fprev, **ftail;
- void *ptr, *uptr, *cptr;
+ void *ptr, *uptr;
u_int *p, *pp;
frgroup_t *fg;
char *group;
ptr = NULL;
- cptr = NULL;
fg = NULL;
fp = &frd;
if (makecopy != 0) {
@@ -4532,7 +4528,6 @@ frrequest(ipf_main_softc_t *softc, int unit, ioctlcmd_t req, caddr_t data,
}
ptr = NULL;
- cptr = NULL;
if (FR_ISACCOUNT(fp->fr_flags))
unit = IPL_LOGCOUNT;
@@ -7314,11 +7309,10 @@ ipf_resolvedest(ipf_main_softc_t *softc, char *base, frdest_t *fdp, int v)
/* for both IPv4 and IPv6 on the same physical NIC. */
/* ------------------------------------------------------------------------ */
void *
-ipf_resolvenic(ipf_main_softc_t *softc, char *name, int v)
+ipf_resolvenic(ipf_main_softc_t *softc __unused, char *name, int v)
{
void *nic;
- softc = softc; /* gcc -Wextra */
if (name[0] == '\0')
return (NULL);
@@ -7455,6 +7449,10 @@ ipf_token_find(ipf_main_softc_t *softc, int type, int uid, void *ptr)
{
ipftoken_t *it, *new;
+ KMALLOC(new, ipftoken_t *);
+ if (new != NULL)
+ bzero((char *)new, sizeof(*new));
+
WRITE_ENTER(&softc->ipf_tokens);
for (it = softc->ipf_token_head; it != NULL; it = it->ipt_next) {
if ((ptr == it->ipt_ctx) && (type == it->ipt_type) &&
@@ -7463,10 +7461,6 @@ ipf_token_find(ipf_main_softc_t *softc, int type, int uid, void *ptr)
}
if (it == NULL) {
- KMALLOC(new, ipftoken_t *);
- if (new != NULL)
- bzero((char *)new, sizeof(*new));
-
it = new;
new = NULL;
if (it == NULL) {
@@ -7478,6 +7472,11 @@ ipf_token_find(ipf_main_softc_t *softc, int type, int uid, void *ptr)
it->ipt_type = type;
it->ipt_ref = 1;
} else {
+ if (new != NULL) {
+ KFREE(new);
+ new = NULL;
+ }
+
if (it->ipt_complete > 0)
it = NULL;
else
diff --git a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c
index 04850549db98..6eb6cf2a7a47 100644
--- a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c
+++ b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c
@@ -463,13 +463,14 @@ ipf_send_ip(fr_info_t *fin, mb_t *m)
int
ipf_send_icmp_err(int type, fr_info_t *fin, int dst)
{
- int err, hlen, xtra, iclen, ohlen, avail, code;
+ int err, hlen, xtra, iclen, ohlen, avail;
struct in_addr dst4;
struct icmp *icmp;
struct mbuf *m;
i6addr_t dst6;
void *ifp;
#ifdef USE_INET6
+ int code;
ip6_t *ip6;
#endif
ip_t *ip, *ip2;
@@ -477,8 +478,8 @@ ipf_send_icmp_err(int type, fr_info_t *fin, int dst)
if ((type < 0) || (type >= ICMP_MAXTYPE))
return (-1);
- code = fin->fin_icode;
#ifdef USE_INET6
+ code = fin->fin_icode;
/* See NetBSD ip_fil_netbsd.c r1.4: */
if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int)))
return (-1);
diff --git a/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c b/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c
index 482e0b456ae5..8c9317c38326 100644
--- a/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_ftp_pxy.c
@@ -219,7 +219,7 @@ ipf_p_ftp_soft_destroy(ipf_main_softc_t *softc, void *arg)
int
-ipf_p_ftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_ftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused)
{
ftpinfo_t *ftp;
ftpside_t *f;
@@ -228,8 +228,6 @@ ipf_p_ftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
if (ftp == NULL)
return (-1);
- nat = nat; /* LINT */
-
aps->aps_data = ftp;
aps->aps_psiz = sizeof(ftpinfo_t);
aps->aps_sport = htons(fin->fin_sport);
@@ -1715,7 +1713,9 @@ ipf_p_ftp_eprt4(ipf_ftp_softc_t *softf, fr_info_t *fin, ip_t *ip, nat_t *nat,
return (0);
if (c != delim)
return (0);
- addr |= addr;
+#if 0
+ addr |= (addr << 0);
+#endif
/*
* Get the port number
diff --git a/sys/netpfil/ipfilter/netinet/ip_htable.c b/sys/netpfil/ipfilter/netinet/ip_htable.c
index 22d427b87a71..91b375f80db1 100644
--- a/sys/netpfil/ipfilter/netinet/ip_htable.c
+++ b/sys/netpfil/ipfilter/netinet/ip_htable.c
@@ -343,6 +343,7 @@ ipf_htable_create(ipf_main_softc_t *softc, void *arg, iplookupop_t *op)
iph->iph_ref = 1;
iph->iph_list = NULL;
iph->iph_tail = &iph->iph_list;
+ iph->iph_unit = unit;
iph->iph_next = softh->ipf_htables[unit + 1];
iph->iph_pnext = &softh->ipf_htables[unit + 1];
if (softh->ipf_htables[unit + 1] != NULL)
@@ -603,7 +604,7 @@ ipf_htent_remove(ipf_main_softc_t *softc, void *arg, iphtable_t *iph,
switch (iph->iph_type & ~IPHASH_ANON)
{
case IPHASH_GROUPMAP :
- if (ipe->ipe_group != NULL)
+ if (ipe->ipe_ptr != NULL)
ipf_group_del(softc, ipe->ipe_ptr, NULL);
break;
@@ -973,7 +974,6 @@ ipf_htent_find(iphtable_t *iph, iphtent_t *ipeo)
{
iphtent_t ipe, *ent;
u_int hv;
- int bits;
bcopy((char *)ipeo, (char *)&ipe, sizeof(ipe));
ipe.ipe_addr.i6[0] &= ipe.ipe_mask.i6[0];
@@ -981,7 +981,6 @@ ipf_htent_find(iphtable_t *iph, iphtent_t *ipeo)
ipe.ipe_addr.i6[2] &= ipe.ipe_mask.i6[2];
ipe.ipe_addr.i6[3] &= ipe.ipe_mask.i6[3];
if (ipe.ipe_family == AF_INET) {
- bits = count4bits(ipe.ipe_mask.in4_addr);
ipe.ipe_addr.i6[1] = 0;
ipe.ipe_addr.i6[2] = 0;
ipe.ipe_addr.i6[3] = 0;
@@ -993,7 +992,6 @@ ipf_htent_find(iphtable_t *iph, iphtent_t *ipeo)
} else
#ifdef USE_INET6
if (ipe.ipe_family == AF_INET6) {
- bits = count6bits(ipe.ipe_mask.i6);
hv = IPE_V6_HASH_FN(ipe.ipe_addr.i6,
ipe.ipe_mask.i6, iph->iph_size);
} else
diff --git a/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c b/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c
index c6e4be17e22e..d5103c2944dc 100644
--- a/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_ipsec_pxy.c
@@ -341,15 +341,13 @@ ipf_p_ipsec_inout(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
* UDP/TCP port numbers).
*/
int
-ipf_p_ipsec_match(fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_ipsec_match(fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused)
{
ipsec_pxy_t *ipsec;
u_32_t cookies[4];
mb_t *m;
int off;
- nat = nat; /* LINT */
-
if ((fin->fin_dlen < sizeof(cookies)) || (fin->fin_flx & FI_FRAG))
return (-1);
diff --git a/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c b/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c
index 026459299efd..aa9e84be19ed 100644
--- a/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_irc_pxy.c
@@ -221,7 +221,7 @@ ipf_p_irc_complete(ircinfo_t *ircp, char *buf, size_t len)
int
-ipf_p_irc_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_irc_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused)
{
ircinfo_t *irc;
@@ -232,8 +232,6 @@ ipf_p_irc_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
if (irc == NULL)
return (-1);
- nat = nat; /* LINT */
-
aps->aps_data = irc;
aps->aps_psiz = sizeof(ircinfo_t);
@@ -422,8 +420,7 @@ ipf_p_irc_send(fr_info_t *fin, nat_t *nat)
int
-ipf_p_irc_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_irc_out(void *arg, fr_info_t *fin, ap_session_t *aps __unused, nat_t *nat)
{
- aps = aps; /* LINT */
return (ipf_p_irc_send(fin, nat));
}
diff --git a/sys/netpfil/ipfilter/netinet/ip_lookup.c b/sys/netpfil/ipfilter/netinet/ip_lookup.c
index b46d1b875003..a52dbef00166 100644
--- a/sys/netpfil/ipfilter/netinet/ip_lookup.c
+++ b/sys/netpfil/ipfilter/netinet/ip_lookup.c
@@ -230,13 +230,11 @@ ipf_lookup_soft_destroy(ipf_main_softc_t *softc, void *arg)
/* ------------------------------------------------------------------------ */
int
ipf_lookup_ioctl(ipf_main_softc_t *softc, caddr_t data, ioctlcmd_t cmd,
- int mode, int uid, void *ctx)
+ int mode __unused, int uid, void *ctx)
{
int err;
SPL_INT(s);
- mode = mode; /* LINT */
-
SPL_NET(s);
switch (cmd)
diff --git a/sys/netpfil/ipfilter/netinet/ip_nat.c b/sys/netpfil/ipfilter/netinet/ip_nat.c
index a13c6129a287..972511f43bd5 100644
--- a/sys/netpfil/ipfilter/netinet/ip_nat.c
+++ b/sys/netpfil/ipfilter/netinet/ip_nat.c
@@ -3224,13 +3224,10 @@ ipf_nat_finalise(fr_info_t *fin, nat_t *nat)
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
u_32_t sum1, sum2, sumd;
frentry_t *fr;
- u_32_t flags;
#if SOLARIS && defined(_KERNEL) && defined(ICK_M_CTL_MAGIC)
qpktinfo_t *qpi = fin->fin_qpi;
#endif
- flags = nat->nat_flags;
-
switch (nat->nat_pr[0])
{
case IPPROTO_ICMP :
@@ -3538,8 +3535,8 @@ ipf_nat_icmperrorlookup(fr_info_t *fin, int dir)
{
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
- int flags = 0, type, minlen;
- icmphdr_t *icmp, *orgicmp;
+ int flags = 0, minlen;
+ icmphdr_t *orgicmp;
nat_stat_side_t *nside;
tcphdr_t *tcp = NULL;
u_short data[2];
@@ -3547,8 +3544,6 @@ ipf_nat_icmperrorlookup(fr_info_t *fin, int dir)
ip_t *oip;
u_int p;
- icmp = fin->fin_dp;
- type = icmp->icmp_type;
nside = &softn->ipf_nat_stats.ns_side[fin->fin_out];
/*
* Does it at least have the return (basic) IP header ?
@@ -3999,9 +3994,7 @@ ipf_nat_inlookup(fr_info_t *fin, u_int flags, u_int p,
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
u_short sport, dport;
- grehdr_t *gre;
ipnat_t *ipn;
- u_int sflags;
nat_t *nat;
int nflags;
u_32_t dst;
@@ -4009,9 +4002,7 @@ ipf_nat_inlookup(fr_info_t *fin, u_int flags, u_int p,
u_int hv, rhv;
ifp = fin->fin_ifp;
- gre = NULL;
dst = mapdst.s_addr;
- sflags = flags & NAT_TCPUDPICMP;
switch (p)
{
@@ -4330,14 +4321,12 @@ ipf_nat_outlookup(fr_info_t *fin, u_int flags, u_int p,
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
u_short sport, dport;
- u_int sflags;
ipnat_t *ipn;
nat_t *nat;
void *ifp;
u_int hv;
ifp = fin->fin_ifp;
- sflags = flags & IPN_TCPUDPICMP;
switch (p)
{
@@ -4756,7 +4745,6 @@ ipf_nat_checkout(fr_info_t *fin, u_32_t *passp)
struct ifnet *ifp, *sifp;
ipf_main_softc_t *softc;
ipf_nat_softc_t *softn;
- icmphdr_t *icmp = NULL;
tcphdr_t *tcp = NULL;
int rval, natfailed;
u_int nflags = 0;
@@ -4802,8 +4790,6 @@ ipf_nat_checkout(fr_info_t *fin, u_32_t *passp)
nflags = IPN_UDP;
break;
case IPPROTO_ICMP :
- icmp = fin->fin_dp;
-
/*
* This is an incoming packet, so the destination is
* the icmp_id and the source port equals 0
@@ -5463,7 +5449,10 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags)
{
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
- u_32_t sumd, ipsumd, sum1, sum2;
+ u_32_t sumd, sum1, sum2;
+#if !defined(_KERNEL) || SOLARIS
+ u_32_t ipsumd;
+#endif
icmphdr_t *icmp;
tcphdr_t *tcp;
ipnat_t *np;
@@ -5499,7 +5488,9 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags)
ipf_sync_update(softc, SMC_NAT, fin, nat->nat_sync);
+#if !defined(_KERNEL) || SOLARIS
ipsumd = nat->nat_ipsumd;
+#endif
/*
* Fix up checksums, not by recalculating them, but
* simply computing adjustments.
@@ -5521,7 +5512,9 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags)
sum1 = nat->nat_osrcaddr;
sum2 = nat->nat_nsrcaddr;
CALC_SUMD(sum1, sum2, sumd);
+#if !defined(_KERNEL) || SOLARIS
ipsumd -= sumd;
+#endif
}
fin->fin_ip->ip_dst = nat->nat_ndstip;
fin->fin_daddr = nat->nat_ndstaddr;
@@ -5538,7 +5531,9 @@ ipf_nat_in(fr_info_t *fin, nat_t *nat, int natadd, u_32_t nflags)
sum1 = nat->nat_odstaddr;
sum2 = nat->nat_ndstaddr;
CALC_SUMD(sum1, sum2, sumd);
+#if !defined(_KERNEL) || SOLARIS
ipsumd -= sumd;
+#endif
}
fin->fin_ip->ip_dst = nat->nat_osrcip;
fin->fin_daddr = nat->nat_osrcaddr;
@@ -7352,30 +7347,18 @@ ipf_nat_nextaddr(fr_info_t *fin, nat_addr_t *na, u_32_t *old, u_32_t *dst)
{
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
- u_32_t amin, amax, new;
+ u_32_t new;
i6addr_t newip;
int error;
new = 0;
- amin = na->na_addr[0].in4.s_addr;
switch (na->na_atype)
{
case FRI_RANGE :
- amax = na->na_addr[1].in4.s_addr;
- break;
-
case FRI_NETMASKED :
case FRI_DYNAMIC :
case FRI_NORMAL :
- /*
- * Compute the maximum address by adding the inverse of the
- * netmask to the minimum address.
- */
- amax = ~na->na_addr[1].in4.s_addr;
- amax |= amin;
- break;
-
case FRI_LOOKUP :
break;
diff --git a/sys/netpfil/ipfilter/netinet/ip_nat6.c b/sys/netpfil/ipfilter/netinet/ip_nat6.c
index dbe19c40c2f2..6d5913177b90 100644
--- a/sys/netpfil/ipfilter/netinet/ip_nat6.c
+++ b/sys/netpfil/ipfilter/netinet/ip_nat6.c
@@ -1130,9 +1130,6 @@ ipf_nat6_finalise(fr_info_t *fin, nat_t *nat)
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
u_32_t sum1, sum2, sumd;
frentry_t *fr;
- u_32_t flags;
-
- flags = nat->nat_flags;
switch (fin->fin_p)
{
@@ -1355,8 +1352,8 @@ ipf_nat6_icmperrorlookup(fr_info_t *fin, int dir)
{
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
- struct icmp6_hdr *icmp6, *orgicmp;
- int flags = 0, type, minlen;
+ struct icmp6_hdr *orgicmp;
+ int flags = 0, minlen;
nat_stat_side_t *nside;
tcphdr_t *tcp = NULL;
u_short data[2];
@@ -1365,8 +1362,6 @@ ipf_nat6_icmperrorlookup(fr_info_t *fin, int dir)
u_int p;
minlen = 40;
- icmp6 = fin->fin_dp;
- type = icmp6->icmp6_type;
nside = &softn->ipf_nat_stats.ns_side6[fin->fin_out];
/*
* Does it at least have the return (basic) IP header ?
@@ -1500,9 +1495,8 @@ ipf_nat6_ip6subtract(i6addr_t *ip1, i6addr_t *ip2)
i6addr_t l1, l2, d;
u_short *s1, *s2, *ds;
u_32_t r;
- int i, neg;
+ int i;
- neg = 0;
l1 = *ip1;
l2 = *ip2;
s1 = (u_short *)&l1;
@@ -1519,7 +1513,6 @@ ipf_nat6_ip6subtract(i6addr_t *ip1, i6addr_t *ip2)
}
if (s2[0] > s1[0]) {
ds[0] = s2[0] + 0x10000 - s1[0];
- neg = 1;
} else {
ds[0] = s2[0] - s1[0];
}
@@ -1869,9 +1862,9 @@ ipf_nat6_inlookup(fr_info_t *fin, u_int flags, u_int p,
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
u_short sport, dport;
- grehdr_t *gre;
+#ifdef IPF_V6_PROXIES
ipnat_t *ipn;
- u_int sflags;
+#endif
nat_t *nat;
int nflags;
i6addr_t dst;
@@ -1881,10 +1874,7 @@ ipf_nat6_inlookup(fr_info_t *fin, u_int flags, u_int p,
ifp = fin->fin_ifp;
sport = 0;
dport = 0;
- gre = NULL;
dst.in6 = *mapdst;
- sflags = flags & NAT_TCPUDPICMP;
-
switch (p)
{
case IPPROTO_TCP :
@@ -1962,8 +1952,8 @@ ipf_nat6_inlookup(fr_info_t *fin, u_int flags, u_int p,
if ((nat->nat_flags & IPN_TCPUDP) != 0) {
- ipn = nat->nat_ptr;
#ifdef IPF_V6_PROXIES
+ ipn = nat->nat_ptr;
if ((ipn != NULL) && (nat->nat_aps != NULL))
if (appr_match(fin, nat) != 0)
continue;
@@ -2192,14 +2182,14 @@ ipf_nat6_outlookup(fr_info_t *fin, u_int flags, u_int p,
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
u_short sport, dport;
- u_int sflags;
+#ifdef IPF_V6_PROXIES
ipnat_t *ipn;
+#endif
nat_t *nat;
void *ifp;
u_int hv;
ifp = fin->fin_ifp;
- sflags = flags & IPN_TCPUDPICMP;
sport = 0;
dport = 0;
@@ -2280,8 +2270,8 @@ ipf_nat6_outlookup(fr_info_t *fin, u_int flags, u_int p,
break;
}
- ipn = nat->nat_ptr;
#ifdef IPF_V6_PROXIES
+ ipn = nat->nat_ptr;
if ((ipn != NULL) && (nat->nat_aps != NULL))
if (appr_match(fin, nat) != 0)
continue;
@@ -2568,7 +2558,6 @@ ipf_nat6_checkout(fr_info_t *fin, u_32_t *passp)
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
struct icmp6_hdr *icmp6 = NULL;
struct ifnet *ifp, *sifp;
- tcphdr_t *tcp = NULL;
int rval, natfailed;
ipnat_t *np = NULL;
u_int nflags = 0;
@@ -2621,9 +2610,6 @@ ipf_nat6_checkout(fr_info_t *fin, u_32_t *passp)
default :
break;
}
-
- if ((nflags & IPN_TCPUDP))
- tcp = fin->fin_dp;
}
ipa = fin->fin_src6;
@@ -2965,7 +2951,9 @@ ipf_nat6_checkin(fr_info_t *fin, u_32_t *passp)
int rval, natfailed;
struct ifnet *ifp;
i6addr_t ipa, iph;
- tcphdr_t *tcp;
+#ifdef IPF_V6_PROXIES
+ tcphdr_t *tcp = NULL;
+#endif
u_short dport;
ipnat_t *np;
nat_t *nat;
@@ -2973,7 +2961,6 @@ ipf_nat6_checkin(fr_info_t *fin, u_32_t *passp)
if (softn->ipf_nat_stats.ns_rules == 0 || softn->ipf_nat_lock != 0)
return (0);
- tcp = NULL;
icmp6 = NULL;
dport = 0;
natadd = 1;
@@ -3014,7 +3001,9 @@ ipf_nat6_checkin(fr_info_t *fin, u_32_t *passp)
}
if ((nflags & IPN_TCPUDP)) {
+#ifdef IPF_V6_PROXIES
tcp = fin->fin_dp;
+#endif
dport = fin->fin_data[1];
}
}
@@ -3802,32 +3791,19 @@ ipf_nat6_nextaddr(fr_info_t *fin, nat_addr_t *na, i6addr_t *old, i6addr_t *dst)
ipf_main_softc_t *softc = fin->fin_main_soft;
ipf_nat_softc_t *softn = softc->ipf_nat_soft;
i6addr_t newip, new;
- u_32_t amin, amax;
int error;
new.i6[0] = 0;
new.i6[1] = 0;
new.i6[2] = 0;
new.i6[3] = 0;
- amin = na->na_addr[0].in4.s_addr;
switch (na->na_atype)
{
case FRI_RANGE :
- amax = na->na_addr[1].in4.s_addr;
- break;
-
case FRI_NETMASKED :
case FRI_DYNAMIC :
case FRI_NORMAL :
- /*
- * Compute the maximum address by adding the inverse of the
- * netmask to the minimum address.
- */
- amax = ~na->na_addr[1].in4.s_addr;
- amax |= amin;
- break;
-
case FRI_LOOKUP :
break;
diff --git a/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c b/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c
index 2ad642adfbcd..f9c1ab50b8a2 100644
--- a/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_netbios_pxy.c
@@ -67,7 +67,7 @@ ipf_p_netbios_main_unload(void)
int
-ipf_p_netbios_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_netbios_out(void *arg, fr_info_t *fin, ap_session_t *aps __unused, nat_t *nat __unused)
{
char dgmbuf[6];
int off, dlen;
@@ -75,9 +75,6 @@ ipf_p_netbios_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
ip_t *ip;
mb_t *m;
- aps = aps; /* LINT */
- nat = nat; /* LINT */
-
m = fin->fin_m;
dlen = fin->fin_dlen - sizeof(*udp);
/*
diff --git a/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c b/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c
index 0ac19b067d2d..dc4c67dc14f0 100644
--- a/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_pptp_pxy.c
@@ -281,7 +281,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev)
tcphdr_t *tcp;
int dlen, off;
u_short len;
- char *msg;
tcp = fin->fin_dp;
dlen = fin->fin_dlen - (TCP_OFF(tcp) << 2);
@@ -310,8 +309,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev)
return (-1);
}
- msg = (char *)fin->fin_dp + (TCP_OFF(tcp) << 2);
-
while (dlen > 0) {
off += pptps->pptps_bytes;
if (pptps->pptps_gothdr == 0) {
@@ -337,7 +334,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev)
}
}
dlen -= len;
- msg += len;
off += len;
pptps->pptps_gothdr = 1;
@@ -381,7 +377,6 @@ ipf_p_pptp_nextmessage(fr_info_t *fin, nat_t *nat, pptp_pxy_t *pptp, int rev)
pptps->pptps_len = 0;
start += len;
- msg += len;
dlen -= len;
}
diff --git a/sys/netpfil/ipfilter/netinet/ip_proxy.c b/sys/netpfil/ipfilter/netinet/ip_proxy.c
index 9785fc37d3da..9fb6dbd2a9e1 100644
--- a/sys/netpfil/ipfilter/netinet/ip_proxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_proxy.c
@@ -679,14 +679,12 @@ ipf_proxy_ok(fr_info_t *fin, tcphdr_t *tcp, ipnat_t *np)
/* ------------------------------------------------------------------------ */
int
ipf_proxy_ioctl(ipf_main_softc_t *softc, caddr_t data, ioctlcmd_t cmd,
- int mode, void *ctx)
+ int mode __unused, void *ctx)
{
ap_ctl_t ctl;
caddr_t ptr;
int error;
- mode = mode; /* LINT */
-
switch (cmd)
{
case SIOCPROXY :
diff --git a/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c b/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c
index 2cfaaa58200f..94f0e3ada707 100644
--- a/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_raudio_pxy.c
@@ -49,12 +49,10 @@ ipf_p_raudio_main_unload(void)
* Setup for a new proxy to handle Real Audio.
*/
int
-ipf_p_raudio_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_raudio_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused)
{
raudio_t *rap;
- nat = nat; /* LINT */
-
if (fin->fin_v != 4)
return (-1);
@@ -72,7 +70,7 @@ ipf_p_raudio_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
int
-ipf_p_raudio_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_raudio_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused)
{
raudio_t *rap = aps->aps_data;
unsigned char membuf[512 + 1], *s;
@@ -82,8 +80,6 @@ ipf_p_raudio_out(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
int len = 0;
mb_t *m;
- nat = nat; /* LINT */
-
/*
* If we've already processed the start messages, then nothing left
* for the proxy to do.
diff --git a/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c b/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c
index 778f14f442de..b85794e75499 100644
--- a/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_rcmd_pxy.c
@@ -63,18 +63,12 @@ ipf_p_rcmd_main_unload(void)
* Setup for a new RCMD proxy.
*/
int
-ipf_p_rcmd_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_rcmd_new(void *arg, fr_info_t *fin __unused, ap_session_t *aps, nat_t *nat)
{
tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp;
rcmdinfo_t *rc;
ipnat_t *ipn;
- ipnat_t *np;
- int size;
- fin = fin; /* LINT */
-
- np = nat->nat_ptr;
- size = np->in_size;
KMALLOC(rc, rcmdinfo_t *);
if (rc == NULL) {
#ifdef IP_RCMD_PROXY_DEBUG
diff --git a/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c b/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c
index f8f4d2d325e1..c608f84d7b3b 100644
--- a/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_rpcb_pxy.c
@@ -144,12 +144,10 @@ ipf_p_rpcb_main_unload(void)
/* Allocate resources for per-session proxy structures. */
/* -------------------------------------------------------------------- */
int
-ipf_p_rpcb_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_rpcb_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat __unused)
{
rpcb_session_t *rs;
- nat = nat; /* LINT */
-
if (fin->fin_v != 4)
return (-1);
@@ -1023,10 +1021,8 @@ ipf_p_rpcb_lookup(rpcb_session_t *rs, u_32_t xid)
/* Free the RPCB transaction record rx from the chain of entries. */
/* -------------------------------------------------------------------- */
static void
-ipf_p_rpcb_deref(rpcb_session_t *rs, rpcb_xact_t *rx)
+ipf_p_rpcb_deref(rpcb_session_t *rs __unused, rpcb_xact_t *rx)
{
- rs = rs; /* LINT */
-
if (rx == NULL)
return;
diff --git a/sys/netpfil/ipfilter/netinet/ip_state.c b/sys/netpfil/ipfilter/netinet/ip_state.c
index 8fe11e3f1215..36fdf23cd062 100644
--- a/sys/netpfil/ipfilter/netinet/ip_state.c
+++ b/sys/netpfil/ipfilter/netinet/ip_state.c
@@ -883,7 +883,7 @@ ipf_state_putent(ipf_main_softc_t *softc, ipf_state_softc_t *softs,
{
ipstate_t *is, *isn;
ipstate_save_t ips;
- int error, out, i;
+ int error, i;
frentry_t *fr;
char *name;
@@ -929,7 +929,6 @@ ipf_state_putent(ipf_main_softc_t *softc, ipf_state_softc_t *softs,
return (ENOMEM);
}
bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr));
- out = fr->fr_flags & FR_OUTQUE ? 1 : 0;
isn->is_rule = fr;
ips.ips_is.is_rule = fr;
MUTEX_NUKE(&fr->fr_lock);
@@ -2207,20 +2206,6 @@ ipf_state_tcpinwindow(fr_info_t *fin, tcpdata_t *fdata, tcpdata_t *tdata,
(ackskew >= -1) && (ackskew <= 1)) {
inseq = 1;
} else if (!(flags & IS_TCPFSM)) {
- int i;
-
- i = (fin->fin_rev << 1) + fin->fin_out;
-
-#if 0
- if (is_pkts[i]0 == 0) {
- /*
- * Picking up a connection in the middle, the "next"
- * packet seen from a direction that is new should be
- * accepted, even if it appears out of sequence.
- */
- inseq = 1;
- } else
-#endif
if (!(fdata->td_winflags &
(TCP_WSCALE_SEEN|TCP_WSCALE_FIRST))) {
/*
@@ -2616,7 +2601,7 @@ ipf_checkicmpmatchingstate(fr_info_t *fin)
icmphdr_t *icmp;
fr_info_t ofin;
tcphdr_t *tcp;
- int type, len;
+ int len;
u_char pr;
ip_t *oip;
u_int hv;
@@ -2634,7 +2619,6 @@ ipf_checkicmpmatchingstate(fr_info_t *fin)
return (NULL);
}
ic = fin->fin_dp;
- type = ic->icmp_type;
oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN);
/*
@@ -4362,7 +4346,6 @@ ipf_checkicmp6matchingstate(fr_info_t *fin)
ip6_t *oip6;
u_char pr;
u_int hv;
- int type;
/*
* Does it at least have the return (basic) IP header ?
@@ -4377,7 +4360,6 @@ ipf_checkicmp6matchingstate(fr_info_t *fin)
}
ic6 = fin->fin_dp;
- type = ic6->icmp6_type;
oip6 = (ip6_t *)((char *)ic6 + ICMPERR_ICMPHLEN);
if (fin->fin_plen < sizeof(*oip6)) {
diff --git a/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c b/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c
index d81de100120b..3c737b38aacc 100644
--- a/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c
+++ b/sys/netpfil/ipfilter/netinet/ip_tftp_pxy.c
@@ -151,7 +151,7 @@ ipf_p_tftp_in(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
int
-ipf_p_tftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
+ipf_p_tftp_new(void *arg, fr_info_t *fin __unused, ap_session_t *aps, nat_t *nat)
{
udphdr_t *udp;
tftpinfo_t *ti;
@@ -159,8 +159,6 @@ ipf_p_tftp_new(void *arg, fr_info_t *fin, ap_session_t *aps, nat_t *nat)
ipnat_t *np;
int size;
- fin = fin; /* LINT */
-
np = nat->nat_ptr;
size = np->in_size;
diff --git a/sys/netpfil/ipfilter/netinet/ipf_rb.h b/sys/netpfil/ipfilter/netinet/ipf_rb.h
index e047c7f44a4a..334311502aab 100644
--- a/sys/netpfil/ipfilter/netinet/ipf_rb.h
+++ b/sys/netpfil/ipfilter/netinet/ipf_rb.h
@@ -305,13 +305,11 @@ _n##_rb_walktree(struct _n##_rb_head *head, _n##_rb_walker_t func, void *arg)\
_t *prev; \
_t *next; \
_t *node = head->top._f.right; \
- _t *base; \
\
while (node != &_n##_rb_zero) \
node = node->_f.left; \
\
for (;;) { \
- base = node; \
prev = node; \
while ((node->_f.parent->_f.right == node) && \
(node != &_n##_rb_zero)) { \
diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c
index 923633d76df7..c129c8c49921 100644
--- a/sys/netpfil/ipfw/ip_fw2.c
+++ b/sys/netpfil/ipfw/ip_fw2.c
@@ -196,7 +196,7 @@ SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Firewall");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
- "Only do a single pass through ipfw when using dummynet(4)");
+ "Only do a single pass through ipfw when using dummynet(4), ipfw_nat or other divert(4)-like interfaces");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
"Rule number auto-increment step");
diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c
index 6a87ea2471cb..cb96d2fcc44c 100644
--- a/sys/netpfil/pf/if_pflog.c
+++ b/sys/netpfil/pf/if_pflog.c
@@ -284,12 +284,12 @@ pflog_packet(uint8_t action, u_int8_t reason,
* state lock, since this leads to unsafe LOR.
* These conditions are very very rare, however.
*/
- if (trigger->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe)
+ if (trigger->log & PF_LOG_USER && !pd->lookup.done && lookupsafe)
pd->lookup.done = pf_socket_lookup(pd);
- if (pd->lookup.done > 0)
+ if (trigger->log & PF_LOG_USER && pd->lookup.done > 0)
hdr.uid = pd->lookup.uid;
else
- hdr.uid = UID_MAX;
+ hdr.uid = -1;
hdr.pid = NO_PID;
hdr.rule_uid = rm->cuid;
hdr.rule_pid = rm->cpid;
diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
index fdedb9424117..4e03584b8f85 100644
--- a/sys/netpfil/pf/if_pfsync.c
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -532,6 +532,7 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
struct pf_kpooladdr *rpool_first;
int error;
uint8_t rt = 0;
+ int n = 0;
PF_RULES_RASSERT();
@@ -557,10 +558,12 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
*/
if (sp->pfs_1301.rule != htonl(-1) && sp->pfs_1301.anchor == htonl(-1) &&
(flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->pfs_1301.rule) <
- pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
- r = pf_main_ruleset.rules[
- PF_RULESET_FILTER].active.ptr_array[ntohl(sp->pfs_1301.rule)];
- else
+ pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) {
+ TAILQ_FOREACH(r, pf_main_ruleset.rules[
+ PF_RULESET_FILTER].active.ptr, entries)
+ if (ntohl(sp->pfs_1301.rule) == n++)
+ break;
+ } else
r = &V_pf_default_rule;
/*
@@ -763,6 +766,10 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
__func__, msg_version);
}
+ if (! (st->act.rtableid == -1 ||
+ (st->act.rtableid >= 0 && st->act.rtableid < rt_numfibs)))
+ goto cleanup;
+
st->id = sp->pfs_1301.id;
st->creatorid = sp->pfs_1301.creatorid;
pf_state_peer_ntoh(&sp->pfs_1301.src, &st->src);
@@ -1083,7 +1090,7 @@ pfsync_in_ins(struct mbuf *m, int offset, int count, int flags, int action)
msg_version = PFSYNC_MSG_VERSION_1400;
break;
default:
- V_pfsyncstats.pfsyncs_badact++;
+ V_pfsyncstats.pfsyncs_badver++;
return (-1);
}
@@ -1110,9 +1117,8 @@ pfsync_in_ins(struct mbuf *m, int offset, int count, int flags, int action)
continue;
}
- if (pfsync_state_import(sp, flags, msg_version) == ENOMEM)
- /* Drop out, but process the rest of the actions. */
- break;
+ if (pfsync_state_import(sp, flags, msg_version) != 0)
+ V_pfsyncstats.pfsyncs_badact++;
}
return (total_len);
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index b24bbe036141..009f7e4d78b1 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -682,7 +682,8 @@ pf_packet_rework_nat(struct pf_pdesc *pd, int off, struct pf_state_key *nk)
0);
break;
case AF_INET6:
- PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
+ pf_addrcpy(pd->src, &nk->addr[pd->sidx],
+ pd->af);
break;
default:
unhandled_af(pd->af);
@@ -696,7 +697,8 @@ pf_packet_rework_nat(struct pf_pdesc *pd, int off, struct pf_state_key *nk)
0);
break;
case AF_INET6:
- PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
+ pf_addrcpy(pd->dst, &nk->addr[pd->didx],
+ pd->af);
break;
default:
unhandled_af(pd->af);
@@ -1084,9 +1086,9 @@ pf_insert_src_node(struct pf_ksrc_node *sns[PF_SN_MAX],
(*sn)->af = af;
(*sn)->rule = r_track;
- PF_ACPY(&(*sn)->addr, src, af);
+ pf_addrcpy(&(*sn)->addr, src, af);
if (raddr != NULL)
- PF_ACPY(&(*sn)->raddr, raddr, af);
+ pf_addrcpy(&(*sn)->raddr, raddr, af);
(*sn)->rkif = rkif;
LIST_INSERT_HEAD(&(*sh)->nodes, *sn, entry);
(*sn)->creation = time_uptime;
@@ -1687,9 +1689,9 @@ pf_state_key_addr_setup(struct pf_pdesc *pd,
copy:
#endif /* INET6 */
if (saddr)
- PF_ACPY(&key->addr[pd->sidx], saddr, pd->af);
+ pf_addrcpy(&key->addr[pd->sidx], saddr, pd->af);
if (daddr)
- PF_ACPY(&key->addr[pd->didx], daddr, pd->af);
+ pf_addrcpy(&key->addr[pd->didx], daddr, pd->af);
return (0);
}
@@ -1734,13 +1736,17 @@ pf_state_key_setup(struct pf_pdesc *pd, u_int16_t sport, u_int16_t dport,
bzero(&(*nk)->addr[0], sizeof((*nk)->addr[0]));
bzero(&(*nk)->addr[1], sizeof((*nk)->addr[1]));
if (pd->dir == PF_IN) {
- PF_ACPY(&(*nk)->addr[pd->didx], &pd->nsaddr, pd->naf);
- PF_ACPY(&(*nk)->addr[pd->sidx], &pd->ndaddr, pd->naf);
+ pf_addrcpy(&(*nk)->addr[pd->didx], &pd->nsaddr,
+ pd->naf);
+ pf_addrcpy(&(*nk)->addr[pd->sidx], &pd->ndaddr,
+ pd->naf);
(*nk)->port[pd->didx] = pd->nsport;
(*nk)->port[pd->sidx] = pd->ndport;
} else {
- PF_ACPY(&(*nk)->addr[pd->sidx], &pd->nsaddr, pd->naf);
- PF_ACPY(&(*nk)->addr[pd->didx], &pd->ndaddr, pd->naf);
+ pf_addrcpy(&(*nk)->addr[pd->sidx], &pd->nsaddr,
+ pd->naf);
+ pf_addrcpy(&(*nk)->addr[pd->didx], &pd->ndaddr,
+ pd->naf);
(*nk)->port[pd->sidx] = pd->nsport;
(*nk)->port[pd->didx] = pd->ndport;
}
@@ -2053,11 +2059,11 @@ pf_udp_mapping_create(sa_family_t af, struct pf_addr *src_addr, uint16_t src_por
mapping = uma_zalloc(V_pf_udp_mapping_z, M_NOWAIT | M_ZERO);
if (mapping == NULL)
return (NULL);
- PF_ACPY(&mapping->endpoints[0].addr, src_addr, af);
+ pf_addrcpy(&mapping->endpoints[0].addr, src_addr, af);
mapping->endpoints[0].port = src_port;
mapping->endpoints[0].af = af;
mapping->endpoints[0].mapping = mapping;
- PF_ACPY(&mapping->endpoints[1].addr, nat_addr, af);
+ pf_addrcpy(&mapping->endpoints[1].addr, nat_addr, af);
mapping->endpoints[1].port = nat_port;
mapping->endpoints[1].af = af;
mapping->endpoints[1].mapping = mapping;
@@ -3295,9 +3301,9 @@ pf_change_ap(struct pf_pdesc *pd, struct pf_addr *a, u_int16_t *p,
MPASS(pd->ip_sum);
}
- PF_ACPY(&ao, a, pd->af);
+ pf_addrcpy(&ao, a, pd->af);
if (pd->af == pd->naf)
- PF_ACPY(a, an, pd->af);
+ pf_addrcpy(a, an, pd->af);
if (pd->m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
*pd->pcksum = ~*pd->pcksum;
@@ -3426,8 +3432,8 @@ pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
{
struct pf_addr ao;
- PF_ACPY(&ao, a, AF_INET6);
- PF_ACPY(a, an, AF_INET6);
+ pf_addrcpy(&ao, a, AF_INET6);
+ pf_addrcpy(a, an, AF_INET6);
*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
@@ -3450,9 +3456,9 @@ pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
{
struct pf_addr oia, ooa;
- PF_ACPY(&oia, ia, af);
+ pf_addrcpy(&oia, ia, af);
if (oa)
- PF_ACPY(&ooa, oa, af);
+ pf_addrcpy(&ooa, oa, af);
/* Change inner protocol port, fix inner protocol checksum. */
if (ip != NULL) {
@@ -3469,7 +3475,7 @@ pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
}
/* Change inner ip address, fix inner ip and icmp checksums. */
- PF_ACPY(ia, na, af);
+ pf_addrcpy(ia, na, af);
switch (af) {
#ifdef INET
case AF_INET: {
@@ -3503,7 +3509,7 @@ pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
}
/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
if (oa) {
- PF_ACPY(oa, na, af);
+ pf_addrcpy(oa, na, af);
switch (af) {
#ifdef INET
case AF_INET:
@@ -4299,8 +4305,8 @@ pf_undo_nat(struct pf_krule *nr, struct pf_pdesc *pd, uint16_t bip_sum)
{
/* undo NAT changes, if they have taken place */
if (nr != NULL) {
- PF_ACPY(pd->src, &pd->osrc, pd->af);
- PF_ACPY(pd->dst, &pd->odst, pd->af);
+ pf_addrcpy(pd->src, &pd->osrc, pd->af);
+ pf_addrcpy(pd->dst, &pd->odst, pd->af);
if (pd->sport)
*pd->sport = pd->osport;
if (pd->dport)
@@ -4573,7 +4579,7 @@ pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
static int
pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
{
- if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
+ if (u == -1 && op != PF_OP_EQ && op != PF_OP_NE)
return (0);
return (pf_match(op, a1, a2, u));
}
@@ -4581,7 +4587,7 @@ pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
static int
pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
{
- if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
+ if (g == -1 && op != PF_OP_EQ && op != PF_OP_NE)
return (0);
return (pf_match(op, a1, a2, g));
}
@@ -4675,6 +4681,13 @@ pf_step_into_anchor(struct pf_test_ctx *ctx, struct pf_krule *r)
}
} else {
rv = pf_match_rule(ctx, &r->anchor->ruleset);
+ /*
+ * Unless errors occured, stop iff any rule matched
+ * within quick anchors.
+ */
+ if (rv != PF_TEST_FAIL && r->quick == PF_TEST_QUICK &&
+ *ctx->am == r)
+ rv = PF_TEST_QUICK;
}
ctx->depth--;
@@ -4784,7 +4797,6 @@ pf_step_out_of_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth,
return (quick);
}
-#ifdef INET6
void
pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
@@ -4796,6 +4808,7 @@ pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
break;
#endif /* INET */
+#ifdef INET6
case AF_INET6:
naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
@@ -4806,6 +4819,7 @@ pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
break;
+#endif /* INET6 */
}
}
@@ -4818,6 +4832,7 @@ pf_addr_inc(struct pf_addr *addr, sa_family_t af)
addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
break;
#endif /* INET */
+#ifdef INET6
case AF_INET6:
if (addr->addr32[3] == 0xffffffff) {
addr->addr32[3] = 0;
@@ -4837,9 +4852,9 @@ pf_addr_inc(struct pf_addr *addr, sa_family_t af)
addr->addr32[3] =
htonl(ntohl(addr->addr32[3]) + 1);
break;
+#endif /* INET6 */
}
}
-#endif /* INET6 */
void
pf_rule_to_actions(struct pf_krule *r, struct pf_rule_actions *a)
@@ -4899,8 +4914,8 @@ pf_socket_lookup(struct pf_pdesc *pd)
struct inpcbinfo *pi;
struct inpcb *inp;
- pd->lookup.uid = UID_MAX;
- pd->lookup.gid = GID_MAX;
+ pd->lookup.uid = -1;
+ pd->lookup.gid = -1;
switch (pd->proto) {
case IPPROTO_TCP:
@@ -5738,8 +5753,8 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
ctx.reason = *reason;
SLIST_INIT(&ctx.rules);
- PF_ACPY(&pd->nsaddr, pd->src, pd->af);
- PF_ACPY(&pd->ndaddr, pd->dst, pd->af);
+ pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
+ pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
@@ -5886,18 +5901,17 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
M_SETFIB(pd->m, pd->act.rtableid);
if (r->rt) {
- struct pf_ksrc_node *sn = NULL;
- struct pf_srchash *snh = NULL;
/*
* Set act.rt here instead of in pf_rule_to_actions() because
* it is applied only from the last pass rule.
*/
pd->act.rt = r->rt;
- /* Don't use REASON_SET, pf_map_addr increases the reason counters */
- ctx.reason = pf_map_addr_sn(pd->af, r, pd->src, &pd->act.rt_addr,
- &pd->act.rt_kif, NULL, &sn, &snh, &(r->route), PF_SN_ROUTE);
- if (ctx.reason != 0)
+ if ((transerror = pf_map_addr_sn(pd->af, r, pd->src,
+ &pd->act.rt_addr, &pd->act.rt_kif, NULL, &(r->route),
+ PF_SN_ROUTE)) != PFRES_MATCH) {
+ REASON_SET(&ctx.reason, transerror);
goto cleanup;
+ }
}
if (pd->virtual_proto != PF_VPROTO_FRAGMENT &&
@@ -6041,9 +6055,16 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx,
/* src node for translation rule */
if (ctx->nr != NULL) {
KASSERT(ctx->nat_pool != NULL, ("%s: nat_pool is NULL", __func__));
+ /*
+ * The NAT addresses are chosen during ruleset parsing.
+ * The new afto code stores post-nat addresses in nsaddr.
+ * The old nat code (also used for new nat-to rules) creates
+ * state keys and stores addresses in them.
+ */
if ((ctx->nat_pool->opts & PF_POOL_STICKYADDR) &&
(sn_reason = pf_insert_src_node(sns, snhs, ctx->nr,
- &ctx->sk->addr[pd->sidx], pd->af, &ctx->nk->addr[1], NULL,
+ ctx->sk ? &(ctx->sk->addr[pd->sidx]) : pd->src, pd->af,
+ ctx->nk ? &(ctx->nk->addr[1]) : &(pd->nsaddr), NULL,
PF_SN_NAT)) != 0 ) {
REASON_SET(&ctx->reason, sn_reason);
goto csfailed;
@@ -6198,7 +6219,7 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx,
if (ctx->tag > 0)
s->tag = ctx->tag;
if (pd->proto == IPPROTO_TCP && (tcp_get_flags(th) & (TH_SYN|TH_ACK)) ==
- TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
+ TH_SYN && r->keep_state == PF_STATE_SYNPROXY && pd->dir == PF_IN) {
pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC);
pf_undo_nat(ctx->nr, pd, bip_sum);
s->src.seqhi = arc4random();
@@ -6357,7 +6378,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
&nk->addr[pd->sidx], nk->port[pd->sidx]);
pd->sport = &th->th_sport;
pd->nsport = th->th_sport;
- PF_ACPY(&pd->nsaddr, pd->src, pd->af);
+ pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
}
if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) ||
@@ -6366,7 +6387,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
&nk->addr[pd->didx], nk->port[pd->didx]);
pd->dport = &th->th_dport;
pd->ndport = th->th_dport;
- PF_ACPY(&pd->ndaddr, pd->dst, pd->af);
+ pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
}
rewrite++;
break;
@@ -6379,7 +6400,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
nk->port[pd->sidx]);
pd->sport = &pd->hdr.udp.uh_sport;
pd->nsport = pd->hdr.udp.uh_sport;
- PF_ACPY(&pd->nsaddr, pd->src, pd->af);
+ pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
}
if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) ||
@@ -6390,7 +6411,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
nk->port[pd->didx]);
pd->dport = &pd->hdr.udp.uh_dport;
pd->ndport = pd->hdr.udp.uh_dport;
- PF_ACPY(&pd->ndaddr, pd->dst, pd->af);
+ pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
}
rewrite++;
break;
@@ -6403,7 +6424,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
nk->port[pd->sidx]);
pd->sport = &pd->hdr.sctp.src_port;
pd->nsport = pd->hdr.sctp.src_port;
- PF_ACPY(&pd->nsaddr, pd->src, pd->af);
+ pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
}
if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) ||
nk->port[pd->didx] != pd->ndport) {
@@ -6413,7 +6434,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
nk->port[pd->didx]);
pd->dport = &pd->hdr.sctp.dest_port;
pd->ndport = pd->hdr.sctp.dest_port;
- PF_ACPY(&pd->ndaddr, pd->dst, pd->af);
+ pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
}
break;
}
@@ -6422,13 +6443,13 @@ pf_translate_compat(struct pf_test_ctx *ctx)
if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], AF_INET)) {
pf_change_a(&pd->src->v4.s_addr, pd->ip_sum,
nk->addr[pd->sidx].v4.s_addr, 0);
- PF_ACPY(&pd->nsaddr, pd->src, pd->af);
+ pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
}
if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], AF_INET)) {
pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum,
nk->addr[pd->didx].v4.s_addr, 0);
- PF_ACPY(&pd->ndaddr, pd->dst, pd->af);
+ pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
}
if (ctx->virtual_type == htons(ICMP_ECHO) &&
@@ -6447,13 +6468,13 @@ pf_translate_compat(struct pf_test_ctx *ctx)
if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], AF_INET6)) {
pf_change_a6(pd->src, &pd->hdr.icmp6.icmp6_cksum,
&nk->addr[pd->sidx], 0);
- PF_ACPY(&pd->nsaddr, pd->src, pd->af);
+ pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
}
if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], AF_INET6)) {
pf_change_a6(pd->dst, &pd->hdr.icmp6.icmp6_cksum,
&nk->addr[pd->didx], 0);
- PF_ACPY(&pd->ndaddr, pd->dst, pd->af);
+ pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
}
rewrite++;
break;
@@ -6467,7 +6488,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
pf_change_a(&pd->src->v4.s_addr,
pd->ip_sum,
nk->addr[pd->sidx].v4.s_addr, 0);
- PF_ACPY(&pd->nsaddr, pd->src, pd->af);
+ pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
}
if (PF_ANEQ(&pd->ndaddr,
@@ -6475,7 +6496,7 @@ pf_translate_compat(struct pf_test_ctx *ctx)
pf_change_a(&pd->dst->v4.s_addr,
pd->ip_sum,
nk->addr[pd->didx].v4.s_addr, 0);
- PF_ACPY(&pd->ndaddr, pd->dst, pd->af);
+ pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
}
break;
#endif /* INET */
@@ -6483,14 +6504,17 @@ pf_translate_compat(struct pf_test_ctx *ctx)
case AF_INET6:
if (PF_ANEQ(&pd->nsaddr,
&nk->addr[pd->sidx], AF_INET6)) {
- PF_ACPY(&pd->nsaddr, &nk->addr[pd->sidx], pd->af);
- PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
+ pf_addrcpy(&pd->nsaddr, &nk->addr[pd->sidx],
+ pd->af);
+ pf_addrcpy(pd->src, &nk->addr[pd->sidx], pd->af);
}
if (PF_ANEQ(&pd->ndaddr,
&nk->addr[pd->didx], AF_INET6)) {
- PF_ACPY(&pd->ndaddr, &nk->addr[pd->didx], pd->af);
- PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
+ pf_addrcpy(&pd->ndaddr, &nk->addr[pd->didx],
+ pd->af);
+ pf_addrcpy(pd->dst, &nk->addr[pd->didx],
+ pd->af);
}
break;
#endif /* INET6 */
@@ -7009,8 +7033,8 @@ pf_test_state(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason)
bzero(&key, sizeof(key));
key.af = pd->af;
key.proto = pd->virtual_proto;
- PF_ACPY(&key.addr[pd->sidx], pd->src, key.af);
- PF_ACPY(&key.addr[pd->didx], pd->dst, key.af);
+ pf_addrcpy(&key.addr[pd->sidx], pd->src, key.af);
+ pf_addrcpy(&key.addr[pd->didx], pd->dst, key.af);
key.port[pd->sidx] = pd->osport;
key.port[pd->didx] = pd->odport;
@@ -7201,8 +7225,8 @@ pf_test_state(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason)
}
if (afto) {
- PF_ACPY(&pd->nsaddr, &nk->addr[sidx], nk->af);
- PF_ACPY(&pd->ndaddr, &nk->addr[didx], nk->af);
+ pf_addrcpy(&pd->nsaddr, &nk->addr[sidx], nk->af);
+ pf_addrcpy(&pd->ndaddr, &nk->addr[didx], nk->af);
pd->naf = nk->af;
action = PF_AFRT;
}
@@ -7496,13 +7520,13 @@ again:
key.af = j->pd.af;
key.proto = IPPROTO_SCTP;
if (j->pd.dir == PF_IN) { /* wire side, straight */
- PF_ACPY(&key.addr[0], j->pd.src, key.af);
- PF_ACPY(&key.addr[1], j->pd.dst, key.af);
+ pf_addrcpy(&key.addr[0], j->pd.src, key.af);
+ pf_addrcpy(&key.addr[1], j->pd.dst, key.af);
key.port[0] = j->pd.hdr.sctp.src_port;
key.port[1] = j->pd.hdr.sctp.dest_port;
} else { /* stack side, reverse */
- PF_ACPY(&key.addr[1], j->pd.src, key.af);
- PF_ACPY(&key.addr[0], j->pd.dst, key.af);
+ pf_addrcpy(&key.addr[1], j->pd.src, key.af);
+ pf_addrcpy(&key.addr[0], j->pd.dst, key.af);
key.port[1] = j->pd.hdr.sctp.src_port;
key.port[0] = j->pd.hdr.sctp.dest_port;
}
@@ -7898,8 +7922,10 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
#endif /* INET6 */
}
if (afto) {
- PF_ACPY(&pd->nsaddr, &nk->addr[sidx], nk->af);
- PF_ACPY(&pd->ndaddr, &nk->addr[didx], nk->af);
+ pf_addrcpy(&pd->nsaddr, &nk->addr[sidx],
+ nk->af);
+ pf_addrcpy(&pd->ndaddr, &nk->addr[didx],
+ nk->af);
pd->naf = nk->af;
return (PF_AFRT);
}
@@ -8031,8 +8057,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
key.af = pd2.af;
key.proto = IPPROTO_TCP;
- PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
- PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
+ pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
key.port[pd2.sidx] = th->th_sport;
key.port[pd2.didx] = th->th_dport;
@@ -8135,9 +8161,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
&nk->addr[didx], pd->af,
nk->af))
return (PF_DROP);
- PF_ACPY(&pd->nsaddr, &nk->addr[pd2.sidx],
- nk->af);
- PF_ACPY(&pd->ndaddr,
+ pf_addrcpy(&pd->nsaddr,
+ &nk->addr[pd2.sidx], nk->af);
+ pf_addrcpy(&pd->ndaddr,
&nk->addr[pd2.didx], nk->af);
if (nk->af == AF_INET) {
pd->proto = IPPROTO_ICMP;
@@ -8226,8 +8252,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
key.af = pd2.af;
key.proto = IPPROTO_UDP;
- PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
- PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
+ pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
key.port[pd2.sidx] = uh->uh_sport;
key.port[pd2.didx] = uh->uh_dport;
@@ -8270,9 +8296,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
&nk->addr[didx], pd->af,
nk->af))
return (PF_DROP);
- PF_ACPY(&pd->nsaddr,
+ pf_addrcpy(&pd->nsaddr,
&nk->addr[pd2.sidx], nk->af);
- PF_ACPY(&pd->ndaddr,
+ pf_addrcpy(&pd->ndaddr,
&nk->addr[pd2.didx], nk->af);
if (nk->af == AF_INET) {
pd->proto = IPPROTO_ICMP;
@@ -8358,8 +8384,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
key.af = pd2.af;
key.proto = IPPROTO_SCTP;
- PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
- PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
+ pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
key.port[pd2.sidx] = sh->src_port;
key.port[pd2.didx] = sh->dest_port;
@@ -8425,9 +8451,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
sh->src_port = nk->port[sidx];
sh->dest_port = nk->port[didx];
m_copyback(pd2.m, pd2.off, sizeof(*sh), (c_caddr_t)sh);
- PF_ACPY(&pd->nsaddr,
+ pf_addrcpy(&pd->nsaddr,
&nk->addr[pd2.sidx], nk->af);
- PF_ACPY(&pd->ndaddr,
+ pf_addrcpy(&pd->ndaddr,
&nk->addr[pd2.didx], nk->af);
if (nk->af == AF_INET) {
pd->proto = IPPROTO_ICMP;
@@ -8568,9 +8594,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
iih->icmp_id = nk->port[iidx];
m_copyback(pd2.m, pd2.off, ICMP_MINLEN,
(c_caddr_t)iih);
- PF_ACPY(&pd->nsaddr,
+ pf_addrcpy(&pd->nsaddr,
&nk->addr[pd2.sidx], nk->af);
- PF_ACPY(&pd->ndaddr,
+ pf_addrcpy(&pd->ndaddr,
&nk->addr[pd2.didx], nk->af);
/*
* IPv4 becomes IPv6 so we must copy
@@ -8696,9 +8722,9 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
iih->icmp6_id = nk->port[iidx];
m_copyback(pd2.m, pd2.off,
sizeof(struct icmp6_hdr), (c_caddr_t)iih);
- PF_ACPY(&pd->nsaddr,
+ pf_addrcpy(&pd->nsaddr,
&nk->addr[pd2.sidx], nk->af);
- PF_ACPY(&pd->ndaddr,
+ pf_addrcpy(&pd->ndaddr,
&nk->addr[pd2.didx], nk->af);
pd->naf = nk->af;
return (PF_AFRT);
@@ -8740,8 +8766,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
key.af = pd2.af;
key.proto = pd2.proto;
- PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
- PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
+ pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
key.port[0] = key.port[1] = 0;
action = pf_find_state(&pd2, &key, state);
@@ -9042,6 +9068,9 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
goto bad;
}
+ if (r->rt == PF_DUPTO)
+ skip_test = true;
+
if (pd->dir == PF_IN && !skip_test) {
if (pf_test(AF_INET, PF_OUT, PFIL_FWD, ifp, &m0, inp,
&pd->act) != PF_PASS) {
@@ -9277,7 +9306,8 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp,
bzero(&dst, sizeof(dst));
dst.sin6_family = AF_INET6;
dst.sin6_len = sizeof(dst);
- PF_ACPY((struct pf_addr *)&dst.sin6_addr, &pd->act.rt_addr, AF_INET6);
+ pf_addrcpy((struct pf_addr *)&dst.sin6_addr, &pd->act.rt_addr,
+ AF_INET6);
if (pd->dir == PF_IN) {
if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
@@ -9343,6 +9373,9 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp,
goto bad;
}
+ if (r->rt == PF_DUPTO)
+ skip_test = true;
+
if (pd->dir == PF_IN && !skip_test) {
if (pf_test(AF_INET6, PF_OUT, PFIL_FWD | PF_PFIL_NOREFRAGMENT,
ifp, &m0, inp, &pd->act) != PF_PASS) {
@@ -10031,6 +10064,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
pd->didx = (dir == PF_IN) ? 1 : 0;
pd->af = pd->naf = af;
+ PF_RULES_ASSERT();
+
TAILQ_INIT(&pd->sctp_multihome_jobs);
if (default_actions != NULL)
memcpy(&pd->act, default_actions, sizeof(pd->act));
@@ -10077,8 +10112,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
pd->src = (struct pf_addr *)&h->ip_src;
pd->dst = (struct pf_addr *)&h->ip_dst;
- PF_ACPY(&pd->osrc, pd->src, af);
- PF_ACPY(&pd->odst, pd->dst, af);
+ pf_addrcpy(&pd->osrc, pd->src, af);
+ pf_addrcpy(&pd->odst, pd->dst, af);
pd->ip_sum = &h->ip_sum;
pd->tos = h->ip_tos & ~IPTOS_ECN_MASK;
pd->ttl = h->ip_ttl;
@@ -10106,6 +10141,12 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
}
h = mtod(pd->m, struct ip6_hdr *);
+ if (pd->m->m_pkthdr.len <
+ sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) {
+ *action = PF_DROP;
+ REASON_SET(reason, PFRES_SHORT);
+ return (-1);
+ }
if (pf_walk_header6(pd, h, reason) != PF_PASS) {
*action = PF_DROP;
@@ -10115,8 +10156,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
h = mtod(pd->m, struct ip6_hdr *);
pd->src = (struct pf_addr *)&h->ip6_src;
pd->dst = (struct pf_addr *)&h->ip6_dst;
- PF_ACPY(&pd->osrc, pd->src, af);
- PF_ACPY(&pd->odst, pd->dst, af);
+ pf_addrcpy(&pd->osrc, pd->src, af);
+ pf_addrcpy(&pd->odst, pd->dst, af);
pd->ip_sum = NULL;
pd->tos = IPV6_DSCP(h);
pd->ttl = h->ip6_hlim;
@@ -10444,35 +10485,30 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
PF_RULES_RLOCK_TRACKER;
KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
M_ASSERTPKTHDR(*m0);
+ NET_EPOCH_ASSERT();
if (!V_pf_status.running)
return (PF_PASS);
- PF_RULES_RLOCK();
-
kif = (struct pfi_kkif *)ifp->if_pf_kif;
if (__predict_false(kif == NULL)) {
DPFPRINTF(PF_DEBUG_URGENT,
("%s: kif == NULL, if_xname %s\n",
__func__, ifp->if_xname));
- PF_RULES_RUNLOCK();
return (PF_DROP);
}
if (kif->pfik_flags & PFI_IFLAG_SKIP) {
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
if ((*m0)->m_flags & M_SKIP_FIREWALL) {
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
if (__predict_false(! M_WRITABLE(*m0))) {
*m0 = m_unshare(*m0, M_NOWAIT);
if (*m0 == NULL) {
- PF_RULES_RUNLOCK();
return (PF_DROP);
}
}
@@ -10485,12 +10521,10 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
pd.pf_mtag->if_idxgen);
if (ifp == NULL || ifp->if_flags & IFF_DYING) {
- PF_RULES_RUNLOCK();
m_freem(*m0);
*m0 = NULL;
return (PF_PASS);
}
- PF_RULES_RUNLOCK();
(ifp->if_output)(ifp, *m0, sintosa(&pd.pf_mtag->dst), NULL);
*m0 = NULL;
return (PF_PASS);
@@ -10505,11 +10539,12 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
/* But only once. We may see the packet multiple times (e.g.
* PFIL_IN/PFIL_OUT). */
pf_dummynet_flag_remove(pd.m, pd.pf_mtag);
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
+ PF_RULES_RLOCK();
+
if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason,
kif, default_actions) == -1) {
if (action != PF_PASS)
diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h
index 2009d2907985..cfff58064922 100644
--- a/sys/netpfil/pf/pf.h
+++ b/sys/netpfil/pf/pf.h
@@ -140,7 +140,7 @@ enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL,
#define PF_LOG 0x01
#define PF_LOG_ALL 0x02
-#define PF_LOG_SOCKET_LOOKUP 0x04
+#define PF_LOG_USER 0x04
#define PF_LOG_FORCE 0x08
#define PF_LOG_MATCHES 0x10
@@ -490,6 +490,7 @@ struct pf_osfp_ioctl {
#define PF_ANCHOR_NAME_SIZE 64
#define PF_ANCHOR_MAXPATH (MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1)
+#define PF_OPTIMIZER_TABLE_PFX "__automatic_"
struct pf_rule {
struct pf_rule_addr src;
diff --git a/sys/netpfil/pf/pf_if.c b/sys/netpfil/pf/pf_if.c
index 389b74d09d37..e2200c15c704 100644
--- a/sys/netpfil/pf/pf_if.c
+++ b/sys/netpfil/pf/pf_if.c
@@ -522,7 +522,7 @@ pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af)
case 0:
return (0);
case 1:
- return (PF_MATCHA(0, &dyn->pfid_addr4,
+ return (pf_match_addr(0, &dyn->pfid_addr4,
&dyn->pfid_mask4, a, AF_INET));
default:
return (pfr_match_addr(dyn->pfid_kt, a, AF_INET));
@@ -535,7 +535,7 @@ pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af)
case 0:
return (0);
case 1:
- return (PF_MATCHA(0, &dyn->pfid_addr6,
+ return (pf_match_addr(0, &dyn->pfid_addr6,
&dyn->pfid_mask6, a, AF_INET6));
default:
return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6));
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index 05a7e1311ad8..5c69c395c5fc 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -615,7 +615,7 @@ pf_free_rule(struct pf_krule *rule)
pfi_kkif_unref(rule->kif);
if (rule->rcv_kif)
pfi_kkif_unref(rule->rcv_kif);
- pf_kanchor_remove(rule);
+ pf_remove_kanchor(rule);
pf_empty_kpool(&rule->rdr.list);
pf_empty_kpool(&rule->nat.list);
pf_empty_kpool(&rule->route.list);
@@ -1274,7 +1274,9 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr)
PF_MD5_UPD(pfr, addr.iflags);
break;
case PF_ADDR_TABLE:
- PF_MD5_UPD(pfr, addr.v.tblname);
+ if (strncmp(pfr->addr.v.tblname, PF_OPTIMIZER_TABLE_PFX,
+ strlen(PF_OPTIMIZER_TABLE_PFX)))
+ PF_MD5_UPD(pfr, addr.v.tblname);
break;
case PF_ADDR_ADDRMASK:
/* XXX ignore af? */
@@ -1357,7 +1359,7 @@ static int
pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
{
struct pf_kruleset *rs;
- struct pf_krule *rule, **old_array, *old_rule;
+ struct pf_krule *rule, *old_rule;
struct pf_krulequeue *old_rules;
struct pf_krule_global *old_tree;
int error;
@@ -1382,13 +1384,10 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
/* Swap rules, keep the old. */
old_rules = rs->rules[rs_num].active.ptr;
old_rcount = rs->rules[rs_num].active.rcount;
- old_array = rs->rules[rs_num].active.ptr_array;
old_tree = rs->rules[rs_num].active.tree;
rs->rules[rs_num].active.ptr =
rs->rules[rs_num].inactive.ptr;
- rs->rules[rs_num].active.ptr_array =
- rs->rules[rs_num].inactive.ptr_array;
rs->rules[rs_num].active.tree =
rs->rules[rs_num].inactive.tree;
rs->rules[rs_num].active.rcount =
@@ -1418,7 +1417,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
}
rs->rules[rs_num].inactive.ptr = old_rules;
- rs->rules[rs_num].inactive.ptr_array = old_array;
rs->rules[rs_num].inactive.tree = NULL; /* important for pf_ioctl_addrule */
rs->rules[rs_num].inactive.rcount = old_rcount;
@@ -1431,9 +1429,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
while ((rule = TAILQ_FIRST(old_rules)) != NULL)
pf_unlink_rule_locked(old_rules, rule);
PF_UNLNKDRULES_UNLOCK();
- if (rs->rules[rs_num].inactive.ptr_array)
- free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
- rs->rules[rs_num].inactive.ptr_array = NULL;
rs->rules[rs_num].inactive.rcount = 0;
rs->rules[rs_num].inactive.open = 0;
pf_remove_if_empty_kruleset(rs);
@@ -1456,24 +1451,11 @@ pf_setup_pfsync_matching(struct pf_kruleset *rs)
if (rs_cnt == PF_RULESET_SCRUB)
continue;
- if (rs->rules[rs_cnt].inactive.ptr_array)
- free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
- rs->rules[rs_cnt].inactive.ptr_array = NULL;
-
if (rs->rules[rs_cnt].inactive.rcount) {
- rs->rules[rs_cnt].inactive.ptr_array =
- mallocarray(rs->rules[rs_cnt].inactive.rcount,
- sizeof(struct pf_rule **),
- M_TEMP, M_NOWAIT);
-
- if (!rs->rules[rs_cnt].inactive.ptr_array)
- return (ENOMEM);
- }
-
- TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
- entries) {
- pf_hash_rule_rolling(&ctx, rule);
- (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
+ TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
+ entries) {
+ pf_hash_rule_rolling(&ctx, rule);
+ }
}
}
@@ -2059,6 +2041,47 @@ pf_ioctl_getrules(struct pfioc_rule *pr)
return (0);
}
+static int
+pf_rule_checkaf(struct pf_krule *r)
+{
+ switch (r->af) {
+ case 0:
+ if (r->rule_flag & PFRULE_AFTO)
+ return (EPFNOSUPPORT);
+ break;
+ case AF_INET:
+ if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET6)
+ return (EPFNOSUPPORT);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET)
+ return (EPFNOSUPPORT);
+ break;
+#endif /* INET6 */
+ default:
+ return (EPFNOSUPPORT);
+ }
+
+ if ((r->rule_flag & PFRULE_AFTO) == 0 && r->naf != 0)
+ return (EPFNOSUPPORT);
+
+ return (0);
+}
+
+static int
+pf_validate_range(uint8_t op, uint16_t port[2])
+{
+ uint16_t a = ntohs(port[0]);
+ uint16_t b = ntohs(port[1]);
+
+ if ((op == PF_OP_RRG && a > b) || /* 34:12, i.e. none */
+ (op == PF_OP_IRG && a >= b) || /* 34><12, i.e. none */
+ (op == PF_OP_XRG && a > b)) /* 34<>22, i.e. all */
+ return 1;
+ return 0;
+}
+
int
pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
uint32_t pool_ticket, const char *anchor, const char *anchor_call,
@@ -2078,6 +2101,13 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
#define ERROUT(x) ERROUT_FUNCTION(errout, x)
+ if ((error = pf_rule_checkaf(rule)))
+ ERROUT(error);
+ if (pf_validate_range(rule->src.port_op, rule->src.port))
+ ERROUT(EINVAL);
+ if (pf_validate_range(rule->dst.port_op, rule->dst.port))
+ ERROUT(EINVAL);
+
if (rule->ifname[0])
kif = pf_kkif_create(M_WAITOK);
if (rule->rcv_ifname[0])
@@ -2155,51 +2185,51 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
rule->rcv_kif = NULL;
if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs)
- error = EBUSY;
+ ERROUT(EBUSY);
#ifdef ALTQ
/* set queue IDs */
if (rule->qname[0] != 0) {
if ((rule->qid = pf_qname2qid(rule->qname)) == 0)
- error = EBUSY;
+ ERROUT(EBUSY);
else if (rule->pqname[0] != 0) {
if ((rule->pqid =
pf_qname2qid(rule->pqname)) == 0)
- error = EBUSY;
+ ERROUT(EBUSY);
} else
rule->pqid = rule->qid;
}
#endif
if (rule->tagname[0])
if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0)
- error = EBUSY;
+ ERROUT(EBUSY);
if (rule->match_tagname[0])
if ((rule->match_tag =
pf_tagname2tag(rule->match_tagname)) == 0)
- error = EBUSY;
+ ERROUT(EBUSY);
if (rule->rt && !rule->direction)
- error = EINVAL;
+ ERROUT(EINVAL);
if (!rule->log)
rule->logif = 0;
if (! pf_init_threshold(&rule->pktrate, rule->pktrate.limit,
rule->pktrate.seconds))
- error = ENOMEM;
+ ERROUT(ENOMEM);
if (pf_addr_setup(ruleset, &rule->src.addr, rule->af))
- error = ENOMEM;
+ ERROUT(ENOMEM);
if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af))
- error = ENOMEM;
+ ERROUT(ENOMEM);
if (pf_kanchor_setup(rule, ruleset, anchor_call))
- error = EINVAL;
+ ERROUT(EINVAL);
if (rule->scrub_flags & PFSTATE_SETPRIO &&
(rule->set_prio[0] > PF_PRIO_MAX ||
rule->set_prio[1] > PF_PRIO_MAX))
- error = EINVAL;
+ ERROUT(EINVAL);
for (int i = 0; i < 3; i++) {
TAILQ_FOREACH(pa, &V_pf_pabuf[i], entries)
if (pa->addr.type == PF_ADDR_TABLE) {
pa->addr.p.tbl = pfr_attach_table(ruleset,
pa->addr.v.tblname);
if (pa->addr.p.tbl == NULL)
- error = ENOMEM;
+ ERROUT(ENOMEM);
}
}
@@ -2207,7 +2237,7 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
if (rule->overload_tblname[0]) {
if ((rule->overload_tbl = pfr_attach_table(ruleset,
rule->overload_tblname)) == NULL)
- error = EINVAL;
+ ERROUT(EINVAL);
else
rule->overload_tbl->pfrkt_flags |=
PFR_TFLAG_ACTIVE;
@@ -2230,23 +2260,19 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
if (((rule->action == PF_NAT) || (rule->action == PF_RDR) ||
(rule->action == PF_BINAT)) && rule->anchor == NULL &&
TAILQ_FIRST(&rule->rdr.list) == NULL) {
- error = EINVAL;
+ ERROUT(EINVAL);
}
if (rule->rt > PF_NOPFROUTE && (TAILQ_FIRST(&rule->route.list) == NULL)) {
- error = EINVAL;
+ ERROUT(EINVAL);
}
if (rule->action == PF_PASS && (rule->rdr.opts & PF_POOL_STICKYADDR ||
rule->nat.opts & PF_POOL_STICKYADDR) && !rule->keep_state) {
- error = EINVAL;
+ ERROUT(EINVAL);
}
- if (error) {
- pf_free_rule(rule);
- rule = NULL;
- ERROUT(error);
- }
+ MPASS(error == 0);
rule->nat.cur = TAILQ_FIRST(&rule->nat.list);
rule->rdr.cur = TAILQ_FIRST(&rule->rdr.list);
@@ -2350,15 +2376,17 @@ relock_DIOCKILLSTATES:
if (psk->psk_proto && psk->psk_proto != sk->proto)
continue;
- if (! PF_MATCHA(psk->psk_src.neg, &psk->psk_src.addr.v.a.addr,
+ if (! pf_match_addr(psk->psk_src.neg,
+ &psk->psk_src.addr.v.a.addr,
&psk->psk_src.addr.v.a.mask, srcaddr, sk->af))
continue;
- if (! PF_MATCHA(psk->psk_dst.neg, &psk->psk_dst.addr.v.a.addr,
+ if (! pf_match_addr(psk->psk_dst.neg,
+ &psk->psk_dst.addr.v.a.addr,
&psk->psk_dst.addr.v.a.mask, dstaddr, sk->af))
continue;
- if (! PF_MATCHA(psk->psk_rt_addr.neg,
+ if (! pf_match_addr(psk->psk_rt_addr.neg,
&psk->psk_rt_addr.addr.v.a.addr,
&psk->psk_rt_addr.addr.v.a.mask,
&s->act.rt_addr, sk->af))
@@ -2398,10 +2426,10 @@ relock_DIOCKILLSTATES:
match_key.af = s->key[idx]->af;
match_key.proto = s->key[idx]->proto;
- PF_ACPY(&match_key.addr[0],
+ pf_addrcpy(&match_key.addr[0],
&s->key[idx]->addr[1], match_key.af);
match_key.port[0] = s->key[idx]->port[1];
- PF_ACPY(&match_key.addr[1],
+ pf_addrcpy(&match_key.addr[1],
&s->key[idx]->addr[0], match_key.af);
match_key.port[1] = s->key[idx]->port[0];
}
@@ -2697,7 +2725,7 @@ pf_ioctl_get_addr(struct pf_nl_pooladdr *pp)
PF_RULES_RLOCK_TRACKER;
- pp->anchor[sizeof(pp->anchor) - 1] = 0;
+ pp->anchor[sizeof(pp->anchor) - 1] = '\0';
PF_RULES_RLOCK();
pool = pf_get_kpool(pp->anchor, pp->ticket, pp->r_action,
@@ -2730,7 +2758,7 @@ pf_ioctl_get_rulesets(struct pfioc_ruleset *pr)
PF_RULES_RLOCK_TRACKER;
- pr->path[sizeof(pr->path) - 1] = 0;
+ pr->path[sizeof(pr->path) - 1] = '\0';
PF_RULES_RLOCK();
if ((ruleset = pf_find_kruleset(pr->path)) == NULL) {
@@ -2738,7 +2766,7 @@ pf_ioctl_get_rulesets(struct pfioc_ruleset *pr)
return (ENOENT);
}
pr->nr = 0;
- if (ruleset->anchor == NULL) {
+ if (ruleset == &pf_main_ruleset) {
/* XXX kludge for pf_main_ruleset */
RB_FOREACH(anchor, pf_kanchor_global, &V_pf_anchors)
if (anchor->parent == NULL)
@@ -2769,8 +2797,8 @@ pf_ioctl_get_ruleset(struct pfioc_ruleset *pr)
return (ENOENT);
}
- pr->name[0] = 0;
- if (ruleset->anchor == NULL) {
+ pr->name[0] = '\0';
+ if (ruleset == &pf_main_ruleset) {
/* XXX kludge for pf_main_ruleset */
RB_FOREACH(anchor, pf_kanchor_global, &V_pf_anchors)
if (anchor->parent == NULL && nr++ == pr->nr) {
@@ -2794,6 +2822,78 @@ pf_ioctl_get_ruleset(struct pfioc_ruleset *pr)
return (error);
}
+int
+pf_ioctl_natlook(struct pfioc_natlook *pnl)
+{
+ struct pf_state_key *sk;
+ struct pf_kstate *state;
+ struct pf_state_key_cmp key;
+ int m = 0, direction = pnl->direction;
+ int sidx, didx;
+
+ /* NATLOOK src and dst are reversed, so reverse sidx/didx */
+ sidx = (direction == PF_IN) ? 1 : 0;
+ didx = (direction == PF_IN) ? 0 : 1;
+
+ if (!pnl->proto ||
+ PF_AZERO(&pnl->saddr, pnl->af) ||
+ PF_AZERO(&pnl->daddr, pnl->af) ||
+ ((pnl->proto == IPPROTO_TCP ||
+ pnl->proto == IPPROTO_UDP) &&
+ (!pnl->dport || !pnl->sport)))
+ return (EINVAL);
+
+ switch (pnl->direction) {
+ case PF_IN:
+ case PF_OUT:
+ case PF_INOUT:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ switch (pnl->af) {
+#ifdef INET
+ case AF_INET:
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ break;
+#endif /* INET6 */
+ default:
+ return (EAFNOSUPPORT);
+ }
+
+ bzero(&key, sizeof(key));
+ key.af = pnl->af;
+ key.proto = pnl->proto;
+ pf_addrcpy(&key.addr[sidx], &pnl->saddr, pnl->af);
+ key.port[sidx] = pnl->sport;
+ pf_addrcpy(&key.addr[didx], &pnl->daddr, pnl->af);
+ key.port[didx] = pnl->dport;
+
+ state = pf_find_state_all(&key, direction, &m);
+ if (state == NULL)
+ return (ENOENT);
+
+ if (m > 1) {
+ PF_STATE_UNLOCK(state);
+ return (E2BIG); /* more than one state */
+ }
+
+ sk = state->key[sidx];
+ pf_addrcpy(&pnl->rsaddr,
+ &sk->addr[sidx], sk->af);
+ pnl->rsport = sk->port[sidx];
+ pf_addrcpy(&pnl->rdaddr,
+ &sk->addr[didx], sk->af);
+ pnl->rdport = sk->port[didx];
+ PF_STATE_UNLOCK(state);
+
+ return (0);
+}
+
static int
pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
{
@@ -3497,10 +3597,10 @@ DIOCADDRULENV_error:
error = pf_rule_to_krule(&pr->rule, rule);
if (error != 0) {
pf_krule_free(rule);
- break;
+ goto fail;
}
- pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ pr->anchor[sizeof(pr->anchor) - 1] = '\0';
/* Frees rule on error */
error = pf_ioctl_addrule(rule, pr->ticket, pr->pool_ticket,
@@ -3512,7 +3612,7 @@ DIOCADDRULENV_error:
case DIOCGETRULES: {
struct pfioc_rule *pr = (struct pfioc_rule *)addr;
- pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ pr->anchor[sizeof(pr->anchor) - 1] = '\0';
error = pf_ioctl_getrules(pr);
@@ -3651,16 +3751,16 @@ DIOCGETRULENV_error:
u_int32_t nr = 0;
int rs_num;
- pcr->anchor[sizeof(pcr->anchor) - 1] = 0;
+ pcr->anchor[sizeof(pcr->anchor) - 1] = '\0';
if (pcr->action < PF_CHANGE_ADD_HEAD ||
pcr->action > PF_CHANGE_GET_TICKET) {
error = EINVAL;
- break;
+ goto fail;
}
if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pcr->action != PF_CHANGE_REMOVE) {
@@ -3668,9 +3768,13 @@ DIOCGETRULENV_error:
error = pf_rule_to_krule(&pcr->rule, newrule);
if (error != 0) {
pf_krule_free(newrule);
- break;
+ goto fail;
}
+ if ((error = pf_rule_checkaf(newrule))) {
+ pf_krule_free(newrule);
+ goto fail;
+ }
if (newrule->ifname[0])
kif = pf_kkif_create(M_WAITOK);
pf_counter_u64_init(&newrule->evaluations, M_WAITOK);
@@ -3818,7 +3922,7 @@ DIOCGETRULENV_error:
pf_free_rule(newrule);
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
- break;
+ goto fail;
}
newrule->nat.cur = TAILQ_FIRST(&newrule->nat.list);
@@ -3845,7 +3949,7 @@ DIOCGETRULENV_error:
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
}
@@ -3863,7 +3967,7 @@ DIOCGETRULENV_error:
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
error = EEXIST;
- break;
+ goto fail;
}
if (oldrule == NULL)
@@ -3919,7 +4023,7 @@ DIOCCHANGERULE_error:
if (sp->timeout >= PFTM_MAX) {
error = EINVAL;
- break;
+ goto fail;
}
if (V_pfsync_state_import_ptr != NULL) {
PF_RULES_RLOCK();
@@ -3939,7 +4043,7 @@ DIOCCHANGERULE_error:
s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
if (s == NULL) {
error = ENOENT;
- break;
+ goto fail;
}
pfsync_state_export((union pfsync_state_union*)&ps->state,
@@ -4018,7 +4122,7 @@ DIOCGETSTATES_retry:
error = copyout(pstore, out,
sizeof(struct pfsync_state_1301) * count);
if (error)
- break;
+ goto fail;
out = ps->ps_states + nr;
}
DIOCGETSTATES_full:
@@ -4038,7 +4142,7 @@ DIOCGETSTATES_full:
if (ps->ps_req_version > PF_STATE_VERSION) {
error = ENOTSUP;
- break;
+ goto fail;
}
if (ps->ps_len <= 0) {
@@ -4096,7 +4200,7 @@ DIOCGETSTATESV2_retry:
error = copyout(pstore, out,
sizeof(struct pf_state_export) * count);
if (error)
- break;
+ goto fail;
out = ps->ps_states + nr;
}
DIOCGETSTATESV2_full:
@@ -4131,49 +4235,8 @@ DIOCGETSTATESV2_full:
case DIOCNATLOOK: {
struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr;
- struct pf_state_key *sk;
- struct pf_kstate *state;
- struct pf_state_key_cmp key;
- int m = 0, direction = pnl->direction;
- int sidx, didx;
-
- /* NATLOOK src and dst are reversed, so reverse sidx/didx */
- sidx = (direction == PF_IN) ? 1 : 0;
- didx = (direction == PF_IN) ? 0 : 1;
-
- if (!pnl->proto ||
- PF_AZERO(&pnl->saddr, pnl->af) ||
- PF_AZERO(&pnl->daddr, pnl->af) ||
- ((pnl->proto == IPPROTO_TCP ||
- pnl->proto == IPPROTO_UDP) &&
- (!pnl->dport || !pnl->sport)))
- error = EINVAL;
- else {
- bzero(&key, sizeof(key));
- key.af = pnl->af;
- key.proto = pnl->proto;
- PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af);
- key.port[sidx] = pnl->sport;
- PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af);
- key.port[didx] = pnl->dport;
-
- state = pf_find_state_all(&key, direction, &m);
- if (state == NULL) {
- error = ENOENT;
- } else {
- if (m > 1) {
- PF_STATE_UNLOCK(state);
- error = E2BIG; /* more than one state */
- } else {
- sk = state->key[sidx];
- PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af);
- pnl->rsport = sk->port[sidx];
- PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af);
- pnl->rdport = sk->port[didx];
- PF_STATE_UNLOCK(state);
- }
- }
- }
+
+ error = pf_ioctl_natlook(pnl);
break;
}
@@ -4243,12 +4306,12 @@ DIOCGETSTATESV2_full:
if (psp->ifname[0] == '\0') {
error = EINVAL;
- break;
+ goto fail;
}
error = pf_user_strcpy(ps.ifname, psp->ifname, IFNAMSIZ);
if (error != 0)
- break;
+ goto fail;
ifp = ifunit(ps.ifname);
if (ifp != NULL) {
psp->baudrate32 =
@@ -4309,7 +4372,7 @@ DIOCGETSTATESV2_full:
altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK | M_ZERO);
error = pf_import_kaltq(pa, altq, IOCPARM_LEN(cmd));
if (error)
- break;
+ goto fail;
altq->local_flags = 0;
PF_RULES_WLOCK();
@@ -4317,7 +4380,7 @@ DIOCGETSTATESV2_full:
PF_RULES_WUNLOCK();
free(altq, M_PFALTQ);
error = EBUSY;
- break;
+ goto fail;
}
/*
@@ -4329,7 +4392,7 @@ DIOCGETSTATESV2_full:
PF_RULES_WUNLOCK();
error = EBUSY;
free(altq, M_PFALTQ);
- break;
+ goto fail;
}
altq->altq_disc = NULL;
TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) {
@@ -4349,7 +4412,7 @@ DIOCGETSTATESV2_full:
if (error) {
PF_RULES_WUNLOCK();
free(altq, M_PFALTQ);
- break;
+ goto fail;
}
if (altq->qname[0] != 0)
@@ -4387,13 +4450,13 @@ DIOCGETSTATESV2_full:
if (pa->ticket != V_ticket_altqs_active) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
altq = pf_altq_get_nth_active(pa->nr);
if (altq == NULL) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd));
PF_RULES_RUNLOCK();
@@ -4417,20 +4480,20 @@ DIOCGETSTATESV2_full:
if (pq->ticket != V_ticket_altqs_active) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
nbytes = pq->nbytes;
altq = pf_altq_get_nth_active(pq->nr);
if (altq == NULL) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
PF_RULES_RUNLOCK();
error = ENXIO;
- break;
+ goto fail;
}
PF_RULES_RUNLOCK();
if (cmd == DIOCGETQSTATSV0)
@@ -4494,35 +4557,35 @@ DIOCGETSTATESV2_full:
struct pf_kruleset *ruleset;
struct pfi_kkif *kif = NULL;
- pca->anchor[sizeof(pca->anchor) - 1] = 0;
+ pca->anchor[sizeof(pca->anchor) - 1] = '\0';
if (pca->action < PF_CHANGE_ADD_HEAD ||
pca->action > PF_CHANGE_REMOVE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
pca->addr.addr.type != PF_ADDR_DYNIFTL &&
pca->addr.addr.type != PF_ADDR_TABLE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->addr.addr.p.dyn != NULL) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->action != PF_CHANGE_REMOVE) {
#ifndef INET
if (pca->af == AF_INET) {
error = EAFNOSUPPORT;
- break;
+ goto fail;
}
#endif /* INET */
#ifndef INET6
if (pca->af == AF_INET6) {
error = EAFNOSUPPORT;
- break;
+ goto fail;
}
#endif /* INET6 */
newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
@@ -4606,7 +4669,7 @@ DIOCGETSTATESV2_full:
}
pool->cur = TAILQ_FIRST(&pool->list);
- PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, pca->af);
+ pf_addrcpy(&pool->counter, &pool->cur->addr.v.a.addr, pca->af);
PF_RULES_WUNLOCK();
break;
@@ -4625,7 +4688,7 @@ DIOCCHANGEADDR_error:
case DIOCGETRULESETS: {
struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr;
- pr->path[sizeof(pr->path) - 1] = 0;
+ pr->path[sizeof(pr->path) - 1] = '\0';
error = pf_ioctl_get_rulesets(pr);
break;
@@ -4634,7 +4697,7 @@ DIOCCHANGEADDR_error:
case DIOCGETRULESET: {
struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr;
- pr->path[sizeof(pr->path) - 1] = 0;
+ pr->path[sizeof(pr->path) - 1] = '\0';
error = pf_ioctl_get_ruleset(pr);
break;
@@ -4645,7 +4708,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != 0) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
@@ -4661,13 +4724,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
error = ENOMEM;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4676,7 +4739,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_add_tables(pfrts, io->pfrio_size,
@@ -4693,13 +4756,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
error = ENOMEM;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4708,7 +4771,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_del_tables(pfrts, io->pfrio_size,
@@ -4726,14 +4789,14 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_RLOCK();
n = pfr_table_count(&io->pfrio_table, io->pfrio_flags);
if (n < 0) {
PF_RULES_RUNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4744,7 +4807,7 @@ DIOCCHANGEADDR_error:
if (pfrts == NULL) {
error = ENOMEM;
PF_RULES_RUNLOCK();
- break;
+ goto fail;
}
error = pfr_get_tables(&io->pfrio_table, pfrts,
&io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4763,7 +4826,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_TABLE_STATS_LOCK();
PF_RULES_RLOCK();
@@ -4772,7 +4835,7 @@ DIOCCHANGEADDR_error:
PF_RULES_RUNLOCK();
PF_TABLE_STATS_UNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4783,7 +4846,7 @@ DIOCCHANGEADDR_error:
error = ENOMEM;
PF_RULES_RUNLOCK();
PF_TABLE_STATS_UNLOCK();
- break;
+ goto fail;
}
error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
&io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4802,7 +4865,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
@@ -4811,7 +4874,7 @@ DIOCCHANGEADDR_error:
* size, so we didn't fail on overly large requests.
* Keep doing so. */
io->pfrio_size = pf_ioctl_maxcount;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4820,7 +4883,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_TABLE_STATS_LOCK();
@@ -4841,7 +4904,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_RLOCK();
@@ -4849,7 +4912,7 @@ DIOCCHANGEADDR_error:
if (n < 0) {
PF_RULES_RUNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4861,7 +4924,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_set_tflags(pfrts, io->pfrio_size,
@@ -4877,7 +4940,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != 0) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
@@ -4893,13 +4956,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4907,7 +4970,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_add_addrs(&io->pfrio_table, pfras,
@@ -4927,13 +4990,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4941,7 +5004,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_del_addrs(&io->pfrio_table, pfras,
@@ -4961,17 +5024,17 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size2 < 0) {
error = EINVAL;
- break;
+ goto fail;
}
count = max(io->pfrio_size, io->pfrio_size2);
if (count > pf_ioctl_maxcount ||
WOULD_OVERFLOW(count, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = count * sizeof(struct pfr_addr);
pfras = mallocarray(count, sizeof(struct pfr_addr), M_TEMP,
@@ -4979,7 +5042,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_set_addrs(&io->pfrio_table, pfras,
@@ -5000,13 +5063,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5028,13 +5091,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_astats)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_astats))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_astats);
pfrastats = mallocarray(io->pfrio_size,
@@ -5056,13 +5119,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5070,7 +5133,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_astats(&io->pfrio_table, pfras,
@@ -5090,13 +5153,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5104,7 +5167,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_RLOCK();
error = pfr_tst_addrs(&io->pfrio_table, pfras,
@@ -5124,13 +5187,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5138,7 +5201,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_ina_define(&io->pfrio_table, pfras,
@@ -5173,13 +5236,13 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5187,7 +5250,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5254,13 +5317,13 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5268,7 +5331,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5337,14 +5400,14 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
@@ -5353,12 +5416,12 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
/* First makes sure everything will succeed. */
for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
- ioe->anchor[sizeof(ioe->anchor) - 1] = 0;
+ ioe->anchor[sizeof(ioe->anchor) - 1] = '\0';
switch (ioe->rs_num) {
case PF_RULESET_ETH:
ers = pf_find_keth_ruleset(ioe->anchor);
@@ -5494,7 +5557,7 @@ DIOCCHANGEADDR_error:
if (psn->psn_len == 0) {
psn->psn_len = sizeof(struct pf_src_node) * nr;
- break;
+ goto fail;
}
nr = 0;
@@ -5519,7 +5582,7 @@ DIOCCHANGEADDR_error:
sizeof(struct pf_src_node) * nr);
if (error) {
free(pstore, M_TEMP);
- break;
+ goto fail;
}
psn->psn_len = sizeof(struct pf_src_node) * nr;
free(pstore, M_TEMP);
@@ -5575,14 +5638,14 @@ DIOCCHANGEADDR_error:
if (io->pfiio_esize != sizeof(struct pfi_kif)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfiio_size < 0 ||
io->pfiio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfiio_size, sizeof(struct pfi_kif))) {
error = EINVAL;
- break;
+ goto fail;
}
io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0';
@@ -6024,11 +6087,11 @@ pf_kill_srcnodes(struct pfioc_src_node_kill *psnk)
PF_HASHROW_LOCK(sh);
LIST_FOREACH_SAFE(sn, &sh->nodes, entry, tmp)
if (psnk == NULL ||
- (PF_MATCHA(psnk->psnk_src.neg,
+ (pf_match_addr(psnk->psnk_src.neg,
&psnk->psnk_src.addr.v.a.addr,
&psnk->psnk_src.addr.v.a.mask,
&sn->addr, sn->af) &&
- PF_MATCHA(psnk->psnk_dst.neg,
+ pf_match_addr(psnk->psnk_dst.neg,
&psnk->psnk_dst.addr.v.a.addr,
&psnk->psnk_dst.addr.v.a.mask,
&sn->raddr, sn->af))) {
@@ -6132,10 +6195,10 @@ relock_DIOCCLRSTATES:
match_key.af = s->key[idx]->af;
match_key.proto = s->key[idx]->proto;
- PF_ACPY(&match_key.addr[0],
+ pf_addrcpy(&match_key.addr[0],
&s->key[idx]->addr[1], match_key.af);
match_key.port[0] = s->key[idx]->port[1];
- PF_ACPY(&match_key.addr[1],
+ pf_addrcpy(&match_key.addr[1],
&s->key[idx]->addr[0], match_key.af);
match_key.port[1] = s->key[idx]->port[0];
}
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index 5e7865e4fac5..9c7863bb301e 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -80,7 +80,6 @@ static enum pf_test_status pf_step_into_translation_anchor(int, struct pf_test_c
struct pf_krule *);
static int pf_get_sport(struct pf_pdesc *, struct pf_krule *,
struct pf_addr *, uint16_t *, uint16_t, uint16_t,
- struct pf_ksrc_node **, struct pf_srchash **,
struct pf_kpool *, struct pf_udp_mapping **,
pf_sn_types_t);
static bool pf_islinklocal(const sa_family_t, const struct pf_addr *);
@@ -291,10 +290,8 @@ pf_match_translation(int rs_num, struct pf_test_ctx *ctx)
}
static int
-pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
- struct pf_addr *naddr, uint16_t *nport, uint16_t low,
- uint16_t high, struct pf_ksrc_node **sn,
- struct pf_srchash **sh, struct pf_kpool *rpool,
+pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *naddr,
+ uint16_t *nport, uint16_t low, uint16_t high, struct pf_kpool *rpool,
struct pf_udp_mapping **udp_mapping, pf_sn_types_t sn_type)
{
struct pf_state_key_cmp key;
@@ -319,20 +316,27 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
bzero(&udp_source, sizeof(udp_source));
udp_source.af = pd->af;
- PF_ACPY(&udp_source.addr, &pd->nsaddr, pd->af);
+ pf_addrcpy(&udp_source.addr, &pd->nsaddr, pd->af);
udp_source.port = pd->nsport;
if (udp_mapping) {
+ struct pf_ksrc_node *sn = NULL;
+ struct pf_srchash *sh = NULL;
*udp_mapping = pf_udp_mapping_find(&udp_source);
if (*udp_mapping) {
- PF_ACPY(naddr, &(*udp_mapping)->endpoints[1].addr, pd->af);
+ pf_addrcpy(naddr,
+ &(*udp_mapping)->endpoints[1].addr,
+ pd->af);
*nport = (*udp_mapping)->endpoints[1].port;
- /* Try to find a src_node as per pf_map_addr(). */
- if (*sn == NULL && rpool->opts & PF_POOL_STICKYADDR &&
+ /*
+ * Try to find a src_node as per pf_map_addr().
+ * XXX: Why? This code seems to do nothing.
+ */
+ if (rpool->opts & PF_POOL_STICKYADDR &&
(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
- *sn = pf_find_src_node(&pd->nsaddr, r,
- pd->af, sh, sn_type, false);
- if (*sn != NULL)
- PF_SRC_NODE_UNLOCK(*sn);
+ sn = pf_find_src_node(&pd->nsaddr, r,
+ pd->af, &sh, sn_type, false);
+ if (sn != NULL)
+ PF_SRC_NODE_UNLOCK(sn);
return (0);
} else {
*udp_mapping = pf_udp_mapping_create(pd->af, &pd->nsaddr,
@@ -344,7 +348,7 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
}
if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL, &init_addr,
- sn, sh, rpool, sn_type))
+ rpool, sn_type))
goto failed;
if (pd->proto == IPPROTO_ICMP) {
@@ -369,12 +373,13 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
key.proto = pd->proto;
do {
- PF_ACPY(&key.addr[didx], &pd->ndaddr, key.af);
- PF_ACPY(&key.addr[sidx], naddr, key.af);
+ pf_addrcpy(&key.addr[didx], &pd->ndaddr, key.af);
+ pf_addrcpy(&key.addr[sidx], naddr, key.af);
key.port[didx] = pd->ndport;
if (udp_mapping && *udp_mapping)
- PF_ACPY(&(*udp_mapping)->endpoints[1].addr, naddr, pd->af);
+ pf_addrcpy(&(*udp_mapping)->endpoints[1].addr, naddr,
+ pd->af);
/*
* port search; start random, step;
@@ -467,9 +472,8 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
* pick a different source address since we're out
* of free port choices for the current one.
*/
- (*sn) = NULL;
if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL,
- &init_addr, sn, sh, rpool, sn_type))
+ &init_addr, rpool, sn_type))
return (1);
break;
case PF_POOL_NONE:
@@ -500,7 +504,6 @@ pf_islinklocal(const sa_family_t af, const struct pf_addr *addr)
static int
pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r,
struct pf_addr *naddr, uint16_t *nport,
- struct pf_ksrc_node **sn, struct pf_srchash **sh,
struct pf_udp_mapping **udp_mapping, struct pf_kpool *rpool)
{
uint16_t psmask, low, highmask;
@@ -520,16 +523,14 @@ pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r,
for (i = cut; i <= ahigh; i++) {
low = (i << ashift) | psmask;
- if (!pf_get_sport(pd, r,
- naddr, nport, low, low | highmask, sn, sh, rpool,
- udp_mapping, PF_SN_NAT))
+ if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask,
+ rpool, udp_mapping, PF_SN_NAT))
return (0);
}
for (i = cut - 1; i > 0; i--) {
low = (i << ashift) | psmask;
- if (!pf_get_sport(pd, r,
- naddr, nport, low, low | highmask, sn, sh, rpool,
- udp_mapping, PF_SN_NAT))
+ if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask,
+ rpool, udp_mapping, PF_SN_NAT))
return (0);
}
return (1);
@@ -542,6 +543,7 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
{
u_short reason = PFRES_MATCH;
struct pf_addr *raddr = NULL, *rmask = NULL;
+ struct pfr_ktable *kt;
uint64_t hashidx;
int cnt;
@@ -591,39 +593,35 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
switch (rpool->opts & PF_POOL_TYPEMASK) {
case PF_POOL_NONE:
- PF_ACPY(naddr, raddr, af);
+ pf_addrcpy(naddr, raddr, af);
break;
case PF_POOL_BITMASK:
- PF_POOLMASK(naddr, raddr, rmask, saddr, af);
+ pf_poolmask(naddr, raddr, rmask, saddr, af);
break;
case PF_POOL_RANDOM:
- if (rpool->cur->addr.type == PF_ADDR_TABLE) {
- cnt = rpool->cur->addr.p.tbl->pfrkt_cnt;
- if (cnt == 0)
- rpool->tblidx = 0;
+ if (rpool->cur->addr.type == PF_ADDR_TABLE ||
+ rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ if (rpool->cur->addr.type == PF_ADDR_TABLE)
+ kt = rpool->cur->addr.p.tbl;
else
- rpool->tblidx = (int)arc4random_uniform(cnt);
- memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL)) {
+ kt = rpool->cur->addr.p.dyn->pfid_kt;
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
- PF_ACPY(naddr, &rpool->counter, af);
- } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
- cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt;
+ cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
rpool->tblidx = (int)arc4random_uniform(cnt);
memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af,
- pf_islinklocal)) {
+ if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
+ af, pf_islinklocal, false)) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
- PF_ACPY(naddr, &rpool->counter, af);
+ pf_addrcpy(naddr, &rpool->counter, af);
} else if (init_addr != NULL && PF_AZERO(init_addr, af)) {
switch (af) {
#ifdef INET
@@ -654,12 +652,12 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
break;
#endif /* INET6 */
}
- PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
- PF_ACPY(init_addr, naddr, af);
+ pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
+ pf_addrcpy(init_addr, naddr, af);
} else {
- PF_AINC(&rpool->counter, af);
- PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
+ pf_addr_inc(&rpool->counter, af);
+ pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
}
break;
case PF_POOL_SRCHASH:
@@ -668,35 +666,31 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
hashidx =
pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
- if (rpool->cur->addr.type == PF_ADDR_TABLE) {
- cnt = rpool->cur->addr.p.tbl->pfrkt_cnt;
- if (cnt == 0)
- rpool->tblidx = 0;
+ if (rpool->cur->addr.type == PF_ADDR_TABLE ||
+ rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ if (rpool->cur->addr.type == PF_ADDR_TABLE)
+ kt = rpool->cur->addr.p.tbl;
else
- rpool->tblidx = (int)(hashidx % cnt);
- memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL)) {
+ kt = rpool->cur->addr.p.dyn->pfid_kt;
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
- PF_ACPY(naddr, &rpool->counter, af);
- } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
- cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt;
+ cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
rpool->tblidx = (int)(hashidx % cnt);
memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af,
- pf_islinklocal)) {
+ if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
+ af, pf_islinklocal, false)) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
- PF_ACPY(naddr, &rpool->counter, af);
+ pf_addrcpy(naddr, &rpool->counter, af);
} else {
- PF_POOLMASK(naddr, raddr, rmask,
+ pf_poolmask(naddr, raddr, rmask,
(struct pf_addr *)&hash, af);
}
break;
@@ -707,11 +701,12 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
if (rpool->cur->addr.type == PF_ADDR_TABLE) {
if (!pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL))
+ &rpool->tblidx, &rpool->counter, af, NULL, true))
goto get_addr;
} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af, pf_islinklocal))
+ &rpool->tblidx, &rpool->counter, af, pf_islinklocal,
+ true))
goto get_addr;
} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
goto get_addr;
@@ -721,9 +716,10 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
rpool->cur = TAILQ_FIRST(&rpool->list);
else
rpool->cur = TAILQ_NEXT(rpool->cur, entries);
+ rpool->tblidx = -1;
if (rpool->cur->addr.type == PF_ADDR_TABLE) {
if (pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL)) {
+ &rpool->tblidx, &rpool->counter, af, NULL, true)) {
/* table contains no address of type 'af' */
if (rpool->cur != acur)
goto try_next;
@@ -731,9 +727,9 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
goto done_pool_mtx;
}
} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
- rpool->tblidx = -1;
if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) {
+ &rpool->tblidx, &rpool->counter, af, pf_islinklocal,
+ true)) {
/* table contains no address of type 'af' */
if (rpool->cur != acur)
goto try_next;
@@ -743,14 +739,14 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
} else {
raddr = &rpool->cur->addr.v.a.addr;
rmask = &rpool->cur->addr.v.a.mask;
- PF_ACPY(&rpool->counter, raddr, af);
+ pf_addrcpy(&rpool->counter, raddr, af);
}
get_addr:
- PF_ACPY(naddr, &rpool->counter, af);
+ pf_addrcpy(naddr, &rpool->counter, af);
if (init_addr != NULL && PF_AZERO(init_addr, af))
- PF_ACPY(init_addr, naddr, af);
- PF_AINC(&rpool->counter, af);
+ pf_addrcpy(init_addr, naddr, af);
+ pf_addr_inc(&rpool->counter, af);
break;
}
}
@@ -761,48 +757,41 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
done_pool_mtx:
mtx_unlock(&rpool->mtx);
- if (reason) {
- counter_u64_add(V_pf_status.counters[reason], 1);
- }
-
return (reason);
}
u_short
pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr,
- struct pf_ksrc_node **sn, struct pf_srchash **sh, struct pf_kpool *rpool,
- pf_sn_types_t sn_type)
+ struct pf_kpool *rpool, pf_sn_types_t sn_type)
{
+ struct pf_ksrc_node *sn = NULL;
+ struct pf_srchash *sh = NULL;
u_short reason = 0;
- KASSERT(*sn == NULL, ("*sn not NULL"));
-
/*
* If this is a sticky-address rule, try to find an existing src_node.
- * Request the sh to be unlocked if sn was not found, as we never
- * insert a new sn when parsing the ruleset.
*/
if (rpool->opts & PF_POOL_STICKYADDR &&
(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
- *sn = pf_find_src_node(saddr, r, af, sh, sn_type, false);
+ sn = pf_find_src_node(saddr, r, af, &sh, sn_type, false);
- if (*sn != NULL) {
- PF_SRC_NODE_LOCK_ASSERT(*sn);
+ if (sn != NULL) {
+ PF_SRC_NODE_LOCK_ASSERT(sn);
/* If the supplied address is the same as the current one we've
* been asked before, so tell the caller that there's no other
* address to be had. */
- if (PF_AEQ(naddr, &(*sn)->raddr, af)) {
+ if (PF_AEQ(naddr, &(sn->raddr), af)) {
reason = PFRES_MAPFAILED;
goto done;
}
- PF_ACPY(naddr, &(*sn)->raddr, af);
+ pf_addrcpy(naddr, &(sn->raddr), af);
if (nkif)
- *nkif = (*sn)->rkif;
+ *nkif = sn->rkif;
if (V_pf_status.debug >= PF_DEBUG_NOISY) {
- printf("pf_map_addr: src tracking maps ");
+ printf("%s: src tracking maps ", __func__);
pf_print_host(saddr, 0, af);
printf(" to ");
pf_print_host(naddr, 0, af);
@@ -817,14 +806,16 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
* Source node has not been found. Find a new address and store it
* in variables given by the caller.
*/
- if (pf_map_addr(af, r, saddr, naddr, nkif, init_addr, rpool) != 0) {
- /* pf_map_addr() sets reason counters on its own */
+ if ((reason = pf_map_addr(af, r, saddr, naddr, nkif, init_addr,
+ rpool)) != 0) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: pf_map_addr has failed\n", __func__);
goto done;
}
if (V_pf_status.debug >= PF_DEBUG_NOISY &&
(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
- printf("pf_map_addr: selected address ");
+ printf("%s: selected address ", __func__);
pf_print_host(naddr, 0, af);
if (nkif)
printf("@%s", (*nkif)->pfik_name);
@@ -832,12 +823,8 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
}
done:
- if ((*sn) != NULL)
- PF_SRC_NODE_UNLOCK(*sn);
-
- if (reason) {
- counter_u64_add(V_pf_status.counters[reason], 1);
- }
+ if (sn != NULL)
+ PF_SRC_NODE_UNLOCK(sn);
return (reason);
}
@@ -887,8 +874,6 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
{
struct pf_pdesc *pd = ctx->pd;
struct pf_addr *naddr;
- struct pf_ksrc_node *sn = NULL;
- struct pf_srchash *sh = NULL;
uint16_t *nportp;
uint16_t low, high;
u_short reason;
@@ -916,8 +901,8 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
high = rpool->proxy_port[1];
}
if (rpool->mape.offset > 0) {
- if (pf_get_mape_sport(pd, r, naddr, nportp, &sn,
- &sh, &ctx->udp_mapping, rpool)) {
+ if (pf_get_mape_sport(pd, r, naddr, nportp,
+ &ctx->udp_mapping, rpool)) {
DPFPRINTF(PF_DEBUG_MISC,
("pf: MAP-E port allocation (%u/%u/%u)"
" failed\n",
@@ -927,8 +912,8 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
reason = PFRES_MAPFAILED;
goto notrans;
}
- } else if (pf_get_sport(pd, r, naddr, nportp, low, high, &sn,
- &sh, rpool, &ctx->udp_mapping, PF_SN_NAT)) {
+ } else if (pf_get_sport(pd, r, naddr, nportp, low, high,
+ rpool, &ctx->udp_mapping, PF_SN_NAT)) {
DPFPRINTF(PF_DEBUG_MISC,
("pf: NAT proxy port allocation (%u-%u) failed\n",
rpool->proxy_port[0], rpool->proxy_port[1]));
@@ -948,7 +933,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
reason = PFRES_MAPFAILED;
goto notrans;
}
- PF_POOLMASK(naddr,
+ pf_poolmask(naddr,
&rpool->cur->addr.p.dyn->pfid_addr4,
&rpool->cur->addr.p.dyn->pfid_mask4,
&pd->nsaddr, AF_INET);
@@ -961,7 +946,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
reason = PFRES_MAPFAILED;
goto notrans;
}
- PF_POOLMASK(naddr,
+ pf_poolmask(naddr,
&rpool->cur->addr.p.dyn->pfid_addr6,
&rpool->cur->addr.p.dyn->pfid_mask6,
&pd->nsaddr, AF_INET6);
@@ -969,7 +954,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
#endif /* INET6 */
}
} else
- PF_POOLMASK(naddr,
+ pf_poolmask(naddr,
&rpool->cur->addr.v.a.addr,
&rpool->cur->addr.v.a.mask, &pd->nsaddr,
pd->af);
@@ -983,7 +968,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
reason = PFRES_MAPFAILED;
goto notrans;
}
- PF_POOLMASK(naddr,
+ pf_poolmask(naddr,
&r->src.addr.p.dyn->pfid_addr4,
&r->src.addr.p.dyn->pfid_mask4,
&pd->ndaddr, AF_INET);
@@ -995,7 +980,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
reason = PFRES_MAPFAILED;
goto notrans;
}
- PF_POOLMASK(naddr,
+ pf_poolmask(naddr,
&r->src.addr.p.dyn->pfid_addr6,
&r->src.addr.p.dyn->pfid_mask6,
&pd->ndaddr, AF_INET6);
@@ -1003,7 +988,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
#endif /* INET6 */
}
} else
- PF_POOLMASK(naddr, &r->src.addr.v.a.addr,
+ pf_poolmask(naddr, &r->src.addr.v.a.addr,
&r->src.addr.v.a.mask, &pd->ndaddr, pd->af);
break;
}
@@ -1014,11 +999,11 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
uint16_t cut, low, high, nport;
reason = pf_map_addr_sn(pd->af, r, &pd->nsaddr, naddr, NULL,
- NULL, &sn, &sh, rpool, PF_SN_NAT);
+ NULL, rpool, PF_SN_NAT);
if (reason != 0)
goto notrans;
if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
- PF_POOLMASK(naddr, naddr, &rpool->cur->addr.v.a.mask,
+ pf_poolmask(naddr, naddr, &rpool->cur->addr.v.a.mask,
&pd->ndaddr, pd->af);
/* Do not change SCTP ports. */
@@ -1027,10 +1012,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
if (rpool->proxy_port[1]) {
uint32_t tmp_nport;
+ uint16_t div;
+
+ div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1;
+ div = (div == 0) ? 1 : div;
- tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) %
- (rpool->proxy_port[1] - rpool->proxy_port[0] +
- 1)) + rpool->proxy_port[0];
+ tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) +
+ rpool->proxy_port[0];
/* Wrap around if necessary. */
if (tmp_nport > 65535)
@@ -1056,9 +1044,9 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
key.af = pd->af;
key.proto = pd->proto;
key.port[0] = pd->nsport;
- PF_ACPY(&key.addr[0], &pd->nsaddr, key.af);
+ pf_addrcpy(&key.addr[0], &pd->nsaddr, key.af);
key.port[1] = nport;
- PF_ACPY(&key.addr[1], naddr, key.af);
+ pf_addrcpy(&key.addr[1], naddr, key.af);
if (!pf_find_state_all_exists(&key, PF_OUT))
break;
@@ -1131,8 +1119,6 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
struct pf_addr ndaddr, nsaddr, naddr;
u_int16_t nport = 0;
int prefixlen = 96;
- struct pf_srchash *sh = NULL;
- struct pf_ksrc_node *sns = NULL;
bzero(&nsaddr, sizeof(nsaddr));
bzero(&ndaddr, sizeof(ndaddr));
@@ -1151,9 +1137,8 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
panic("pf_get_transaddr_af: no nat pool for source address");
/* get source address and port */
- if (pf_get_sport(pd, r, &nsaddr, &nport,
- r->nat.proxy_port[0], r->nat.proxy_port[1], &sns, &sh, &r->nat,
- NULL, PF_SN_NAT)) {
+ if (pf_get_sport(pd, r, &nsaddr, &nport, r->nat.proxy_port[0],
+ r->nat.proxy_port[1], &r->nat, NULL, PF_SN_NAT)) {
DPFPRINTF(PF_DEBUG_MISC,
("pf: af-to NAT proxy port allocation (%u-%u) failed",
r->nat.proxy_port[0], r->nat.proxy_port[1]));
@@ -1179,7 +1164,7 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
/* get the destination address and port */
if (! TAILQ_EMPTY(&r->rdr.list)) {
if (pf_map_addr_sn(pd->naf, r, &nsaddr, &naddr, NULL, NULL,
- &sns, NULL, &r->rdr, PF_SN_NAT))
+ &r->rdr, PF_SN_NAT))
return (-1);
if (r->rdr.proxy_port[0])
pd->ndport = htons(r->rdr.proxy_port[0]);
@@ -1220,8 +1205,8 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
}
}
- PF_ACPY(&pd->nsaddr, &nsaddr, pd->naf);
- PF_ACPY(&pd->ndaddr, &ndaddr, pd->naf);
+ pf_addrcpy(&pd->nsaddr, &nsaddr, pd->naf);
+ pf_addrcpy(&pd->ndaddr, &ndaddr, pd->naf);
if (V_pf_status.debug >= PF_DEBUG_MISC) {
printf("pf: af-to %s done, prefixlen %d, ",
diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c
index 381e966eacf1..73933c022ca2 100644
--- a/sys/netpfil/pf/pf_nl.c
+++ b/sys/netpfil/pf/pf_nl.c
@@ -1256,23 +1256,13 @@ pf_handle_clear_status(struct nlmsghdr *hdr, struct nl_pstate *npt)
return (0);
}
-struct pf_nl_natlook {
- sa_family_t af;
- uint8_t direction;
- uint8_t proto;
- struct pf_addr src;
- struct pf_addr dst;
- uint16_t sport;
- uint16_t dport;
-};
-
-#define _OUT(_field) offsetof(struct pf_nl_natlook, _field)
+#define _OUT(_field) offsetof(struct pfioc_natlook, _field)
static const struct nlattr_parser nla_p_natlook[] = {
{ .type = PF_NL_AF, .off = _OUT(af), .cb = nlattr_get_uint8 },
{ .type = PF_NL_DIRECTION, .off = _OUT(direction), .cb = nlattr_get_uint8 },
{ .type = PF_NL_PROTO, .off = _OUT(proto), .cb = nlattr_get_uint8 },
- { .type = PF_NL_SRC_ADDR, .off = _OUT(src), .cb = nlattr_get_in6_addr },
- { .type = PF_NL_DST_ADDR, .off = _OUT(dst), .cb = nlattr_get_in6_addr },
+ { .type = PF_NL_SRC_ADDR, .off = _OUT(saddr), .cb = nlattr_get_in6_addr },
+ { .type = PF_NL_DST_ADDR, .off = _OUT(daddr), .cb = nlattr_get_in6_addr },
{ .type = PF_NL_SRC_PORT, .off = _OUT(sport), .cb = nlattr_get_uint16 },
{ .type = PF_NL_DST_PORT, .off = _OUT(dport), .cb = nlattr_get_uint16 },
};
@@ -1282,63 +1272,31 @@ NL_DECLARE_PARSER(natlook_parser, struct genlmsghdr, nlf_p_empty, nla_p_natlook)
static int
pf_handle_natlook(struct nlmsghdr *hdr, struct nl_pstate *npt)
{
- struct pf_nl_natlook attrs = {};
- struct pf_state_key_cmp key = {};
+ struct pfioc_natlook attrs = {};
struct nl_writer *nw = npt->nw;
- struct pf_state_key *sk;
- struct pf_kstate *state;
struct genlmsghdr *ghdr_new;
- int error, m = 0;
- int sidx, didx;
+ int error;
error = nl_parse_nlmsg(hdr, &natlook_parser, npt, &attrs);
if (error != 0)
return (error);
- if (attrs.proto == 0 ||
- PF_AZERO(&attrs.src, attrs.af) ||
- PF_AZERO(&attrs.dst, attrs.af) ||
- ((attrs.proto == IPPROTO_TCP || attrs.proto == IPPROTO_UDP) &&
- (attrs.sport == 0 || attrs.dport == 0)))
- return (EINVAL);
-
- /* NATLOOK src and dst are reversed, so reverse sidx/didx */
- sidx = (attrs.direction == PF_IN) ? 1 : 0;
- didx = (attrs.direction == PF_IN) ? 0 : 1;
-
- key.af = attrs.af;
- key.proto = attrs.proto;
- PF_ACPY(&key.addr[sidx], &attrs.src, attrs.af);
- key.port[sidx] = attrs.sport;
- PF_ACPY(&key.addr[didx], &attrs.dst, attrs.af);
- key.port[didx] = attrs.dport;
-
- state = pf_find_state_all(&key, attrs.direction, &m);
- if (state == NULL)
- return (ENOENT);
- if (m > 1) {
- PF_STATE_UNLOCK(state);
- return (E2BIG);
- }
+ error = pf_ioctl_natlook(&attrs);
+ if (error != 0)
+ return (error);
- if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) {
- PF_STATE_UNLOCK(state);
+ if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr)))
return (ENOMEM);
- }
ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr);
ghdr_new->cmd = PFNL_CMD_NATLOOK;
ghdr_new->version = 0;
ghdr_new->reserved = 0;
- sk = state->key[sidx];
-
- nlattr_add_in6_addr(nw, PF_NL_SRC_ADDR, &sk->addr[sidx].v6);
- nlattr_add_in6_addr(nw, PF_NL_DST_ADDR, &sk->addr[didx].v6);
- nlattr_add_u16(nw, PF_NL_SRC_PORT, sk->port[sidx]);
- nlattr_add_u16(nw, PF_NL_DST_PORT, sk->port[didx]);
-
- PF_STATE_UNLOCK(state);
+ nlattr_add_in6_addr(nw, PF_NL_SRC_ADDR, &attrs.rsaddr.v6);
+ nlattr_add_in6_addr(nw, PF_NL_DST_ADDR, &attrs.rdaddr.v6);
+ nlattr_add_u16(nw, PF_NL_SRC_PORT, attrs.rsport);
+ nlattr_add_u16(nw, PF_NL_DST_PORT, attrs.rdport);
if (!nlmsg_end(nw)) {
nlmsg_abort(nw);
diff --git a/sys/netpfil/pf/pf_ruleset.c b/sys/netpfil/pf/pf_ruleset.c
index 865c5ecd72d9..2e5165a9900c 100644
--- a/sys/netpfil/pf/pf_ruleset.c
+++ b/sys/netpfil/pf/pf_ruleset.c
@@ -232,7 +232,7 @@ pf_get_leaf_kruleset(char *path, char **path_remainder)
return (ruleset);
}
-struct pf_kanchor *
+static struct pf_kanchor *
pf_create_kanchor(struct pf_kanchor *parent, const char *aname)
{
struct pf_kanchor *anchor, *dup;
@@ -259,8 +259,8 @@ pf_create_kanchor(struct pf_kanchor *parent, const char *aname)
if ((dup = RB_INSERT(pf_kanchor_global, &V_pf_anchors, anchor)) !=
NULL) {
- printf("pf_find_or_create_ruleset: RB_INSERT1 "
- "'%s' '%s' collides with '%s' '%s'\n",
+ printf("%s: RB_INSERT1 "
+ "'%s' '%s' collides with '%s' '%s'\n", __func__,
anchor->path, anchor->name, dup->path, dup->name);
rs_free(anchor);
return (NULL);
@@ -270,10 +270,10 @@ pf_create_kanchor(struct pf_kanchor *parent, const char *aname)
anchor->parent = parent;
if ((dup = RB_INSERT(pf_kanchor_node, &parent->children,
anchor)) != NULL) {
- printf("pf_find_or_create_ruleset: "
+ printf("%s: "
"RB_INSERT2 '%s' '%s' collides with "
- "'%s' '%s'\n", anchor->path, anchor->name,
- dup->path, dup->name);
+ "'%s' '%s'\n", __func__, anchor->path,
+ anchor->name, dup->path, dup->name);
RB_REMOVE(pf_kanchor_global, &V_pf_anchors,
anchor);
rs_free(anchor);
@@ -339,7 +339,7 @@ pf_remove_if_empty_kruleset(struct pf_kruleset *ruleset)
int i;
while (ruleset != NULL) {
- if (ruleset == &pf_main_ruleset || ruleset->anchor == NULL ||
+ if (ruleset == &pf_main_ruleset ||
!RB_EMPTY(&ruleset->anchor->children) ||
ruleset->anchor->refcnt > 0 || ruleset->tables > 0 ||
ruleset->topen)
@@ -407,7 +407,7 @@ pf_kanchor_setup(struct pf_krule *r, const struct pf_kruleset *s,
}
ruleset = pf_find_or_create_kruleset(path);
rs_free(path);
- if (ruleset == NULL || ruleset->anchor == NULL) {
+ if (ruleset == NULL || ruleset == &pf_main_ruleset) {
DPFPRINTF("%s: ruleset\n", __func__);
return (1);
}
@@ -432,7 +432,7 @@ pf_kanchor_copyout(const struct pf_kruleset *rs, const struct pf_krule *r,
char a[MAXPATHLEN];
char *p;
int i;
- if (rs->anchor == NULL)
+ if (rs == &pf_main_ruleset)
a[0] = 0;
else
strlcpy(a, rs->anchor->path, MAXPATHLEN);
@@ -444,7 +444,7 @@ pf_kanchor_copyout(const struct pf_kruleset *rs, const struct pf_krule *r,
anchor_call_len);
}
if (strncmp(a, r->anchor->path, strlen(a))) {
- printf("pf_anchor_copyout: '%s' '%s'\n", a,
+ printf("%s: '%s' '%s'\n", __func__, a,
r->anchor->path);
return (1);
}
@@ -525,16 +525,13 @@ done:
}
void
-pf_kanchor_remove(struct pf_krule *r)
+pf_remove_kanchor(struct pf_krule *r)
{
if (r->anchor == NULL)
return;
- if (r->anchor->refcnt <= 0) {
- printf("pf_anchor_remove: broken refcount\n");
- r->anchor = NULL;
- return;
- }
- if (!--r->anchor->refcnt)
+ if (r->anchor->refcnt <= 0)
+ printf("%s: broken refcount\n", __func__);
+ else if (!--r->anchor->refcnt)
pf_remove_if_empty_kruleset(&r->anchor->ruleset);
r->anchor = NULL;
}
diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c
index d5874df3df66..9c0151b7da2b 100644
--- a/sys/netpfil/pf/pf_table.c
+++ b/sys/netpfil/pf/pf_table.c
@@ -704,7 +704,7 @@ pfr_validate_addr(struct pfr_addr *ad)
return (-1);
if (ad->pfra_not && ad->pfra_not != 1)
return (-1);
- if (ad->pfra_fback)
+ if (ad->pfra_fback != PFR_FB_NONE)
return (-1);
return (0);
}
@@ -819,10 +819,10 @@ pfr_create_kentry(struct pfr_addr *ad, bool counters)
static void
pfr_destroy_kentries(struct pfr_kentryworkq *workq)
{
- struct pfr_kentry *p, *q;
+ struct pfr_kentry *p;
- for (p = SLIST_FIRST(workq); p != NULL; p = q) {
- q = SLIST_NEXT(p, pfrke_workq);
+ while ((p = SLIST_FIRST(workq)) != NULL) {
+ SLIST_REMOVE_HEAD(workq, pfrke_workq);
pfr_destroy_kentry(p);
}
}
@@ -1680,8 +1680,7 @@ pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd,
}
if (!(flags & PFR_FLAG_DUMMY)) {
- for (p = SLIST_FIRST(&workq); p != NULL; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ SLIST_FOREACH_SAFE(p, &workq, pfrkt_workq, q) {
pfr_commit_ktable(p, tzero);
}
rs->topen = 0;
@@ -1710,7 +1709,7 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
} else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) {
/* kt might contain addresses */
struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq;
- struct pfr_kentry *p, *q, *next;
+ struct pfr_kentry *p, *q;
struct pfr_addr ad;
pfr_enqueue_addrs(shadow, &addrq, NULL, 0);
@@ -1720,7 +1719,8 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
SLIST_INIT(&delq);
SLIST_INIT(&garbageq);
pfr_clean_node_mask(shadow, &addrq);
- SLIST_FOREACH_SAFE(p, &addrq, pfrke_workq, next) {
+ while ((p = SLIST_FIRST(&addrq)) != NULL) {
+ SLIST_REMOVE_HEAD(&addrq, pfrke_workq);
pfr_copyout_addr(&ad, p);
q = pfr_lookup_addr(kt, &ad, 1);
if (q != NULL) {
@@ -1864,8 +1864,7 @@ pfr_setflags_ktables(struct pfr_ktableworkq *workq)
{
struct pfr_ktable *p, *q;
- for (p = SLIST_FIRST(workq); p; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ SLIST_FOREACH_SAFE(p, workq, pfrkt_workq, q) {
pfr_setflags_ktable(p, p->pfrkt_nflags);
}
}
@@ -2015,10 +2014,10 @@ pfr_create_ktable(struct pfr_table *tbl, time_t tzero, int attachruleset)
static void
pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr)
{
- struct pfr_ktable *p, *q;
+ struct pfr_ktable *p;
- for (p = SLIST_FIRST(workq); p; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ while ((p = SLIST_FIRST(workq)) != NULL) {
+ SLIST_REMOVE_HEAD(workq, pfrkt_workq);
pfr_destroy_ktable(p, flushaddr);
}
}
@@ -2074,17 +2073,16 @@ pfr_lookup_table(struct pfr_table *tbl)
(struct pfr_ktable *)tbl));
}
-int
-pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
+static struct pfr_kentry *
+pfr_kentry_byaddr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
+ int exact)
{
struct pfr_kentry *ke = NULL;
- int match;
PF_RULES_RASSERT();
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
- kt = kt->pfrkt_root;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL)
return (0);
switch (af) {
@@ -2121,11 +2119,26 @@ pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
default:
unhandled_af(af);
}
+ if (exact && ke && KENTRY_NETWORK(ke))
+ ke = NULL;
+
+ return (ke);
+}
+
+int
+pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
+{
+ struct pfr_kentry *ke = NULL;
+ int match;
+
+ ke = pfr_kentry_byaddr(kt, a, af, 0);
+
match = (ke && !ke->pfrke_not);
if (match)
pfr_kstate_counter_add(&kt->pfrkt_match, 1);
else
pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1);
+
return (match);
}
@@ -2135,9 +2148,8 @@ pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
{
struct pfr_kentry *ke = NULL;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
- kt = kt->pfrkt_root;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL)
return;
switch (af) {
@@ -2281,7 +2293,7 @@ pfr_detach_table(struct pfr_ktable *kt)
int
pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter,
- sa_family_t af, pf_addr_filter_func_t filter)
+ sa_family_t af, pf_addr_filter_func_t filter, bool loop_once)
{
struct pf_addr *addr, cur, mask, umask_addr;
union sockaddr_union uaddr, umask;
@@ -2306,9 +2318,8 @@ pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter,
unhandled_af(af);
}
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
- kt = kt->pfrkt_root;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL)
return (-1);
idx = *pidx;
@@ -2327,7 +2338,7 @@ _next_block:
ke = pfr_kentry_byidx(kt, idx, af);
if (ke == NULL) {
/* we don't have this idx, try looping */
- if (loop || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) {
+ if ((loop || loop_once) || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) {
pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1);
return (1);
}
@@ -2340,16 +2351,16 @@ _next_block:
if (use_counter && !PF_AZERO(counter, af)) {
/* is supplied address within block? */
- if (!PF_MATCHA(0, &cur, &mask, counter, af)) {
+ if (!pf_match_addr(0, &cur, &mask, counter, af)) {
/* no, go to next block in table */
idx++;
use_counter = 0;
goto _next_block;
}
- PF_ACPY(addr, counter, af);
+ pf_addrcpy(addr, counter, af);
} else {
/* use first address of block */
- PF_ACPY(addr, &cur, af);
+ pf_addrcpy(addr, &cur, af);
}
if (!KENTRY_NETWORK(ke)) {
@@ -2358,7 +2369,7 @@ _next_block:
idx++;
goto _next_block;
}
- PF_ACPY(counter, addr, af);
+ pf_addrcpy(counter, addr, af);
*pidx = idx;
pfr_kstate_counter_add(&kt->pfrkt_match, 1);
return (0);
@@ -2382,7 +2393,7 @@ _next_block:
/* lookup return the same block - perfect */
if (filter && filter(af, addr))
goto _next_entry;
- PF_ACPY(counter, addr, af);
+ pf_addrcpy(counter, addr, af);
*pidx = idx;
pfr_kstate_counter_add(&kt->pfrkt_match, 1);
return (0);
@@ -2392,9 +2403,9 @@ _next_entry:
/* we need to increase the counter past the nested block */
pfr_prepare_network(&umask, AF_INET, ke2->pfrke_net);
pfr_sockaddr_to_pf_addr(&umask, &umask_addr);
- PF_POOLMASK(addr, addr, &umask_addr, &pfr_ffaddr, af);
- PF_AINC(addr, af);
- if (!PF_MATCHA(0, &cur, &mask, addr, af)) {
+ pf_poolmask(addr, addr, &umask_addr, &pfr_ffaddr, af);
+ pf_addr_inc(addr, af);
+ if (!pf_match_addr(0, &cur, &mask, addr, af)) {
/* ok, we reached the end of our main block */
/* go to next block in table */
idx++;
@@ -2455,3 +2466,14 @@ pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn)
unhandled_af(dyn->pfid_af);
}
}
+
+struct pfr_ktable *
+pfr_ktable_select_active(struct pfr_ktable *kt)
+{
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (NULL);
+
+ return (kt);
+}
diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c
index 7746b668265d..ae17b3289593 100644
--- a/sys/powerpc/aim/mmu_oea.c
+++ b/sys/powerpc/aim/mmu_oea.c
@@ -1469,6 +1469,9 @@ moea_page_set_memattr(vm_page_t m, vm_memattr_t ma)
pmap_t pmap;
u_int lo;
+ if (m->md.mdpg_cache_attrs == ma)
+ return;
+
if ((m->oflags & VPO_UNMANAGED) != 0) {
m->md.mdpg_cache_attrs = ma;
return;
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 79cea408bb5f..796b1719b8ba 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -2134,6 +2134,9 @@ moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x",
__func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma);
+ if (m->md.mdpg_cache_attrs == ma)
+ return;
+
if ((m->oflags & VPO_UNMANAGED) != 0) {
m->md.mdpg_cache_attrs = ma;
return;
diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c
index 45f7bef8bcc9..a12142fc2d7b 100644
--- a/sys/powerpc/aim/mmu_radix.c
+++ b/sys/powerpc/aim/mmu_radix.c
@@ -5937,6 +5937,10 @@ mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
+
+ if (m->md.mdpg_cache_attrs == ma)
+ return;
+
m->md.mdpg_cache_attrs = ma;
/*
diff --git a/sys/powerpc/include/pcb.h b/sys/powerpc/include/pcb.h
index 050ada6b0f64..0230cf78aba7 100644
--- a/sys/powerpc/include/pcb.h
+++ b/sys/powerpc/include/pcb.h
@@ -66,16 +66,8 @@ struct pcb {
#define PCB_VECREGS 0x200 /* Process had Altivec registers initialized */
struct fpu {
union {
-#if _BYTE_ORDER == _BIG_ENDIAN
- double fpr;
- uint32_t vsr[4];
-#else
uint32_t vsr[4];
- struct {
- double padding;
- double fpr;
- };
-#endif
+ double fpr;
} fpr[32];
double fpscr; /* FPSCR stored as double for easier access */
} pcb_fpu; /* Floating point processor */
diff --git a/sys/powerpc/include/ucontext.h b/sys/powerpc/include/ucontext.h
index d35c6c773fe0..dc87edd578bc 100644
--- a/sys/powerpc/include/ucontext.h
+++ b/sys/powerpc/include/ucontext.h
@@ -41,6 +41,7 @@ typedef struct __mcontext {
int mc_flags;
#define _MC_FP_VALID 0x01
#define _MC_AV_VALID 0x02
+#define _MC_VS_VALID 0x04
int mc_onstack; /* saved onstack flag */
int mc_len; /* sizeof(__mcontext) */
__uint64_t mc_avec[32*2]; /* vector register file */
@@ -56,6 +57,7 @@ typedef struct __mcontext32 {
int mc_flags;
#define _MC_FP_VALID 0x01
#define _MC_AV_VALID 0x02
+#define _MC_VS_VALID 0x04
int mc_onstack; /* saved onstack flag */
int mc_len; /* sizeof(__mcontext) */
uint64_t mc_avec[32*2]; /* vector register file */
diff --git a/sys/powerpc/mpc85xx/mpc85xx_gpio.c b/sys/powerpc/mpc85xx/mpc85xx_gpio.c
index 0f333feb747f..cb96d768adef 100644
--- a/sys/powerpc/mpc85xx/mpc85xx_gpio.c
+++ b/sys/powerpc/mpc85xx/mpc85xx_gpio.c
@@ -226,14 +226,14 @@ mpc85xx_gpio_attach(device_t dev)
return (ENOMEM);
}
+ OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev);
+
sc->busdev = gpiobus_attach_bus(dev);
if (sc->busdev == NULL) {
mpc85xx_gpio_detach(dev);
return (ENOMEM);
}
- OF_device_register_xref(OF_xref_from_node(ofw_bus_get_node(dev)), dev);
-
return (0);
}
diff --git a/sys/powerpc/powerpc/exec_machdep.c b/sys/powerpc/powerpc/exec_machdep.c
index 1893d79f29a8..8a33d0f589a7 100644
--- a/sys/powerpc/powerpc/exec_machdep.c
+++ b/sys/powerpc/powerpc/exec_machdep.c
@@ -214,10 +214,10 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
sfpsize = sizeof(sf);
#ifdef __powerpc64__
/*
- * 64-bit PPC defines a 288 byte scratch region
- * below the stack.
+ * 64-bit PPC defines a 512 byte red zone below
+ * the existing stack (ELF ABI v2 §2.2.2.4)
*/
- rndfsize = 288 + roundup(sizeof(sf), 48);
+ rndfsize = 512 + roundup(sizeof(sf), 48);
#else
rndfsize = roundup(sizeof(sf), 16);
#endif
@@ -349,13 +349,6 @@ sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
if (error != 0)
return (error);
- /*
- * Save FPU state if needed. User may have changed it on
- * signal handler
- */
- if (uc.uc_mcontext.mc_srr1 & PSL_FP)
- save_fpu(td);
-
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
@@ -432,6 +425,7 @@ grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
}
if (pcb->pcb_flags & PCB_VSX) {
+ mcp->mc_flags |= _MC_VS_VALID;
for (i = 0; i < 32; i++)
memcpy(&mcp->mc_vsxfpreg[i],
&pcb->pcb_fpu.fpr[i].vsr[2], sizeof(double));
@@ -481,6 +475,7 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
struct pcb *pcb;
struct trapframe *tf;
register_t tls;
+ register_t msr;
int i;
pcb = td->td_pcb;
@@ -531,6 +526,22 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
tf->srr1 &= ~(PSL_FP | PSL_VSX | PSL_VEC);
pcb->pcb_flags &= ~(PCB_FPU | PCB_VSX | PCB_VEC);
+ /*
+ * Ensure the FPU is also disabled in hardware.
+ *
+ * Without this, it's possible for the register reload to fail if we
+ * don't switch to a FPU disabled context before resuming the original
+ * thread. Specifically, if the FPU/VSX unavailable exception is never
+ * hit, then whatever data is still in the FP/VSX registers when
+ * sigresume is callled will used by the resumed thread, instead of the
+ * previously saved data from the mcontext.
+ */
+ critical_enter();
+ msr = mfmsr() & ~(PSL_FP | PSL_VSX | PSL_VEC);
+ isync();
+ mtmsr(msr);
+ critical_exit();
+
if (mcp->mc_flags & _MC_FP_VALID) {
/* enable_fpu() will happen lazily on a fault */
pcb->pcb_flags |= PCB_FPREGS;
@@ -539,8 +550,12 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
for (i = 0; i < 32; i++) {
memcpy(&pcb->pcb_fpu.fpr[i].fpr, &mcp->mc_fpreg[i],
sizeof(double));
- memcpy(&pcb->pcb_fpu.fpr[i].vsr[2],
- &mcp->mc_vsxfpreg[i], sizeof(double));
+ }
+ if (mcp->mc_flags & _MC_VS_VALID) {
+ for (i = 0; i < 32; i++) {
+ memcpy(&pcb->pcb_fpu.fpr[i].vsr[2],
+ &mcp->mc_vsxfpreg[i], sizeof(double));
+ }
}
}
diff --git a/sys/powerpc/powerpc/fpu.c b/sys/powerpc/powerpc/fpu.c
index 0eaff2ea4932..cc8f22f7dda3 100644
--- a/sys/powerpc/powerpc/fpu.c
+++ b/sys/powerpc/powerpc/fpu.c
@@ -64,8 +64,19 @@ save_fpu_int(struct thread *td)
* Save the floating-point registers and FPSCR to the PCB
*/
if (pcb->pcb_flags & PCB_VSX) {
- #define SFP(n) __asm ("stxvw4x " #n ", 0,%0" \
+#if _BYTE_ORDER == _BIG_ENDIAN
+ #define SFP(n) __asm("stxvw4x " #n ", 0,%0" \
:: "b"(&pcb->pcb_fpu.fpr[n]));
+#else
+ /*
+ * stxvw2x will swap words within the FP double word on LE systems,
+ * leading to corruption if VSX is used to store state and FP is
+ * subsequently used to restore state.
+ * Use stxvd2x instead.
+ */
+ #define SFP(n) __asm("stxvd2x " #n ", 0,%0" \
+ :: "b"(&pcb->pcb_fpu.fpr[n]));
+#endif
SFP(0); SFP(1); SFP(2); SFP(3);
SFP(4); SFP(5); SFP(6); SFP(7);
SFP(8); SFP(9); SFP(10); SFP(11);
@@ -76,7 +87,7 @@ save_fpu_int(struct thread *td)
SFP(28); SFP(29); SFP(30); SFP(31);
#undef SFP
} else {
- #define SFP(n) __asm ("stfd " #n ", 0(%0)" \
+ #define SFP(n) __asm("stfd " #n ", 0(%0)" \
:: "b"(&pcb->pcb_fpu.fpr[n].fpr));
SFP(0); SFP(1); SFP(2); SFP(3);
SFP(4); SFP(5); SFP(6); SFP(7);
@@ -149,8 +160,19 @@ enable_fpu(struct thread *td)
:: "b"(&pcb->pcb_fpu.fpscr));
if (pcb->pcb_flags & PCB_VSX) {
- #define LFP(n) __asm ("lxvw4x " #n ", 0,%0" \
+#if _BYTE_ORDER == _BIG_ENDIAN
+ #define LFP(n) __asm("lxvw4x " #n ", 0,%0" \
+ :: "b"(&pcb->pcb_fpu.fpr[n]));
+#else
+ /*
+ * lxvw4x will swap words within the FP double word on LE systems,
+ * leading to corruption if FP is used to store state and VSX is
+ * subsequently used to restore state.
+ * Use lxvd2x instead.
+ */
+ #define LFP(n) __asm("lxvd2x " #n ", 0,%0" \
:: "b"(&pcb->pcb_fpu.fpr[n]));
+#endif
LFP(0); LFP(1); LFP(2); LFP(3);
LFP(4); LFP(5); LFP(6); LFP(7);
LFP(8); LFP(9); LFP(10); LFP(11);
@@ -161,7 +183,7 @@ enable_fpu(struct thread *td)
LFP(28); LFP(29); LFP(30); LFP(31);
#undef LFP
} else {
- #define LFP(n) __asm ("lfd " #n ", 0(%0)" \
+ #define LFP(n) __asm("lfd " #n ", 0(%0)" \
:: "b"(&pcb->pcb_fpu.fpr[n].fpr));
LFP(0); LFP(1); LFP(2); LFP(3);
LFP(4); LFP(5); LFP(6); LFP(7);
diff --git a/sys/riscv/allwinner/files.allwinner b/sys/riscv/allwinner/files.allwinner
index 423a89c10c78..7a4ff6b9c62e 100644
--- a/sys/riscv/allwinner/files.allwinner
+++ b/sys/riscv/allwinner/files.allwinner
@@ -1,5 +1,7 @@
arm/allwinner/aw_gpio.c optional gpio aw_gpio fdt
+arm/allwinner/aw_mmc.c optional mmc aw_mmc fdt | mmccam aw_mmc fdt
+arm/allwinner/aw_rtc.c optional aw_rtc fdt
arm/allwinner/aw_syscon.c optional syscon
arm/allwinner/aw_sid.c optional aw_sid nvmem
arm/allwinner/aw_timer.c optional aw_timer fdt
diff --git a/sys/riscv/conf/std.allwinner b/sys/riscv/conf/std.allwinner
index 1bf6b027a4cb..34fe195b01ba 100644
--- a/sys/riscv/conf/std.allwinner
+++ b/sys/riscv/conf/std.allwinner
@@ -7,6 +7,8 @@ options SOC_ALLWINNER_D1
device aw_ccu # Allwinner clock controller
device aw_gpio # Allwinner GPIO controller
+device aw_mmc # Allwinner SD/MMC controller
+device aw_rtc # Allwinner Real-time Clock
device aw_sid # Allwinner Secure ID EFUSE
device aw_timer # Allwinner Timer
device aw_usbphy # Allwinner USB PHY
diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c
index 5d15bd671285..26efaecc64d1 100644
--- a/sys/riscv/riscv/pmap.c
+++ b/sys/riscv/riscv/pmap.c
@@ -4838,6 +4838,8 @@ pmap_unmapbios(void *p, vm_size_t size)
void
pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pv_memattr == ma)
+ return;
m->md.pv_memattr = ma;
diff --git a/sys/rpc/clnt_rc.c b/sys/rpc/clnt_rc.c
index 9e87af578885..44b63e38a8e6 100644
--- a/sys/rpc/clnt_rc.c
+++ b/sys/rpc/clnt_rc.c
@@ -198,6 +198,12 @@ clnt_reconnect_connect(CLIENT *cl)
newclient = clnt_vc_create(so,
(struct sockaddr *) &rc->rc_addr, rc->rc_prog, rc->rc_vers,
rc->rc_sendsz, rc->rc_recvsz, rc->rc_intr);
+ /*
+ * CLSET_FD_CLOSE must be done now, in case rpctls_connect()
+ * fails just below.
+ */
+ if (newclient != NULL)
+ CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
if (rc->rc_tls && newclient != NULL) {
CURVNET_SET(so->so_vnet);
stat = rpctls_connect(newclient, rc->rc_tlscertname, so,
@@ -236,7 +242,6 @@ clnt_reconnect_connect(CLIENT *cl)
goto out;
}
- CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
CLNT_CONTROL(newclient, CLSET_CONNECT, &one);
CLNT_CONTROL(newclient, CLSET_TIMEOUT, &rc->rc_timeout);
CLNT_CONTROL(newclient, CLSET_RETRY_TIMEOUT, &rc->rc_retry);
diff --git a/sys/rpc/rpcsec_gss/rpcsec_gss.c b/sys/rpc/rpcsec_gss/rpcsec_gss.c
index 62c71937a185..983dd251f81f 100644
--- a/sys/rpc/rpcsec_gss/rpcsec_gss.c
+++ b/sys/rpc/rpcsec_gss/rpcsec_gss.c
@@ -67,6 +67,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/hash.h>
+#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/lock.h>
@@ -772,6 +773,17 @@ rpc_gss_init(AUTH *auth, rpc_gss_options_ret_t *options_ret)
gd->gd_cred.gc_seq = 0;
/*
+ * XXX Threads from inside jails can get here via calls
+ * to clnt_vc_call()->AUTH_REFRESH()->rpc_gss_refresh()
+ * but the NFS mount is always done outside of the
+ * jails in vnet0. Since the thread credentials won't
+ * necessarily have cr_prison == vnet0 and this function
+ * has no access to the socket, using vnet0 seems the
+ * only option. This is broken if NFS mounts are enabled
+ * within vnet prisons.
+ */
+ KGSS_CURVNET_SET_QUIET(vnet0);
+ /*
* For KerberosV, if there is a client principal name, that implies
* that this is a host based initiator credential in the default
* keytab file. For this case, it is necessary to do a
@@ -994,12 +1006,14 @@ out:
gss_delete_sec_context(&min_stat, &gd->gd_ctx,
GSS_C_NO_BUFFER);
}
+ KGSS_CURVNET_RESTORE();
mtx_lock(&gd->gd_lock);
gd->gd_state = RPCSEC_GSS_START;
wakeup(gd);
mtx_unlock(&gd->gd_lock);
return (FALSE);
}
+ KGSS_CURVNET_RESTORE();
mtx_lock(&gd->gd_lock);
gd->gd_state = RPCSEC_GSS_ESTABLISHED;
diff --git a/sys/rpc/rpcsec_tls/rpctls_impl.c b/sys/rpc/rpcsec_tls/rpctls_impl.c
index 93fe283e65fd..51fe270b13d9 100644
--- a/sys/rpc/rpcsec_tls/rpctls_impl.c
+++ b/sys/rpc/rpcsec_tls/rpctls_impl.c
@@ -240,6 +240,14 @@ rpctls_rpc_failed(struct upsock *ups, struct socket *so)
* failed to do the handshake.
*/
mtx_unlock(&rpctls_lock);
+ /*
+ * Do a shutdown on the socket, since the daemon is
+ * probably stuck in SSL_accept() or SSL_connect() trying to
+ * read the socket. Do not soclose() the socket, since the
+ * daemon will close() the socket after SSL_accept()
+ * returns an error.
+ */
+ soshutdown(so, SHUT_RD);
}
}
diff --git a/sys/sys/caprights.h b/sys/sys/caprights.h
index 48c75afc62a0..6a5a17eda5ee 100644
--- a/sys/sys/caprights.h
+++ b/sys/sys/caprights.h
@@ -79,6 +79,8 @@ extern const cap_rights_t cap_futimes_rights;
extern const cap_rights_t cap_getpeername_rights;
extern const cap_rights_t cap_getsockopt_rights;
extern const cap_rights_t cap_getsockname_rights;
+extern const cap_rights_t cap_inotify_add_rights;
+extern const cap_rights_t cap_inotify_rm_rights;
extern const cap_rights_t cap_ioctl_rights;
extern const cap_rights_t cap_linkat_source_rights;
extern const cap_rights_t cap_linkat_target_rights;
diff --git a/sys/sys/capsicum.h b/sys/sys/capsicum.h
index d493535454e9..3847c4c73e75 100644
--- a/sys/sys/capsicum.h
+++ b/sys/sys/capsicum.h
@@ -279,11 +279,15 @@
#define CAP_KQUEUE (CAP_KQUEUE_EVENT | CAP_KQUEUE_CHANGE)
+/* Allows operations on inotify descriptors. */
+#define CAP_INOTIFY_ADD CAPRIGHT(1, 0x0000000000200000ULL)
+#define CAP_INOTIFY_RM CAPRIGHT(1, 0x0000000000400000ULL)
+
/* All used bits for index 1. */
-#define CAP_ALL1 CAPRIGHT(1, 0x00000000001FFFFFULL)
+#define CAP_ALL1 CAPRIGHT(1, 0x00000000007FFFFFULL)
/* Available bits for index 1. */
-#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000200000ULL)
+#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000800000ULL)
/* ... */
#define CAP_UNUSED1_57 CAPRIGHT(1, 0x0100000000000000ULL)
diff --git a/sys/sys/efi.h b/sys/sys/efi.h
index 95a433a950db..89c8b15519de 100644
--- a/sys/sys/efi.h
+++ b/sys/sys/efi.h
@@ -42,6 +42,8 @@
{0xb122a263,0x3661,0x4f68,{0x99,0x29,0x78,0xf8,0xb0,0xd6,0x21,0x80}}
#define EFI_PROPERTIES_TABLE \
{0x880aaca3,0x4adc,0x4a04,{0x90,0x79,0xb7,0x47,0x34,0x08,0x25,0xe5}}
+#define EFI_MEMORY_ATTRIBUTES_TABLE \
+ {0xdcfa911d,0x26eb,0x469f,{0xa2,0x20,0x38,0xb7,0xdc,0x46,0x12,0x20}}
#define LINUX_EFI_MEMRESERVE_TABLE \
{0x888eb0c6,0x8ede,0x4ff5,{0xa8,0xf0,0x9a,0xee,0x5c,0xb9,0x77,0xc2}}
@@ -166,6 +168,22 @@ struct efi_prop_table {
uint64_t memory_protection_attribute;
};
+struct efi_memory_descriptor {
+ uint32_t type;
+ caddr_t phy_addr;
+ caddr_t virt_addr;
+ uint64_t pages;
+ uint64_t attrs;
+};
+
+struct efi_memory_attribute_table {
+ uint32_t version;
+ uint32_t num_ents;
+ uint32_t descriptor_size;
+ uint32_t flags;
+ struct efi_memory_descriptor tables[];
+};
+
#ifdef _KERNEL
#ifdef EFIABI_ATTR
diff --git a/sys/sys/elf_common.h b/sys/sys/elf_common.h
index 87460aae2dd4..efda38279848 100644
--- a/sys/sys/elf_common.h
+++ b/sys/sys/elf_common.h
@@ -306,7 +306,7 @@ typedef struct {
and MPRC of Peking University */
#define EM_AARCH64 183 /* AArch64 (64-bit ARM) */
#define EM_RISCV 243 /* RISC-V */
-#define EM_LOONGARCH 258 /* Loongson LoongArch */
+#define EM_LOONGARCH 258 /* Loongson LoongArch */
/* Non-standard or deprecated. */
#define EM_486 6 /* Intel i486. */
@@ -392,15 +392,15 @@ typedef struct {
*/
/* LoongArch Base ABI Modifiers */
-#define EF_LOONGARCH_ABI_SOFT_FLOAT 0x00000001
-#define EF_LOONGARCH_ABI_SINGLE_FLOAT 0x00000002
-#define EF_LOONGARCH_ABI_DOUBLE_FLOAT 0x00000003
-#define EF_LOONGARCH_ABI_MODIFIER_MASK 0x00000007
+#define EF_LOONGARCH_ABI_SOFT_FLOAT 0x00000001
+#define EF_LOONGARCH_ABI_SINGLE_FLOAT 0x00000002
+#define EF_LOONGARCH_ABI_DOUBLE_FLOAT 0x00000003
+#define EF_LOONGARCH_ABI_MODIFIER_MASK 0x00000007
/* LoongArch Object file ABI versions */
-#define EF_LOONGARCH_OBJABI_V0 0x00000000
-#define EF_LOONGARCH_OBJABI_V1 0x00000040
-#define EF_LOONGARCH_OBJABI_MASK 0x000000C0
+#define EF_LOONGARCH_OBJABI_V0 0x00000000
+#define EF_LOONGARCH_OBJABI_V1 0x00000040
+#define EF_LOONGARCH_OBJABI_MASK 0x000000C0
#define EF_SPARC_EXT_MASK 0x00ffff00
#define EF_SPARC_32PLUS 0x00000100
@@ -470,12 +470,12 @@ typedef struct {
#define SHT_HIOS 0x6fffffff /* Last of OS specific semantics */
#define SHT_LOPROC 0x70000000 /* reserved range for processor */
#define SHT_X86_64_UNWIND 0x70000001 /* unwind information */
-#define SHT_AMD64_UNWIND SHT_X86_64_UNWIND
+#define SHT_AMD64_UNWIND SHT_X86_64_UNWIND
#define SHT_ARM_EXIDX 0x70000001 /* Exception index table. */
-#define SHT_ARM_PREEMPTMAP 0x70000002 /* BPABI DLL dynamic linking
+#define SHT_ARM_PREEMPTMAP 0x70000002 /* BPABI DLL dynamic linking
pre-emption map. */
-#define SHT_ARM_ATTRIBUTES 0x70000003 /* Object file compatibility
+#define SHT_ARM_ATTRIBUTES 0x70000003 /* Object file compatibility
attributes. */
#define SHT_ARM_DEBUGOVERLAY 0x70000004 /* See DBGOVL for details. */
#define SHT_ARM_OVERLAYSECTION 0x70000005 /* See DBGOVL for details. */
@@ -791,7 +791,7 @@ typedef struct {
#define DF_1_NODELETE 0x00000008 /* Set the RTLD_NODELETE for object */
#define DF_1_LOADFLTR 0x00000010 /* Immediate loading of filtees */
#define DF_1_INITFIRST 0x00000020 /* Initialize DSO first at runtime */
-#define DF_1_NOOPEN 0x00000040 /* Do not allow loading on dlopen() */
+#define DF_1_NOOPEN 0x00000040 /* Do not allow loading on dlopen() */
#define DF_1_ORIGIN 0x00000080 /* Process $ORIGIN */
#define DF_1_INTERPOSE 0x00000400 /* Interpose all objects but main */
#define DF_1_NODEFLIB 0x00000800 /* Do not search default paths */
@@ -908,7 +908,7 @@ typedef struct {
#define STV_ELIMINATE 0x6
/* Architecture specific data - st_other */
-#define STO_AARCH64_VARIANT_PCS 0x80
+#define STO_AARCH64_VARIANT_PCS 0x80
/* Special symbol table indexes. */
#define STN_UNDEF 0 /* Undefined symbol index. */
@@ -1084,11 +1084,11 @@ typedef struct {
#define R_AARCH64_COPY 1024 /* Copy data from shared object */
#define R_AARCH64_GLOB_DAT 1025 /* Set GOT entry to data address */
#define R_AARCH64_JUMP_SLOT 1026 /* Set GOT entry to code address */
-#define R_AARCH64_RELATIVE 1027 /* Add load address of shared object */
+#define R_AARCH64_RELATIVE 1027 /* Add load address of shared object */
#define R_AARCH64_TLS_DTPREL64 1028
#define R_AARCH64_TLS_DTPMOD64 1029
-#define R_AARCH64_TLS_TPREL64 1030
-#define R_AARCH64_TLSDESC 1031 /* Identify the TLS descriptor */
+#define R_AARCH64_TLS_TPREL64 1030
+#define R_AARCH64_TLSDESC 1031 /* Identify the TLS descriptor */
#define R_AARCH64_IRELATIVE 1032
#define R_ARM_NONE 0 /* No relocation. */
@@ -1231,8 +1231,8 @@ typedef struct {
#define R_MIPS_GOT_HI16 22 /* GOT HI 16 bit */
#define R_MIPS_GOT_LO16 23 /* GOT LO 16 bit */
#define R_MIPS_SUB 24
-#define R_MIPS_CALLHI16 30 /* upper 16 bit GOT entry for function */
-#define R_MIPS_CALLLO16 31 /* lower 16 bit GOT entry for function */
+#define R_MIPS_CALLHI16 30 /* upper 16 bit GOT entry for function */
+#define R_MIPS_CALLLO16 31 /* lower 16 bit GOT entry for function */
#define R_MIPS_JALR 37
#define R_MIPS_TLS_GD 42
#define R_MIPS_COPY 126
@@ -1352,7 +1352,6 @@ typedef struct {
* RISC-V relocation types.
*/
-/* Relocation types used by the dynamic linker. */
#define R_RISCV_NONE 0
#define R_RISCV_32 1
#define R_RISCV_64 2
@@ -1365,8 +1364,7 @@ typedef struct {
#define R_RISCV_TLS_DTPREL64 9
#define R_RISCV_TLS_TPREL32 10
#define R_RISCV_TLS_TPREL64 11
-
-/* Relocation types not used by the dynamic linker. */
+#define R_RISCV_TLSDESC 12
#define R_RISCV_BRANCH 16
#define R_RISCV_JAL 17
#define R_RISCV_CALL 18
@@ -1392,10 +1390,10 @@ typedef struct {
#define R_RISCV_SUB16 38
#define R_RISCV_SUB32 39
#define R_RISCV_SUB64 40
+#define R_RISCV_GOT32_PCREL 41
#define R_RISCV_ALIGN 43
#define R_RISCV_RVC_BRANCH 44
#define R_RISCV_RVC_JUMP 45
-#define R_RISCV_RVC_LUI 46
#define R_RISCV_RELAX 51
#define R_RISCV_SUB6 52
#define R_RISCV_SET6 53
@@ -1404,6 +1402,14 @@ typedef struct {
#define R_RISCV_SET32 56
#define R_RISCV_32_PCREL 57
#define R_RISCV_IRELATIVE 58
+#define R_RISCV_PLT32 59
+#define R_RISCV_SET_ULEB128 60
+#define R_RISCV_SUB_ULEB128 61
+#define R_RISCV_TLSDESC_HI20 62
+#define R_RISCV_TLSDESC_LOAD_LO12 63
+#define R_RISCV_TLSDESC_ADD_LO12 64
+#define R_RISCV_TLSDESC_CALL 65
+#define R_RISCV_VENDOR 191
/*
* Loongson LoongArch relocation types.
@@ -1413,101 +1419,101 @@ typedef struct {
*/
/* Relocation types used by the dynamic linker */
-#define R_LARCH_NONE 0
-#define R_LARCH_32 1
-#define R_LARCH_64 2
-#define R_LARCH_RELATIVE 3
-#define R_LARCH_COPY 4
-#define R_LARCH_JUMP_SLOT 5
-#define R_LARCH_TLS_DTPMOD32 6
-#define R_LARCH_TLS_DTPMOD64 7
-#define R_LARCH_TLS_DTPREL32 8
-#define R_LARCH_TLS_DTPREL64 9
-#define R_LARCH_TLS_TPREL32 10
-#define R_LARCH_TLS_TPREL64 11
-#define R_LARCH_IRELATIVE 12
-#define R_LARCH_MARK_LA 20
-#define R_LARCH_MARK_PCREL 21
-#define R_LARCH_SOP_PUSH_PCREL 22
-#define R_LARCH_SOP_PUSH_ABSOLUTE 23
-#define R_LARCH_SOP_PUSH_DUP 24
-#define R_LARCH_SOP_PUSH_GPREL 25
-#define R_LARCH_SOP_PUSH_TLS_TPREL 26
-#define R_LARCH_SOP_PUSH_TLS_GOT 27
-#define R_LARCH_SOP_PUSH_TLS_GD 28
-#define R_LARCH_SOP_PUSH_PLT_PCREL 29
-#define R_LARCH_SOP_ASSERT 30
-#define R_LARCH_SOP_NOT 31
-#define R_LARCH_SOP_SUB 32
-#define R_LARCH_SOP_SL 33
-#define R_LARCH_SOP_SR 34
-#define R_LARCH_SOP_ADD 35
-#define R_LARCH_SOP_AND 36
-#define R_LARCH_SOP_IF_ELSE 37
-#define R_LARCH_SOP_POP_32_S_10_5 38
-#define R_LARCH_SOP_POP_32_U_10_12 39
-#define R_LARCH_SOP_POP_32_S_10_12 40
-#define R_LARCH_SOP_POP_32_S_10_16 41
-#define R_LARCH_SOP_POP_32_S_10_16_S2 42
-#define R_LARCH_SOP_POP_32_S_5_20 43
-#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2 44
-#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2 45
-#define R_LARCH_SOP_POP_32_U 46
-#define R_LARCH_ADD8 47
-#define R_LARCH_ADD16 48
-#define R_LARCH_ADD24 49
-#define R_LARCH_ADD32 50
-#define R_LARCH_ADD64 51
-#define R_LARCH_SUB8 52
-#define R_LARCH_SUB16 53
-#define R_LARCH_SUB24 54
-#define R_LARCH_SUB32 55
-#define R_LARCH_SUB64 56
-#define R_LARCH_GNU_VTINHERIT 57
-#define R_LARCH_GNU_VTENTRY 58
+#define R_LARCH_NONE 0
+#define R_LARCH_32 1
+#define R_LARCH_64 2
+#define R_LARCH_RELATIVE 3
+#define R_LARCH_COPY 4
+#define R_LARCH_JUMP_SLOT 5
+#define R_LARCH_TLS_DTPMOD32 6
+#define R_LARCH_TLS_DTPMOD64 7
+#define R_LARCH_TLS_DTPREL32 8
+#define R_LARCH_TLS_DTPREL64 9
+#define R_LARCH_TLS_TPREL32 10
+#define R_LARCH_TLS_TPREL64 11
+#define R_LARCH_IRELATIVE 12
+#define R_LARCH_MARK_LA 20
+#define R_LARCH_MARK_PCREL 21
+#define R_LARCH_SOP_PUSH_PCREL 22
+#define R_LARCH_SOP_PUSH_ABSOLUTE 23
+#define R_LARCH_SOP_PUSH_DUP 24
+#define R_LARCH_SOP_PUSH_GPREL 25
+#define R_LARCH_SOP_PUSH_TLS_TPREL 26
+#define R_LARCH_SOP_PUSH_TLS_GOT 27
+#define R_LARCH_SOP_PUSH_TLS_GD 28
+#define R_LARCH_SOP_PUSH_PLT_PCREL 29
+#define R_LARCH_SOP_ASSERT 30
+#define R_LARCH_SOP_NOT 31
+#define R_LARCH_SOP_SUB 32
+#define R_LARCH_SOP_SL 33
+#define R_LARCH_SOP_SR 34
+#define R_LARCH_SOP_ADD 35
+#define R_LARCH_SOP_AND 36
+#define R_LARCH_SOP_IF_ELSE 37
+#define R_LARCH_SOP_POP_32_S_10_5 38
+#define R_LARCH_SOP_POP_32_U_10_12 39
+#define R_LARCH_SOP_POP_32_S_10_12 40
+#define R_LARCH_SOP_POP_32_S_10_16 41
+#define R_LARCH_SOP_POP_32_S_10_16_S2 42
+#define R_LARCH_SOP_POP_32_S_5_20 43
+#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2 44
+#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2 45
+#define R_LARCH_SOP_POP_32_U 46
+#define R_LARCH_ADD8 47
+#define R_LARCH_ADD16 48
+#define R_LARCH_ADD24 49
+#define R_LARCH_ADD32 50
+#define R_LARCH_ADD64 51
+#define R_LARCH_SUB8 52
+#define R_LARCH_SUB16 53
+#define R_LARCH_SUB24 54
+#define R_LARCH_SUB32 55
+#define R_LARCH_SUB64 56
+#define R_LARCH_GNU_VTINHERIT 57
+#define R_LARCH_GNU_VTENTRY 58
/*
* Relocs whose processing do not require a stack machine.
*
* Spec addition: https://github.com/loongson/LoongArch-Documentation/pull/57
*/
-#define R_LARCH_B16 64
-#define R_LARCH_B21 65
-#define R_LARCH_B26 66
-#define R_LARCH_ABS_HI20 67
-#define R_LARCH_ABS_LO12 68
-#define R_LARCH_ABS64_LO20 69
-#define R_LARCH_ABS64_HI12 70
-#define R_LARCH_PCALA_HI20 71
-#define R_LARCH_PCALA_LO12 72
-#define R_LARCH_PCALA64_LO20 73
-#define R_LARCH_PCALA64_HI12 74
-#define R_LARCH_GOT_PC_HI20 75
-#define R_LARCH_GOT_PC_LO12 76
-#define R_LARCH_GOT64_PC_LO20 77
-#define R_LARCH_GOT64_PC_HI12 78
-#define R_LARCH_GOT_HI20 79
-#define R_LARCH_GOT_LO12 80
-#define R_LARCH_GOT64_LO20 81
-#define R_LARCH_GOT64_HI12 82
-#define R_LARCH_TLS_LE_HI20 83
-#define R_LARCH_TLS_LE_LO12 84
-#define R_LARCH_TLS_LE64_LO20 85
-#define R_LARCH_TLS_LE64_HI12 86
-#define R_LARCH_TLS_IE_PC_HI20 87
-#define R_LARCH_TLS_IE_PC_LO12 88
-#define R_LARCH_TLS_IE64_PC_LO20 89
-#define R_LARCH_TLS_IE64_PC_HI12 90
-#define R_LARCH_TLS_IE_HI20 91
-#define R_LARCH_TLS_IE_LO12 92
-#define R_LARCH_TLS_IE64_LO20 93
-#define R_LARCH_TLS_IE64_HI12 94
-#define R_LARCH_TLS_LD_PC_HI20 95
-#define R_LARCH_TLS_LD_HI20 96
-#define R_LARCH_TLS_GD_PC_HI20 97
-#define R_LARCH_TLS_GD_HI20 98
-#define R_LARCH_32_PCREL 99
-#define R_LARCH_RELAX 100
+#define R_LARCH_B16 64
+#define R_LARCH_B21 65
+#define R_LARCH_B26 66
+#define R_LARCH_ABS_HI20 67
+#define R_LARCH_ABS_LO12 68
+#define R_LARCH_ABS64_LO20 69
+#define R_LARCH_ABS64_HI12 70
+#define R_LARCH_PCALA_HI20 71
+#define R_LARCH_PCALA_LO12 72
+#define R_LARCH_PCALA64_LO20 73
+#define R_LARCH_PCALA64_HI12 74
+#define R_LARCH_GOT_PC_HI20 75
+#define R_LARCH_GOT_PC_LO12 76
+#define R_LARCH_GOT64_PC_LO20 77
+#define R_LARCH_GOT64_PC_HI12 78
+#define R_LARCH_GOT_HI20 79
+#define R_LARCH_GOT_LO12 80
+#define R_LARCH_GOT64_LO20 81
+#define R_LARCH_GOT64_HI12 82
+#define R_LARCH_TLS_LE_HI20 83
+#define R_LARCH_TLS_LE_LO12 84
+#define R_LARCH_TLS_LE64_LO20 85
+#define R_LARCH_TLS_LE64_HI12 86
+#define R_LARCH_TLS_IE_PC_HI20 87
+#define R_LARCH_TLS_IE_PC_LO12 88
+#define R_LARCH_TLS_IE64_PC_LO20 89
+#define R_LARCH_TLS_IE64_PC_HI12 90
+#define R_LARCH_TLS_IE_HI20 91
+#define R_LARCH_TLS_IE_LO12 92
+#define R_LARCH_TLS_IE64_LO20 93
+#define R_LARCH_TLS_IE64_HI12 94
+#define R_LARCH_TLS_LD_PC_HI20 95
+#define R_LARCH_TLS_LD_HI20 96
+#define R_LARCH_TLS_GD_PC_HI20 97
+#define R_LARCH_TLS_GD_HI20 98
+#define R_LARCH_32_PCREL 99
+#define R_LARCH_RELAX 100
/*
* Relocs added in ELF for the LoongArchâ„¢ Architecture v20230519, part of the
@@ -1520,13 +1526,13 @@ typedef struct {
* in psABI v2.20 because they were proved not necessary to be exposed outside
* of the linker.
*/
-#define R_LARCH_ALIGN 102
-#define R_LARCH_PCREL20_S2 103
-#define R_LARCH_ADD6 105
-#define R_LARCH_SUB6 106
-#define R_LARCH_ADD_ULEB128 107
-#define R_LARCH_SUB_ULEB128 108
-#define R_LARCH_64_PCREL 109
+#define R_LARCH_ALIGN 102
+#define R_LARCH_PCREL20_S2 103
+#define R_LARCH_ADD6 105
+#define R_LARCH_SUB6 106
+#define R_LARCH_ADD_ULEB128 107
+#define R_LARCH_SUB_ULEB128 108
+#define R_LARCH_64_PCREL 109
/*
* Relocs added in ELF for the LoongArchâ„¢ Architecture v20231102, part of the
@@ -1534,7 +1540,7 @@ typedef struct {
*
* Spec addition: https://github.com/loongson/la-abi-specs/pull/4
*/
-#define R_LARCH_CALL36 110
+#define R_LARCH_CALL36 110
/*
* Relocs added in ELF for the LoongArchâ„¢ Architecture v20231219, part of the
@@ -1542,24 +1548,24 @@ typedef struct {
*
* Spec addition: https://github.com/loongson/la-abi-specs/pull/5
*/
-#define R_LARCH_TLS_DESC32 13
-#define R_LARCH_TLS_DESC64 14
-#define R_LARCH_TLS_DESC_PC_HI20 111
-#define R_LARCH_TLS_DESC_PC_LO12 112
-#define R_LARCH_TLS_DESC64_PC_LO20 113
-#define R_LARCH_TLS_DESC64_PC_HI12 114
-#define R_LARCH_TLS_DESC_HI20 115
-#define R_LARCH_TLS_DESC_LO12 116
-#define R_LARCH_TLS_DESC64_LO20 117
-#define R_LARCH_TLS_DESC64_HI12 118
-#define R_LARCH_TLS_DESC_LD 119
-#define R_LARCH_TLS_DESC_CALL 120
-#define R_LARCH_TLS_LE_HI20_R 121
-#define R_LARCH_TLS_LE_ADD_R 122
-#define R_LARCH_TLS_LE_LO12_R 123
-#define R_LARCH_TLS_LD_PCREL20_S2 124
-#define R_LARCH_TLS_GD_PCREL20_S2 125
-#define R_LARCH_TLS_DESC_PCREL20_S2 126
+#define R_LARCH_TLS_DESC32 13
+#define R_LARCH_TLS_DESC64 14
+#define R_LARCH_TLS_DESC_PC_HI20 111
+#define R_LARCH_TLS_DESC_PC_LO12 112
+#define R_LARCH_TLS_DESC64_PC_LO20 113
+#define R_LARCH_TLS_DESC64_PC_HI12 114
+#define R_LARCH_TLS_DESC_HI20 115
+#define R_LARCH_TLS_DESC_LO12 116
+#define R_LARCH_TLS_DESC64_LO20 117
+#define R_LARCH_TLS_DESC64_HI12 118
+#define R_LARCH_TLS_DESC_LD 119
+#define R_LARCH_TLS_DESC_CALL 120
+#define R_LARCH_TLS_LE_HI20_R 121
+#define R_LARCH_TLS_LE_ADD_R 122
+#define R_LARCH_TLS_LE_LO12_R 123
+#define R_LARCH_TLS_LD_PCREL20_S2 124
+#define R_LARCH_TLS_GD_PCREL20_S2 125
+#define R_LARCH_TLS_DESC_PCREL20_S2 126
#define R_SPARC_NONE 0
#define R_SPARC_8 1
diff --git a/sys/sys/exterr_cat.h b/sys/sys/exterr_cat.h
index d770c274d7b7..cab94ac511a5 100644
--- a/sys/sys/exterr_cat.h
+++ b/sys/sys/exterr_cat.h
@@ -16,6 +16,8 @@
#define EXTERR_KTRACE 3 /* To allow inclusion of this
file into kern_ktrace.c */
#define EXTERR_CAT_FUSE 4
+#define EXTERR_CAT_INOTIFY 5
+#define EXTERR_CAT_GENIO 6
#endif
diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h
index 15557c614f88..7bf1d264ff5e 100644
--- a/sys/sys/exterrvar.h
+++ b/sys/sys/exterrvar.h
@@ -21,6 +21,7 @@
#define EXTERRCTL_ENABLE 1
#define EXTERRCTL_DISABLE 2
+#define EXTERRCTL_UD 3
#define EXTERRCTLF_FORCE 0x00000001
diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h
index dd9fccf5cf38..18d3928e91c7 100644
--- a/sys/sys/fcntl.h
+++ b/sys/sys/fcntl.h
@@ -144,6 +144,10 @@ typedef __pid_t pid_t;
#define O_XATTR O_NAMEDATTR /* Solaris compatibility */
#endif
+#if __POSIX_VISIBLE >= 202405
+#define O_CLOFORK 0x08000000
+#endif
+
/*
* !!! DANGER !!!
*
@@ -280,6 +284,16 @@ typedef __pid_t pid_t;
#define F_GET_SEALS 20
#define F_ISUNIONSTACK 21 /* Kludge for libc, don't use it. */
#define F_KINFO 22 /* Return kinfo_file for this fd */
+#endif /* __BSD_VISIBLE */
+
+#if __POSIX_VISIBLE >= 202405
+#define F_DUPFD_CLOFORK 23 /* Like F_DUPFD, but FD_CLOFORK is set */
+#endif
+
+#if __BSD_VISIBLE
+#define F_DUP3FD 24 /* Used with dup3() */
+
+#define F_DUP3FD_SHIFT 16 /* Shift used for F_DUP3FD */
/* Seals (F_ADD_SEALS, F_GET_SEALS). */
#define F_SEAL_SEAL 0x0001 /* Prevent adding sealings */
@@ -292,6 +306,9 @@ typedef __pid_t pid_t;
#define FD_CLOEXEC 1 /* close-on-exec flag */
#define FD_RESOLVE_BENEATH 2 /* all lookups relative to fd have
O_RESOLVE_BENEATH semantics */
+#if __POSIX_VISIBLE >= 202405
+#define FD_CLOFORK 4 /* close-on-fork flag */
+#endif
/* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */
#define F_RDLCK 1 /* shared or read lock */
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 284d523147b6..63313926c4f0 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -71,6 +71,7 @@ struct nameidata;
#define DTYPE_PROCDESC 12 /* process descriptor */
#define DTYPE_EVENTFD 13 /* eventfd */
#define DTYPE_TIMERFD 14 /* timerfd */
+#define DTYPE_INOTIFY 15 /* inotify descriptor */
#ifdef _KERNEL
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index 55969b2ff4b3..0a388c90de26 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -149,6 +149,7 @@ struct filedesc_to_leader {
*/
#define UF_EXCLOSE 0x01 /* auto-close on exec */
#define UF_RESOLVE_BENEATH 0x02 /* lookups must be beneath this dir */
+#define UF_FOCLOSE 0x04 /* auto-close on fork */
#ifdef _KERNEL
@@ -221,6 +222,7 @@ enum {
/* Flags for kern_dup(). */
#define FDDUP_FLAG_CLOEXEC 0x1 /* Atomically set UF_EXCLOSE. */
+#define FDDUP_FLAG_CLOFORK 0x2 /* Atomically set UF_FOCLOSE. */
/* For backward compatibility. */
#define falloc(td, resultfp, resultfd, flags) \
diff --git a/sys/sys/hwt.h b/sys/sys/hwt.h
new file mode 100644
index 000000000000..78b774a70f9f
--- /dev/null
+++ b/sys/sys/hwt.h
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* User-visible header. */
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/types.h>
+#include <sys/hwt_record.h>
+
+#ifndef _SYS_HWT_H_
+#define _SYS_HWT_H_
+
+#define HWT_MAGIC 0x42
+#define HWT_IOC_ALLOC _IOW(HWT_MAGIC, 0x00, struct hwt_alloc)
+#define HWT_IOC_START _IOW(HWT_MAGIC, 0x01, struct hwt_start)
+#define HWT_IOC_STOP _IOW(HWT_MAGIC, 0x02, struct hwt_stop)
+#define HWT_IOC_RECORD_GET _IOW(HWT_MAGIC, 0x03, struct hwt_record_get)
+#define HWT_IOC_BUFPTR_GET _IOW(HWT_MAGIC, 0x04, struct hwt_bufptr_get)
+#define HWT_IOC_SET_CONFIG _IOW(HWT_MAGIC, 0x05, struct hwt_set_config)
+#define HWT_IOC_WAKEUP _IOW(HWT_MAGIC, 0x06, struct hwt_wakeup)
+#define HWT_IOC_SVC_BUF _IOW(HWT_MAGIC, 0x07, struct hwt_svc_buf)
+
+#define HWT_BACKEND_MAXNAMELEN 256
+
+#define HWT_MODE_THREAD 1
+#define HWT_MODE_CPU 2
+
+struct hwt_alloc {
+ size_t bufsize;
+ int mode;
+ pid_t pid; /* thread mode */
+ cpuset_t *cpu_map; /* cpu mode only */
+ size_t cpusetsize;
+ const char *backend_name;
+ int *ident;
+ int kqueue_fd;
+} __aligned(16);
+
+struct hwt_start {
+ int reserved;
+} __aligned(16);
+
+struct hwt_stop {
+ int reserved;
+} __aligned(16);
+
+struct hwt_wakeup {
+ int reserved;
+} __aligned(16);
+
+struct hwt_record_user_entry {
+ enum hwt_record_type record_type;
+ union {
+ /*
+ * Used for MMAP, EXECUTABLE, INTERP,
+ * and KERNEL records.
+ */
+ struct {
+ char fullpath[MAXPATHLEN];
+ uintptr_t addr;
+ uintptr_t baseaddr;
+ };
+ /* Used for BUFFER records. */
+ struct {
+ int buf_id;
+ int curpage;
+ vm_offset_t offset;
+ };
+ /* Used for THREAD_* records. */
+ int thread_id;
+ };
+} __aligned(16);
+
+struct hwt_record_get {
+ struct hwt_record_user_entry *records;
+ int *nentries;
+ int wait;
+} __aligned(16);
+
+struct hwt_bufptr_get {
+ int *ident;
+ vm_offset_t *offset;
+ uint64_t *data;
+} __aligned(16);
+
+struct hwt_set_config {
+ /* Configuration of ctx. */
+ int pause_on_mmap;
+
+ /* The following passed to backend as is. */
+ void *config;
+ size_t config_size;
+ int config_version;
+} __aligned(16);
+
+struct hwt_svc_buf {
+ /* The following passed to backend as is. */
+ void *data;
+ size_t data_size;
+ int data_version;
+} __aligned(16);
+
+#endif /* !_SYS_HWT_H_ */
diff --git a/sys/sys/hwt_record.h b/sys/sys/hwt_record.h
new file mode 100644
index 000000000000..8336723f9396
--- /dev/null
+++ b/sys/sys/hwt_record.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2023-2025 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This work was supported by Innovate UK project 105694, "Digital Security
+ * by Design (DSbD) Technology Platform Prototype".
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* User-visible header. */
+
+#ifndef _SYS_HWT_RECORD_H_
+#define _SYS_HWT_RECORD_H_
+
+enum hwt_record_type {
+ HWT_RECORD_MMAP,
+ HWT_RECORD_MUNMAP,
+ HWT_RECORD_EXECUTABLE,
+ HWT_RECORD_KERNEL,
+ HWT_RECORD_THREAD_CREATE,
+ HWT_RECORD_THREAD_SET_NAME,
+ HWT_RECORD_BUFFER
+};
+
+#ifdef _KERNEL
+struct hwt_record_entry {
+ TAILQ_ENTRY(hwt_record_entry) next;
+ enum hwt_record_type record_type;
+ union {
+ /*
+ * Used for MMAP, EXECUTABLE, INTERP,
+ * and KERNEL records.
+ */
+ struct {
+ char *fullpath;
+ uintptr_t addr;
+ uintptr_t baseaddr;
+ };
+ /* Used for BUFFER records. */
+ struct {
+ int buf_id;
+ int curpage;
+ vm_offset_t offset;
+ };
+ /* Used for THREAD_* records. */
+ int thread_id;
+ };
+};
+#endif
+
+#endif /* !_SYS_HWT_RECORD_H_ */
diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h
new file mode 100644
index 000000000000..d1f23d5898bb
--- /dev/null
+++ b/sys/sys/inotify.h
@@ -0,0 +1,158 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Klara, Inc.
+ */
+
+#ifndef _INOTIFY_H_
+#define _INOTIFY_H_
+
+#include <sys/_types.h>
+
+/* Flags for inotify_init1(). */
+#define IN_NONBLOCK 0x00000004 /* O_NONBLOCK */
+#define IN_CLOEXEC 0x00100000 /* O_CLOEXEC */
+
+struct inotify_event {
+ int wd;
+ __uint32_t mask;
+ __uint32_t cookie;
+ __uint32_t len;
+ char name[0];
+};
+
+/* Events, set in the mask field. */
+#define IN_ACCESS 0x00000001
+#define IN_MODIFY 0x00000002
+#define IN_ATTRIB 0x00000004
+#define IN_CLOSE_WRITE 0x00000008
+#define IN_CLOSE_NOWRITE 0x00000010
+#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE)
+#define IN_OPEN 0x00000020
+#define IN_MOVED_FROM 0x00000040
+#define IN_MOVED_TO 0x00000080
+#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO)
+#define IN_CREATE 0x00000100
+#define IN_DELETE 0x00000200
+#define IN_DELETE_SELF 0x00000400
+#define IN_MOVE_SELF 0x00000800
+#define IN_ALL_EVENTS 0x00000fff
+
+/* Events report only for entries in a watched dir, not the dir itself. */
+#define _IN_DIR_EVENTS (IN_CLOSE_WRITE | IN_DELETE | IN_MODIFY | \
+ IN_MOVED_FROM | IN_MOVED_TO)
+
+#ifdef _KERNEL
+/*
+ * An unlink that's done as part of a rename only records IN_DELETE if the
+ * unlinked vnode itself is watched, and not when the containing directory is
+ * watched.
+ */
+#define _IN_MOVE_DELETE 0x40000000
+/*
+ * Inode link count changes only trigger IN_ATTRIB events if the inode itself is
+ * watched, and not when the containing directory is watched.
+ */
+#define _IN_ATTRIB_LINKCOUNT 0x80000000
+#endif
+
+/* Flags, set in the mask field. */
+#define IN_ONLYDIR 0x01000000
+#define IN_DONT_FOLLOW 0x02000000
+#define IN_EXCL_UNLINK 0x04000000
+#define IN_MASK_CREATE 0x10000000
+#define IN_MASK_ADD 0x20000000
+#define IN_ONESHOT 0x80000000
+#define _IN_ALL_FLAGS (IN_ONLYDIR | IN_DONT_FOLLOW | \
+ IN_EXCL_UNLINK | IN_MASK_CREATE | \
+ IN_MASK_ADD | IN_ONESHOT)
+
+/* Flags returned by the kernel. */
+#define IN_UNMOUNT 0x00002000
+#define IN_Q_OVERFLOW 0x00004000
+#define IN_IGNORED 0x00008000
+#define IN_ISDIR 0x40000000
+#define _IN_ALL_RETFLAGS (IN_Q_OVERFLOW | IN_UNMOUNT | IN_IGNORED | \
+ IN_ISDIR)
+
+#define _IN_ALIGN _Alignof(struct inotify_event)
+#define _IN_NAMESIZE(namelen) \
+ ((namelen) == 0 ? 0 : __align_up((namelen) + 1, _IN_ALIGN))
+
+#ifdef _KERNEL
+struct componentname;
+struct file;
+struct inotify_softc;
+struct thread;
+struct vnode;
+
+int inotify_create_file(struct thread *, struct file *, int, int *);
+void inotify_log(struct vnode *, const char *, size_t, int, __uint32_t);
+
+int kern_inotify_rm_watch(int, uint32_t, struct thread *);
+int kern_inotify_add_watch(int, int, const char *, uint32_t,
+ struct thread *);
+
+void vn_inotify(struct vnode *, struct vnode *, struct componentname *, int,
+ uint32_t);
+int vn_inotify_add_watch(struct vnode *, struct inotify_softc *,
+ __uint32_t, __uint32_t *, struct thread *);
+void vn_inotify_revoke(struct vnode *);
+
+/* Log an inotify event. */
+#define INOTIFY(vp, ev) do { \
+ if (__predict_false((vn_irflag_read(vp) & (VIRF_INOTIFY | \
+ VIRF_INOTIFY_PARENT)) != 0)) \
+ VOP_INOTIFY((vp), NULL, NULL, (ev), 0); \
+} while (0)
+
+/* Log an inotify event using a specific name for the vnode. */
+#define INOTIFY_NAME_LOCK(vp, dvp, cnp, ev, lock) do { \
+ if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 || \
+ (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) { \
+ if (lock) \
+ vn_lock((vp), LK_SHARED | LK_RETRY); \
+ VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0); \
+ if (lock) \
+ VOP_UNLOCK(vp); \
+ } \
+} while (0)
+#define INOTIFY_NAME(vp, dvp, cnp, ev) \
+ INOTIFY_NAME_LOCK((vp), (dvp), (cnp), (ev), false)
+
+extern __uint32_t inotify_rename_cookie;
+
+#define INOTIFY_MOVE(vp, fdvp, fcnp, tvp, tdvp, tcnp) do { \
+ if (__predict_false((vn_irflag_read(fdvp) & VIRF_INOTIFY) != 0 || \
+ (vn_irflag_read(tdvp) & VIRF_INOTIFY) != 0 || \
+ (vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) { \
+ __uint32_t cookie; \
+ \
+ cookie = atomic_fetchadd_32(&inotify_rename_cookie, 1); \
+ VOP_INOTIFY((vp), (fdvp), (fcnp), IN_MOVED_FROM, cookie); \
+ VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie); \
+ } \
+ if ((tvp) != NULL) \
+ INOTIFY_NAME_LOCK((tvp), (tdvp), (tcnp), \
+ _IN_MOVE_DELETE, true); \
+} while (0)
+
+#define INOTIFY_REVOKE(vp) do { \
+ if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) \
+ vn_inotify_revoke((vp)); \
+} while (0)
+
+#else
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+int inotify_init(void);
+int inotify_init1(int flags);
+int inotify_add_watch(int fd, const char *pathname, __uint32_t mask);
+int inotify_add_watch_at(int fd, int dfd, const char *pathname,
+ __uint32_t mask);
+int inotify_rm_watch(int fd, int wd);
+__END_DECLS
+#endif /* !_KERNEL */
+
+#endif /* !_INOTIFY_H_ */
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index a6f858e02395..f6480b173a5c 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -267,6 +267,7 @@ struct mount {
int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */
int mnt_upper_pending; /* (i) # of pending ops on mnt_uppers */
struct lock mnt_explock; /* vfs_export walkers lock */
+ struct lock mnt_renamelock; /* renames and O_RESOLVE_BENEATH */
TAILQ_HEAD(, mount_upper_node) mnt_uppers; /* (i) upper mounts over us */
TAILQ_HEAD(, mount_upper_node) mnt_notify; /* (i) upper mounts for notification */
STAILQ_ENTRY(mount) mnt_taskqueue_link; /* (d) our place in deferred unmount list */
diff --git a/sys/sys/namei.h b/sys/sys/namei.h
index 5c245235ace5..6008d83f729d 100644
--- a/sys/sys/namei.h
+++ b/sys/sys/namei.h
@@ -108,7 +108,12 @@ struct nameidata {
* through the VOP interface.
*/
struct componentname ni_cnd;
+
+ /* Serving RBENEATH. */
struct nameicap_tracker_head ni_cap_tracker;
+ struct vnode *ni_rbeneath_dpp;
+ struct mount *ni_nctrack_mnt;
+
/*
* Private helper data for UFS, must be at the end. See
* NDINIT_PREFILL().
@@ -235,6 +240,10 @@ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
panic("namei data not inited"); \
if (((arg)->ni_debugflags & NAMEI_DBG_HADSTARTDIR) != 0) \
panic("NDREINIT on namei data with NAMEI_DBG_HADSTARTDIR"); \
+ if ((arg)->ni_nctrack_mnt != NULL) \
+ panic("NDREINIT on namei data with leaked ni_nctrack_mnt"); \
+ if (!TAILQ_EMPTY(&(arg)->ni_cap_tracker)) \
+ panic("NDREINIT on namei data with leaked ni_cap_tracker"); \
(arg)->ni_debugflags = NAMEI_DBG_INITED; \
}
#else
@@ -259,6 +268,9 @@ do { \
_ndp->ni_resflags = 0; \
filecaps_init(&_ndp->ni_filecaps); \
_ndp->ni_rightsneeded = _rightsp; \
+ _ndp->ni_rbeneath_dpp = NULL; \
+ _ndp->ni_nctrack_mnt = NULL; \
+ TAILQ_INIT(&_ndp->ni_cap_tracker); \
} while (0)
#define NDREINIT(ndp) do { \
diff --git a/sys/sys/param.h b/sys/sys/param.h
index f1bf874cb5fd..f941f021a423 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -74,7 +74,7 @@
* cannot include sys/param.h and should only be updated here.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 1500049
+#define __FreeBSD_version 1500054
/*
* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index c7e1a1f51cb4..af9cafa99dd0 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -893,6 +893,8 @@ struct proc {
#define P2_LOGSIGEXIT_ENABLE 0x00800000 /* Disable logging on sigexit */
#define P2_LOGSIGEXIT_CTL 0x01000000 /* Override kern.logsigexit */
+#define P2_HWT 0x02000000 /* Process is using HWT. */
+
/* Flags protected by proctree_lock, kept in p_treeflags. */
#define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */
#define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan
diff --git a/sys/sys/random.h b/sys/sys/random.h
index 254ba9451d0a..5abf762cd200 100644
--- a/sys/sys/random.h
+++ b/sys/sys/random.h
@@ -85,7 +85,8 @@ enum random_entropy_source {
RANDOM_FS_ATIME,
RANDOM_UMA, /* Special!! UMA/SLAB Allocator */
RANDOM_CALLOUT,
- RANDOM_ENVIRONMENTAL_END = RANDOM_CALLOUT,
+ RANDOM_RANDOMDEV,
+ RANDOM_ENVIRONMENTAL_END = RANDOM_RANDOMDEV,
/* Fast hardware random-number sources from here on. */
RANDOM_PURE_START,
RANDOM_PURE_OCTEON = RANDOM_PURE_START,
diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h
index b15dace8cfa0..61411890c85b 100644
--- a/sys/sys/resourcevar.h
+++ b/sys/sys/resourcevar.h
@@ -122,6 +122,8 @@ struct uidinfo {
long ui_kqcnt; /* (b) number of kqueues */
long ui_umtxcnt; /* (b) number of shared umtxs */
long ui_pipecnt; /* (b) consumption of pipe buffers */
+ long ui_inotifycnt; /* (b) number of inotify descriptors */
+ long ui_inotifywatchcnt; /* (b) number of inotify watches */
uid_t ui_uid; /* (a) uid */
u_int ui_ref; /* (b) reference count */
#ifdef RACCT
@@ -144,6 +146,8 @@ int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to,
int chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval);
int chgumtxcnt(struct uidinfo *uip, int diff, rlim_t maxval);
int chgpipecnt(struct uidinfo *uip, int diff, rlim_t max);
+int chginotifycnt(struct uidinfo *uip, int diff, rlim_t maxval);
+int chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t maxval);
int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which,
struct rlimit *limp);
struct plimit
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 5e7c554c34cf..cdd4fa3b4b89 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -111,10 +111,11 @@ typedef __uintptr_t uintptr_t;
*/
#define SOCK_CLOEXEC 0x10000000
#define SOCK_NONBLOCK 0x20000000
+#define SOCK_CLOFORK 0x40000000
#ifdef _KERNEL
/*
* Flags for accept1(), kern_accept4() and solisten_dequeue, in addition
- * to SOCK_CLOEXEC and SOCK_NONBLOCK.
+ * to SOCK_CLOEXEC, SOCK_CLOFORK and SOCK_NONBLOCK.
*/
#define ACCEPT4_INHERIT 0x1
#define ACCEPT4_COMPAT 0x2
@@ -478,6 +479,9 @@ struct msghdr {
#define MSG_MORETOCOME 0x00100000 /* additional data pending */
#define MSG_TLSAPPDATA 0x00200000 /* do not soreceive() alert rec. (TLS) */
#endif
+#if __BSD_VISIBLE
+#define MSG_CMSG_CLOFORK 0x00400000 /* make received fds close-on-fork */
+#endif
/*
* Header for ancillary data objects in msg_control buffer.
diff --git a/sys/sys/specialfd.h b/sys/sys/specialfd.h
index dc4d88ce689f..0b79c841d149 100644
--- a/sys/sys/specialfd.h
+++ b/sys/sys/specialfd.h
@@ -30,6 +30,7 @@
enum specialfd_type {
SPECIALFD_EVENTFD = 1,
+ SPECIALFD_INOTIFY = 2,
};
struct specialfd_eventfd {
@@ -37,4 +38,8 @@ struct specialfd_eventfd {
int flags;
};
+struct specialfd_inotify {
+ int flags;
+};
+
#endif /* !_SYS_SPECIALFD_H_ */
diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h
index 68406a2dfc29..eec923d0b82e 100644
--- a/sys/sys/syscall.h
+++ b/sys/sys/syscall.h
@@ -529,4 +529,6 @@
#define SYS_fchroot 590
#define SYS_setcred 591
#define SYS_exterrctl 592
-#define SYS_MAXSYSCALL 593
+#define SYS_inotify_add_watch_at 593
+#define SYS_inotify_rm_watch 594
+#define SYS_MAXSYSCALL 595
diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk
index 9a90a63f35a3..547242a73277 100644
--- a/sys/sys/syscall.mk
+++ b/sys/sys/syscall.mk
@@ -434,4 +434,6 @@ MIASM = \
getrlimitusage.o \
fchroot.o \
setcred.o \
- exterrctl.o
+ exterrctl.o \
+ inotify_add_watch_at.o \
+ inotify_rm_watch.o
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index fe6dd9e14fb4..fd183ffbc7a4 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -257,6 +257,7 @@ int kern_munlock(struct thread *td, uintptr_t addr, size_t size);
int kern_munmap(struct thread *td, uintptr_t addr, size_t size);
int kern_nanosleep(struct thread *td, struct timespec *rqt,
struct timespec *rmt);
+int kern_nosys(struct thread *td, int dummy);
int kern_ntp_adjtime(struct thread *td, struct timex *ntv, int *retvalp);
int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
long *ploff);
diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h
index 6314b03142e7..4ddfc8516053 100644
--- a/sys/sys/sysent.h
+++ b/sys/sys/sysent.h
@@ -79,11 +79,10 @@ struct sysent { /* system call table */
*/
#define SYF_CAPENABLED 0x00000001
-#define SY_THR_FLAGMASK 0x7
-#define SY_THR_STATIC 0x1
-#define SY_THR_DRAINING 0x2
-#define SY_THR_ABSENT 0x4
-#define SY_THR_INCR 0x8
+#define SY_THR_STATIC 0x01
+#define SY_THR_DRAINING 0x02
+#define SY_THR_ABSENT 0x04
+#define SY_THR_INCR 0x08
#ifdef KLD_MODULE
#define SY_THR_STATIC_KLD 0
diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h
index 94da81c84d25..94b5a0a7a95e 100644
--- a/sys/sys/sysproto.h
+++ b/sys/sys/sysproto.h
@@ -1891,6 +1891,16 @@ struct exterrctl_args {
char flags_l_[PADL_(u_int)]; u_int flags; char flags_r_[PADR_(u_int)];
char ptr_l_[PADL_(void *)]; void * ptr; char ptr_r_[PADR_(void *)];
};
+struct inotify_add_watch_at_args {
+ char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+ char dfd_l_[PADL_(int)]; int dfd; char dfd_r_[PADR_(int)];
+ char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)];
+ char mask_l_[PADL_(uint32_t)]; uint32_t mask; char mask_r_[PADR_(uint32_t)];
+};
+struct inotify_rm_watch_args {
+ char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+ char wd_l_[PADL_(int)]; int wd; char wd_r_[PADR_(int)];
+};
int sys_exit(struct thread *, struct exit_args *);
int sys_fork(struct thread *, struct fork_args *);
int sys_read(struct thread *, struct read_args *);
@@ -2293,6 +2303,8 @@ int sys_getrlimitusage(struct thread *, struct getrlimitusage_args *);
int sys_fchroot(struct thread *, struct fchroot_args *);
int sys_setcred(struct thread *, struct setcred_args *);
int sys_exterrctl(struct thread *, struct exterrctl_args *);
+int sys_inotify_add_watch_at(struct thread *, struct inotify_add_watch_at_args *);
+int sys_inotify_rm_watch(struct thread *, struct inotify_rm_watch_args *);
#ifdef COMPAT_43
@@ -3275,6 +3287,8 @@ int freebsd13_swapoff(struct thread *, struct freebsd13_swapoff_args *);
#define SYS_AUE_fchroot AUE_NULL
#define SYS_AUE_setcred AUE_SETCRED
#define SYS_AUE_exterrctl AUE_NULL
+#define SYS_AUE_inotify_add_watch_at AUE_INOTIFY
+#define SYS_AUE_inotify_rm_watch AUE_INOTIFY
#undef PAD_
#undef PADL_
diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h
index f5caea2e3919..c291c1dc2b95 100644
--- a/sys/sys/unistd.h
+++ b/sys/sys/unistd.h
@@ -156,6 +156,7 @@
#define _PC_DEALLOC_PRESENT 65
#define _PC_NAMEDATTR_ENABLED 66
#define _PC_HAS_NAMEDATTR 67
+#define _PC_HAS_HIDDENSYSTEM 68
#endif
/* From OpenSolaris, used by SEEK_DATA/SEEK_HOLE. */
@@ -210,6 +211,7 @@
* close_range() options.
*/
#define CLOSE_RANGE_CLOEXEC (1<<2)
+#define CLOSE_RANGE_CLOFORK (1<<3)
#endif /* __BSD_VISIBLE */
diff --git a/sys/sys/user.h b/sys/sys/user.h
index f94a91ca1238..103236b6ed1b 100644
--- a/sys/sys/user.h
+++ b/sys/sys/user.h
@@ -265,6 +265,7 @@ struct user {
#define KF_TYPE_DEV 12
#define KF_TYPE_EVENTFD 13
#define KF_TYPE_TIMERFD 14
+#define KF_TYPE_INOTIFY 15
#define KF_TYPE_UNKNOWN 255
#define KF_VTYPE_VNON 0
@@ -456,6 +457,10 @@ struct kinfo_file {
int32_t kf_kqueue_count;
int32_t kf_kqueue_state;
} kf_kqueue;
+ struct {
+ uint64_t kf_inotify_npending;
+ uint64_t kf_inotify_nbpending;
+ } kf_inotify;
} kf_un;
};
uint16_t kf_status; /* Status flags. */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index bed20f607339..2c6947103c94 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -86,11 +86,13 @@ enum vgetstate {
* it from v_data. If non-null, this area is freed in getnewvnode().
*/
-struct namecache;
struct cache_fpl;
+struct inotify_watch;
+struct namecache;
struct vpollinfo {
struct mtx vpi_lock; /* lock to protect below */
+ TAILQ_HEAD(, inotify_watch) vpi_inotify; /* list of inotify watchers */
struct selinfo vpi_selinfo; /* identity of poller(s) */
short vpi_events; /* what they are looking for */
short vpi_revents; /* what has happened */
@@ -248,6 +250,9 @@ _Static_assert(sizeof(struct vnode) <= 448, "vnode size crosses 448 bytes");
#define VIRF_CROSSMP 0x0010 /* Cross-mp vnode, no locking */
#define VIRF_NAMEDDIR 0x0020 /* Named attribute directory */
#define VIRF_NAMEDATTR 0x0040 /* Named attribute */
+#define VIRF_INOTIFY 0x0080 /* This vnode is being watched */
+#define VIRF_INOTIFY_PARENT 0x0100 /* A parent of this vnode may be being
+ watched */
#define VI_UNUSED0 0x0001 /* unused */
#define VI_MOUNT 0x0002 /* Mount in progress */
@@ -667,6 +672,7 @@ char *cache_symlink_alloc(size_t size, int flags);
void cache_symlink_free(char *string, size_t size);
int cache_symlink_resolve(struct cache_fpl *fpl, const char *string,
size_t len);
+void cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie);
void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp);
void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp);
@@ -869,8 +875,10 @@ int vop_stdfsync(struct vop_fsync_args *);
int vop_stdgetwritemount(struct vop_getwritemount_args *);
int vop_stdgetpages(struct vop_getpages_args *);
int vop_stdinactive(struct vop_inactive_args *);
-int vop_stdioctl(struct vop_ioctl_args *);
int vop_stdneed_inactive(struct vop_need_inactive_args *);
+int vop_stdinotify(struct vop_inotify_args *);
+int vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *);
+int vop_stdioctl(struct vop_ioctl_args *);
int vop_stdkqfilter(struct vop_kqfilter_args *);
int vop_stdlock(struct vop_lock1_args *);
int vop_stdunlock(struct vop_unlock_args *);
@@ -910,9 +918,12 @@ int dead_read(struct vop_read_args *ap);
int dead_write(struct vop_write_args *ap);
/* These are called from within the actual VOPS. */
+void vop_allocate_post(void *a, int rc);
+void vop_copy_file_range_post(void *ap, int rc);
void vop_close_post(void *a, int rc);
void vop_create_pre(void *a);
void vop_create_post(void *a, int rc);
+void vop_deallocate_post(void *a, int rc);
void vop_whiteout_pre(void *a);
void vop_whiteout_post(void *a, int rc);
void vop_deleteextattr_pre(void *a);
@@ -1020,9 +1031,12 @@ void vop_rename_fail(struct vop_rename_args *ap);
#define VOP_WRITE_POST(ap, ret) \
noffset = (ap)->a_uio->uio_offset; \
- if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \
- VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \
- | (noffset > osize ? NOTE_EXTEND : 0)); \
+ if (noffset > ooffset) { \
+ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \
+ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE | \
+ (noffset > osize ? NOTE_EXTEND : 0)); \
+ } \
+ INOTIFY((ap)->a_vp, IN_MODIFY); \
}
#define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
diff --git a/sys/tools/vnode_if.awk b/sys/tools/vnode_if.awk
index d23c2af9bd9a..e829105197cc 100644
--- a/sys/tools/vnode_if.awk
+++ b/sys/tools/vnode_if.awk
@@ -193,6 +193,7 @@ if (cfile) {
printc(common_head \
"#include <sys/param.h>\n" \
"#include <sys/event.h>\n" \
+ "#include <sys/inotify.h>\n" \
"#include <sys/kernel.h>\n" \
"#include <sys/mount.h>\n" \
"#include <sys/sdt.h>\n" \
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 891e490a7031..75f5fe716c31 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -1012,7 +1012,6 @@ ffs_mountfs(struct vnode *odevvp, struct mount *mp, struct thread *td)
else
ump->um_check_blkno = NULL;
mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
- sx_init(&ump->um_checkpath_lock, "uchpth");
fs->fs_ronly = ronly;
fs->fs_active = NULL;
mp->mnt_data = ump;
@@ -1182,7 +1181,6 @@ out:
}
if (ump != NULL) {
mtx_destroy(UFS_MTX(ump));
- sx_destroy(&ump->um_checkpath_lock);
if (mp->mnt_gjprovider != NULL) {
free(mp->mnt_gjprovider, M_UFSMNT);
mp->mnt_gjprovider = NULL;
@@ -1306,7 +1304,6 @@ ffs_unmount(struct mount *mp, int mntflags)
vrele(ump->um_odevvp);
dev_rel(ump->um_dev);
mtx_destroy(UFS_MTX(ump));
- sx_destroy(&ump->um_checkpath_lock);
if (mp->mnt_gjprovider != NULL) {
free(mp->mnt_gjprovider, M_UFSMNT);
mp->mnt_gjprovider = NULL;
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
index eaf37c58756b..3f9c95e934fc 100644
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@@ -1412,7 +1412,6 @@ ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target,
vp = tvp = ITOV(target);
mp = vp->v_mount;
*wait_ino = 0;
- sx_assert(&VFSTOUFS(mp)->um_checkpath_lock, SA_XLOCKED);
if (target->i_number == source_ino)
return (EEXIST);
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 9aea01e70951..53fac4b0665e 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -1273,9 +1273,9 @@ ufs_rename(
struct mount *mp;
ino_t ino;
seqc_t fdvp_s, fvp_s, tdvp_s, tvp_s;
- bool checkpath_locked, want_seqc_end;
+ bool want_seqc_end;
- checkpath_locked = want_seqc_end = false;
+ want_seqc_end = false;
endoff = 0;
mp = tdvp->v_mount;
@@ -1427,10 +1427,6 @@ relock:
}
vfs_ref(mp);
MPASS(!want_seqc_end);
- if (checkpath_locked) {
- sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock);
- checkpath_locked = false;
- }
VOP_UNLOCK(fdvp);
VOP_UNLOCK(fvp);
vref(tdvp);
@@ -1484,8 +1480,6 @@ relock:
if (error)
goto unlockout;
- sx_xlock(&VFSTOUFS(mp)->um_checkpath_lock);
- checkpath_locked = true;
error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,
&ino);
/*
@@ -1493,8 +1487,6 @@ relock:
* everything else and VGET before restarting.
*/
if (ino) {
- sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock);
- checkpath_locked = false;
VOP_UNLOCK(fdvp);
VOP_UNLOCK(fvp);
VOP_UNLOCK(tdvp);
@@ -1574,9 +1566,6 @@ relock:
vn_seqc_write_end(fdvp);
want_seqc_end = false;
vfs_ref(mp);
- MPASS(checkpath_locked);
- sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock);
- checkpath_locked = false;
VOP_UNLOCK(fdvp);
VOP_UNLOCK(fvp);
vref(tdvp);
@@ -1763,9 +1752,6 @@ unlockout:
vn_seqc_write_end(fdvp);
}
- if (checkpath_locked)
- sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock);
-
vput(fdvp);
vput(fvp);
@@ -2734,6 +2720,9 @@ ufs_pathconf(
case _PC_SYMLINK_MAX:
*ap->a_retval = MAXPATHLEN;
break;
+ case _PC_HAS_HIDDENSYSTEM:
+ *ap->a_retval = 1;
+ break;
default:
error = vop_stdpathconf(ap);
diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h
index 5c7fa11dae6a..d33b01e4425e 100644
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@@ -97,8 +97,6 @@ struct ufsmount {
uint64_t um_maxsymlinklen; /* (c) max size of short
symlink */
struct mtx um_lock; /* (c) Protects ufsmount & fs */
- struct sx um_checkpath_lock; /* (c) Protects ufs_checkpath()
- result */
struct mount_softdeps *um_softdep; /* (c) softdep mgmt structure */
struct vnode *um_quotas[MAXQUOTAS]; /* (q) pointer to quota files */
struct ucred *um_cred[MAXQUOTAS]; /* (q) quota file access cred */
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 86b75a2d7989..d6bd06226d04 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -384,8 +384,8 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
#endif
}
-static int swap_pager_full = 2; /* swap space exhaustion (task killing) */
-static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
+static bool swap_pager_full = true; /* swap space exhaustion (task killing) */
+static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */
static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */
static int nsw_wcount_async; /* limit async write buffers */
static int nsw_wcount_async_max;/* assigned maximum */
@@ -642,14 +642,14 @@ swp_sizecheck(void)
{
if (swap_pager_avail < nswap_lowat) {
- if (swap_pager_almost_full == 0) {
+ if (!swap_pager_almost_full) {
printf("swap_pager: out of swap space\n");
- swap_pager_almost_full = 1;
+ swap_pager_almost_full = true;
}
} else {
- swap_pager_full = 0;
+ swap_pager_full = false;
if (swap_pager_avail > nswap_hiwat)
- swap_pager_almost_full = 0;
+ swap_pager_almost_full = false;
}
}
@@ -958,11 +958,10 @@ swp_pager_getswapspace(int *io_npages)
swp_sizecheck();
swdevhd = TAILQ_NEXT(sp, sw_list);
} else {
- if (swap_pager_full != 2) {
+ if (!swap_pager_full) {
printf("swp_pager_getswapspace(%d): failed\n",
*io_npages);
- swap_pager_full = 2;
- swap_pager_almost_full = 1;
+ swap_pager_full = swap_pager_almost_full = true;
}
swdevhd = NULL;
}
@@ -2863,10 +2862,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags)
sp->sw_id = NULL;
TAILQ_REMOVE(&swtailq, sp, sw_list);
nswapdev--;
- if (nswapdev == 0) {
- swap_pager_full = 2;
- swap_pager_almost_full = 1;
- }
+ if (nswapdev == 0)
+ swap_pager_full = swap_pager_almost_full = true;
if (swdevhd == sp)
swdevhd = NULL;
mtx_unlock(&sw_dev_mtx);
diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c
index 7b8bf4c77663..b44bdb96b0d4 100644
--- a/sys/vm/vm_domainset.c
+++ b/sys/vm/vm_domainset.c
@@ -131,8 +131,7 @@ static void
vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
{
- KASSERT(di->di_n > 0,
- ("vm_domainset_iter_first: Invalid n %d", di->di_n));
+ KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
switch (di->di_policy) {
case DOMAINSET_POLICY_FIRSTTOUCH:
/*
@@ -149,11 +148,10 @@ vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
vm_domainset_iter_prefer(di, domain);
break;
default:
- panic("vm_domainset_iter_first: Unknown policy %d",
- di->di_policy);
+ panic("%s: Unknown policy %d", __func__, di->di_policy);
}
KASSERT(*domain < vm_ndomains,
- ("vm_domainset_iter_next: Invalid domain %d", *domain));
+ ("%s: Invalid domain %d", __func__, *domain));
}
static void
@@ -189,13 +187,11 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
di->di_n = di->di_domain->ds_cnt;
break;
default:
- panic("vm_domainset_iter_first: Unknown policy %d",
- di->di_policy);
+ panic("%s: Unknown policy %d", __func__, di->di_policy);
}
- KASSERT(di->di_n > 0,
- ("vm_domainset_iter_first: Invalid n %d", di->di_n));
+ KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
KASSERT(*domain < vm_ndomains,
- ("vm_domainset_iter_first: Invalid domain %d", *domain));
+ ("%s: Invalid domain %d", __func__, *domain));
}
void
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 21584abacfa3..3e57e8d4f1d0 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -1441,8 +1441,7 @@ vm_fault_busy_sleep(struct faultstate *fs)
}
vm_object_pip_wakeup(fs->object);
vm_fault_unlock_map(fs);
- if (fs->m != vm_page_lookup(fs->object, fs->pindex) ||
- !vm_page_busy_sleep(fs->m, "vmpfw", 0))
+ if (!vm_page_busy_sleep(fs->m, "vmpfw", 0))
VM_OBJECT_UNLOCK(fs->object);
VM_CNT_INC(v_intrans);
vm_object_deallocate(fs->first_object);
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 875c22d27628..e7d7b6726d2c 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -110,11 +110,18 @@ u_int exec_map_entry_size;
u_int exec_map_entries;
SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
- SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
+#if defined(__amd64__)
+ &kva_layout.km_low, 0,
+#else
+ SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS,
+#endif
+ "Min kernel address");
SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
#if defined(__arm__)
&vm_max_kernel_address, 0,
+#elif defined(__amd64__)
+ &kva_layout.km_high, 0,
#else
SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS,
#endif
diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c
index 46fd212df299..501ace32bd11 100644
--- a/sys/vm/vm_mmap.c
+++ b/sys/vm/vm_mmap.c
@@ -41,6 +41,7 @@
*/
#include "opt_hwpmc_hooks.h"
+#include "opt_hwt_hooks.h"
#include "opt_vm.h"
#define EXTERR_CATEGORY EXTERR_CAT_MMAP
@@ -95,6 +96,10 @@
#include <sys/pmckern.h>
#endif
+#ifdef HWT_HOOKS
+#include <dev/hwt/hwt_hook.h>
+#endif
+
int old_mlock = 0;
SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
"Do not apply RLIMIT_MEMLOCK on mlockall");
@@ -613,6 +618,17 @@ kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
#endif
rv = vm_map_delete(map, addr, end);
+#ifdef HWT_HOOKS
+ if (HWT_HOOK_INSTALLED && rv == KERN_SUCCESS) {
+ struct hwt_record_entry ent;
+
+ ent.addr = (uintptr_t) addr;
+ ent.fullpath = NULL;
+ ent.record_type = HWT_RECORD_MUNMAP;
+ HWT_CALL_HOOK(td, HWT_RECORD, &ent);
+ }
+#endif
+
#ifdef HWPMC_HOOKS
if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) {
/* downgrade the lock to prevent a LOR with the pmc-sx lock */
diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h
index cbbd27389662..9bd3b389fb60 100644
--- a/sys/vm/vm_pagequeue.h
+++ b/sys/vm/vm_pagequeue.h
@@ -260,9 +260,9 @@ struct vm_domain {
u_int vmd_inactive_shortage; /* Per-thread shortage. */
blockcount_t vmd_inactive_running; /* Number of inactive threads. */
blockcount_t vmd_inactive_starting; /* Number of threads started. */
- volatile u_int vmd_addl_shortage; /* Shortage accumulator. */
- volatile u_int vmd_inactive_freed; /* Successful inactive frees. */
- volatile u_int vmd_inactive_us; /* Microseconds for above. */
+ u_int vmd_addl_shortage; /* (a) Shortage accumulator. */
+ u_int vmd_inactive_freed; /* (a) Successful inactive frees. */
+ u_int vmd_inactive_us; /* (a) Microseconds for above. */
u_int vmd_inactive_pps; /* Exponential decay frees/second. */
int vmd_oom_seq;
int vmd_last_active_scan;
diff --git a/sys/x86/linux/linux_dummy_x86.c b/sys/x86/linux/linux_dummy_x86.c
index ae1d23e811e7..221f5dbf5ba3 100644
--- a/sys/x86/linux/linux_dummy_x86.c
+++ b/sys/x86/linux/linux_dummy_x86.c
@@ -46,7 +46,5 @@ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
DUMMY(sysfs);
DUMMY(quotactl);
-/* Linux 2.6.13: */
-DUMMY(inotify_init);
/* Linux 2.6.22: */
DUMMY(signalfd);