aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/acpica/acpi_wakeup.c18
-rw-r--r--sys/amd64/amd64/apic_vector.S9
-rw-r--r--sys/amd64/amd64/cpu_switch.S4
-rw-r--r--sys/amd64/amd64/exec_machdep.c2
-rw-r--r--sys/amd64/amd64/machdep.c9
-rw-r--r--sys/amd64/amd64/mem.c4
-rw-r--r--sys/amd64/amd64/minidump_machdep.c10
-rw-r--r--sys/amd64/amd64/pmap.c598
-rw-r--r--sys/amd64/amd64/support.S20
-rw-r--r--sys/amd64/amd64/trap.c21
-rw-r--r--sys/amd64/conf/MINIMALUP4
-rw-r--r--sys/amd64/include/param.h7
-rw-r--r--sys/amd64/include/pmap.h34
-rw-r--r--sys/amd64/include/smp.h3
-rw-r--r--sys/amd64/include/vmm.h2
-rw-r--r--sys/amd64/include/vmm_dev.h7
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h25
-rw-r--r--sys/amd64/include/vmparam.h55
-rw-r--r--sys/amd64/pt/pt.c978
-rw-r--r--sys/amd64/pt/pt.h49
-rw-r--r--sys/amd64/vmm/amd/svm.c90
-rw-r--r--sys/amd64/vmm/intel/vmx.c2
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S6
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c34
-rw-r--r--sys/amd64/vmm/vmm_ioport.c40
-rw-r--r--sys/arm/allwinner/aw_mmc.c33
-rw-r--r--sys/arm/arm/pmap-v6.c32
-rw-r--r--sys/arm64/arm64/pmap.c122
-rw-r--r--sys/arm64/broadcom/genet/if_genet.c4
-rw-r--r--sys/arm64/include/vmm_dev.h5
-rw-r--r--sys/cam/cam_xpt.c9
-rw-r--r--sys/cam/cam_xpt.h2
-rw-r--r--sys/cam/mmc/mmc_da.c2
-rw-r--r--sys/cam/mmc/mmc_xpt.c1
-rw-r--r--sys/cddl/boot/zfs/zfsimpl.h2
-rw-r--r--sys/cddl/dev/sdt/sdt.c23
-rw-r--r--sys/compat/linprocfs/linprocfs.c8
-rw-r--r--sys/compat/linux/linux_file.c2
-rw-r--r--sys/compat/linuxkpi/common/include/linux/slab.h2
-rw-r--r--sys/compat/linuxkpi/common/src/linux_page.c5
-rw-r--r--sys/conf/files15
-rw-r--r--sys/conf/files.amd648
-rw-r--r--sys/conf/files.arm644
-rw-r--r--sys/conf/files.x861
-rw-r--r--sys/dev/amdsmu/amdsmu.c466
-rw-r--r--sys/dev/amdsmu/amdsmu.h95
-rw-r--r--sys/dev/amdsmu/amdsmu_reg.h84
-rw-r--r--sys/dev/cxgbe/tom/t4_cpl_io.c6
-rw-r--r--sys/dev/cxgbe/tom/t4_tls.c4
-rw-r--r--sys/dev/drm2/drm_fb_helper.c2
-rw-r--r--sys/dev/efidev/efirt.c42
-rw-r--r--sys/dev/gpio/acpi_gpiobus.c2
-rw-r--r--sys/dev/gpio/gpiobus.c24
-rw-r--r--sys/dev/ice/ice_features.h2
-rw-r--r--sys/dev/ice/ice_iflib.h16
-rw-r--r--sys/dev/ice/ice_iov.c1856
-rw-r--r--sys/dev/ice/ice_iov.h125
-rw-r--r--sys/dev/ice/ice_lib.c28
-rw-r--r--sys/dev/ice/ice_lib.h4
-rw-r--r--sys/dev/ice/ice_vf_mbx.c471
-rw-r--r--sys/dev/ice/ice_vf_mbx.h67
-rw-r--r--sys/dev/ice/if_ice_iflib.c132
-rw-r--r--sys/dev/iicbus/iichid.c74
-rw-r--r--sys/dev/md/md.c29
-rw-r--r--sys/dev/mgb/if_mgb.c2
-rw-r--r--sys/dev/mlx5/mlx5_accel/ipsec.h8
-rw-r--r--sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c16
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c3
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_rx.c2
-rw-r--r--sys/dev/nvme/nvme_ctrlr.c295
-rw-r--r--sys/dev/nvme/nvme_private.h4
-rw-r--r--sys/dev/nvmf/controller/nvmft_subr.c40
-rw-r--r--sys/dev/ofw/ofw_bus_subr.c101
-rw-r--r--sys/dev/qlnx/qlnxe/qlnx_os.c11
-rw-r--r--sys/dev/random/fortuna.c7
-rw-r--r--sys/dev/random/random_harvestq.c232
-rw-r--r--sys/dev/random/random_harvestq.h2
-rw-r--r--sys/dev/random/randomdev.c2
-rw-r--r--sys/dev/ufshci/ufshci_private.h4
-rw-r--r--sys/dev/ufshci/ufshci_req_sdb.c45
-rw-r--r--sys/dev/usb/controller/xhci_pci.c7
-rw-r--r--sys/dev/vmm/vmm_dev.c135
-rw-r--r--sys/dev/vmm/vmm_mem.c15
-rw-r--r--sys/dev/vmm/vmm_mem.h27
-rw-r--r--sys/dev/vt/hw/vga/vt_vga.c2
-rw-r--r--sys/dev/vt/vt_core.c4
-rw-r--r--sys/fs/fdescfs/fdesc_vnops.c9
-rw-r--r--sys/fs/fuse/fuse_internal.h6
-rw-r--r--sys/fs/fuse/fuse_ipc.c2
-rw-r--r--sys/fs/fuse/fuse_vnops.c2
-rw-r--r--sys/fs/msdosfs/msdosfs_conv.c11
-rw-r--r--sys/fs/msdosfs/msdosfs_vnops.c21
-rw-r--r--sys/fs/nfs/nfs_commonport.c3
-rw-r--r--sys/fs/nfs/nfs_commonsubs.c32
-rw-r--r--sys/fs/nfs/nfs_var.h8
-rw-r--r--sys/fs/nfsclient/nfs_clrpcops.c6
-rw-r--r--sys/fs/nfsclient/nfs_clstate.c2
-rw-r--r--sys/fs/nfsserver/nfs_nfsdport.c41
-rw-r--r--sys/fs/nfsserver/nfs_nfsdserv.c34
-rw-r--r--sys/fs/p9fs/p9fs_vnops.c8
-rw-r--r--sys/fs/pseudofs/pseudofs_vnops.c31
-rw-r--r--sys/fs/smbfs/smbfs_io.c20
-rw-r--r--sys/fs/smbfs/smbfs_node.h2
-rw-r--r--sys/fs/smbfs/smbfs_vnops.c6
-rw-r--r--sys/fs/udf/ecma167-udf.h4
-rw-r--r--sys/fs/udf/udf_vfsops.c7
-rw-r--r--sys/fs/udf/udf_vnops.c48
-rw-r--r--sys/geom/concat/g_concat.c1
-rw-r--r--sys/geom/geom.h2
-rw-r--r--sys/geom/geom_ccd.c10
-rw-r--r--sys/geom/geom_event.c2
-rw-r--r--sys/geom/geom_io.c2
-rw-r--r--sys/geom/geom_slice.c6
-rw-r--r--sys/geom/geom_subr.c21
-rw-r--r--sys/geom/multipath/g_multipath.c10
-rw-r--r--sys/geom/virstor/g_virstor.c30
-rw-r--r--sys/i386/conf/GENERIC2
-rw-r--r--sys/i386/conf/GENERIC-NODEBUG2
-rw-r--r--sys/i386/conf/LINT1
-rw-r--r--sys/i386/conf/MINIMAL2
-rw-r--r--sys/i386/conf/PAE2
-rw-r--r--sys/i386/i386/pmap.c12
-rw-r--r--sys/kern/coredump_vnode.c562
-rw-r--r--sys/kern/imgact_elf.c15
-rw-r--r--sys/kern/kern_cpuset.c98
-rw-r--r--sys/kern/kern_descrip.c2
-rw-r--r--sys/kern/kern_exec.c21
-rw-r--r--sys/kern/kern_jail.c17
-rw-r--r--sys/kern/kern_prot.c38
-rw-r--r--sys/kern/kern_sendfile.c11
-rw-r--r--sys/kern/kern_sig.c594
-rw-r--r--sys/kern/kern_sysctl.c2
-rw-r--r--sys/kern/kern_ucoredump.c299
-rw-r--r--sys/kern/subr_asan.c3
-rw-r--r--sys/kern/subr_compressor.c6
-rw-r--r--sys/kern/subr_trap.c5
-rw-r--r--sys/kern/sys_generic.c8
-rw-r--r--sys/kern/uipc_ktls.c2
-rw-r--r--sys/kern/uipc_shm.c50
-rw-r--r--sys/kern/uipc_sockbuf.c43
-rw-r--r--sys/kern/uipc_socket.c12
-rw-r--r--sys/kern/vfs_cache.c13
-rw-r--r--sys/kern/vfs_inotify.c5
-rw-r--r--sys/kern/vfs_subr.c11
-rw-r--r--sys/kern/vfs_syscalls.c4
-rw-r--r--sys/kern/vnode_if.src5
-rw-r--r--sys/modules/Makefile4
-rw-r--r--sys/modules/amdsmu/Makefile14
-rw-r--r--sys/modules/efirt/Makefile2
-rw-r--r--sys/modules/ice/Makefile1
-rw-r--r--sys/modules/pt/Makefile8
-rw-r--r--sys/modules/qlnx/qlnxe/Makefile1
-rw-r--r--sys/net/ethernet.h6
-rw-r--r--sys/net/if_bridge.c197
-rw-r--r--sys/net/if_bridgevar.h4
-rw-r--r--sys/net/if_ethersubr.c5
-rw-r--r--sys/net/if_gif.h3
-rw-r--r--sys/net/if_lagg.c1
-rw-r--r--sys/net/if_ovpn.c321
-rw-r--r--sys/net/if_ovpn.h1
-rw-r--r--sys/net/if_tuntap.c76
-rw-r--r--sys/net/if_vlan.c12
-rw-r--r--sys/net/pfvar.h17
-rw-r--r--sys/net80211/ieee80211_hostap.c7
-rw-r--r--sys/net80211/ieee80211_ht.c13
-rw-r--r--sys/net80211/ieee80211_vht.c4
-rw-r--r--sys/net80211/ieee80211_vht.h3
-rw-r--r--sys/netinet/icmp_var.h9
-rw-r--r--sys/netinet/in_fib_dxr.c6
-rw-r--r--sys/netinet/ip_icmp.c3
-rw-r--r--sys/netinet/sctp_timer.c1
-rw-r--r--sys/netinet/tcp_hpts.c73
-rw-r--r--sys/netinet/tcp_hpts.h17
-rw-r--r--sys/netinet/tcp_input.c48
-rw-r--r--sys/netinet/tcp_log_buf.c2
-rw-r--r--sys/netinet/tcp_log_buf.h14
-rw-r--r--sys/netinet/tcp_lro.c12
-rw-r--r--sys/netinet/tcp_sack.c2
-rw-r--r--sys/netinet/tcp_stacks/bbr.c40
-rw-r--r--sys/netinet/tcp_stacks/rack.c80
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.c2
-rw-r--r--sys/netinet/tcp_stacks/rack_pcm.c4
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.c8
-rw-r--r--sys/netinet/tcp_stacks/sack_filter.h2
-rw-r--r--sys/netinet/tcp_subr.c2
-rw-r--r--sys/netinet/tcp_timer.c4
-rw-r--r--sys/netinet/tcp_usrreq.c10
-rw-r--r--sys/netinet/tcp_var.h26
-rw-r--r--sys/netinet/udp_usrreq.c11
-rw-r--r--sys/netinet6/in6_gif.c18
-rw-r--r--sys/netinet6/mld6.c29
-rw-r--r--sys/netinet6/raw_ip6.c3
-rw-r--r--sys/netinet6/udp6_usrreq.c11
-rw-r--r--sys/netipsec/ipsec.c6
-rw-r--r--sys/netipsec/ipsec_offload.c25
-rw-r--r--sys/netipsec/ipsec_offload.h16
-rw-r--r--sys/netipsec/key.c2
-rw-r--r--sys/netlink/netlink_message_parser.h3
-rw-r--r--sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c5
-rw-r--r--sys/netpfil/ipfw/ip_fw2.c2
-rw-r--r--sys/netpfil/pf/if_pflog.c4
-rw-r--r--sys/netpfil/pf/if_pfsync.c27
-rw-r--r--sys/netpfil/pf/pf.c360
-rw-r--r--sys/netpfil/pf/pf.h3
-rw-r--r--sys/netpfil/pf/pf_ioctl.c312
-rw-r--r--sys/netpfil/pf/pf_lb.c196
-rw-r--r--sys/netpfil/pf/pf_norm.c109
-rw-r--r--sys/netpfil/pf/pf_osfp.c17
-rw-r--r--sys/netpfil/pf/pf_ruleset.c13
-rw-r--r--sys/netpfil/pf/pf_syncookies.c6
-rw-r--r--sys/netpfil/pf/pf_table.c78
-rw-r--r--sys/netsmb/smb_conn.c4
-rw-r--r--sys/powerpc/aim/mmu_oea.c3
-rw-r--r--sys/powerpc/aim/mmu_oea64.c3
-rw-r--r--sys/powerpc/aim/mmu_radix.c4
-rw-r--r--sys/powerpc/include/pcb.h10
-rw-r--r--sys/powerpc/include/ucontext.h2
-rw-r--r--sys/powerpc/powerpc/exec_machdep.c39
-rw-r--r--sys/powerpc/powerpc/fpu.c30
-rw-r--r--sys/riscv/allwinner/files.allwinner1
-rw-r--r--sys/riscv/conf/std.allwinner1
-rw-r--r--sys/riscv/include/vmm_dev.h5
-rw-r--r--sys/riscv/riscv/pmap.c2
-rw-r--r--sys/rpc/authunix_prot.c7
-rw-r--r--sys/rpc/clnt_rc.c7
-rw-r--r--sys/rpc/rpcsec_gss/rpcsec_gss.c14
-rw-r--r--sys/rpc/rpcsec_tls/rpctls_impl.c8
-rw-r--r--sys/rpc/svc_auth_unix.c3
-rw-r--r--sys/security/audit/audit.c2
-rw-r--r--sys/security/audit/audit_arg.c2
-rw-r--r--sys/sys/compressor.h1
-rw-r--r--sys/sys/domainset.h14
-rw-r--r--sys/sys/efi.h18
-rw-r--r--sys/sys/elf_common.h280
-rw-r--r--sys/sys/exec.h20
-rw-r--r--sys/sys/exterr_cat.h1
-rw-r--r--sys/sys/exterrvar.h1
-rw-r--r--sys/sys/imgact_elf.h3
-rw-r--r--sys/sys/inotify.h14
-rw-r--r--sys/sys/jail.h2
-rw-r--r--sys/sys/mbuf.h1
-rw-r--r--sys/sys/param.h2
-rw-r--r--sys/sys/random.h3
-rw-r--r--sys/sys/signalvar.h1
-rw-r--r--sys/sys/sockbuf.h4
-rw-r--r--sys/sys/syscallsubr.h3
-rw-r--r--sys/sys/sysent.h4
-rw-r--r--sys/sys/ucoredump.h99
-rw-r--r--sys/sys/unistd.h2
-rw-r--r--sys/sys/vnode.h34
-rw-r--r--sys/ufs/ufs/ufs_vnops.c30
-rw-r--r--sys/vm/swap_pager.c23
-rw-r--r--sys/vm/vm_domainset.c16
-rw-r--r--sys/vm/vm_kern.c9
-rw-r--r--sys/vm/vm_page.c2
-rw-r--r--sys/vm/vm_pagequeue.h6
256 files changed, 9215 insertions, 2884 deletions
diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c
index 51d6d5e36840..99565fbb69ca 100644
--- a/sys/amd64/acpica/acpi_wakeup.c
+++ b/sys/amd64/acpica/acpi_wakeup.c
@@ -54,10 +54,8 @@
#include <x86/apicreg.h>
#include <x86/apicvar.h>
-#ifdef SMP
#include <machine/smp.h>
#include <machine/vmparam.h>
-#endif
#include <contrib/dev/acpica/include/acpi.h>
@@ -73,19 +71,13 @@ extern int acpi_resume_beep;
extern int acpi_reset_video;
extern int acpi_susp_bounce;
-#ifdef SMP
extern struct susppcb **susppcbs;
static cpuset_t suspcpus;
-#else
-static struct susppcb **susppcbs;
-#endif
static void acpi_stop_beep(void *);
-#ifdef SMP
static int acpi_wakeup_ap(struct acpi_softc *, int);
static void acpi_wakeup_cpus(struct acpi_softc *);
-#endif
#define ACPI_WAKEPT_PAGES 7
@@ -103,7 +95,6 @@ acpi_stop_beep(void *arg)
timer_spkr_release();
}
-#ifdef SMP
static int
acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
{
@@ -177,7 +168,6 @@ acpi_wakeup_cpus(struct acpi_softc *sc)
outb(CMOS_DATA, mpbiosreason);
}
}
-#endif
int
acpi_sleep_machdep(struct acpi_softc *sc, int state)
@@ -190,10 +180,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
if (sc->acpi_wakeaddr == 0ul)
return (-1); /* couldn't alloc wake memory */
-#ifdef SMP
suspcpus = all_cpus;
CPU_CLR(PCPU_GET(cpuid), &suspcpus);
-#endif
if (acpi_resume_beep != 0)
timer_spkr_acquire();
@@ -208,12 +196,10 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
pcb = &susppcbs[0]->sp_pcb;
if (savectx(pcb)) {
fpususpend(susppcbs[0]->sp_fpususpend);
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
device_printf(sc->acpi_dev, "Failed to suspend APs\n");
return (0); /* couldn't sleep */
}
-#endif
hw_ibrs_ibpb_active = 0;
hw_ssb_active = 0;
cpu_stdext_feature3 = 0;
@@ -278,16 +264,12 @@ acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
PCPU_SET(switchtime, 0);
PCPU_SET(switchticks, ticks);
lapic_xapic_mode();
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus))
acpi_wakeup_cpus(sc);
-#endif
}
-#ifdef SMP
if (!CPU_EMPTY(&suspcpus))
resume_cpus(suspcpus);
-#endif
/*
* Re-read cpu_stdext_feature3, which was zeroed-out
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 6e51ebff298a..e98bae9eb6c5 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -49,12 +49,6 @@
#include <machine/specialreg.h>
#include <x86/apicreg.h>
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
.text
SUPERALIGN_TEXT
/* End Of Interrupt to APIC */
@@ -163,7 +157,6 @@ IDTVEC(spuriousint)
jmp doreti
#endif
-#ifdef SMP
/*
* Global address space TLB shootdown.
*/
@@ -270,5 +263,3 @@ IDTVEC(justreturn)
INTR_HANDLER justreturn1
call as_lapic_eoi
jmp doreti
-
-#endif /* SMP */
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
index a053f6c70af1..d7e954f573b0 100644
--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -136,7 +136,7 @@ ctx_switch_fpusave_done:
movq %r15,TD_LOCK(%r13) /* Release the old thread */
sw1:
leaq TD_MD_PCB(%r12),%r8
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
movq $blocked_lock, %rdx
movq TD_LOCK(%r12),%rcx
cmpq %rcx, %rdx
@@ -492,7 +492,7 @@ ENTRY(resumectx)
END(resumectx)
/* Wait for the new thread to become unblocked */
-#if defined(SCHED_ULE) && defined(SMP)
+#if defined(SCHED_ULE)
sw1wait:
1:
pause
diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c
index da68289e2c83..6752b716deb5 100644
--- a/sys/amd64/amd64/exec_machdep.c
+++ b/sys/amd64/amd64/exec_machdep.c
@@ -59,9 +59,7 @@
#include <sys/reg.h>
#include <sys/rwlock.h>
#include <sys/signalvar.h>
-#ifdef SMP
#include <sys/smp.h>
-#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index f46462b39fa3..37c7056f649c 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -38,7 +38,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include "opt_atpic.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
@@ -82,9 +81,7 @@
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
-#ifdef SMP
#include <sys/smp.h>
-#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
@@ -132,9 +129,7 @@
#include <machine/tss.h>
#include <x86/ucode.h>
#include <x86/ifunc.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#ifdef FDT
#include <x86/fdt.h>
#endif
@@ -149,6 +144,10 @@
#include <isa/rtc.h>
#include <x86/init.h>
+#ifndef SMP
+#error amd64 requires options SMP
+#endif
+
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c
index 413b7c74890e..851f2df0e6e1 100644
--- a/sys/amd64/amd64/mem.c
+++ b/sys/amd64/amd64/mem.c
@@ -105,8 +105,8 @@ memrw(struct cdev *dev, struct uio *uio, int flags)
* PAGE_SIZE, the uiomove() call does not
* access past the end of the direct map.
*/
- if (v >= DMAP_MIN_ADDRESS &&
- v < DMAP_MIN_ADDRESS + dmaplimit) {
+ if (v >= kva_layout.dmap_low &&
+ v < kva_layout.dmap_high) {
error = uiomove((void *)v, c, uio);
break;
}
diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
index 6d0917e16099..43bf81a991bf 100644
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -186,7 +186,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
* tables, so care must be taken to read each entry only once.
*/
pmapsize = 0;
- for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) {
+ for (va = kva_layout.km_low; va < kva_end; ) {
/*
* We always write a page, even if it is zero. Each
* page written corresponds to 1GB of space
@@ -279,9 +279,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
mdhdr.msgbufsize = mbp->msg_size;
mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages));
mdhdr.pmapsize = pmapsize;
- mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
- mdhdr.dmapbase = DMAP_MIN_ADDRESS;
- mdhdr.dmapend = DMAP_MAX_ADDRESS;
+ mdhdr.kernbase = kva_layout.km_low;
+ mdhdr.dmapbase = kva_layout.dmap_low;
+ mdhdr.dmapend = kva_layout.dmap_high;
mdhdr.dumpavailsize = round_page(sizeof(dump_avail));
dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION,
@@ -323,7 +323,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
/* Dump kernel page directory pages */
bzero(fakepd, sizeof(fakepd));
- for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) {
+ for (va = kva_layout.km_low; va < kva_end; va += NBPDP) {
ii = pmap_pml4e_index(va);
pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 9c985df13ddf..8df082f6c5dc 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -162,9 +162,7 @@
#include <machine/msan.h>
#include <machine/pcb.h>
#include <machine/specialreg.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#include <machine/sysarch.h>
#include <machine/tss.h>
@@ -415,7 +413,7 @@ SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
static int ndmpdp;
vm_paddr_t dmaplimit;
-vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
+vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS_LA48;
pt_entry_t pg_nx;
static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -475,11 +473,56 @@ _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow");
static pml4_entry_t *kernel_pml4;
static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
+static u_int64_t DMPML4phys; /* ... level 4, for la57 */
static int ndmpdpphys; /* number of DMPDPphys pages */
vm_paddr_t kernphys; /* phys addr of start of bootstrap data */
vm_paddr_t KERNend; /* and the end */
+struct kva_layout_s kva_layout = {
+ .kva_min = KV4ADDR(PML4PML4I, 0, 0, 0),
+ .kva_max = KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .dmap_low = KV4ADDR(DMPML4I, 0, 0, 0),
+ .dmap_high = KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0),
+ .lm_low = KV4ADDR(LMSPML4I, 0, 0, 0),
+ .lm_high = KV4ADDR(LMEPML4I + 1, 0, 0, 0),
+ .km_low = KV4ADDR(KPML4BASE, 0, 0, 0),
+ .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .rec_pt = KV4ADDR(PML4PML4I, 0, 0, 0),
+ .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+ .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+ .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+ .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+ 0, 0, 0),
+ .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+ .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+ 0, 0, 0),
+};
+
+struct kva_layout_s kva_layout_la57 = {
+ .kva_min = KV5ADDR(NPML5EPG / 2, 0, 0, 0, 0), /* == rec_pt */
+ .kva_max = KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .dmap_low = KV5ADDR(DMPML5I, 0, 0, 0, 0),
+ .dmap_high = KV5ADDR(DMPML5I + NDMPML5E, 0, 0, 0, 0),
+ .lm_low = KV5ADDR(LMSPML5I, 0, 0, 0, 0),
+ .lm_high = KV5ADDR(LMEPML5I + 1, 0, 0, 0, 0),
+ .km_low = KV4ADDR(KPML4BASE, 0, 0, 0),
+ .km_high = KV4ADDR(KPML4BASE + NKPML4E - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1),
+ .rec_pt = KV5ADDR(PML5PML5I, 0, 0, 0, 0),
+ .kasan_shadow_low = KV4ADDR(KASANPML4I, 0, 0, 0),
+ .kasan_shadow_high = KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0),
+ .kmsan_shadow_low = KV4ADDR(KMSANSHADPML4I, 0, 0, 0),
+ .kmsan_shadow_high = KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E,
+ 0, 0, 0),
+ .kmsan_origin_low = KV4ADDR(KMSANORIGPML4I, 0, 0, 0),
+ .kmsan_origin_high = KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E,
+ 0, 0, 0),
+};
+
/*
* pmap_mapdev support pre initialization (i.e. console)
*/
@@ -549,8 +592,8 @@ static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
static vmem_t *large_vmem;
static u_int lm_ents;
-#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \
- (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
+#define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= kva_layout.lm_low && \
+ (va) < kva_layout.lm_high)
int pmap_pcid_enabled = 1;
SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
@@ -1336,7 +1379,7 @@ static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
static pd_entry_t *pmap_pti_pde(vm_offset_t va);
static void pmap_pti_wire_pte(void *pte);
static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
- bool remove_pt, struct spglist *free, struct rwlock **lockp);
+ bool demote_kpde, struct spglist *free, struct rwlock **lockp);
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
@@ -1722,7 +1765,7 @@ create_pagetables(vm_paddr_t *firstaddr)
{
pd_entry_t *pd_p;
pdp_entry_t *pdp_p;
- pml4_entry_t *p4_p;
+ pml4_entry_t *p4_p, *p4d_p;
pml5_entry_t *p5_p;
uint64_t DMPDkernphys;
vm_paddr_t pax;
@@ -1732,7 +1775,7 @@ create_pagetables(vm_paddr_t *firstaddr)
vm_offset_t kasankernbase;
int kasankpdpi, kasankpdi, nkasanpte;
#endif
- int i, j, ndm1g, nkpdpe, nkdmpde;
+ int i, j, ndm1g, nkpdpe, nkdmpde, ndmpml4phys;
TSENTER();
/* Allocate page table pages for the direct map */
@@ -1740,15 +1783,30 @@ create_pagetables(vm_paddr_t *firstaddr)
if (ndmpdp < 4) /* Minimum 4GB of dirmap */
ndmpdp = 4;
ndmpdpphys = howmany(ndmpdp, NPDPEPG);
- if (ndmpdpphys > NDMPML4E) {
- /*
- * Each NDMPML4E allows 512 GB, so limit to that,
- * and then readjust ndmpdp and ndmpdpphys.
- */
- printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
- Maxmem = atop(NDMPML4E * NBPML4);
- ndmpdpphys = NDMPML4E;
- ndmpdp = NDMPML4E * NPDEPG;
+ if (la57) {
+ ndmpml4phys = howmany(ndmpdpphys, NPML4EPG);
+ if (ndmpml4phys > NDMPML5E) {
+ printf("NDMPML5E limits system to %ld GB\n",
+ (u_long)NDMPML5E * NBPML5 / 1024 / 1024 / 1024);
+ Maxmem = atop(NDMPML5E * NBPML5);
+ ndmpml4phys = NDMPML5E;
+ ndmpdpphys = ndmpml4phys * NPML4EPG;
+ ndmpdp = ndmpdpphys * NPDEPG;
+ }
+ DMPML4phys = allocpages(firstaddr, ndmpml4phys);
+ } else {
+ if (ndmpdpphys > NDMPML4E) {
+ /*
+ * Each NDMPML4E allows 512 GB, so limit to
+ * that, and then readjust ndmpdp and
+ * ndmpdpphys.
+ */
+ printf("NDMPML4E limits system to %d GB\n",
+ NDMPML4E * 512);
+ Maxmem = atop(NDMPML4E * NBPML4);
+ ndmpdpphys = NDMPML4E;
+ ndmpdp = NDMPML4E * NPDEPG;
+ }
}
DMPDPphys = allocpages(firstaddr, ndmpdpphys);
ndm1g = 0;
@@ -1773,7 +1831,13 @@ create_pagetables(vm_paddr_t *firstaddr)
dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
/* Allocate pages. */
+ if (la57) {
+ KPML5phys = allocpages(firstaddr, 1);
+ p5_p = (pml5_entry_t *)KPML5phys;
+ }
KPML4phys = allocpages(firstaddr, 1);
+ p4_p = (pml4_entry_t *)KPML4phys;
+
KPDPphys = allocpages(firstaddr, NKPML4E);
#ifdef KASAN
KASANPDPphys = allocpages(firstaddr, NKASANPML4E);
@@ -1893,6 +1957,16 @@ create_pagetables(vm_paddr_t *firstaddr)
}
/*
+ * Connect the Direct Map slots up to the PML4.
+ * pml5 entries for DMAP are handled below in global pml5 loop.
+ */
+ p4d_p = la57 ? (pml4_entry_t *)DMPML4phys : &p4_p[DMPML4I];
+ for (i = 0; i < ndmpdpphys; i++) {
+ p4d_p[i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
+ pg_nx;
+ }
+
+ /*
* Instead of using a 1G page for the memory containing the kernel,
* use 2M pages with read-only and no-execute permissions. (If using 1G
* pages, this will partially overwrite the PDPEs above.)
@@ -1911,11 +1985,6 @@ create_pagetables(vm_paddr_t *firstaddr)
}
}
- /* And recursively map PML4 to itself in order to get PTmap */
- p4_p = (pml4_entry_t *)KPML4phys;
- p4_p[PML4PML4I] = KPML4phys;
- p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
-
#ifdef KASAN
/* Connect the KASAN shadow map slots up to the PML4. */
for (i = 0; i < NKASANPML4E; i++) {
@@ -1938,25 +2007,15 @@ create_pagetables(vm_paddr_t *firstaddr)
}
#endif
- /* Connect the Direct Map slots up to the PML4. */
- for (i = 0; i < ndmpdpphys; i++) {
- p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
- p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
- }
-
/* Connect the KVA slots up to the PML4 */
for (i = 0; i < NKPML4E; i++) {
p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
}
- kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
-
if (la57) {
/* XXXKIB bootstrap KPML5phys page is lost */
- KPML5phys = allocpages(firstaddr, 1);
- for (i = 0, p5_p = (pml5_entry_t *)KPML5phys; i < NPML5EPG;
- i++) {
+ for (i = 0; i < NPML5EPG; i++) {
if (i == PML5PML5I) {
/*
* Recursively map PML5 to itself in
@@ -1964,6 +2023,10 @@ create_pagetables(vm_paddr_t *firstaddr)
*/
p5_p[i] = KPML5phys | X86_PG_RW | X86_PG_A |
X86_PG_M | X86_PG_V | pg_nx;
+ } else if (i >= DMPML5I && i < DMPML5I + ndmpml4phys) {
+ /* Connect DMAP pml4 pages to PML5. */
+ p5_p[i] = (DMPML4phys + ptoa(i - DMPML5I)) |
+ X86_PG_RW | X86_PG_V | pg_nx;
} else if (i == pmap_pml5e_index(UPT_MAX_ADDRESS)) {
p5_p[i] = KPML4phys | X86_PG_RW | X86_PG_A |
X86_PG_M | X86_PG_V;
@@ -1971,6 +2034,10 @@ create_pagetables(vm_paddr_t *firstaddr)
p5_p[i] = 0;
}
}
+ } else {
+ /* Recursively map PML4 to itself in order to get PTmap */
+ p4_p[PML4PML4I] = KPML4phys;
+ p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
}
TSEXIT();
}
@@ -2024,7 +2091,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
*/
virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
(vm_paddr_t)kernphys);
- virtual_end = VM_MAX_KERNEL_ADDRESS;
+ virtual_end = kva_layout.km_high;
/*
* Enable PG_G global pages, then switch to the kernel page
@@ -2046,9 +2113,13 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
* Initialize the kernel pmap (which is statically allocated).
* Count bootstrap data as being resident in case any of this data is
* later unmapped (using pmap_remove()) and freed.
+ *
+ * DMAP_TO_PHYS()/PHYS_TO_DMAP() are functional only after
+ * kva_layout is fixed.
*/
PMAP_LOCK_INIT(kernel_pmap);
if (la57) {
+ kva_layout = kva_layout_la57;
vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3;
PTmap = (vm_offset_t)P5Tmap;
@@ -2059,6 +2130,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
kernel_pmap->pm_cr3 = KPML5phys;
pmap_pt_page_count_adj(kernel_pmap, 1); /* top-level page */
} else {
+ kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
kernel_pmap->pm_pmltop = kernel_pml4;
kernel_pmap->pm_cr3 = KPML4phys;
}
@@ -2420,6 +2492,8 @@ pmap_init(void)
{
struct pmap_preinit_mapping *ppim;
vm_page_t m, mpte;
+ pml4_entry_t *pml4e;
+ unsigned long lm_max;
int error, i, ret, skz63;
/* L1TF, reserve page @0 unconditionally */
@@ -2545,10 +2619,15 @@ pmap_init(void)
lm_ents = 8;
TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
- if (lm_ents > LMEPML4I - LMSPML4I + 1)
- lm_ents = LMEPML4I - LMSPML4I + 1;
+ lm_max = (kva_layout.lm_high - kva_layout.lm_low) / NBPML4;
+ if (lm_ents > lm_max) {
+ printf(
+ "pmap: shrinking large map from requested %d slots to %ld slots\n",
+ lm_ents, lm_max);
+ lm_ents = lm_max;
+ }
#ifdef KMSAN
- if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
+ if (!la57 && lm_ents > KMSANORIGPML4I - LMSPML4I) {
printf(
"pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
lm_ents, KMSANORIGPML4I - LMSPML4I);
@@ -2559,18 +2638,27 @@ pmap_init(void)
printf("pmap: large map %u PML4 slots (%lu GB)\n",
lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
if (lm_ents != 0) {
- large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
+ large_vmem = vmem_create("large", kva_layout.lm_low,
(vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
if (large_vmem == NULL) {
printf("pmap: cannot create large map\n");
lm_ents = 0;
}
+ if (la57) {
+ for (i = 0; i < howmany((vm_offset_t)NBPML4 *
+ lm_ents, NBPML5); i++) {
+ m = pmap_large_map_getptp_unlocked();
+ kernel_pmap->pm_pmltop[LMSPML5I + i] = X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M |
+ pg_nx | VM_PAGE_TO_PHYS(m);
+ }
+ }
for (i = 0; i < lm_ents; i++) {
m = pmap_large_map_getptp_unlocked();
- /* XXXKIB la57 */
- kernel_pml4[LMSPML4I + i] = X86_PG_V |
- X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
- VM_PAGE_TO_PHYS(m);
+ pml4e = pmap_pml4e(kernel_pmap, kva_layout.lm_low +
+ (u_long)i * NBPML4);
+ *pml4e = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M |
+ pg_nx | VM_PAGE_TO_PHYS(m);
}
}
}
@@ -2975,7 +3063,6 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
* XXX TODO
*/
-#ifdef SMP
/*
* Interrupt the cpus that are executing in the guest context.
* This will force the vcpu to exit and the cached EPT mappings
@@ -3433,168 +3520,6 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
}
sched_unpin();
}
-#else /* !SMP */
-/*
- * Normal, non-SMP, invalidation functions.
- */
-void
-pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
- invlpg(va);
- if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
- pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = va;
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlpg(ucr3, kcr3, va);
- }
- critical_exit();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- vm_offset_t addr;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
- if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
- pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = sva;
- for (; d.addr < eva; d.addr += PAGE_SIZE)
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
- }
- critical_exit();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_all(pmap_t pmap)
-{
- struct invpcid_descr d;
- struct pmap_pcid *pcidp;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
-
- if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
- pmap->pm_eptgen++;
- return;
- }
- KASSERT(pmap->pm_type == PT_X86,
- ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
-
- if (pmap == kernel_pmap) {
- if (pmap_pcid_enabled && invpcid_works) {
- bzero(&d, sizeof(d));
- invpcid(&d, INVPCID_CTXGLOB);
- } else {
- invltlb_glob();
- }
- } else if (pmap == PCPU_GET(curpmap)) {
- if (pmap_pcid_enabled) {
- critical_enter();
- pcid = pmap_get_pcid(pmap);
- if (invpcid_works) {
- d.pcid = pcid;
- d.pad = 0;
- d.addr = 0;
- invpcid(&d, INVPCID_CTX);
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- d.pcid |= PMAP_PCID_USER_PT;
- invpcid(&d, INVPCID_CTX);
- }
- } else {
- kcr3 = pmap->pm_cr3 | pcid;
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT;
- pmap_pti_pcid_invalidate(ucr3, kcr3);
- } else
- load_cr3(kcr3);
- }
- critical_exit();
- } else {
- invltlb();
- }
- } else if (pmap_pcid_enabled) {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-
-void
-pmap_invalidate_cache(void)
-{
-
- wbinvd();
-}
-
-static void
-pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
-{
- struct pmap_pcid *pcidp;
-
- pmap_update_pde_store(pmap, pde, newpde);
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
- pmap_update_pde_invalidate(pmap, va, newpde);
- else {
- pcidp = zpcpu_get(pmap->pm_pcidp);
- pcidp->pm_gen = 0;
- }
-}
-#endif /* !SMP */
static void
pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
@@ -3899,7 +3824,7 @@ pmap_kextract(vm_offset_t va)
pd_entry_t pde;
vm_paddr_t pa;
- if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
+ if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high) {
pa = DMAP_TO_PHYS(va);
} else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
pa = pmap_large_map_kextract(va);
@@ -4040,7 +3965,7 @@ pmap_qremove(vm_offset_t sva, int count)
* enough to one of those pmap_enter() calls for it to
* be caught up in a promotion.
*/
- KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
+ KASSERT(va >= kva_layout.km_low, ("usermode va %lx", va));
KASSERT((*vtopde(va) & X86_PG_PS) == 0,
("pmap_qremove on promoted va %#lx", va));
@@ -4328,21 +4253,13 @@ void
pmap_pinit_pml5(vm_page_t pml5pg)
{
pml5_entry_t *pm_pml5;
+ int i;
pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
-
- /*
- * Add pml5 entry at top of KVA pointing to existing pml4 table,
- * entering all existing kernel mappings into level 5 table.
- */
- pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
- X86_PG_RW | X86_PG_A | X86_PG_M;
-
- /*
- * Install self-referential address mapping entry.
- */
- pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
- X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A;
+ for (i = 0; i < NPML5EPG / 2; i++)
+ pm_pml5[i] = 0;
+ for (; i < NPML5EPG; i++)
+ pm_pml5[i] = kernel_pmap->pm_pmltop[i];
}
static void
@@ -4899,8 +4816,8 @@ pmap_release(pmap_t pmap)
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
if (pmap_is_la57(pmap)) {
- pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
- pmap->pm_pmltop[PML5PML5I] = 0;
+ for (i = NPML5EPG / 2; i < NPML5EPG; i++)
+ pmap->pm_pmltop[i] = 0;
} else {
for (i = 0; i < NKPML4E; i++) /* KVA */
pmap->pm_pmltop[KPML4BASE + i] = 0;
@@ -4942,7 +4859,7 @@ pmap_release(pmap_t pmap)
static int
kvm_size(SYSCTL_HANDLER_ARGS)
{
- unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
+ unsigned long ksize = kva_layout.km_high - kva_layout.km_low;
return sysctl_handle_long(oidp, &ksize, 0, req);
}
@@ -4953,7 +4870,7 @@ SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
static int
kvm_free(SYSCTL_HANDLER_ARGS)
{
- unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
+ unsigned long kfree = kva_layout.km_high - kernel_vm_end;
return sysctl_handle_long(oidp, &kfree, 0, req);
}
@@ -5031,7 +4948,7 @@ pmap_page_array_startup(long pages)
vm_page_array_size = pages;
- start = VM_MIN_KERNEL_ADDRESS;
+ start = kva_layout.km_low;
end = start + pages * sizeof(struct vm_page);
for (va = start; va < end; va += NBPDR) {
pfn = first_page + (va - start) / sizeof(struct vm_page);
@@ -6045,17 +5962,18 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
if (mpte == NULL) {
/*
* Invalidate the 2MB page mapping and return "failure" if the
- * mapping was never accessed.
+ * mapping was never accessed and not wired.
*/
if ((oldpde & PG_A) == 0) {
- KASSERT((oldpde & PG_W) == 0,
- ("pmap_demote_pde: a wired mapping is missing PG_A"));
- pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
- return (false);
- }
-
- mpte = pmap_remove_pt_page(pmap, va);
- if (mpte == NULL) {
+ if ((oldpde & PG_W) == 0) {
+ pmap_demote_pde_abort(pmap, va, pde, oldpde,
+ lockp);
+ return (false);
+ }
+ mpte = pmap_remove_pt_page(pmap, va);
+ /* Fill the PTP with PTEs that have PG_A cleared. */
+ mpte->valid = 0;
+ } else if ((mpte = pmap_remove_pt_page(pmap, va)) == NULL) {
KASSERT((oldpde & PG_W) == 0,
("pmap_demote_pde: page table page for a wired mapping is missing"));
@@ -6067,8 +5985,8 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
* so the direct map region is the only part of the
* kernel address space that must be handled here.
*/
- KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
- va < DMAP_MAX_ADDRESS),
+ KASSERT(!in_kernel || (va >= kva_layout.dmap_low &&
+ va < kva_layout.dmap_high),
("pmap_demote_pde: No saved mpte for va %#lx", va));
/*
@@ -6107,7 +6025,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
/*
* If the PTP is not leftover from an earlier promotion or it does not
* have PG_A set in every PTE, then fill it. The new PTEs will all
- * have PG_A set.
+ * have PG_A set, unless this is a wired mapping with PG_A clear.
*/
if (!vm_page_all_valid(mpte))
pmap_fill_ptp(firstpte, newpte);
@@ -6165,8 +6083,7 @@ pmap_demote_pde_mpte(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
* pmap_remove_kernel_pde: Remove a kernel superpage mapping.
*/
static void
-pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
- bool remove_pt)
+pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
{
pd_entry_t newpde;
vm_paddr_t mptepa;
@@ -6174,12 +6091,8 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- if (remove_pt)
- mpte = pmap_remove_pt_page(pmap, va);
- else
- mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va));
- if (mpte == NULL)
- panic("pmap_remove_kernel_pde: Missing pt page.");
+ mpte = pmap_remove_pt_page(pmap, va);
+ KASSERT(mpte != NULL, ("pmap_remove_kernel_pde: missing pt page"));
mptepa = VM_PAGE_TO_PHYS(mpte);
newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
@@ -6209,7 +6122,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
* pmap_remove_pde: do the things to unmap a superpage in a process
*/
static int
-pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt,
+pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool demote_kpde,
struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
@@ -6249,9 +6162,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt,
pmap_delayed_invl_page(m);
}
}
- if (pmap == kernel_pmap) {
- pmap_remove_kernel_pde(pmap, pdq, sva, remove_pt);
- } else {
+ if (pmap != kernel_pmap) {
mpte = pmap_remove_pt_page(pmap, sva);
if (mpte != NULL) {
KASSERT(vm_page_any_valid(mpte),
@@ -6262,6 +6173,14 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, bool remove_pt,
mpte->ref_count = 0;
pmap_add_delayed_free_list(mpte, free, false);
}
+ } else if (demote_kpde) {
+ pmap_remove_kernel_pde(pmap, pdq, sva);
+ } else {
+ mpte = vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(sva));
+ if (vm_page_any_valid(mpte)) {
+ mpte->valid = 0;
+ pmap_zero_page(mpte);
+ }
}
return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
}
@@ -7183,7 +7102,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
PG_RW = pmap_rw_bit(pmap);
va = trunc_page(va);
- KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
+ KASSERT(va <= kva_layout.km_high, ("pmap_enter: toobig"));
KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
va));
@@ -7512,6 +7431,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
PG_RW = pmap_rw_bit(pmap);
KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
("pmap_enter_pde: newpde is missing PG_M"));
+ KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+ PMAP_ENTER_NORECLAIM,
+ ("pmap_enter_pde: flags is missing PMAP_ENTER_NOREPLACE"));
PG_V = pmap_valid_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -7573,8 +7495,8 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
* the mapping is not from kernel_pmap, then
* a reserved PT page could be freed.
*/
- (void)pmap_remove_pde(pmap, pde, va,
- pmap != kernel_pmap, &free, lockp);
+ (void)pmap_remove_pde(pmap, pde, va, false, &free,
+ lockp);
if ((oldpde & PG_G) == 0)
pmap_invalidate_pde_page(pmap, va, oldpde);
} else {
@@ -7584,10 +7506,9 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
* before any changes to mappings are
* made. Abort on failure.
*/
- mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
- if (pmap_insert_pt_page(pmap, mt, false, false)) {
- if (pdpg != NULL)
- pdpg->ref_count--;
+ mt = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
+ if (pmap_insert_pt_page(pmap, mt, false,
+ false)) {
CTR1(KTR_PMAP,
"pmap_enter_pde: cannot ins kern ptp va %#lx",
va);
@@ -7641,6 +7562,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
if (pdpg != NULL)
pmap_abort_ptp(pmap, va, pdpg);
+ else {
+ KASSERT(va >= VM_MAXUSER_ADDRESS &&
+ (*pde & (PG_PS | PG_V)) == PG_V,
+ ("pmap_enter_pde: invalid kernel PDE"));
+ mt = pmap_remove_pt_page(pmap, va);
+ KASSERT(mt != NULL,
+ ("pmap_enter_pde: missing kernel PTP"));
+ }
if (uwptpg != NULL) {
mt = pmap_remove_pt_page(pmap, va);
KASSERT(mt == uwptpg,
@@ -9550,7 +9479,7 @@ pmap_unmapdev(void *p, vm_size_t size)
va = (vm_offset_t)p;
/* If we gave a direct map region in pmap_mapdev, do nothing */
- if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
+ if (va >= kva_layout.dmap_low && va < kva_layout.dmap_high)
return;
offset = va & PAGE_MASK;
size = round_page(offset + size);
@@ -9649,6 +9578,8 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va, vm_page_t m)
void
pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pat_mode == ma)
+ return;
m->md.pat_mode = ma;
@@ -9668,6 +9599,9 @@ pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma)
{
int error;
+ if (m->md.pat_mode == ma)
+ return;
+
m->md.pat_mode = ma;
if ((m->flags & PG_FICTITIOUS) != 0)
@@ -9724,7 +9658,7 @@ pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
int error;
/* Only supported within the kernel map. */
- if (va < VM_MIN_KERNEL_ADDRESS)
+ if (va < kva_layout.km_low)
return (EINVAL);
PMAP_LOCK(kernel_pmap);
@@ -9755,7 +9689,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
* Only supported on kernel virtual addresses, including the direct
* map but excluding the recursive map.
*/
- if (base < DMAP_MIN_ADDRESS)
+ if (base < kva_layout.dmap_low)
return (EINVAL);
/*
@@ -9778,7 +9712,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pte_bits |= X86_PG_RW;
}
if ((prot & VM_PROT_EXECUTE) == 0 ||
- va < VM_MIN_KERNEL_ADDRESS) {
+ va < kva_layout.km_low) {
pde_bits |= pg_nx;
pte_bits |= pg_nx;
}
@@ -9874,7 +9808,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pmap_pte_props(pdpe, pde_bits, pde_mask);
changed = true;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ if (tmpva >= kva_layout.km_low &&
(*pdpe & PG_PS_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
@@ -9904,7 +9838,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pmap_pte_props(pde, pde_bits, pde_mask);
changed = true;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ if (tmpva >= kva_layout.km_low &&
(*pde & PG_PS_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
@@ -9932,7 +9866,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
pmap_pte_props(pte, pte_bits, pte_mask);
changed = true;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ if (tmpva >= kva_layout.km_low &&
(*pte & PG_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
@@ -10280,17 +10214,9 @@ pmap_activate_sw(struct thread *td)
return;
}
cpuid = PCPU_GET(cpuid);
-#ifdef SMP
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
- CPU_SET(cpuid, &pmap->pm_active);
-#endif
pmap_activate_sw_mode(td, pmap, cpuid);
-#ifdef SMP
CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
-#else
- CPU_CLR(cpuid, &oldpmap->pm_active);
-#endif
}
void
@@ -10331,11 +10257,7 @@ pmap_activate_boot(pmap_t pmap)
MPASS(pmap != kernel_pmap);
cpuid = PCPU_GET(cpuid);
-#ifdef SMP
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
-#else
- CPU_SET(cpuid, &pmap->pm_active);
-#endif
PCPU_SET(curpmap, pmap);
if (pti) {
kcr3 = pmap->pm_cr3;
@@ -10699,19 +10621,28 @@ pmap_large_map_getptp(void)
static pdp_entry_t *
pmap_large_map_pdpe(vm_offset_t va)
{
+ pml4_entry_t *pml4;
vm_pindex_t pml4_idx;
vm_paddr_t mphys;
- pml4_idx = pmap_pml4e_index(va);
- KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
- ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
- "%#jx lm_ents %d",
- (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
- ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
- "LMSPML4I %#jx lm_ents %d",
- (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+ KASSERT(va >= kva_layout.lm_low && va < kva_layout.lm_low +
+ (vm_offset_t)NBPML4 * lm_ents, ("va %#lx not in large map", va));
+ if (la57) {
+ pml4 = pmap_pml4e(kernel_pmap, va);
+ mphys = *pml4 & PG_FRAME;
+ } else {
+ pml4_idx = pmap_pml4e_index(va);
+
+ KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
+ ("pmap_large_map_pdpe: va %#jx out of range idx %#jx "
+ "LMSPML4I %#jx lm_ents %d",
+ (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+ KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
+ ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
+ "LMSPML4I %#jx lm_ents %d",
+ (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
+ mphys = kernel_pml4[pml4_idx] & PG_FRAME;
+ }
return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
}
@@ -10904,8 +10835,8 @@ pmap_large_unmap(void *svaa, vm_size_t len)
struct spglist spgf;
sva = (vm_offset_t)svaa;
- if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
- sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
+ if (len == 0 || sva + len < sva || (sva >= kva_layout.dmap_low &&
+ sva + len < kva_layout.dmap_high))
return;
SLIST_INIT(&spgf);
@@ -11151,11 +11082,10 @@ pmap_large_map_wb(void *svap, vm_size_t len)
sva = (vm_offset_t)svap;
eva = sva + len;
pmap_large_map_wb_fence();
- if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
+ if (sva >= kva_layout.dmap_low && eva < kva_layout.dmap_high) {
pmap_large_map_flush_range(sva, len);
} else {
- KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
- eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
+ KASSERT(sva >= kva_layout.lm_low && eva < kva_layout.lm_high,
("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
pmap_large_map_wb_large(sva, eva);
}
@@ -11196,8 +11126,8 @@ pmap_pti_init(void)
VM_OBJECT_WLOCK(pti_obj);
pml4_pg = pmap_pti_alloc_page();
pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
- for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
- va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
+ for (va = kva_layout.km_low; va <= kva_layout.km_high &&
+ va >= kva_layout.km_low && va > NBPML4; va += NBPML4) {
pdpe = pmap_pti_pdpe(va);
pmap_pti_wire_pte(pdpe);
}
@@ -11971,9 +11901,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
mode, range->pdpes, range->pdes, range->ptes);
/* Reset to sentinel value. */
- range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1);
+ range->sva = kva_layout.kva_max;
}
/*
@@ -12014,12 +11942,18 @@ sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
*/
static void
sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
- vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
- pt_entry_t pte)
+ vm_offset_t va, pml5_entry_t pml5e, pml4_entry_t pml4e, pdp_entry_t pdpe,
+ pd_entry_t pde, pt_entry_t pte)
{
pt_entry_t attrs;
- attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+ if (la57) {
+ attrs = pml5e & (X86_PG_RW | X86_PG_U | pg_nx);
+ attrs |= pml4e & pg_nx;
+ attrs &= pg_nx | (pml4e & (X86_PG_RW | X86_PG_U));
+ } else {
+ attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+ }
attrs |= pdpe & pg_nx;
attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
@@ -12052,13 +11986,15 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
{
struct pmap_kernel_map_range range;
struct sbuf sbuf, *sb;
+ pml5_entry_t pml5e;
pml4_entry_t pml4e;
pdp_entry_t *pdp, pdpe;
pd_entry_t *pd, pde;
pt_entry_t *pt, pte;
vm_offset_t sva;
vm_paddr_t pa;
- int error, i, j, k, l;
+ int error, j, k, l;
+ bool first;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
@@ -12067,9 +12003,8 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
/* Sentinel value. */
- range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
- NPDEPG - 1, NPTEPG - 1);
+ range.sva = kva_layout.kva_max;
+ pml5e = 0; /* no UB for la48 */
/*
* Iterate over the kernel page tables without holding the kernel pmap
@@ -12078,41 +12013,50 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
* Within the large map, ensure that PDP and PD page addresses are
* valid before descending.
*/
- for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
- switch (i) {
- case PML4PML4I:
+ for (first = true, sva = 0; sva != 0 || first; first = false) {
+ if (sva == kva_layout.rec_pt)
sbuf_printf(sb, "\nRecursive map:\n");
- break;
- case DMPML4I:
+ else if (sva == kva_layout.dmap_low)
sbuf_printf(sb, "\nDirect map:\n");
- break;
#ifdef KASAN
- case KASANPML4I:
+ else if (sva == kva_layout.kasan_shadow_low)
sbuf_printf(sb, "\nKASAN shadow map:\n");
- break;
#endif
#ifdef KMSAN
- case KMSANSHADPML4I:
+ else if (sva == kva_layout.kmsan_shadow_low)
sbuf_printf(sb, "\nKMSAN shadow map:\n");
- break;
- case KMSANORIGPML4I:
+ else if (sva == kva_layout.kmsan_origin_low)
sbuf_printf(sb, "\nKMSAN origin map:\n");
- break;
#endif
- case KPML4BASE:
+ else if (sva == kva_layout.km_low)
sbuf_printf(sb, "\nKernel map:\n");
- break;
- case LMSPML4I:
+ else if (sva == kva_layout.lm_low)
sbuf_printf(sb, "\nLarge map:\n");
- break;
- }
/* Convert to canonical form. */
- if (sva == 1ul << 47)
- sva |= -1ul << 48;
+ if (la57) {
+ if (sva == 1ul << 56) {
+ sva |= -1ul << 57;
+ continue;
+ }
+ } else {
+ if (sva == 1ul << 47) {
+ sva |= -1ul << 48;
+ continue;
+ }
+ }
restart:
- pml4e = kernel_pml4[i];
+ if (la57) {
+ pml5e = *pmap_pml5e(kernel_pmap, sva);
+ if ((pml5e & X86_PG_V) == 0) {
+ sva = rounddown2(sva, NBPML5);
+ sysctl_kmaps_dump(sb, &range, sva);
+ sva += NBPML5;
+ continue;
+ }
+ }
+ pml4e = *pmap_pml4e(kernel_pmap, sva);
if ((pml4e & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPML4);
sysctl_kmaps_dump(sb, &range, sva);
@@ -12133,8 +12077,8 @@ restart:
pa = pdpe & PG_FRAME;
if ((pdpe & PG_PS) != 0) {
sva = rounddown2(sva, NBPDP);
- sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
- 0, 0);
+ sysctl_kmaps_check(sb, &range, sva, pml5e,
+ pml4e, pdpe, 0, 0);
range.pdpes++;
sva += NBPDP;
continue;
@@ -12146,6 +12090,7 @@ restart:
* freed. Validate the next-level address
* before descending.
*/
+ sva += NBPDP;
goto restart;
}
pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
@@ -12162,7 +12107,7 @@ restart:
if ((pde & PG_PS) != 0) {
sva = rounddown2(sva, NBPDR);
sysctl_kmaps_check(sb, &range, sva,
- pml4e, pdpe, pde, 0);
+ pml5e, pml4e, pdpe, pde, 0);
range.pdes++;
sva += NBPDR;
continue;
@@ -12174,6 +12119,7 @@ restart:
* may be freed. Validate the
* next-level address before descending.
*/
+ sva += NBPDR;
goto restart;
}
pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
@@ -12187,7 +12133,7 @@ restart:
continue;
}
sysctl_kmaps_check(sb, &range, sva,
- pml4e, pdpe, pde, pte);
+ pml5e, pml4e, pdpe, pde, pte);
range.ptes++;
}
}
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index c95696bbe7ef..870cd255abb7 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -934,10 +934,7 @@ ENTRY(casueword32_nosmap)
ja fusufault
movl %esi,%eax /* old */
-#ifdef SMP
- lock
-#endif
- cmpxchgl %ecx,(%rdi) /* new = %ecx */
+ lock cmpxchgl %ecx,(%rdi) /* new = %ecx */
setne %cl
/*
@@ -971,10 +968,7 @@ ENTRY(casueword32_smap)
movl %esi,%eax /* old */
stac
-#ifdef SMP
- lock
-#endif
- cmpxchgl %ecx,(%rdi) /* new = %ecx */
+ lock cmpxchgl %ecx,(%rdi) /* new = %ecx */
clac
setne %cl
@@ -1014,10 +1008,7 @@ ENTRY(casueword_nosmap)
ja fusufault
movq %rsi,%rax /* old */
-#ifdef SMP
- lock
-#endif
- cmpxchgq %rcx,(%rdi) /* new = %rcx */
+ lock cmpxchgq %rcx,(%rdi) /* new = %rcx */
setne %cl
/*
@@ -1045,10 +1036,7 @@ ENTRY(casueword_smap)
movq %rsi,%rax /* old */
stac
-#ifdef SMP
- lock
-#endif
- cmpxchgq %rcx,(%rdi) /* new = %rcx */
+ lock cmpxchgq %rcx,(%rdi) /* new = %rcx */
clac
setne %cl
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 09ac0a67dbef..f3469ed5e2bc 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -37,7 +37,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
/*
* AMD64 Trap and System call handling
*/
@@ -87,9 +86,7 @@ PMC_SOFT_DEFINE( , , page_fault, write);
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
-#ifdef SMP
#include <machine/smp.h>
-#endif
#include <machine/stack.h>
#include <machine/trap.h>
#include <machine/tss.h>
@@ -769,7 +766,7 @@ trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode)
return (-1);
}
}
- if (eva >= VM_MIN_KERNEL_ADDRESS) {
+ if (eva >= kva_layout.km_low) {
/*
* Don't allow user-mode faults in kernel address space.
*/
@@ -900,11 +897,9 @@ trap_diag(struct trapframe *frame, vm_offset_t eva)
printf("\n\nFatal trap %d: %s while in %s mode\n", type,
type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN,
TRAPF_USERMODE(frame) ? "user" : "kernel");
-#ifdef SMP
- /* two separate prints in case of a trap on an unmapped page */
- printf("cpuid = %d; ", PCPU_GET(cpuid));
- printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+ /* Print these separately in case pcpu accesses trap. */
+ printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+ PCPU_GET(apic_id));
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%lx\n", eva);
printf("fault code = %s %s %s%s%s, %s\n",
@@ -1025,11 +1020,9 @@ dblfault_handler(struct trapframe *frame)
frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
frame->tf_fs, frame->tf_gs,
rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
-#ifdef SMP
- /* two separate prints in case of a trap on an unmapped page */
- printf("cpuid = %d; ", PCPU_GET(cpuid));
- printf("apic id = %02x\n", PCPU_GET(apic_id));
-#endif
+ /* Print these separately in case pcpu accesses trap. */
+ printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
+ PCPU_GET(apic_id));
panic("double fault");
}
diff --git a/sys/amd64/conf/MINIMALUP b/sys/amd64/conf/MINIMALUP
deleted file mode 100644
index 0dbddbe5b341..000000000000
--- a/sys/amd64/conf/MINIMALUP
+++ /dev/null
@@ -1,4 +0,0 @@
-include MINIMAL
-ident MINIMALUP
-nooptions SMP
-nooptions NUMA
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index 8db314fa034d..5a9c3162e14c 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -146,11 +146,10 @@
#define amd64_btop(x) ((unsigned long)(x) >> PAGE_SHIFT)
#define amd64_ptob(x) ((unsigned long)(x) << PAGE_SHIFT)
-#define INKERNEL(va) (((va) >= DMAP_MIN_ADDRESS && (va) < DMAP_MAX_ADDRESS) \
- || ((va) >= VM_MIN_KERNEL_ADDRESS && (va) < VM_MAX_KERNEL_ADDRESS))
+#define INKERNEL(va) \
+ (((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_high) || \
+ ((va) >= kva_layout.km_low && (va) < kva_layout.km_high))
-#ifdef SMP
#define SC_TABLESIZE 1024 /* Must be power of 2. */
-#endif
#endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 7d3e91bcd9b9..e2f97442c10f 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -169,11 +169,12 @@
* the recursive page table map.
*/
#define NDMPML4E 8
+#define NDMPML5E 32
/*
- * These values control the layout of virtual memory. The starting address
- * of the direct map, which is controlled by DMPML4I, must be a multiple of
- * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ * These values control the layout of virtual memory. The starting
+ * address of the direct map is controlled by DMPML4I on LA48 and
+ * DMPML5I on LA57.
*
* Note: KPML4I is the index of the (single) level 4 page that maps
* the KVA that holds KERNBASE, while KPML4BASE is the index of the
@@ -191,6 +192,7 @@
#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */
#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
+#define DMPML5I (NPML5EPG / 2 + 1)
#define KPML4I (NPML4EPG-1)
#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */
@@ -200,9 +202,14 @@
#define KMSANSHADPML4I (KPML4BASE - NKMSANSHADPML4E)
#define KMSANORIGPML4I (DMPML4I - NKMSANORIGPML4E)
-/* Large map: index of the first and max last pml4 entry */
+/*
+ * Large map: index of the first and max last pml4/la48 and pml5/la57
+ * entry.
+ */
#define LMSPML4I (PML4PML4I + 1)
#define LMEPML4I (KASANPML4I - 1)
+#define LMSPML5I (DMPML5I + NDMPML5E)
+#define LMEPML5I (LMSPML5I + 32 - 1) /* 32 slots for large map */
/*
* XXX doesn't really belong here I guess...
@@ -548,6 +555,25 @@ pmap_pml5e_index(vm_offset_t va)
return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1));
}
+struct kva_layout_s {
+ vm_offset_t kva_min;
+ vm_offset_t kva_max;
+ vm_offset_t dmap_low; /* DMAP_MIN_ADDRESS */
+ vm_offset_t dmap_high; /* DMAP_MAX_ADDRESS */
+ vm_offset_t lm_low; /* LARGEMAP_MIN_ADDRESS */
+ vm_offset_t lm_high; /* LARGEMAP_MAX_ADDRESS */
+ vm_offset_t km_low; /* VM_MIN_KERNEL_ADDRESS */
+ vm_offset_t km_high; /* VM_MAX_KERNEL_ADDRESS */
+ vm_offset_t rec_pt;
+ vm_offset_t kasan_shadow_low; /* KASAN_MIN_ADDRESS */
+ vm_offset_t kasan_shadow_high; /* KASAN_MAX_ADDRESS */
+ vm_offset_t kmsan_shadow_low; /* KMSAN_SHAD_MIN_ADDRESS */
+ vm_offset_t kmsan_shadow_high; /* KMSAN_SHAD_MAX_ADDRESS */
+ vm_offset_t kmsan_origin_low; /* KMSAN_ORIG_MIN_ADDRESS */
+ vm_offset_t kmsan_origin_high; /* KMSAN_ORIG_MAX_ADDRESS */
+};
+extern struct kva_layout_s kva_layout;
+
#endif /* !LOCORE */
#endif /* !_MACHINE_PMAP_H_ */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 26eb227211da..bff92570ff82 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -13,8 +13,6 @@
#ifdef _KERNEL
-#ifdef SMP
-
#ifndef LOCORE
#include <x86/x86_smp.h>
@@ -39,7 +37,6 @@ void invlop_handler(void);
int start_all_aps(void);
#endif /* !LOCORE */
-#endif /* SMP */
#endif /* _KERNEL */
#endif /* _MACHINE_SMP_H_ */
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index a9c73b75213b..0b3daed4f69e 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -649,6 +649,8 @@ struct vm_inout_str {
int addrsize;
enum vm_reg_name seg_name;
struct seg_desc seg_desc;
+ int cs_d;
+ uint64_t cs_base;
};
enum task_switch_reason {
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 1f86538ce5f3..441330fd57b8 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -29,6 +29,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
+#include <sys/domainset.h>
+
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
@@ -52,7 +54,10 @@ struct vm_munmap {
struct vm_memseg {
int segid;
size_t len;
- char name[VM_MAX_SUFFIXLEN + 1];
+ char name[VM_MAX_SUFFIXLEN + 1];
+ domainset_t *ds_mask;
+ size_t ds_mask_size;
+ int ds_policy;
};
struct vm_register {
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index d5f0363cfb41..1fb0f97682a7 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -31,6 +31,31 @@
#include <sys/mman.h>
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_MOVSX,
+ VIE_OP_TYPE_MOVZX,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_OR,
+ VIE_OP_TYPE_SUB,
+ VIE_OP_TYPE_TWO_BYTE,
+ VIE_OP_TYPE_PUSH,
+ VIE_OP_TYPE_CMP,
+ VIE_OP_TYPE_POP,
+ VIE_OP_TYPE_MOVS,
+ VIE_OP_TYPE_GROUP1,
+ VIE_OP_TYPE_STOS,
+ VIE_OP_TYPE_BITTEST,
+ VIE_OP_TYPE_TWOB_GRP15,
+ VIE_OP_TYPE_ADD,
+ VIE_OP_TYPE_TEST,
+ VIE_OP_TYPE_BEXTR,
+ VIE_OP_TYPE_OUTS,
+ VIE_OP_TYPE_LAST
+};
+
/*
* Callback functions to read and write memory regions.
*/
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 0cd9bb4fa7a4..d2ac3c6648b2 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -163,6 +163,7 @@
* Virtual addresses of things. Derived from the page directory and
* page table indexes from pmap.h for precision.
*
+ * LA48:
* 0x0000000000000000 - 0x00007fffffffffff user map
* 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole)
* 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot)
@@ -175,32 +176,38 @@
* 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional
* 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map
*
+ * LA57:
+ * 0x0000000000000000 - 0x00ffffffffffffff user map
+ * 0x0100000000000000 - 0xf0ffffffffffffff does not exist (hole)
+ * 0xff00000000000000 - 0xff00ffffffffffff recursive page table (2048TB slot)
+ * 0xff01000000000000 - 0xff20ffffffffffff direct map (32 x 2048TB slots)
+ * 0xff21000000000000 - 0xff40ffffffffffff large map
+ * 0xff41000000000000 - 0xffff7fffffffffff unused
+ * 0xffff800000000000 - 0xfffff5ffffffffff unused (start of kernel pml4 entry)
+ * 0xfffff60000000000 - 0xfffff7ffffffffff 2TB KMSAN origin map, optional
+ * 0xfffff78000000000 - 0xfffff7bfffffffff 512GB KASAN shadow map, optional
+ * 0xfffff80000000000 - 0xfffffbffffffffff 4TB unused
+ * 0xfffffc0000000000 - 0xfffffdffffffffff 2TB KMSAN shadow map, optional
+ * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map
+ *
* Within the kernel map:
*
* 0xfffffe0000000000 vm_page_array
* 0xffffffff80000000 KERNBASE
*/
-#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0)
-#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \
- NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-
-#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0)
-#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0)
-
-#define KASAN_MIN_ADDRESS KV4ADDR(KASANPML4I, 0, 0, 0)
-#define KASAN_MAX_ADDRESS KV4ADDR(KASANPML4I + NKASANPML4E, 0, 0, 0)
+#define VM_MIN_KERNEL_ADDRESS_LA48 KV4ADDR(KPML4BASE, 0, 0, 0)
+#define VM_MIN_KERNEL_ADDRESS kva_layout.km_low
+#define VM_MAX_KERNEL_ADDRESS kva_layout.km_high
-#define KMSAN_SHAD_MIN_ADDRESS KV4ADDR(KMSANSHADPML4I, 0, 0, 0)
-#define KMSAN_SHAD_MAX_ADDRESS KV4ADDR(KMSANSHADPML4I + NKMSANSHADPML4E, \
- 0, 0, 0)
+#define KASAN_MIN_ADDRESS (kva_layout.kasan_shadow_low)
+#define KASAN_MAX_ADDRESS (kva_layout.kasan_shadow_high)
-#define KMSAN_ORIG_MIN_ADDRESS KV4ADDR(KMSANORIGPML4I, 0, 0, 0)
-#define KMSAN_ORIG_MAX_ADDRESS KV4ADDR(KMSANORIGPML4I + NKMSANORIGPML4E, \
- 0, 0, 0)
+#define KMSAN_SHAD_MIN_ADDRESS (kva_layout.kmsan_shadow_low)
+#define KMSAN_SHAD_MAX_ADDRESS (kva_layout.kmsan_shadow_high)
-#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0)
-#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0)
+#define KMSAN_ORIG_MIN_ADDRESS (kva_layout.kmsan_origin_low)
+#define KMSAN_ORIG_MAX_ADDRESS (kva_layout.kmsan_origin_high)
/*
* Formally kernel mapping starts at KERNBASE, but kernel linker
@@ -239,21 +246,21 @@
* vt fb startup needs to be reworked.
*/
#define PHYS_IN_DMAP(pa) (dmaplimit == 0 || (pa) < dmaplimit)
-#define VIRT_IN_DMAP(va) ((va) >= DMAP_MIN_ADDRESS && \
- (va) < (DMAP_MIN_ADDRESS + dmaplimit))
+#define VIRT_IN_DMAP(va) \
+ ((va) >= kva_layout.dmap_low && (va) < kva_layout.dmap_low + dmaplimit)
#define PMAP_HAS_DMAP 1
-#define PHYS_TO_DMAP(x) ({ \
+#define PHYS_TO_DMAP(x) __extension__ ({ \
KASSERT(PHYS_IN_DMAP(x), \
("physical address %#jx not covered by the DMAP", \
(uintmax_t)x)); \
- (x) | DMAP_MIN_ADDRESS; })
+ (x) + kva_layout.dmap_low; })
-#define DMAP_TO_PHYS(x) ({ \
+#define DMAP_TO_PHYS(x) __extension__ ({ \
KASSERT(VIRT_IN_DMAP(x), \
("virtual address %#jx not covered by the DMAP", \
(uintmax_t)x)); \
- (x) & ~DMAP_MIN_ADDRESS; })
+ (x) - kva_layout.dmap_low; })
/*
* amd64 maps the page array into KVA so that it can be more easily
@@ -274,7 +281,7 @@
*/
#ifndef VM_KMEM_SIZE_MAX
#define VM_KMEM_SIZE_MAX ((VM_MAX_KERNEL_ADDRESS - \
- VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5)
+ kva_layout.km_low + 1) * 3 / 5)
#endif
/* initial pagein size of beginning of executable file */
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
index 000000000000..c7b75767680a
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,978 @@
+/*
+ * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * hwt(4) Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ * 'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ * each traced CPU core or thread. Upon initialization, a ToPA configuration
+ * is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ * The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ * 4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ * relevant PT registers. Every time a traced thread is switched
+ * out or in, its state will be saved to or loaded from its corresponding
+ * 'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ * tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ * interrupt before continuing. The interrupt handler will then fetch the
+ * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ * The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ * and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ * a 4K ToPA entry is filled. While this is fine when tracing smaller
+ * functions or infrequent code paths, this will generate too much interrupt
+ * traffic when tracing hotter functions. A proper solution for this issue
+ * should estimate the amount of data generated by the current configuration
+ * and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/systm.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_record.h>
+#include <dev/hwt/hwt_thread.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+#define PT_SUPPORTED_FLAGS \
+ (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
+ RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
+#define PT_MAX_IP_RANGES 2
+
+#define PT_TOPA_MASK_PTRS 0x7f
+#define PT_TOPA_PAGE_MASK 0xffffff80
+#define PT_TOPA_PAGE_SHIFT 7
+
+#define CPUID_PT_LEAF 0x14
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+/*
+ * Intel Processor Trace XSAVE-managed state.
+ */
+struct pt_ext_area {
+ uint64_t rtit_ctl;
+ uint64_t rtit_output_base;
+ uint64_t rtit_output_mask_ptrs;
+ uint64_t rtit_status;
+ uint64_t rtit_cr3_match;
+ uint64_t rtit_addr0_a;
+ uint64_t rtit_addr0_b;
+ uint64_t rtit_addr1_a;
+ uint64_t rtit_addr1_b;
+};
+
+struct pt_buffer {
+ uint64_t *topa_hw; /* ToPA table entries. */
+ size_t size;
+ struct mtx lock; /* Lock for fields below. */
+ vm_offset_t offset;
+ uint64_t wrap_count;
+ int curpage;
+};
+
+struct pt_ctx {
+ int id;
+ struct pt_buffer buf; /* ToPA buffer metadata */
+ struct task task; /* ToPA buffer notification task */
+ struct hwt_context *hwt_ctx;
+ uint8_t *save_area; /* PT XSAVE area */
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+ PT_DISABLED = 0,
+ PT_STOPPED,
+ PT_ACTIVE
+};
+
+static struct pt_cpu {
+ struct pt_ctx *ctx; /* active PT tracing context */
+ enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+ uint32_t l0_eax;
+ uint32_t l0_ebx;
+ uint32_t l0_ecx;
+ uint32_t l1_eax;
+ uint32_t l1_ebx;
+ size_t xsave_area_size;
+ size_t xstate_hdr_offset;
+ size_t pt_xsave_offset;
+} pt_info __read_mostly;
+
+static bool initialized = false;
+static int cpu_mode_ctr = 0;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+ return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+ atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+static __inline struct xstate_hdr *
+pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
+{
+ return ((struct xstate_hdr *)(ctx->save_area +
+ pt_info.xstate_hdr_offset));
+}
+
+
+static __inline struct pt_ext_area *
+pt_ctx_get_ext_area(struct pt_ctx *ctx)
+{
+ return ((struct pt_ext_area *)(ctx->save_area +
+ pt_info.pt_xsave_offset));
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+ uint64_t reg;
+ int curpage;
+
+ /* Update buffer offset. */
+ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+ curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
+ mtx_lock_spin(&buf->lock);
+ /* Check if the output wrapped. */
+ if (buf->curpage > curpage)
+ buf->wrap_count++;
+ buf->curpage = curpage;
+ buf->offset = reg >> 32;
+ mtx_unlock_spin(&buf->lock);
+
+ dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+ buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+ struct hwt_record_entry *rec)
+{
+ rec->record_type = HWT_RECORD_BUFFER;
+ rec->buf_id = id;
+ rec->curpage = buf->curpage;
+ rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(uint8_t *save_area, bool enable)
+{
+ u_long xcr0, cr0;
+ u_long xss;
+
+ cr0 = rcr0();
+ if (cr0 & CR0_TS)
+ clts();
+ xcr0 = rxcr(XCR0);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+ xss = rdmsr(MSR_IA32_XSS);
+ wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+ if (!enable) {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+ ("%s: PT is disabled", __func__));
+ xsaves(save_area, XFEATURE_ENABLED_PT);
+ } else {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+ ("%s: PT is enabled", __func__));
+ xrstors(save_area, XFEATURE_ENABLED_PT);
+ }
+ wrmsr(MSR_IA32_XSS, xss);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0);
+ if (cr0 & CR0_TS)
+ load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+ struct pt_cpu *cpu;
+
+ cpu = &pt_pcpu[curcpu];
+ MPASS(cpu->ctx != NULL);
+
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+ load_cr4(rcr4() | CR4_XSAVE);
+ wrmsr(MSR_IA32_RTIT_STATUS, 0);
+ pt_cpu_set_state(curcpu, PT_ACTIVE);
+ pt_cpu_toggle_local(cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+ struct pt_cpu *cpu;
+ struct pt_ctx *ctx;
+
+ /* Shutdown may occur before PT gets properly configured. */
+ if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+ return;
+
+ cpu = &pt_pcpu[curcpu];
+ ctx = cpu->ctx;
+ MPASS(ctx != NULL);
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+ pt_cpu_set_state(curcpu, PT_STOPPED);
+ pt_cpu_toggle_local(cpu->ctx->save_area, false);
+ pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+ struct pt_buffer *buf;
+ size_t topa_size;
+ int i;
+
+ topa_size = TOPA_SIZE_4K;
+ buf = &ctx->buf;
+
+ KASSERT(buf->topa_hw == NULL,
+ ("%s: ToPA info already exists", __func__));
+ buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+ M_ZERO | M_WAITOK);
+ dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+ buf->size = vm->npages * PAGE_SIZE;
+ for (i = 0; i < vm->npages; i++) {
+ buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+ /*
+ * XXX: TOPA_INT should ideally be set according to
+ * expected amount of incoming trace data. Too few TOPA_INT
+ * entries will not trigger interrupts often enough when tracing
+ * smaller functions.
+ */
+ buf->topa_hw[i] |= TOPA_INT;
+ }
+ buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+ return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+ struct pt_ext_area *pt_ext;
+ int nranges_supp, n, error = 0;
+
+ pt_ext = pt_ctx_get_ext_area(ctx);
+ if (pt_info.l0_ebx & CPUPT_IPF) {
+ nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+ CPUPT_NADDR_S;
+
+ if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+ nranges_supp = PT_IP_FILTER_MAX_RANGES;
+ n = cfg->nranges;
+ if (n > nranges_supp) {
+ printf("%s: %d IP filtering ranges requested, CPU "
+ "supports %d, truncating\n",
+ __func__, n, nranges_supp);
+ n = nranges_supp;
+ }
+
+ switch (n) {
+ case 2:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+ pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+ pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+ case 1:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+ pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+ pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+ break;
+ default:
+ error = (EINVAL);
+ break;
+ };
+ } else
+ error = (ENXIO);
+
+ return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+ dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+ KASSERT(pt_ctx->buf.topa_hw == NULL,
+ ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+ memset(pt_ctx, 0, sizeof(struct pt_ctx));
+ mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+ pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
+ M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx->save_area == NULL)
+ return (ENOMEM);
+ dprintf("%s: preparing ToPA buffer\n", __func__);
+ if (pt_topa_prepare(pt_ctx, vm) != 0) {
+ dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+ free(pt_ctx->save_area, M_PT);
+ return (ENOMEM);
+ }
+
+ pt_ctx->id = ctx_id;
+ TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+ return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+ if (pt_ctx->buf.topa_hw != NULL)
+ free(pt_ctx->buf.topa_hw, M_PT);
+ if (pt_ctx->save_area != NULL)
+ free(pt_ctx->save_area, M_PT);
+ memset(pt_ctx, 0, sizeof(*pt_ctx));
+ pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+ struct hwt_cpu *hwt_cpu;
+ struct hwt_thread *thr;
+ struct pt_ctx *pt_ctx;
+ struct pt_cpu_config *cfg;
+ struct pt_ext_area *pt_ext;
+ struct xstate_hdr *hdr;
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ cfg = (struct pt_cpu_config *)ctx->config;
+ pt_ctx = NULL;
+
+ /* Clear any flags we don't support yet. */
+ cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+ if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+ if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+ printf("%s: CPU does not support generating MTC "
+ "packets\n", __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+ if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+ printf("%s: CPU does not support CR3 filtering\n",
+ __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+ if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+ printf("%s: CPU does not support TNT\n", __func__);
+ return (ENXIO);
+ }
+ }
+ /* TODO: support for more config bits. */
+
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ if (hwt_cpu->cpu_id != cpu_id)
+ continue;
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ break;
+ }
+ } else {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ if (thr->thread_id != thread_id)
+ continue;
+ KASSERT(thr->private != NULL,
+ ("%s: hwt thread private"
+ " not set, thr %p",
+ __func__, thr));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ break;
+ }
+ }
+ if (pt_ctx == NULL)
+ return (ENOENT);
+
+ dprintf("%s: preparing MSRs\n", __func__);
+ pt_ext = pt_ctx_get_ext_area(pt_ctx);
+ hdr = pt_ctx_get_xstate_hdr(pt_ctx);
+
+ pt_ext->rtit_ctl |= cfg->rtit_ctl;
+ if (cfg->nranges != 0) {
+ dprintf("%s: preparing IPF ranges\n", __func__);
+ if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+ return (error);
+ }
+ pt_ctx->hwt_ctx = ctx;
+ pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+ pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+ pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
+ hdr->xstate_bv = XFEATURE_ENABLED_PT;
+ hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+ XSTATE_XCOMP_BV_COMPACT;
+ pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+ pt_pcpu[cpu_id].ctx = pt_ctx;
+ pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to start PT on another cpu", __func__));
+ pt_cpu_start(NULL);
+ CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+ struct pt_cpu *cpu;
+
+ if (ctx->mode == HWT_MODE_CPU)
+ return;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to disable PT on another cpu", __func__));
+ pt_cpu_stop(NULL);
+ CPU_CLR(cpu_id, &ctx->cpu_map);
+ cpu = &pt_pcpu[cpu_id];
+ cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static int
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 1) != 0)
+ return (-1);
+
+ KASSERT(ctx->mode == HWT_MODE_CPU,
+ ("%s: should only be used for CPU mode", __func__));
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static int
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU &&
+ atomic_swap_32(&cpu_mode_ctr, 0) == 0)
+ return (-1);
+
+ if (CPU_EMPTY(&ctx->cpu_map)) {
+ dprintf("%s: empty cpu map\n", __func__);
+ return (-1);
+ }
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+ struct hwt_cpu *hwt_cpu;
+ int error;
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+ hwt_cpu->vm, hwt_cpu->cpu_id);
+ if (error)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static int
+pt_backend_deinit(struct hwt_context *ctx)
+{
+ struct pt_ctx *pt_ctx;
+ struct hwt_thread *thr;
+ int cpu_id;
+
+ dprintf("%s\n", __func__);
+
+ pt_backend_disable_smp(ctx);
+ if (ctx->mode == HWT_MODE_THREAD) {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ KASSERT(thr->private != NULL,
+ ("%s: thr->private not set", __func__));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ pt_deinit_ctx(pt_ctx);
+ }
+ } else {
+ CPU_FOREACH(cpu_id) {
+ if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+ continue;
+ if (pt_pcpu[cpu_id].ctx != NULL) {
+ KASSERT(pt_pcpu[cpu_id].ctx ==
+ &pt_pcpu_ctx[cpu_id],
+ ("%s: CPU mode tracing with non-cpu mode PT"
+ "context active",
+ __func__));
+ pt_pcpu[cpu_id].ctx = NULL;
+ }
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ pt_deinit_ctx(pt_ctx);
+ memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+ uint64_t *data)
+{
+ struct pt_buffer *buf;
+
+ if (vm->ctx->mode == HWT_MODE_THREAD)
+ buf = &((struct pt_ctx *)vm->thr->private)->buf;
+ else
+ buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+ mtx_lock_spin(&buf->lock);
+ *curpage = buf->curpage;
+ *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+ mtx_unlock_spin(&buf->lock);
+
+ return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *pt_ctx;
+ int error;
+
+ /* Omit M_WAITOK since this might get invoked a non-sleepable context */
+ pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx == NULL)
+ return (ENOMEM);
+
+ error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+ if (error)
+ return (error);
+
+ thr->private = pt_ctx;
+ return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *ctx;
+
+ ctx = (struct pt_ctx *)thr->private;
+
+ pt_deinit_ctx(ctx);
+ free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+ .hwt_backend_init = pt_backend_init,
+ .hwt_backend_deinit = pt_backend_deinit,
+
+ .hwt_backend_configure = pt_backend_configure,
+
+ .hwt_backend_enable = pt_backend_enable,
+ .hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+ .hwt_backend_enable_smp = pt_backend_enable_smp,
+ .hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+ .hwt_backend_read = pt_backend_read,
+ .hwt_backend_dump = pt_backend_dump,
+
+ .hwt_backend_thread_alloc = pt_backend_alloc_thread,
+ .hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+ .ops = &pt_ops,
+ .name = "pt",
+ .kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+ struct hwt_record_entry record;
+ struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+ /* Prepare buffer record. */
+ mtx_lock_spin(&ctx->buf.lock);
+ pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+ mtx_unlock_spin(&ctx->buf.lock);
+ hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+ uint64_t reg;
+
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+ reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+ struct pt_buffer *buf;
+ struct pt_ctx *ctx;
+ uint64_t reg;
+
+ SDT_PROBE0(pt, , , topa__intr);
+
+ if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+ return (0);
+ }
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+ if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+ /* ACK spurious or leftover interrupt. */
+ pt_topa_status_clear();
+ return (1);
+ }
+
+ ctx = pt_pcpu[curcpu].ctx;
+ buf = &ctx->buf;
+ KASSERT(buf->topa_hw != NULL,
+ ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+ pt_cpu_toggle_local(ctx->save_area, false);
+ pt_update_buffer(buf);
+ pt_topa_status_clear();
+ taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+ TASKQUEUE_FAIL_IF_PENDING);
+
+ if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+ pt_cpu_toggle_local(ctx->save_area, true);
+ lapic_reenable_pcint();
+ }
+ return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+ u_int cp[4];
+ int error;
+
+ dprintf("pt: Enumerating part 1\n");
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+ dprintf("pt: ecx %x\n", cp[2]);
+
+ pt_info.l0_eax = cp[0];
+ pt_info.l0_ebx = cp[1];
+ pt_info.l0_ecx = cp[2];
+
+ dprintf("pt: Enumerating part 2\n");
+ cpuid_count(CPUID_PT_LEAF, 1, cp);
+ dprintf("pt: eax %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+
+ pt_info.l1_eax = cp[0];
+ pt_info.l1_ebx = cp[1];
+
+ error = hwt_backend_register(&backend);
+ if (error != 0) {
+ printf("pt: unable to register hwt backend, error %d\n", error);
+ return (error);
+ }
+ pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+ M_ZERO | M_WAITOK);
+ pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+ M_ZERO | M_WAITOK);
+
+ nmi_register_handler(pt_topa_intr);
+ if (!lapic_enable_pcint()) {
+ nmi_remove_handler(pt_topa_intr);
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ pt_pcpu_ctx = NULL;
+ printf("pt: failed to setup interrupt line\n");
+ return (error);
+ }
+ initialized = true;
+
+ return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT and
+ * initializes XSAVE area info.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+ u_int cp[4];
+
+ if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+ printf("pt: CPU does not support Intel Processor Trace\n");
+ return (false);
+ }
+ if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+ printf("pt: XSAVE is not supported\n");
+ return (false);
+ }
+ if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
+ printf("pt: CPU does not support managing PT state using XSAVE\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
+ printf("pt: XSAVE compaction is not supported\n");
+ return (false);
+ }
+ if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
+ printf("pt: CPU does not support XSAVES/XRSTORS\n");
+ return (false);
+ }
+
+ /* Require ToPA support. */
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ if ((cp[2] & CPUPT_TOPA) == 0) {
+ printf("pt: ToPA is not supported\n");
+ return (false);
+ }
+ if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+ printf("pt: multiple ToPA outputs are not supported\n");
+ return (false);
+ }
+
+ pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
+ pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
+ pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
+ XFEATURE_ENABLED_PT, true, true);
+
+ return (true);
+}
+
+static void
+pt_deinit(void)
+{
+ if (!initialized)
+ return;
+ nmi_remove_handler(pt_topa_intr);
+ lapic_disable_pcint();
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+ switch (type) {
+ case MOD_LOAD:
+ if (!pt_supported() || pt_init() != 0) {
+ return (ENXIO);
+ }
+ break;
+ case MOD_UNLOAD:
+ pt_deinit();
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
index 000000000000..2423afdf22e9
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+ uint64_t rtit_ctl;
+ register_t cr3_filter;
+ int nranges;
+ struct ipf_range {
+ vm_offset_t start;
+ vm_offset_t end;
+ } ip_ranges[PT_IP_FILTER_MAX_RANGES];
+ uint32_t mtc_freq;
+ uint32_t cyc_thresh;
+ uint32_t psb_freq;
+};
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index 6c16daaa47c2..2fe6a5bc3584 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -317,6 +317,33 @@ svm_set_tsc_offset(struct svm_vcpu *vcpu, uint64_t offset)
#define MSR_AMD7TH_START 0xC0010000UL
#define MSR_AMD7TH_END 0xC0011FFFUL
+static void
+svm_get_cs_info(struct vmcb *vmcb, struct vm_guest_paging *paging, int *cs_d,
+ uint64_t *base)
+{
+ struct vmcb_segment seg;
+ int error __diagused;
+
+ error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
+ KASSERT(error == 0, ("%s: vmcb_seg error %d", __func__, error));
+
+ switch (paging->cpu_mode) {
+ case CPU_MODE_REAL:
+ *base = seg.base;
+ *cs_d = 0;
+ break;
+ case CPU_MODE_PROTECTED:
+ case CPU_MODE_COMPATIBILITY:
+ *cs_d = !!(seg.attrib & VMCB_CS_ATTRIB_D);
+ *base = seg.base;
+ break;
+ default:
+ *base = 0;
+ *cs_d = 0;
+ break;
+ }
+}
+
/*
* Get the index and bit position for a MSR in permission bitmap.
* Two bits are used for each MSR: lower bit for read and higher bit for write.
@@ -735,10 +762,29 @@ svm_inout_str_seginfo(struct svm_vcpu *vcpu, int64_t info1, int in,
if (in) {
vis->seg_name = VM_REG_GUEST_ES;
- } else {
- /* The segment field has standard encoding */
+ } else if (decode_assist()) {
+ /*
+ * The effective segment number in EXITINFO1[12:10] is populated
+ * only if the processor has the DecodeAssist capability.
+ *
+ * XXX this is not specified explicitly in APMv2 but can be
+ * verified empirically.
+ */
s = (info1 >> 10) & 0x7;
+
+ /* The segment field has standard encoding */
vis->seg_name = vm_segment_name(s);
+ } else {
+ /*
+ * The segment register need to be manually decoded by fetching
+ * the instructions near ip. However, we are unable to fetch it
+ * while the interrupts are disabled. Therefore, we leave the
+ * value unset until the generic ins/outs handler runs.
+ */
+ vis->seg_name = VM_REG_LAST;
+ svm_get_cs_info(vcpu->vmcb, &vis->paging, &vis->cs_d,
+ &vis->cs_base);
+ return;
}
error = svm_getdesc(vcpu, vis->seg_name, &vis->seg_desc);
@@ -798,16 +844,6 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit)
info1 = ctrl->exitinfo1;
inout_string = info1 & BIT(2) ? 1 : 0;
- /*
- * The effective segment number in EXITINFO1[12:10] is populated
- * only if the processor has the DecodeAssist capability.
- *
- * XXX this is not specified explicitly in APMv2 but can be verified
- * empirically.
- */
- if (inout_string && !decode_assist())
- return (UNHANDLED);
-
vmexit->exitcode = VM_EXITCODE_INOUT;
vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0;
vmexit->u.inout.string = inout_string;
@@ -825,6 +861,8 @@ svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit)
vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
vis->addrsize = svm_inout_str_addrsize(info1);
+ vis->cs_d = 0;
+ vis->cs_base = 0;
svm_inout_str_seginfo(vcpu, info1, vmexit->u.inout.in, vis);
}
@@ -866,10 +904,9 @@ static void
svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
{
struct vm_guest_paging *paging;
- struct vmcb_segment seg;
struct vmcb_ctrl *ctrl;
char *inst_bytes;
- int error __diagused, inst_len;
+ int inst_len;
ctrl = &vmcb->ctrl;
paging = &vmexit->u.inst_emul.paging;
@@ -879,29 +916,8 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
svm_paging_info(vmcb, paging);
- error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
- KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
-
- switch(paging->cpu_mode) {
- case CPU_MODE_REAL:
- vmexit->u.inst_emul.cs_base = seg.base;
- vmexit->u.inst_emul.cs_d = 0;
- break;
- case CPU_MODE_PROTECTED:
- case CPU_MODE_COMPATIBILITY:
- vmexit->u.inst_emul.cs_base = seg.base;
-
- /*
- * Section 4.8.1 of APM2, Default Operand Size or D bit.
- */
- vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
- 1 : 0;
- break;
- default:
- vmexit->u.inst_emul.cs_base = 0;
- vmexit->u.inst_emul.cs_d = 0;
- break;
- }
+ svm_get_cs_info(vmcb, paging, &vmexit->u.inst_emul.cs_d,
+ &vmexit->u.inst_emul.cs_base);
/*
* Copy the instruction bytes into 'vie' if available.
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 957217ab2258..842281ab862e 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -2659,6 +2659,8 @@ vmx_exit_process(struct vmx *vmx, struct vmx_vcpu *vcpu, struct vm_exit *vmexit)
vis->index = inout_str_index(vcpu, in);
vis->count = inout_str_count(vcpu, vis->inout.rep);
vis->addrsize = inout_str_addrsize(inst_info);
+ vis->cs_d = 0;
+ vis->cs_base = 0;
inout_str_seginfo(vcpu, inst_info, in, vis);
}
SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpuid, vmexit);
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index f393f160b101..130130b64541 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -32,12 +32,6 @@
#include "vmx_assym.h"
-#ifdef SMP
-#define LK lock ;
-#else
-#define LK
-#endif
-
/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
#define VENTER push %rbp ; mov %rsp,%rbp
#define VLEAVE pop %rbp
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index c53e32889000..c54b6e6d0074 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -65,30 +65,6 @@
#include <x86/psl.h>
#include <x86/specialreg.h>
-/* struct vie_op.op_type */
-enum {
- VIE_OP_TYPE_NONE = 0,
- VIE_OP_TYPE_MOV,
- VIE_OP_TYPE_MOVSX,
- VIE_OP_TYPE_MOVZX,
- VIE_OP_TYPE_AND,
- VIE_OP_TYPE_OR,
- VIE_OP_TYPE_SUB,
- VIE_OP_TYPE_TWO_BYTE,
- VIE_OP_TYPE_PUSH,
- VIE_OP_TYPE_CMP,
- VIE_OP_TYPE_POP,
- VIE_OP_TYPE_MOVS,
- VIE_OP_TYPE_GROUP1,
- VIE_OP_TYPE_STOS,
- VIE_OP_TYPE_BITTEST,
- VIE_OP_TYPE_TWOB_GRP15,
- VIE_OP_TYPE_ADD,
- VIE_OP_TYPE_TEST,
- VIE_OP_TYPE_BEXTR,
- VIE_OP_TYPE_LAST
-};
-
/* struct vie_op.op_flags */
#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
@@ -152,6 +128,16 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_byte = 0x3B,
.op_type = VIE_OP_TYPE_CMP,
},
+ [0x6E] = {
+ .op_byte = 0x6E,
+ .op_type = VIE_OP_TYPE_OUTS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
+ },
+ [0x6F] = {
+ .op_byte = 0x6F,
+ .op_type = VIE_OP_TYPE_OUTS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
+ },
[0x88] = {
.op_byte = 0x88,
.op_type = VIE_OP_TYPE_MOV,
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index fc1ecab9f209..8aab28f5e68e 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -145,9 +145,49 @@ emulate_inout_port(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu)
}
static int
+decode_segment(struct vcpu *vcpu, enum vm_reg_name *segment)
+{
+ struct vm_guest_paging *paging;
+ struct vie vie;
+ struct vm_exit *vme;
+ int err;
+ int fault;
+
+ vme = vm_exitinfo(vcpu);
+ paging = &vme->u.inout_str.paging;
+
+ vie_init(&vie, NULL, 0);
+ err = vmm_fetch_instruction(vcpu, paging,
+ vme->rip + vme->u.inout_str.cs_base, VIE_INST_SIZE, &vie, &fault);
+ if (err || fault)
+ return (err);
+
+ err = vmm_decode_instruction(vcpu, VIE_INVALID_GLA, paging->cpu_mode,
+ vme->u.inout_str.cs_d, &vie);
+
+ if (err || vie.op.op_type != VIE_OP_TYPE_OUTS)
+ return (EINVAL);
+ if (vie.segment_override)
+ *segment = vie.segment_register;
+ else
+ *segment = VM_REG_GUEST_DS;
+
+ return (0);
+}
+
+static int
emulate_inout_str(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu)
{
+ int err;
+
*retu = true;
+ if (vmexit->u.inout_str.seg_name == VM_REG_LAST) {
+ err = decode_segment(vcpu, &vmexit->u.inout_str.seg_name);
+ if (err)
+ return (err);
+ return (vm_get_seg_desc(vcpu, vmexit->u.inout_str.seg_name,
+ &vmexit->u.inout_str.seg_desc));
+ }
return (0); /* Return to userspace to finish emulation */
}
diff --git a/sys/arm/allwinner/aw_mmc.c b/sys/arm/allwinner/aw_mmc.c
index 6bebf5e5fb5e..a8add957dc74 100644
--- a/sys/arm/allwinner/aw_mmc.c
+++ b/sys/arm/allwinner/aw_mmc.c
@@ -84,21 +84,26 @@
struct aw_mmc_conf {
uint32_t dma_xferlen;
+ uint32_t dma_desc_shift;
bool mask_data0;
bool can_calibrate;
bool new_timing;
+ bool zero_is_skip;
};
static const struct aw_mmc_conf a10_mmc_conf = {
.dma_xferlen = 0x2000,
+ .dma_desc_shift = 0,
};
static const struct aw_mmc_conf a13_mmc_conf = {
.dma_xferlen = 0x10000,
+ .dma_desc_shift = 0,
};
static const struct aw_mmc_conf a64_mmc_conf = {
.dma_xferlen = 0x10000,
+ .dma_desc_shift = 0,
.mask_data0 = true,
.can_calibrate = true,
.new_timing = true,
@@ -106,13 +111,24 @@ static const struct aw_mmc_conf a64_mmc_conf = {
static const struct aw_mmc_conf a64_emmc_conf = {
.dma_xferlen = 0x2000,
+ .dma_desc_shift = 0,
.can_calibrate = true,
};
+static const struct aw_mmc_conf d1_mmc_conf = {
+ .dma_xferlen = 0x1000,
+ .dma_desc_shift = 2,
+ .mask_data0 = true,
+ .can_calibrate = true,
+ .new_timing = true,
+ .zero_is_skip = true,
+};
+
static struct ofw_compat_data compat_data[] = {
{"allwinner,sun4i-a10-mmc", (uintptr_t)&a10_mmc_conf},
{"allwinner,sun5i-a13-mmc", (uintptr_t)&a13_mmc_conf},
{"allwinner,sun7i-a20-mmc", (uintptr_t)&a13_mmc_conf},
+ {"allwinner,sun20i-d1-mmc", (uintptr_t)&d1_mmc_conf},
{"allwinner,sun50i-a64-mmc", (uintptr_t)&a64_mmc_conf},
{"allwinner,sun50i-a64-emmc", (uintptr_t)&a64_emmc_conf},
{NULL, 0}
@@ -607,16 +623,18 @@ aw_dma_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int err)
dma_desc = sc->aw_dma_desc;
for (i = 0; i < nsegs; i++) {
- if (segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen)
+ if ((segs[i].ds_len == sc->aw_mmc_conf->dma_xferlen) &&
+ !sc->aw_mmc_conf->zero_is_skip)
dma_desc[i].buf_size = 0; /* Size of 0 indicate max len */
else
dma_desc[i].buf_size = segs[i].ds_len;
- dma_desc[i].buf_addr = segs[i].ds_addr;
+ dma_desc[i].buf_addr = segs[i].ds_addr >>
+ sc->aw_mmc_conf->dma_desc_shift;
dma_desc[i].config = AW_MMC_DMA_CONFIG_CH |
- AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC;
-
- dma_desc[i].next = sc->aw_dma_desc_phys +
- ((i + 1) * sizeof(struct aw_mmc_dma_desc));
+ AW_MMC_DMA_CONFIG_OWN | AW_MMC_DMA_CONFIG_DIC;
+ dma_desc[i].next = (sc->aw_dma_desc_phys +
+ (i + 1) * sizeof(struct aw_mmc_dma_desc)) >>
+ sc->aw_mmc_conf->dma_desc_shift;
}
dma_desc[0].config |= AW_MMC_DMA_CONFIG_FD;
@@ -678,7 +696,8 @@ aw_mmc_prepare_dma(struct aw_mmc_softc *sc)
AW_MMC_WRITE_4(sc, AW_MMC_IDIE, val);
/* Set DMA descritptor list address */
- AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys);
+ AW_MMC_WRITE_4(sc, AW_MMC_DLBA, sc->aw_dma_desc_phys >>
+ sc->aw_mmc_conf->dma_desc_shift);
/* FIFO trigger level */
AW_MMC_WRITE_4(sc, AW_MMC_FWLR, AW_MMC_DMA_FTRGLEVEL);
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
index 92eb0589f80b..78883296c5b7 100644
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -5767,7 +5767,7 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m,
VM_PAGE_TO_PHYS(m), oma, ma);
- if ((m->flags & PG_FICTITIOUS) != 0)
+ if (ma == oma || (m->flags & PG_FICTITIOUS) != 0)
return;
#if 0
/*
@@ -5784,22 +5784,20 @@ pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
* If page is not mapped by sf buffer, map the page
* transient and do invalidation.
*/
- if (ma != oma) {
- pa = VM_PAGE_TO_PHYS(m);
- sched_pin();
- pc = get_pcpu();
- cmap2_pte2p = pc->pc_cmap2_pte2p;
- mtx_lock(&pc->pc_cmap_lock);
- if (pte2_load(cmap2_pte2p) != 0)
- panic("%s: CMAP2 busy", __func__);
- pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW,
- vm_memattr_to_pte2(ma)));
- dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE);
- pte2_clear(cmap2_pte2p);
- tlb_flush((vm_offset_t)pc->pc_cmap2_addr);
- sched_unpin();
- mtx_unlock(&pc->pc_cmap_lock);
- }
+ pa = VM_PAGE_TO_PHYS(m);
+ sched_pin();
+ pc = get_pcpu();
+ cmap2_pte2p = pc->pc_cmap2_pte2p;
+ mtx_lock(&pc->pc_cmap_lock);
+ if (pte2_load(cmap2_pte2p) != 0)
+ panic("%s: CMAP2 busy", __func__);
+ pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW,
+ vm_memattr_to_pte2(ma)));
+ dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE);
+ pte2_clear(cmap2_pte2p);
+ tlb_flush((vm_offset_t)pc->pc_cmap2_addr);
+ sched_unpin();
+ mtx_unlock(&pc->pc_cmap_lock);
}
/*
diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c
index d2e56a270f54..2152f7fcc1c6 100644
--- a/sys/arm64/arm64/pmap.c
+++ b/sys/arm64/arm64/pmap.c
@@ -497,7 +497,8 @@ static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
struct rwlock **lockp);
static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
- pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
+ pd_entry_t l1e, bool demote_kl2e, struct spglist *free,
+ struct rwlock **lockp);
static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
@@ -3847,8 +3848,7 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
ml3 = pmap_remove_pt_page(pmap, va);
- if (ml3 == NULL)
- panic("pmap_remove_kernel_l2: Missing pt page");
+ KASSERT(ml3 != NULL, ("pmap_remove_kernel_l2: missing pt page"));
ml3pa = VM_PAGE_TO_PHYS(ml3);
newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
@@ -3873,8 +3873,8 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
* pmap_remove_l2: Do the things to unmap a level 2 superpage.
*/
static int
-pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
- pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
+pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e,
+ bool demote_kl2e, struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
pt_entry_t old_l2;
@@ -3910,9 +3910,7 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
vm_page_aflag_clear(mt, PGA_WRITEABLE);
}
}
- if (pmap == kernel_pmap) {
- pmap_remove_kernel_l2(pmap, l2, sva);
- } else {
+ if (pmap != kernel_pmap) {
ml3 = pmap_remove_pt_page(pmap, sva);
if (ml3 != NULL) {
KASSERT(vm_page_any_valid(ml3),
@@ -3923,6 +3921,14 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
ml3->ref_count = 0;
pmap_add_delayed_free_list(ml3, free, false);
}
+ } else if (demote_kl2e) {
+ pmap_remove_kernel_l2(pmap, l2, sva);
+ } else {
+ ml3 = vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(sva));
+ if (vm_page_any_valid(ml3)) {
+ ml3->valid = 0;
+ pmap_zero_page(ml3);
+ }
}
return (pmap_unuse_pt(pmap, sva, l1e, free));
}
@@ -4232,7 +4238,7 @@ pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
if (sva + L2_SIZE == va_next && eva >= va_next) {
pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
- &free, &lock);
+ true, &free, &lock);
continue;
} else if (pmap_demote_l2_locked(pmap, l2, sva,
&lock) == NULL)
@@ -5703,6 +5709,9 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT(ADDR_IS_CANONICAL(va),
("%s: Address not in canonical form: %lx", __func__, va));
+ KASSERT((flags & (PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM)) !=
+ PMAP_ENTER_NORECLAIM,
+ ("pmap_enter_l2: flags is missing PMAP_ENTER_NOREPLACE"));
if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
@@ -5747,33 +5756,51 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
}
}
SLIST_INIT(&free);
- if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
+ if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
(void)pmap_remove_l2(pmap, l2, va,
- pmap_load(pmap_l1(pmap, va)), &free, lockp);
- else
+ pmap_load(pmap_l1(pmap, va)), false, &free, lockp);
+ } else {
+ if (ADDR_IS_KERNEL(va)) {
+ /*
+ * Try to save the ptp in the trie
+ * before any changes to mappings are
+ * made. Abort on failure.
+ */
+ mt = PTE_TO_VM_PAGE(old_l2);
+ if (pmap_insert_pt_page(pmap, mt, false,
+ false)) {
+ CTR1(KTR_PMAP,
+ "pmap_enter_l2: cannot ins kern ptp va %#lx",
+ va);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
+ /*
+ * Both pmap_remove_l2() and
+ * pmap_remove_l3_range() will zero fill
+ * the L3 kernel page table page.
+ */
+ }
pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
&free, lockp);
+ if (ADDR_IS_KERNEL(va)) {
+ /*
+ * The TLB could have an intermediate
+ * entry for the L3 kernel page table
+ * page, so request an invalidation at
+ * all levels after clearing the
+ * L2_TABLE entry.
+ */
+ pmap_clear(l2);
+ pmap_s1_invalidate_page(pmap, va, false);
+ }
+ }
+ KASSERT(pmap_load(l2) == 0,
+ ("pmap_enter_l2: non-zero L2 entry %p", l2));
if (!ADDR_IS_KERNEL(va)) {
vm_page_free_pages_toq(&free, true);
- KASSERT(pmap_load(l2) == 0,
- ("pmap_enter_l2: non-zero L2 entry %p", l2));
} else {
KASSERT(SLIST_EMPTY(&free),
("pmap_enter_l2: freed kernel page table page"));
-
- /*
- * Both pmap_remove_l2() and pmap_remove_l3_range()
- * will leave the kernel page table page zero filled.
- * Nonetheless, the TLB could have an intermediate
- * entry for the kernel page table page, so request
- * an invalidation at all levels after clearing
- * the L2_TABLE entry.
- */
- mt = PTE_TO_VM_PAGE(pmap_load(l2));
- if (pmap_insert_pt_page(pmap, mt, false, false))
- panic("pmap_enter_l2: trie insert failed");
- pmap_clear(l2);
- pmap_s1_invalidate_page(pmap, va, false);
}
}
@@ -5804,6 +5831,15 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
if (l2pg != NULL)
pmap_abort_ptp(pmap, va, l2pg);
+ else {
+ KASSERT(ADDR_IS_KERNEL(va) &&
+ (pmap_load(l2) & ATTR_DESCR_MASK) ==
+ L2_TABLE,
+ ("pmap_enter_l2: invalid kernel L2E"));
+ mt = pmap_remove_pt_page(pmap, va);
+ KASSERT(mt != NULL,
+ ("pmap_enter_l2: missing kernel PTP"));
+ }
if (uwptpg != NULL) {
mt = pmap_remove_pt_page(pmap, va);
KASSERT(mt == uwptpg,
@@ -8045,6 +8081,8 @@ pmap_unmapbios(void *p, vm_size_t size)
void
pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pv_memattr == ma)
+ return;
m->md.pv_memattr = ma;
@@ -8424,8 +8462,8 @@ pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
struct spglist free;
SLIST_INIT(&free);
- (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
- lockp);
+ (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), true,
+ &free, lockp);
vm_page_free_pages_toq(&free, true);
}
@@ -8463,18 +8501,20 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
/*
* Invalidate the 2MB page mapping and return "failure" if the
- * mapping was never accessed.
+ * mapping was never accessed and not wired.
*/
if ((oldl2 & ATTR_AF) == 0) {
- KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
- ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
- pmap_demote_l2_abort(pmap, va, l2, lockp);
- CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
- va, pmap);
- goto fail;
- }
-
- if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
+ if ((oldl2 & ATTR_SW_WIRED) == 0) {
+ pmap_demote_l2_abort(pmap, va, l2, lockp);
+ CTR2(KTR_PMAP,
+ "pmap_demote_l2: failure for va %#lx in pmap %p",
+ va, pmap);
+ goto fail;
+ }
+ ml3 = pmap_remove_pt_page(pmap, va);
+ /* Fill the PTP with L3Es that have ATTR_AF cleared. */
+ ml3->valid = 0;
+ } else if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
("pmap_demote_l2: page table page for a wired mapping"
" is missing"));
@@ -8530,7 +8570,7 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
/*
* If the PTP is not leftover from an earlier promotion or it does not
* have ATTR_AF set in every L3E, then fill it. The new L3Es will all
- * have ATTR_AF set.
+ * have ATTR_AF set, unless this is a wired mapping with ATTR_AF clear.
*
* When pmap_update_entry() clears the old L2 mapping, it (indirectly)
* performs a dsb(). That dsb() ensures that the stores for filling
diff --git a/sys/arm64/broadcom/genet/if_genet.c b/sys/arm64/broadcom/genet/if_genet.c
index 0602f076b257..182b5582fb7c 100644
--- a/sys/arm64/broadcom/genet/if_genet.c
+++ b/sys/arm64/broadcom/genet/if_genet.c
@@ -349,7 +349,7 @@ gen_attach(device_t dev)
}
/* If address was not found, create one based on the hostid and name. */
- if (eaddr_found == 0)
+ if (!eaddr_found)
ether_gen_addr(sc->ifp, &eaddr);
/* Attach ethernet interface */
ether_ifattach(sc->ifp, eaddr.octet);
@@ -653,7 +653,7 @@ gen_bus_dma_teardown(struct gen_softc *sc)
error);
}
- if (sc->tx_buf_tag != NULL) {
+ if (sc->rx_buf_tag != NULL) {
for (i = 0; i < RX_DESC_COUNT; i++) {
error = bus_dmamap_destroy(sc->rx_buf_tag,
sc->rx_ring_ent[i].map);
diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h
index 938bea47c7f8..219f1116c728 100644
--- a/sys/arm64/include/vmm_dev.h
+++ b/sys/arm64/include/vmm_dev.h
@@ -27,6 +27,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
+#include <sys/domainset.h>
+
#include <machine/vmm.h>
struct vm_memmap {
@@ -49,6 +51,9 @@ struct vm_memseg {
int segid;
size_t len;
char name[VM_MAX_SUFFIXLEN + 1];
+ domainset_t *ds_mask;
+ size_t ds_mask_size;
+ int ds_policy;
};
struct vm_register {
diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c
index 2ec736e7f4ac..cae29226d13c 100644
--- a/sys/cam/cam_xpt.c
+++ b/sys/cam/cam_xpt.c
@@ -2515,6 +2515,15 @@ xpt_action(union ccb *start_ccb)
("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code,
xpt_action_name(start_ccb->ccb_h.func_code)));
+ /*
+ * Either it isn't queued, or it has a real priority. There still too
+ * many places that reuse CCBs with a real priority to do immediate
+ * queries to do the other side of this assert.
+ */
+ KASSERT((start_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0 ||
+ start_ccb->ccb_h.pinfo.priority != CAM_PRIORITY_NONE,
+ ("%s: queued ccb and CAM_PRIORITY_NONE illegal.", __func__));
+
start_ccb->ccb_h.status = CAM_REQ_INPROG;
(*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb);
}
diff --git a/sys/cam/cam_xpt.h b/sys/cam/cam_xpt.h
index 3815f42cba40..efa6c823245a 100644
--- a/sys/cam/cam_xpt.h
+++ b/sys/cam/cam_xpt.h
@@ -151,7 +151,6 @@ void xpt_sim_poll(struct cam_sim *sim);
static inline void
xpt_path_inq(struct ccb_pathinq *cpi, struct cam_path *path)
{
-
bzero(cpi, sizeof(*cpi));
xpt_setup_ccb(&cpi->ccb_h, path, CAM_PRIORITY_NONE);
cpi->ccb_h.func_code = XPT_PATH_INQ;
@@ -165,7 +164,6 @@ xpt_path_inq(struct ccb_pathinq *cpi, struct cam_path *path)
static inline void
xpt_gdev_type(struct ccb_getdev *cgd, struct cam_path *path)
{
-
bzero(cgd, sizeof(*cgd));
xpt_setup_ccb(&cgd->ccb_h, path, CAM_PRIORITY_NONE);
cgd->ccb_h.func_code = XPT_GDEV_TYPE;
diff --git a/sys/cam/mmc/mmc_da.c b/sys/cam/mmc/mmc_da.c
index 7f8bf3516804..322141a72707 100644
--- a/sys/cam/mmc/mmc_da.c
+++ b/sys/cam/mmc/mmc_da.c
@@ -1081,7 +1081,7 @@ sdda_start_init_task(void *context, int pending)
CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("sdda_start_init_task\n"));
new_ccb = xpt_alloc_ccb();
xpt_setup_ccb(&new_ccb->ccb_h, periph->path,
- CAM_PRIORITY_NONE);
+ CAM_PRIORITY_NORMAL);
cam_periph_lock(periph);
cam_periph_hold(periph, PRIBIO|PCATCH);
diff --git a/sys/cam/mmc/mmc_xpt.c b/sys/cam/mmc/mmc_xpt.c
index 4fce03004994..f5f66f5214a8 100644
--- a/sys/cam/mmc/mmc_xpt.c
+++ b/sys/cam/mmc/mmc_xpt.c
@@ -610,7 +610,6 @@ mmcprobe_start(struct cam_periph *periph, union ccb *start_ccb)
CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_RESET\n"));
/* FALLTHROUGH */
case PROBE_IDENTIFY:
- xpt_path_inq(&start_ccb->cpi, periph->path);
CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_PROBE, ("Start with PROBE_IDENTIFY\n"));
init_standard_ccb(start_ccb, XPT_MMC_GET_TRAN_SETTINGS);
break;
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index 0ce38384abbf..83d964360343 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -2019,6 +2019,7 @@ typedef struct vdev {
vdev_list_t v_children; /* children of this vdev */
const char *v_name; /* vdev name */
uint64_t v_guid; /* vdev guid */
+ uint64_t v_txg; /* most recent transaction */
uint64_t v_id; /* index in parent */
uint64_t v_psize; /* physical device capacity */
int v_ashift; /* offset to block shift */
@@ -2048,7 +2049,6 @@ typedef struct spa {
STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
char *spa_name; /* pool name */
uint64_t spa_guid; /* pool guid */
- uint64_t spa_txg; /* most recent transaction */
struct uberblock *spa_uberblock; /* best uberblock so far */
vdev_t *spa_root_vdev; /* toplevel vdev container */
objset_phys_t *spa_mos; /* MOS for this pool */
diff --git a/sys/cddl/dev/sdt/sdt.c b/sys/cddl/dev/sdt/sdt.c
index a8da618204af..0a9059104671 100644
--- a/sys/cddl/dev/sdt/sdt.c
+++ b/sys/cddl/dev/sdt/sdt.c
@@ -72,6 +72,7 @@ static void sdt_load(void);
static int sdt_unload(void);
static void sdt_create_provider(struct sdt_provider *);
static void sdt_create_probe(struct sdt_probe *);
+static void sdt_init_probe(struct sdt_probe *, linker_file_t);
static void sdt_kld_load(void *, struct linker_file *);
static void sdt_kld_unload_try(void *, struct linker_file *, int *);
@@ -204,6 +205,14 @@ sdt_create_probe(struct sdt_probe *probe)
(void)dtrace_probe_create(prov->id, mod, func, name, aframes, probe);
}
+static void
+sdt_init_probe(struct sdt_probe *probe, linker_file_t lf)
+{
+ probe->sdtp_lf = lf;
+ TAILQ_INIT(&probe->argtype_list);
+ STAILQ_INIT(&probe->tracepoint_list);
+}
+
/*
* Probes are created through the SDT module load/unload hook, so this function
* has nothing to do. It only exists because the DTrace provider framework
@@ -361,12 +370,19 @@ static void
sdt_kld_load_providers(struct linker_file *lf)
{
struct sdt_provider **prov, **begin, **end;
+ struct sdt_probe **p_begin, **p_end;
if (linker_file_lookup_set(lf, "sdt_providers_set", &begin, &end,
NULL) == 0) {
for (prov = begin; prov < end; prov++)
sdt_create_provider(*prov);
}
+
+ if (linker_file_lookup_set(lf, "sdt_probes_set", &p_begin, &p_end,
+ NULL) == 0) {
+ for (struct sdt_probe **probe = p_begin; probe < p_end; probe++)
+ sdt_init_probe(*probe, lf);
+ }
}
static void
@@ -378,13 +394,8 @@ sdt_kld_load_probes(struct linker_file *lf)
if (linker_file_lookup_set(lf, "sdt_probes_set", &p_begin, &p_end,
NULL) == 0) {
- for (struct sdt_probe **probe = p_begin; probe < p_end;
- probe++) {
- (*probe)->sdtp_lf = lf;
+ for (struct sdt_probe **probe = p_begin; probe < p_end; probe++)
sdt_create_probe(*probe);
- TAILQ_INIT(&(*probe)->argtype_list);
- STAILQ_INIT(&(*probe)->tracepoint_list);
- }
}
if (linker_file_lookup_set(lf, "sdt_argtypes_set", &a_begin, &a_end,
diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c
index cfb054235489..1c6d64d6b8bc 100644
--- a/sys/compat/linprocfs/linprocfs.c
+++ b/sys/compat/linprocfs/linprocfs.c
@@ -1911,7 +1911,7 @@ linprocfs_doproclimits(PFS_FILL_ARGS)
"kern.sigqueue.max_pending_per_proc",
&res, &size, 0, 0, 0, 0);
if (error != 0)
- goto out;
+ continue;
rl.rlim_cur = res;
rl.rlim_max = res;
break;
@@ -1919,7 +1919,7 @@ linprocfs_doproclimits(PFS_FILL_ARGS)
error = kernel_sysctlbyname(td,
"kern.ipc.msgmnb", &res, &size, 0, 0, 0, 0);
if (error != 0)
- goto out;
+ continue;
rl.rlim_cur = res;
rl.rlim_max = res;
break;
@@ -1941,9 +1941,9 @@ linprocfs_doproclimits(PFS_FILL_ARGS)
li->desc, (unsigned long long)rl.rlim_cur,
(unsigned long long)rl.rlim_max, li->unit);
}
-out:
+
lim_free(limp);
- return (error);
+ return (0);
}
/*
diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c
index 86834a7ecea8..a4be5313aa96 100644
--- a/sys/compat/linux/linux_file.c
+++ b/sys/compat/linux/linux_file.c
@@ -1792,7 +1792,7 @@ linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args)
if ((flags & MFD_ALLOW_SEALING) != 0)
shmflags |= SHM_ALLOW_SEALING;
return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, NULL,
- memfd_name));
+ memfd_name, NULL));
}
int
diff --git a/sys/compat/linuxkpi/common/include/linux/slab.h b/sys/compat/linuxkpi/common/include/linux/slab.h
index f3a840d9bf4b..efa5c8cb67b3 100644
--- a/sys/compat/linuxkpi/common/include/linux/slab.h
+++ b/sys/compat/linuxkpi/common/include/linux/slab.h
@@ -45,7 +45,7 @@
MALLOC_DECLARE(M_KMALLOC);
-#define kvzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO)
+#define kvzalloc(size, flags) kvmalloc(size, (flags) | __GFP_ZERO)
#define kvcalloc(n, size, flags) kvmalloc_array(n, size, (flags) | __GFP_ZERO)
#define kzalloc(size, flags) kmalloc(size, (flags) | __GFP_ZERO)
#define kzalloc_node(size, flags, node) kmalloc_node(size, (flags) | __GFP_ZERO, node)
diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c
index ebb92eacbf9a..628af17df853 100644
--- a/sys/compat/linuxkpi/common/src/linux_page.c
+++ b/sys/compat/linuxkpi/common/src/linux_page.c
@@ -106,6 +106,7 @@ linux_alloc_pages(gfp_t flags, unsigned int order)
if ((flags & M_ZERO) != 0)
req |= VM_ALLOC_ZERO;
+
if (order == 0 && (flags & GFP_DMA32) == 0) {
page = vm_page_alloc_noobj(req);
if (page == NULL)
@@ -113,6 +114,10 @@ linux_alloc_pages(gfp_t flags, unsigned int order)
} else {
vm_paddr_t pmax = (flags & GFP_DMA32) ?
BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR;
+
+ if ((flags & __GFP_NORETRY) != 0)
+ req |= VM_ALLOC_NORECLAIM;
+
retry:
page = vm_page_alloc_noobj_contig(req, npages, 0, pmax,
PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
diff --git a/sys/conf/files b/sys/conf/files
index 74d251c2b608..b7c19fae0b8e 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3227,6 +3227,19 @@ dev/uart/uart_if.m optional uart
dev/uart/uart_subr.c optional uart
dev/uart/uart_tty.c optional uart
#
+# Universal Flash Storage Host Controller Interface drivers
+#
+dev/ufshci/ufshci.c optional ufshci
+dev/ufshci/ufshci_ctrlr.c optional ufshci
+dev/ufshci/ufshci_ctrlr_cmd.c optional ufshci
+dev/ufshci/ufshci_dev.c optional ufshci
+dev/ufshci/ufshci_pci.c optional ufshci
+dev/ufshci/ufshci_req_queue.c optional ufshci
+dev/ufshci/ufshci_req_sdb.c optional ufshci
+dev/ufshci/ufshci_sim.c optional ufshci
+dev/ufshci/ufshci_sysctl.c optional ufshci
+dev/ufshci/ufshci_uic_cmd.c optional ufshci
+#
# USB controller drivers
#
dev/usb/controller/musb_otg.c optional musb
@@ -3755,6 +3768,7 @@ gnu/gcov/gcov_subr.c optional gcov
kern/bus_if.m standard
kern/clock_if.m standard
+kern/coredump_vnode.c standard
kern/cpufreq_if.m standard
kern/device_if.m standard
kern/imgact_binmisc.c optional imgact_binmisc
@@ -3843,6 +3857,7 @@ kern/kern_time.c standard
kern/kern_timeout.c standard
kern/kern_tslog.c optional tslog
kern/kern_ubsan.c optional kubsan
+kern/kern_ucoredump.c standard
kern/kern_umtx.c standard
kern/kern_uuid.c standard
kern/kern_vnodedumper.c standard
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 0584fc29d039..80548320c3fc 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -84,8 +84,8 @@ amd64/amd64/xen-locore.S optional xenhvm \
amd64/amd64/machdep.c standard
amd64/amd64/mem.c optional mem
amd64/amd64/minidump_machdep.c standard
-amd64/amd64/mp_machdep.c optional smp
-amd64/amd64/mpboot.S optional smp
+amd64/amd64/mp_machdep.c standard
+amd64/amd64/mpboot.S standard
amd64/amd64/pmap.c standard
amd64/amd64/ptrace_machdep.c standard
amd64/amd64/support.S standard
@@ -191,6 +191,10 @@ dev/ice/irdma_di_if.m optional ice pci \
compile-with "${NORMAL_M} -I$S/dev/ice"
dev/ice/ice_ddp_common.c optional ice pci \
compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_iov.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_vf_mbx.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
ice_ddp.c optional ice_ddp \
compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}" \
no-ctfconvert no-implicit-rule before-depend local \
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index 901da27e63f2..641001efab5e 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -368,6 +368,10 @@ dev/ice/irdma_di_if.m optional ice pci \
compile-with "${NORMAL_M} -I$S/dev/ice"
dev/ice/ice_ddp_common.c optional ice pci \
compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_iov.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_vf_mbx.c optional ice pci pci_iov \
+ compile-with "${NORMAL_C} -I$S/dev/ice"
ice_ddp.c optional ice_ddp \
compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01032900 -mice_ddp -c${.TARGET}" \
no-ctfconvert no-implicit-rule before-depend local \
diff --git a/sys/conf/files.x86 b/sys/conf/files.x86
index df206b314b38..9976e9cfec5d 100644
--- a/sys/conf/files.x86
+++ b/sys/conf/files.x86
@@ -62,6 +62,7 @@ dev/acpi_support/acpi_wmi_if.m standard
dev/agp/agp_amd64.c optional agp
dev/agp/agp_i810.c optional agp
dev/agp/agp_via.c optional agp
+dev/amdsmu/amdsmu.c optional amdsmu pci
dev/amdsbwd/amdsbwd.c optional amdsbwd
dev/amdsmn/amdsmn.c optional amdsmn | amdtemp
dev/amdtemp/amdtemp.c optional amdtemp
diff --git a/sys/dev/amdsmu/amdsmu.c b/sys/dev/amdsmu/amdsmu.c
new file mode 100644
index 000000000000..416f875c6176
--- /dev/null
+++ b/sys/dev/amdsmu/amdsmu.c
@@ -0,0 +1,466 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 The FreeBSD Foundation
+ *
+ * This software was developed by Aymeric Wibo <obiwac@freebsd.org>
+ * under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/rman.h>
+#include <sys/sysctl.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/amdsmu/amdsmu.h>
+
+static bool
+amdsmu_match(device_t dev, const struct amdsmu_product **product_out)
+{
+ const uint16_t vendorid = pci_get_vendor(dev);
+ const uint16_t deviceid = pci_get_device(dev);
+
+ for (size_t i = 0; i < nitems(amdsmu_products); i++) {
+ const struct amdsmu_product *prod = &amdsmu_products[i];
+
+ if (vendorid == prod->amdsmu_vendorid &&
+ deviceid == prod->amdsmu_deviceid) {
+ if (product_out != NULL)
+ *product_out = prod;
+ return (true);
+ }
+ }
+ return (false);
+}
+
+static void
+amdsmu_identify(driver_t *driver, device_t parent)
+{
+ if (device_find_child(parent, "amdsmu", -1) != NULL)
+ return;
+
+ if (amdsmu_match(parent, NULL)) {
+ if (device_add_child(parent, "amdsmu", -1) == NULL)
+ device_printf(parent, "add amdsmu child failed\n");
+ }
+}
+
+static int
+amdsmu_probe(device_t dev)
+{
+ if (resource_disabled("amdsmu", 0))
+ return (ENXIO);
+ if (!amdsmu_match(device_get_parent(dev), NULL))
+ return (ENXIO);
+ device_set_descf(dev, "AMD System Management Unit");
+
+ return (BUS_PROBE_GENERIC);
+}
+
+static enum amdsmu_res
+amdsmu_wait_res(device_t dev)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+ enum amdsmu_res res;
+
+ /*
+ * The SMU has a response ready for us when the response register is
+ * set. Otherwise, we must wait.
+ */
+ for (size_t i = 0; i < SMU_RES_READ_MAX; i++) {
+ res = amdsmu_read4(sc, SMU_REG_RESPONSE);
+ if (res != SMU_RES_WAIT)
+ return (res);
+ pause_sbt("amdsmu", ustosbt(SMU_RES_READ_PERIOD_US), 0,
+ C_HARDCLOCK);
+ }
+ device_printf(dev, "timed out waiting for response from SMU\n");
+ return (SMU_RES_WAIT);
+}
+
+static int
+amdsmu_cmd(device_t dev, enum amdsmu_msg msg, uint32_t arg, uint32_t *ret)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+ enum amdsmu_res res;
+
+ /* Wait for SMU to be ready. */
+ if (amdsmu_wait_res(dev) == SMU_RES_WAIT)
+ return (ETIMEDOUT);
+
+ /* Clear previous response. */
+ amdsmu_write4(sc, SMU_REG_RESPONSE, SMU_RES_WAIT);
+
+ /* Write out command to registers. */
+ amdsmu_write4(sc, SMU_REG_MESSAGE, msg);
+ amdsmu_write4(sc, SMU_REG_ARGUMENT, arg);
+
+ /* Wait for SMU response and handle it. */
+ res = amdsmu_wait_res(dev);
+
+ switch (res) {
+ case SMU_RES_WAIT:
+ return (ETIMEDOUT);
+ case SMU_RES_OK:
+ if (ret != NULL)
+ *ret = amdsmu_read4(sc, SMU_REG_ARGUMENT);
+ return (0);
+ case SMU_RES_REJECT_BUSY:
+ device_printf(dev, "SMU is busy\n");
+ return (EBUSY);
+ case SMU_RES_REJECT_PREREQ:
+ case SMU_RES_UNKNOWN:
+ case SMU_RES_FAILED:
+ device_printf(dev, "SMU error: %02x\n", res);
+ return (EIO);
+ }
+
+ return (EINVAL);
+}
+
+static int
+amdsmu_get_vers(device_t dev)
+{
+ int err;
+ uint32_t smu_vers;
+ struct amdsmu_softc *sc = device_get_softc(dev);
+
+ err = amdsmu_cmd(dev, SMU_MSG_GETSMUVERSION, 0, &smu_vers);
+ if (err != 0) {
+ device_printf(dev, "failed to get SMU version\n");
+ return (err);
+ }
+ sc->smu_program = (smu_vers >> 24) & 0xFF;
+ sc->smu_maj = (smu_vers >> 16) & 0xFF;
+ sc->smu_min = (smu_vers >> 8) & 0xFF;
+ sc->smu_rev = smu_vers & 0xFF;
+ device_printf(dev, "SMU version: %d.%d.%d (program %d)\n",
+ sc->smu_maj, sc->smu_min, sc->smu_rev, sc->smu_program);
+
+ return (0);
+}
+
+static int
+amdsmu_get_ip_blocks(device_t dev)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+ const uint16_t deviceid = pci_get_device(dev);
+ int err;
+ struct amdsmu_metrics *m = &sc->metrics;
+ bool active;
+ char sysctl_descr[32];
+
+ /* Get IP block count. */
+ switch (deviceid) {
+ case PCI_DEVICEID_AMD_REMBRANDT_ROOT:
+ sc->ip_block_count = 12;
+ break;
+ case PCI_DEVICEID_AMD_PHOENIX_ROOT:
+ sc->ip_block_count = 21;
+ break;
+ /* TODO How many IP blocks does Strix Point (and the others) have? */
+ case PCI_DEVICEID_AMD_STRIX_POINT_ROOT:
+ default:
+ sc->ip_block_count = nitems(amdsmu_ip_blocks_names);
+ }
+ KASSERT(sc->ip_block_count <= nitems(amdsmu_ip_blocks_names),
+ ("too many IP blocks for array"));
+
+ /* Get and print out IP blocks. */
+ err = amdsmu_cmd(dev, SMU_MSG_GET_SUP_CONSTRAINTS, 0,
+ &sc->active_ip_blocks);
+ if (err != 0) {
+ device_printf(dev, "failed to get IP blocks\n");
+ return (err);
+ }
+ device_printf(dev, "Active IP blocks: ");
+ for (size_t i = 0; i < sc->ip_block_count; i++) {
+ active = (sc->active_ip_blocks & (1 << i)) != 0;
+ sc->ip_blocks_active[i] = active;
+ if (!active)
+ continue;
+ printf("%s%s", amdsmu_ip_blocks_names[i],
+ i + 1 < sc->ip_block_count ? " " : "\n");
+ }
+
+ /* Create a sysctl node for IP blocks. */
+ sc->ip_blocks_sysctlnode = SYSCTL_ADD_NODE(sc->sysctlctx,
+ SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO, "ip_blocks",
+ CTLFLAG_RD, NULL, "SMU metrics");
+ if (sc->ip_blocks_sysctlnode == NULL) {
+ device_printf(dev, "could not add sysctl node for IP blocks\n");
+ return (ENOMEM);
+ }
+
+ /* Create a sysctl node for each IP block. */
+ for (size_t i = 0; i < sc->ip_block_count; i++) {
+ /* Create the sysctl node itself for the IP block. */
+ snprintf(sysctl_descr, sizeof sysctl_descr,
+ "Metrics about the %s AMD IP block",
+ amdsmu_ip_blocks_names[i]);
+ sc->ip_block_sysctlnodes[i] = SYSCTL_ADD_NODE(sc->sysctlctx,
+ SYSCTL_CHILDREN(sc->ip_blocks_sysctlnode), OID_AUTO,
+ amdsmu_ip_blocks_names[i], CTLFLAG_RD, NULL, sysctl_descr);
+ if (sc->ip_block_sysctlnodes[i] == NULL) {
+ device_printf(dev,
+ "could not add sysctl node for \"%s\"\n", sysctl_descr);
+ continue;
+ }
+ /*
+ * Create sysctls for if the IP block is currently active, last
+ * active time, and total active time.
+ */
+ SYSCTL_ADD_BOOL(sc->sysctlctx,
+ SYSCTL_CHILDREN(sc->ip_block_sysctlnodes[i]), OID_AUTO,
+ "active", CTLFLAG_RD, &sc->ip_blocks_active[i], 0,
+ "IP block is currently active");
+ SYSCTL_ADD_U64(sc->sysctlctx,
+ SYSCTL_CHILDREN(sc->ip_block_sysctlnodes[i]), OID_AUTO,
+ "last_time", CTLFLAG_RD, &m->ip_block_last_active_time[i],
+ 0, "How long the IP block was active for during the last"
+ " sleep (us)");
+#ifdef IP_BLOCK_TOTAL_ACTIVE_TIME
+ SYSCTL_ADD_U64(sc->sysctlctx,
+ SYSCTL_CHILDREN(sc->ip_block_sysctlnodes[i]), OID_AUTO,
+ "total_time", CTLFLAG_RD, &m->ip_block_total_active_time[i],
+ 0, "How long the IP block was active for during sleep in"
+ " total (us)");
+#endif
+ }
+ return (0);
+}
+
+static int
+amdsmu_init_metrics(device_t dev)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+ int err;
+ uint32_t metrics_addr_lo, metrics_addr_hi;
+ uint64_t metrics_addr;
+
+ /* Get physical address of logging buffer. */
+ err = amdsmu_cmd(dev, SMU_MSG_LOG_GETDRAM_ADDR_LO, 0, &metrics_addr_lo);
+ if (err != 0)
+ return (err);
+ err = amdsmu_cmd(dev, SMU_MSG_LOG_GETDRAM_ADDR_HI, 0, &metrics_addr_hi);
+ if (err != 0)
+ return (err);
+ metrics_addr = ((uint64_t) metrics_addr_hi << 32) | metrics_addr_lo;
+
+ /* Map memory of logging buffer. */
+ err = bus_space_map(sc->bus_tag, metrics_addr,
+ sizeof(struct amdsmu_metrics), 0, &sc->metrics_space);
+ if (err != 0) {
+ device_printf(dev, "could not map bus space for SMU metrics\n");
+ return (err);
+ }
+
+ /* Start logging for metrics. */
+ amdsmu_cmd(dev, SMU_MSG_LOG_RESET, 0, NULL);
+ amdsmu_cmd(dev, SMU_MSG_LOG_START, 0, NULL);
+ return (0);
+}
+
+static int
+amdsmu_dump_metrics(device_t dev)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+ int err;
+
+ err = amdsmu_cmd(dev, SMU_MSG_LOG_DUMP_DATA, 0, NULL);
+ if (err != 0) {
+ device_printf(dev, "failed to dump metrics\n");
+ return (err);
+ }
+ bus_space_read_region_4(sc->bus_tag, sc->metrics_space, 0,
+ (uint32_t *)&sc->metrics, sizeof(sc->metrics) / sizeof(uint32_t));
+
+ return (0);
+}
+
+static void
+amdsmu_fetch_idlemask(device_t dev)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+
+ sc->idlemask = amdsmu_read4(sc, SMU_REG_IDLEMASK);
+}
+
+static int
+amdsmu_attach(device_t dev)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+ int err;
+ uint32_t physbase_addr_lo, physbase_addr_hi;
+ uint64_t physbase_addr;
+ int rid = 0;
+ struct sysctl_oid *node;
+
+ /*
+ * Find physical base address for SMU.
+ * XXX I am a little confused about the masks here. I'm just copying
+ * what Linux does in the amd-pmc driver to get the base address.
+ */
+ pci_write_config(dev, SMU_INDEX_ADDRESS, SMU_PHYSBASE_ADDR_LO, 4);
+ physbase_addr_lo = pci_read_config(dev, SMU_INDEX_DATA, 4) & 0xFFF00000;
+
+ pci_write_config(dev, SMU_INDEX_ADDRESS, SMU_PHYSBASE_ADDR_HI, 4);
+ physbase_addr_hi = pci_read_config(dev, SMU_INDEX_DATA, 4) & 0x0000FFFF;
+
+ physbase_addr = (uint64_t)physbase_addr_hi << 32 | physbase_addr_lo;
+
+ /* Map memory for SMU and its registers. */
+ sc->res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
+ if (sc->res == NULL) {
+ device_printf(dev, "could not allocate resource\n");
+ return (ENXIO);
+ }
+
+ sc->bus_tag = rman_get_bustag(sc->res);
+
+ if (bus_space_map(sc->bus_tag, physbase_addr,
+ SMU_MEM_SIZE, 0, &sc->smu_space) != 0) {
+ device_printf(dev, "could not map bus space for SMU\n");
+ err = ENXIO;
+ goto err_smu_space;
+ }
+ if (bus_space_map(sc->bus_tag, physbase_addr + SMU_REG_SPACE_OFF,
+ SMU_MEM_SIZE, 0, &sc->reg_space) != 0) {
+ device_printf(dev, "could not map bus space for SMU regs\n");
+ err = ENXIO;
+ goto err_reg_space;
+ }
+
+ /* sysctl stuff. */
+ sc->sysctlctx = device_get_sysctl_ctx(dev);
+ sc->sysctlnode = device_get_sysctl_tree(dev);
+
+ /* Get version & add sysctls. */
+ if ((err = amdsmu_get_vers(dev)) != 0)
+ goto err_dump;
+
+ SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO,
+ "program", CTLFLAG_RD, &sc->smu_program, 0, "SMU program number");
+ SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO,
+ "version_major", CTLFLAG_RD, &sc->smu_maj, 0,
+ "SMU firmware major version number");
+ SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO,
+ "version_minor", CTLFLAG_RD, &sc->smu_min, 0,
+ "SMU firmware minor version number");
+ SYSCTL_ADD_U8(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO,
+ "version_revision", CTLFLAG_RD, &sc->smu_rev, 0,
+ "SMU firmware revision number");
+
+ /* Set up for getting metrics & add sysctls. */
+ if ((err = amdsmu_init_metrics(dev)) != 0)
+ goto err_dump;
+ if ((err = amdsmu_dump_metrics(dev)) != 0)
+ goto err_dump;
+
+ node = SYSCTL_ADD_NODE(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode),
+ OID_AUTO, "metrics", CTLFLAG_RD, NULL, "SMU metrics");
+ if (node == NULL) {
+ device_printf(dev, "could not add sysctl node for metrics\n");
+ err = ENOMEM;
+ goto err_dump;
+ }
+
+ SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "table_version", CTLFLAG_RD, &sc->metrics.table_version, 0,
+ "SMU metrics table version");
+ SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "hint_count", CTLFLAG_RD, &sc->metrics.hint_count, 0,
+ "How many times the sleep hint was set");
+ SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "s0i3_last_entry_status", CTLFLAG_RD,
+ &sc->metrics.s0i3_last_entry_status, 0,
+ "1 if last S0i3 entry was successful");
+ SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "time_last_in_s0i2", CTLFLAG_RD, &sc->metrics.time_last_in_s0i2, 0,
+ "Time spent in S0i2 during last sleep (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "time_last_entering_s0i3", CTLFLAG_RD,
+ &sc->metrics.time_last_entering_s0i3, 0,
+ "Time spent entering S0i3 during last sleep (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "total_time_entering_s0i3", CTLFLAG_RD,
+ &sc->metrics.total_time_entering_s0i3, 0,
+ "Total time spent entering S0i3 (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "time_last_resuming", CTLFLAG_RD, &sc->metrics.time_last_resuming,
+ 0, "Time spent resuming from last sleep (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "total_time_resuming", CTLFLAG_RD, &sc->metrics.total_time_resuming,
+ 0, "Total time spent resuming from sleep (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "time_last_in_s0i3", CTLFLAG_RD, &sc->metrics.time_last_in_s0i3, 0,
+ "Time spent in S0i3 during last sleep (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "total_time_in_s0i3", CTLFLAG_RD, &sc->metrics.total_time_in_s0i3,
+ 0, "Total time spent in S0i3 (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "time_last_in_sw_drips", CTLFLAG_RD,
+ &sc->metrics.time_last_in_sw_drips, 0,
+ "Time spent in awake during last sleep (us)");
+ SYSCTL_ADD_U64(sc->sysctlctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "total_time_in_sw_drips", CTLFLAG_RD,
+ &sc->metrics.total_time_in_sw_drips, 0,
+ "Total time spent awake (us)");
+
+ /* Get IP blocks & add sysctls. */
+ err = amdsmu_get_ip_blocks(dev);
+ if (err != 0)
+ goto err_dump;
+
+ /* Get idlemask & add sysctl. */
+ amdsmu_fetch_idlemask(dev);
+ SYSCTL_ADD_U32(sc->sysctlctx, SYSCTL_CHILDREN(sc->sysctlnode), OID_AUTO,
+ "idlemask", CTLFLAG_RD, &sc->idlemask, 0, "SMU idlemask. This "
+ "value is not documented - only used to help AMD internally debug "
+ "issues");
+
+ return (0);
+err_dump:
+ bus_space_unmap(sc->bus_tag, sc->reg_space, SMU_MEM_SIZE);
+err_reg_space:
+ bus_space_unmap(sc->bus_tag, sc->smu_space, SMU_MEM_SIZE);
+err_smu_space:
+ bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->res);
+ return (err);
+}
+
+static int
+amdsmu_detach(device_t dev)
+{
+ struct amdsmu_softc *sc = device_get_softc(dev);
+ int rid = 0;
+
+ bus_space_unmap(sc->bus_tag, sc->smu_space, SMU_MEM_SIZE);
+ bus_space_unmap(sc->bus_tag, sc->reg_space, SMU_MEM_SIZE);
+
+ bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->res);
+ return (0);
+}
+
+static device_method_t amdsmu_methods[] = {
+ DEVMETHOD(device_identify, amdsmu_identify),
+ DEVMETHOD(device_probe, amdsmu_probe),
+ DEVMETHOD(device_attach, amdsmu_attach),
+ DEVMETHOD(device_detach, amdsmu_detach),
+ DEVMETHOD_END
+};
+
+static driver_t amdsmu_driver = {
+ "amdsmu",
+ amdsmu_methods,
+ sizeof(struct amdsmu_softc),
+};
+
+DRIVER_MODULE(amdsmu, hostb, amdsmu_driver, NULL, NULL);
+MODULE_VERSION(amdsmu, 1);
+MODULE_DEPEND(amdsmu, amdsmn, 1, 1, 1);
+MODULE_PNP_INFO("U16:vendor;U16:device", pci, amdsmu, amdsmu_products,
+ nitems(amdsmu_products));
diff --git a/sys/dev/amdsmu/amdsmu.h b/sys/dev/amdsmu/amdsmu.h
new file mode 100644
index 000000000000..025887f7fe5a
--- /dev/null
+++ b/sys/dev/amdsmu/amdsmu.h
@@ -0,0 +1,95 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 The FreeBSD Foundation
+ *
+ * This software was developed by Aymeric Wibo <obiwac@freebsd.org>
+ * under sponsorship from the FreeBSD Foundation.
+ */
+#ifndef _AMDSMU_H_
+#define _AMDSMU_H_
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <machine/bus.h>
+#include <x86/cputypes.h>
+
+#include <dev/amdsmu/amdsmu_reg.h>
+
+#define SMU_RES_READ_PERIOD_US 50
+#define SMU_RES_READ_MAX 20000
+
+static const struct amdsmu_product {
+ uint16_t amdsmu_vendorid;
+ uint16_t amdsmu_deviceid;
+} amdsmu_products[] = {
+ { CPU_VENDOR_AMD, PCI_DEVICEID_AMD_REMBRANDT_ROOT },
+ { CPU_VENDOR_AMD, PCI_DEVICEID_AMD_PHOENIX_ROOT },
+ { CPU_VENDOR_AMD, PCI_DEVICEID_AMD_STRIX_POINT_ROOT },
+};
+
+static const char *const amdsmu_ip_blocks_names[] = {
+ "DISPLAY",
+ "CPU",
+ "GFX",
+ "VDD",
+ "ACP",
+ "VCN",
+ "ISP",
+ "NBIO",
+ "DF",
+ "USB3_0",
+ "USB3_1",
+ "LAPIC",
+ "USB3_2",
+ "USB3_3",
+ "USB3_4",
+ "USB4_0",
+ "USB4_1",
+ "MPM",
+ "JPEG",
+ "IPU",
+ "UMSCH",
+ "VPE",
+};
+
+CTASSERT(nitems(amdsmu_ip_blocks_names) <= 32);
+
+struct amdsmu_softc {
+ struct sysctl_ctx_list *sysctlctx;
+ struct sysctl_oid *sysctlnode;
+
+ struct resource *res;
+ bus_space_tag_t bus_tag;
+
+ bus_space_handle_t smu_space;
+ bus_space_handle_t reg_space;
+
+ uint8_t smu_program;
+ uint8_t smu_maj, smu_min, smu_rev;
+
+ uint32_t active_ip_blocks;
+ struct sysctl_oid *ip_blocks_sysctlnode;
+ size_t ip_block_count;
+ struct sysctl_oid *ip_block_sysctlnodes[nitems(amdsmu_ip_blocks_names)];
+ bool ip_blocks_active[nitems(amdsmu_ip_blocks_names)];
+
+ bus_space_handle_t metrics_space;
+ struct amdsmu_metrics metrics;
+ uint32_t idlemask;
+};
+
+static inline uint32_t
+amdsmu_read4(const struct amdsmu_softc *sc, bus_size_t reg)
+{
+ return (bus_space_read_4(sc->bus_tag, sc->reg_space, reg));
+}
+
+static inline void
+amdsmu_write4(const struct amdsmu_softc *sc, bus_size_t reg, uint32_t val)
+{
+ bus_space_write_4(sc->bus_tag, sc->reg_space, reg, val);
+}
+
+#endif /* _AMDSMU_H_ */
diff --git a/sys/dev/amdsmu/amdsmu_reg.h b/sys/dev/amdsmu/amdsmu_reg.h
new file mode 100644
index 000000000000..e685b34e6883
--- /dev/null
+++ b/sys/dev/amdsmu/amdsmu_reg.h
@@ -0,0 +1,84 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 The FreeBSD Foundation
+ *
+ * This software was developed by Aymeric Wibo <obiwac@freebsd.org>
+ * under sponsorship from the FreeBSD Foundation.
+ */
+#ifndef _AMDSMU_REG_H_
+#define _AMDSMU_REG_H_
+
+#include <sys/types.h>
+
+/*
+ * TODO These are in common with amdtemp; should we find a way to factor these
+ * out? Also, there are way more of these. I couldn't find a centralized place
+ * which lists them though.
+ */
+#define PCI_DEVICEID_AMD_REMBRANDT_ROOT 0x14B5
+#define PCI_DEVICEID_AMD_PHOENIX_ROOT 0x14E8
+#define PCI_DEVICEID_AMD_STRIX_POINT_ROOT 0x14A4
+
+#define SMU_INDEX_ADDRESS 0xB8
+#define SMU_INDEX_DATA 0xBC
+
+#define SMU_PHYSBASE_ADDR_LO 0x13B102E8
+#define SMU_PHYSBASE_ADDR_HI 0x13B102EC
+
+#define SMU_MEM_SIZE 0x1000
+#define SMU_REG_SPACE_OFF 0x10000
+
+#define SMU_REG_MESSAGE 0x538
+#define SMU_REG_RESPONSE 0x980
+#define SMU_REG_ARGUMENT 0x9BC
+#define SMU_REG_IDLEMASK 0xD14
+
+enum amdsmu_res {
+ SMU_RES_WAIT = 0x00,
+ SMU_RES_OK = 0x01,
+ SMU_RES_REJECT_BUSY = 0xFC,
+ SMU_RES_REJECT_PREREQ = 0xFD,
+ SMU_RES_UNKNOWN = 0xFE,
+ SMU_RES_FAILED = 0xFF,
+};
+
+enum amdsmu_msg {
+ SMU_MSG_GETSMUVERSION = 0x02,
+ SMU_MSG_LOG_GETDRAM_ADDR_HI = 0x04,
+ SMU_MSG_LOG_GETDRAM_ADDR_LO = 0x05,
+ SMU_MSG_LOG_START = 0x06,
+ SMU_MSG_LOG_RESET = 0x07,
+ SMU_MSG_LOG_DUMP_DATA = 0x08,
+ SMU_MSG_GET_SUP_CONSTRAINTS = 0x09,
+};
+
+/* XXX Copied from Linux struct smu_metrics. */
+struct amdsmu_metrics {
+ uint32_t table_version;
+ uint32_t hint_count;
+ uint32_t s0i3_last_entry_status;
+ uint32_t time_last_in_s0i2;
+ uint64_t time_last_entering_s0i3;
+ uint64_t total_time_entering_s0i3;
+ uint64_t time_last_resuming;
+ uint64_t total_time_resuming;
+ uint64_t time_last_in_s0i3;
+ uint64_t total_time_in_s0i3;
+ uint64_t time_last_in_sw_drips;
+ uint64_t total_time_in_sw_drips;
+ /*
+ * This is how long each IP block was active for (us), i.e., blocking
+ * entry to S0i3. In Linux, these are called "timecondition_notmet_*".
+ *
+ * XXX Total active time for IP blocks seems to be buggy and reporting
+ * garbage (at least on Phoenix), so it's disabled for now. The last
+ * active time for the USB4_0 IP block also seems to be buggy.
+ */
+ uint64_t ip_block_last_active_time[32];
+#ifdef IP_BLOCK_TOTAL_ACTIVE_TIME
+ uint64_t ip_block_total_active_time[32];
+#endif
+} __attribute__((packed));
+
+#endif /* _AMDSMU_REG_H_ */
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index 8547f21586e1..7a6b1cbdd736 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -703,7 +703,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
for (m = sndptr; m != NULL; m = m->m_next) {
int n;
- if ((m->m_flags & M_NOTAVAIL) != 0)
+ if ((m->m_flags & M_NOTREADY) != 0)
break;
if (m->m_flags & M_EXTPG) {
#ifdef KERN_TLS
@@ -787,7 +787,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
/* nothing to send */
if (plen == 0) {
- KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0,
+ KASSERT(m == NULL || (m->m_flags & M_NOTREADY) != 0,
("%s: nothing to send, but m != NULL is ready",
__func__));
break;
@@ -880,7 +880,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
toep->txsd_avail--;
t4_l2t_send(sc, wr, toep->l2te);
- } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0);
+ } while (m != NULL && (m->m_flags & M_NOTREADY) == 0);
/* Send a FIN if requested, but only if there's no more data to send */
if (m == NULL && toep->flags & TPF_SEND_FIN)
diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c
index c6377980fca9..27c16b9988ae 100644
--- a/sys/dev/cxgbe/tom/t4_tls.c
+++ b/sys/dev/cxgbe/tom/t4_tls.c
@@ -563,7 +563,7 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
* If there is no ready data to send, wait until more
* data arrives.
*/
- if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0) {
+ if (m == NULL || (m->m_flags & M_NOTREADY) != 0) {
if (sowwakeup)
sowwakeup_locked(so);
else
@@ -614,7 +614,7 @@ t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
/* Shove if there is no additional data pending. */
shove = ((m->m_next == NULL ||
- (m->m_next->m_flags & M_NOTAVAIL) != 0)) &&
+ (m->m_next->m_flags & M_NOTREADY) != 0)) &&
(tp->t_flags & TF_MORETOCOME) == 0;
if (sb->sb_flags & SB_AUTOSIZE &&
diff --git a/sys/dev/drm2/drm_fb_helper.c b/sys/dev/drm2/drm_fb_helper.c
index f67cc9f60d02..1f4abd255690 100644
--- a/sys/dev/drm2/drm_fb_helper.c
+++ b/sys/dev/drm2/drm_fb_helper.c
@@ -51,7 +51,7 @@ struct vt_kms_softc {
struct task fb_mode_task;
};
-/* Call restore out of vt(9) locks. */
+/* Call restore out of vt(4) locks. */
static void
vt_restore_fbdev_mode(void *arg, int pending)
{
diff --git a/sys/dev/efidev/efirt.c b/sys/dev/efidev/efirt.c
index b0fa33daeca7..b55c1c191077 100644
--- a/sys/dev/efidev/efirt.c
+++ b/sys/dev/efidev/efirt.c
@@ -107,7 +107,8 @@ static int efi_status2err[25] = {
enum efi_table_type {
TYPE_ESRT = 0,
- TYPE_PROP
+ TYPE_PROP,
+ TYPE_MEMORY_ATTR
};
static int efi_enter(void);
@@ -445,6 +446,42 @@ get_table_length(enum efi_table_type type, size_t *table_len, void **taddr)
free(buf, M_TEMP);
return (0);
}
+ case TYPE_MEMORY_ATTR:
+ {
+ efi_guid_t guid = EFI_MEMORY_ATTRIBUTES_TABLE;
+ struct efi_memory_attribute_table *tbl_addr, *mem_addr;
+ int error;
+ void *buf;
+ size_t len = sizeof(struct efi_memory_attribute_table);
+
+ error = efi_get_table(&guid, (void **)&tbl_addr);
+ if (error)
+ return (error);
+
+ buf = malloc(len, M_TEMP, M_WAITOK);
+ error = physcopyout((vm_paddr_t)tbl_addr, buf, len);
+ if (error) {
+ free(buf, M_TEMP);
+ return (error);
+ }
+
+ mem_addr = (struct efi_memory_attribute_table *)buf;
+ if (mem_addr->version != 2) {
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+ len += mem_addr->descriptor_size * mem_addr->num_ents;
+ if (len > EFI_TABLE_ALLOC_MAX) {
+ free(buf, M_TEMP);
+ return (ENOMEM);
+ }
+
+ *table_len = len;
+ if (taddr != NULL)
+ *taddr = tbl_addr;
+ free(buf, M_TEMP);
+ return (0);
+ }
}
return (ENOENT);
}
@@ -457,7 +494,8 @@ copy_table(efi_guid_t *guid, void **buf, size_t buf_len, size_t *table_len)
enum efi_table_type type;
} tables[] = {
{ EFI_TABLE_ESRT, TYPE_ESRT },
- { EFI_PROPERTIES_TABLE, TYPE_PROP }
+ { EFI_PROPERTIES_TABLE, TYPE_PROP },
+ { EFI_MEMORY_ATTRIBUTES_TABLE, TYPE_MEMORY_ATTR }
};
size_t table_idx;
void *taddr;
diff --git a/sys/dev/gpio/acpi_gpiobus.c b/sys/dev/gpio/acpi_gpiobus.c
index f9468e0deda0..94f4e5771266 100644
--- a/sys/dev/gpio/acpi_gpiobus.c
+++ b/sys/dev/gpio/acpi_gpiobus.c
@@ -357,7 +357,7 @@ acpi_gpiobus_attach(device_t dev)
status = AcpiWalkResources(handle, "_AEI", acpi_gpiobus_enumerate_aei,
&ctx);
- if (ACPI_FAILURE(status))
+ if (ACPI_FAILURE(status) && status != AE_NOT_FOUND)
device_printf(dev, "Failed to enumerate AEI resources\n");
return (0);
diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c
index ab7f13177969..764bcb7e6ee8 100644
--- a/sys/dev/gpio/gpiobus.c
+++ b/sys/dev/gpio/gpiobus.c
@@ -110,10 +110,9 @@ gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags,
res = bus_alloc_resource(consumer_dev, SYS_RES_IRQ, rid, irq, irq, 1,
alloc_flags);
if (res == NULL) {
- intr_free_intr_map_data((struct intr_map_data *)gpio_data);
+ intr_unmap_irq(irq);
return (NULL);
}
- rman_set_virtual(res, gpio_data);
return (res);
}
#else
@@ -866,6 +865,25 @@ gpiobus_alloc_resource(device_t bus, device_t child, int type, int *rid,
end, count, flags));
}
+static int
+gpiobus_release_resource(device_t dev, device_t child, struct resource *r)
+{
+ int err;
+#ifdef INTRNG
+ u_int irq;
+
+ irq = rman_get_start(r);
+ MPASS(irq == rman_get_end(r));
+#endif
+ err = bus_generic_rman_release_resource(dev, child, r);
+ if (err != 0)
+ return (err);
+#ifdef INTRNG
+ intr_unmap_irq(irq);
+#endif
+ return (0);
+}
+
static struct resource_list *
gpiobus_get_resource_list(device_t bus __unused, device_t child)
{
@@ -1060,7 +1078,7 @@ static device_method_t gpiobus_methods[] = {
DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource),
DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource),
DEVMETHOD(bus_alloc_resource, gpiobus_alloc_resource),
- DEVMETHOD(bus_release_resource, bus_generic_rman_release_resource),
+ DEVMETHOD(bus_release_resource, gpiobus_release_resource),
DEVMETHOD(bus_activate_resource, bus_generic_rman_activate_resource),
DEVMETHOD(bus_deactivate_resource, bus_generic_rman_deactivate_resource),
DEVMETHOD(bus_get_resource_list, gpiobus_get_resource_list),
diff --git a/sys/dev/ice/ice_features.h b/sys/dev/ice/ice_features.h
index 821abe4806ca..5b23757b1c98 100644
--- a/sys/dev/ice/ice_features.h
+++ b/sys/dev/ice/ice_features.h
@@ -91,7 +91,9 @@ enum feat_list {
static inline void
ice_disable_unsupported_features(ice_bitmap_t __unused *bitmap)
{
+#ifndef PCI_IOV
ice_clear_bit(ICE_FEATURE_SRIOV, bitmap);
+#endif
#ifndef DEV_NETMAP
ice_clear_bit(ICE_FEATURE_NETMAP, bitmap);
#endif
diff --git a/sys/dev/ice/ice_iflib.h b/sys/dev/ice/ice_iflib.h
index 3a5dc201189a..e1d5307a9516 100644
--- a/sys/dev/ice/ice_iflib.h
+++ b/sys/dev/ice/ice_iflib.h
@@ -139,6 +139,9 @@ struct ice_irq_vector {
* @tc: traffic class queue belongs to
* @q_handle: qidx in tc; used in TXQ enable functions
*
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
* Other parameters may be iflib driver specific
*/
struct ice_tx_queue {
@@ -153,6 +156,9 @@ struct ice_tx_queue {
u32 me;
u16 q_handle;
u8 tc;
+#ifdef PCI_IOV
+ u8 itr_idx;
+#endif
/* descriptor writeback status */
qidx_t *tx_rsq;
@@ -175,6 +181,9 @@ struct ice_tx_queue {
* @stats: queue statistics
* @tc: traffic class queue belongs to
*
+ * ice_iov.c requires the following parameters (when PCI_IOV is defined):
+ * @itr_idx: ITR index to use for this queue
+ *
* Other parameters may be iflib driver specific
*/
struct ice_rx_queue {
@@ -187,6 +196,9 @@ struct ice_rx_queue {
struct ice_irq_vector *irqv;
u32 me;
u8 tc;
+#ifdef PCI_IOV
+ u8 itr_idx;
+#endif
struct if_irq que_irq;
};
@@ -332,6 +344,10 @@ struct ice_softc {
ice_declare_bitmap(feat_cap, ICE_FEATURE_COUNT);
ice_declare_bitmap(feat_en, ICE_FEATURE_COUNT);
+#ifdef PCI_IOV
+ struct ice_vf *vfs;
+ u16 num_vfs;
+#endif
struct ice_resmgr os_imgr;
/* For mirror interface */
struct ice_mirr_if *mirr_if;
diff --git a/sys/dev/ice/ice_iov.c b/sys/dev/ice/ice_iov.c
new file mode 100644
index 000000000000..c5a3e1060e44
--- /dev/null
+++ b/sys/dev/ice/ice_iov.c
@@ -0,0 +1,1856 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.c
+ * @brief Virtualization support functions
+ *
+ * Contains functions for enabling and managing PCIe virtual function devices,
+ * including enabling new VFs, and managing VFs over the virtchnl interface.
+ */
+
+#include "ice_iov.h"
+
+static struct ice_vf *ice_iov_get_vf(struct ice_softc *sc, int vf_num);
+static void ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf,
+ bool trigger_vflr);
+static void ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf);
+
+static void ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static bool ice_vc_isvalid_ring_len(u16 ring_len);
+static void ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf);
+static void ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+ struct virtchnl_eth_stats *vstats);
+static void ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static void ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf);
+static enum virtchnl_status_code ice_iov_err_to_virt_err(int ice_err);
+static int ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr);
+
+/**
+ * ice_iov_attach - Initialize SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Initialize SR-IOV PF host support at the end of the driver attach process.
+ *
+ * @pre Must be called from sleepable context (calls malloc() w/ M_WAITOK)
+ *
+ * @returns 0 if successful, or
+ * - ENOMEM if there is no memory for the PF/VF schemas or iov device
+ * - ENXIO if the device isn't PCI-E or doesn't support the same SR-IOV
+ * version as the kernel
+ * - ENOENT if the device doesn't have the SR-IOV capability
+ */
+int
+ice_iov_attach(struct ice_softc *sc)
+{
+ device_t dev = sc->dev;
+ nvlist_t *pf_schema, *vf_schema;
+ int error;
+
+ pf_schema = pci_iov_schema_alloc_node();
+ vf_schema = pci_iov_schema_alloc_node();
+
+ pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL);
+ pci_iov_schema_add_bool(vf_schema, "mac-anti-spoof",
+ IOV_SCHEMA_HASDEFAULT, TRUE);
+ pci_iov_schema_add_bool(vf_schema, "allow-set-mac",
+ IOV_SCHEMA_HASDEFAULT, FALSE);
+ pci_iov_schema_add_bool(vf_schema, "allow-promisc",
+ IOV_SCHEMA_HASDEFAULT, FALSE);
+ pci_iov_schema_add_uint16(vf_schema, "num-queues",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_QUEUES);
+ pci_iov_schema_add_uint16(vf_schema, "mirror-src-vsi",
+ IOV_SCHEMA_HASDEFAULT, ICE_INVALID_MIRROR_VSI);
+ pci_iov_schema_add_uint16(vf_schema, "max-vlan-allowed",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_VLAN_LIMIT);
+ pci_iov_schema_add_uint16(vf_schema, "max-mac-filters",
+ IOV_SCHEMA_HASDEFAULT, ICE_DEFAULT_VF_FILTER_LIMIT);
+
+ error = pci_iov_attach(dev, pf_schema, vf_schema);
+ if (error != 0) {
+ device_printf(dev,
+ "pci_iov_attach failed (error=%s)\n",
+ ice_err_str(error));
+ ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+ } else
+ ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+
+ return (error);
+}
+
+/**
+ * ice_iov_detach - Teardown SR-IOV PF host support
+ * @sc: device softc structure
+ *
+ * Teardown SR-IOV PF host support at the start of the driver detach process.
+ *
+ * @returns 0 if successful or IOV support hasn't been setup, or
+ * - EBUSY if VFs still exist
+ */
+int
+ice_iov_detach(struct ice_softc *sc)
+{
+ device_t dev = sc->dev;
+ int error;
+
+ error = pci_iov_detach(dev);
+ if (error != 0) {
+ device_printf(dev,
+ "pci_iov_detach failed (error=%s)\n",
+ ice_err_str(error));
+ }
+
+ return (error);
+}
+
+/**
+ * ice_iov_init - Called by the OS before the first VF is created.
+ * @sc: device softc structure
+ * @num_vfs: number of VFs to setup resources for
+ * @params: configuration parameters for the PF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params __unused)
+{
+ /* Allocate array of VFs, for tracking */
+ sc->vfs = (struct ice_vf *)malloc(sizeof(struct ice_vf) * num_vfs, M_ICE, M_NOWAIT |
+ M_ZERO);
+ if (sc->vfs == NULL)
+ return (ENOMEM);
+
+ /* Initialize each VF with basic information */
+ for (int i = 0; i < num_vfs; i++)
+ sc->vfs[i].vf_num = i;
+
+ /* Save off number of configured VFs */
+ sc->num_vfs = num_vfs;
+
+ return (0);
+}
+
+/**
+ * ice_iov_get_vf - Get pointer to VF at given index
+ * @sc: device softc structure
+ * @vf_num: Index of VF to retrieve
+ *
+ * @remark will throw an assertion if vf_num is not in the
+ * range of allocated VFs
+ *
+ * @returns a pointer to the VF structure at the given index
+ */
+static struct ice_vf *
+ice_iov_get_vf(struct ice_softc *sc, int vf_num)
+{
+ MPASS(vf_num < sc->num_vfs);
+
+ return &sc->vfs[vf_num];
+}
+
+/**
+ * ice_iov_add_vf - Called by the OS for each VF to create
+ * @sc: device softc structure
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * @returns 0 if successful or an error code on failure
+ */
+int
+ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params)
+{
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ device_t dev = sc->dev;
+ struct ice_vsi *vsi;
+ struct ice_vf *vf;
+ int vf_num_queues;
+ const void *mac;
+ size_t size;
+ int error;
+ int i;
+
+ vf = ice_iov_get_vf(sc, vfnum);
+ vf->vf_flags = VF_FLAG_ENABLED;
+
+ /* This VF needs at least one VSI */
+ vsi = ice_alloc_vsi(sc, ICE_VSI_VF);
+ if (vsi == NULL)
+ return (ENOMEM);
+ vf->vsi = vsi;
+ vsi->vf_num = vfnum;
+
+ vf_num_queues = nvlist_get_number(params, "num-queues");
+ /* Validate and clamp value if invalid */
+ if (vf_num_queues < 1 || vf_num_queues > ICE_MAX_SCATTERED_QUEUES)
+ device_printf(dev, "Invalid num-queues (%d) for VF %d\n",
+ vf_num_queues, vf->vf_num);
+ if (vf_num_queues < 1) {
+ device_printf(dev, "Setting VF %d num-queues to 1\n", vf->vf_num);
+ vf_num_queues = 1;
+ } else if (vf_num_queues > ICE_MAX_SCATTERED_QUEUES) {
+ device_printf(dev, "Setting VF %d num-queues to %d\n",
+ vf->vf_num, ICE_MAX_SCATTERED_QUEUES);
+ vf_num_queues = ICE_MAX_SCATTERED_QUEUES;
+ }
+ vsi->qmap_type = ICE_RESMGR_ALLOC_SCATTERED;
+
+ /* Reserve VF queue allocation from PF queues */
+ ice_alloc_vsi_qmap(vsi, vf_num_queues, vf_num_queues);
+ vsi->num_tx_queues = vsi->num_rx_queues = vf_num_queues;
+
+ /* Assign Tx queues from PF space */
+ error = ice_resmgr_assign_scattered(&sc->tx_qmgr, vsi->tx_qmap,
+ vsi->num_tx_queues);
+ if (error) {
+ device_printf(sc->dev, "Unable to assign VF Tx queues: %s\n",
+ ice_err_str(error));
+ goto release_vsi;
+ }
+
+ /* Assign Rx queues from PF space */
+ error = ice_resmgr_assign_scattered(&sc->rx_qmgr, vsi->rx_qmap,
+ vsi->num_rx_queues);
+ if (error) {
+ device_printf(sc->dev, "Unable to assign VF Rx queues: %s\n",
+ ice_err_str(error));
+ goto release_vsi;
+ }
+
+ vsi->max_frame_size = ICE_MAX_FRAME_SIZE;
+
+ /* Allocate queue structure memory */
+ vsi->tx_queues = (struct ice_tx_queue *)
+ malloc(sizeof(struct ice_tx_queue) * vsi->num_tx_queues, M_ICE,
+ M_NOWAIT | M_ZERO);
+ if (!vsi->tx_queues) {
+ device_printf(sc->dev, "VF-%d: Unable to allocate Tx queue memory\n",
+ vfnum);
+ error = ENOMEM;
+ goto release_vsi;
+ }
+ for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) {
+ txq->me = i;
+ txq->vsi = vsi;
+ }
+
+ /* Allocate queue structure memory */
+ vsi->rx_queues = (struct ice_rx_queue *)
+ malloc(sizeof(struct ice_rx_queue) * vsi->num_rx_queues, M_ICE,
+ M_NOWAIT | M_ZERO);
+ if (!vsi->rx_queues) {
+ device_printf(sc->dev, "VF-%d: Unable to allocate Rx queue memory\n",
+ vfnum);
+ error = ENOMEM;
+ goto free_txqs;
+ }
+ for (i = 0, rxq = vsi->rx_queues; i < vsi->num_rx_queues; i++, rxq++) {
+ rxq->me = i;
+ rxq->vsi = vsi;
+ }
+
+ /* Allocate space to store the IRQ vector data */
+ vf->num_irq_vectors = vf_num_queues + 1;
+ vf->tx_irqvs = (struct ice_irq_vector *)
+ malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+ M_ICE, M_NOWAIT);
+ if (!vf->tx_irqvs) {
+ device_printf(sc->dev,
+ "Unable to allocate TX irqv memory for VF-%d's %d vectors\n",
+ vfnum, vf->num_irq_vectors);
+ error = ENOMEM;
+ goto free_rxqs;
+ }
+ vf->rx_irqvs = (struct ice_irq_vector *)
+ malloc(sizeof(struct ice_irq_vector) * (vf->num_irq_vectors),
+ M_ICE, M_NOWAIT);
+ if (!vf->rx_irqvs) {
+ device_printf(sc->dev,
+ "Unable to allocate RX irqv memory for VF-%d's %d vectors\n",
+ vfnum, vf->num_irq_vectors);
+ error = ENOMEM;
+ goto free_txirqvs;
+ }
+
+ /* Assign VF interrupts from PF space */
+ if (!(vf->vf_imap =
+ (u16 *)malloc(sizeof(u16) * vf->num_irq_vectors,
+ M_ICE, M_NOWAIT))) {
+ device_printf(dev, "Unable to allocate VF-%d imap memory\n", vfnum);
+ error = ENOMEM;
+ goto free_rxirqvs;
+ }
+ error = ice_resmgr_assign_contiguous(&sc->dev_imgr, vf->vf_imap, vf->num_irq_vectors);
+ if (error) {
+ device_printf(dev, "Unable to assign VF-%d interrupt mapping: %s\n",
+ vfnum, ice_err_str(error));
+ goto free_imap;
+ }
+
+ if (nvlist_exists_binary(params, "mac-addr")) {
+ mac = nvlist_get_binary(params, "mac-addr", &size);
+ memcpy(vf->mac, mac, ETHER_ADDR_LEN);
+
+ if (nvlist_get_bool(params, "allow-set-mac"))
+ vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+ } else
+ /*
+ * If the administrator has not specified a MAC address then
+ * we must allow the VF to choose one.
+ */
+ vf->vf_flags |= VF_FLAG_SET_MAC_CAP;
+
+ if (nvlist_get_bool(params, "mac-anti-spoof"))
+ vf->vf_flags |= VF_FLAG_MAC_ANTI_SPOOF;
+
+ if (nvlist_get_bool(params, "allow-promisc"))
+ vf->vf_flags |= VF_FLAG_PROMISC_CAP;
+
+ vsi->mirror_src_vsi = nvlist_get_number(params, "mirror-src-vsi");
+
+ vf->vlan_limit = nvlist_get_number(params, "max-vlan-allowed");
+ vf->mac_filter_limit = nvlist_get_number(params, "max-mac-filters");
+
+ vf->vf_flags |= VF_FLAG_VLAN_CAP;
+
+ /* Create and setup VSI in HW */
+ error = ice_initialize_vsi(vsi);
+ if (error) {
+ device_printf(sc->dev, "Unable to initialize VF %d VSI: %s\n",
+ vfnum, ice_err_str(error));
+ goto release_imap;
+ }
+
+ /* Add the broadcast address */
+ error = ice_add_vsi_mac_filter(vsi, broadcastaddr);
+ if (error) {
+ device_printf(sc->dev, "Unable to add broadcast filter VF %d VSI: %s\n",
+ vfnum, ice_err_str(error));
+ goto release_imap;
+ }
+
+ ice_iov_ready_vf(sc, vf);
+
+ return (0);
+
+release_imap:
+ ice_resmgr_release_map(&sc->dev_imgr, vf->vf_imap,
+ vf->num_irq_vectors);
+free_imap:
+ free(vf->vf_imap, M_ICE);
+ vf->vf_imap = NULL;
+free_rxirqvs:
+ free(vf->rx_irqvs, M_ICE);
+ vf->rx_irqvs = NULL;
+free_txirqvs:
+ free(vf->tx_irqvs, M_ICE);
+ vf->tx_irqvs = NULL;
+free_rxqs:
+ free(vsi->rx_queues, M_ICE);
+ vsi->rx_queues = NULL;
+free_txqs:
+ free(vsi->tx_queues, M_ICE);
+ vsi->tx_queues = NULL;
+release_vsi:
+ ice_release_vsi(vsi);
+ vf->vsi = NULL;
+ return (error);
+}
+
+/**
+ * ice_iov_uninit - Called by the OS when VFs are destroyed
+ * @sc: device softc structure
+ */
+void
+ice_iov_uninit(struct ice_softc *sc)
+{
+ struct ice_vf *vf;
+ struct ice_vsi *vsi;
+
+ /* Release per-VF resources */
+ for (int i = 0; i < sc->num_vfs; i++) {
+ vf = &sc->vfs[i];
+ vsi = vf->vsi;
+
+ /* Free VF interrupt reservation */
+ if (vf->vf_imap) {
+ free(vf->vf_imap, M_ICE);
+ vf->vf_imap = NULL;
+ }
+
+ /* Free queue interrupt mapping trackers */
+ if (vf->tx_irqvs) {
+ free(vf->tx_irqvs, M_ICE);
+ vf->tx_irqvs = NULL;
+ }
+ if (vf->rx_irqvs) {
+ free(vf->rx_irqvs, M_ICE);
+ vf->rx_irqvs = NULL;
+ }
+
+ if (!vsi)
+ continue;
+
+ /* Free VSI queues */
+ if (vsi->tx_queues) {
+ free(vsi->tx_queues, M_ICE);
+ vsi->tx_queues = NULL;
+ }
+ if (vsi->rx_queues) {
+ free(vsi->rx_queues, M_ICE);
+ vsi->rx_queues = NULL;
+ }
+
+ ice_release_vsi(vsi);
+ vf->vsi = NULL;
+ }
+
+ /* Release memory used for VF tracking */
+ if (sc->vfs) {
+ free(sc->vfs, M_ICE);
+ sc->vfs = NULL;
+ }
+ sc->num_vfs = 0;
+}
+
+/**
+ * ice_iov_handle_vflr - Process VFLR event
+ * @sc: device softc structure
+ *
+ * Identifys which VFs have been reset and re-configure
+ * them.
+ */
+void
+ice_iov_handle_vflr(struct ice_softc *sc)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct ice_vf *vf;
+ u32 reg, reg_idx, bit_idx;
+
+ for (int i = 0; i < sc->num_vfs; i++) {
+ vf = &sc->vfs[i];
+
+ reg_idx = (hw->func_caps.vf_base_id + vf->vf_num) / 32;
+ bit_idx = (hw->func_caps.vf_base_id + vf->vf_num) % 32;
+ reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx));
+ if (reg & BIT(bit_idx))
+ ice_reset_vf(sc, vf, false);
+ }
+}
+
+/**
+ * ice_iov_ready_vf - Setup VF interrupts and mark it as ready
+ * @sc: device softc structure
+ * @vf: driver's VF structure for the VF to update
+ *
+ * Clears VF reset triggering bit, sets up the PF<->VF interrupt
+ * mapping and marks the VF as active in the HW so that the VF
+ * driver can use it.
+ */
+static void
+ice_iov_ready_vf(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct ice_hw *hw = &sc->hw;
+ u32 reg;
+
+ /* Clear the triggering bit */
+ reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+ reg &= ~VPGEN_VFRTRIG_VFSWR_M;
+ wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+
+ /* Setup VF interrupt allocation and mapping */
+ ice_iov_setup_intr_mapping(sc, vf);
+
+ /* Indicate to the VF that reset is done */
+ wr32(hw, VFGEN_RSTAT(vf->vf_num), VIRTCHNL_VFR_VFACTIVE);
+
+ ice_flush(hw);
+}
+
+/**
+ * ice_reset_vf - Perform a hardware reset (VFR) on a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be reset
+ * @trigger_vflr: trigger a reset or only handle already executed reset
+ *
+ * Performs a VFR for the given VF. This function busy waits until the
+ * reset completes in the HW, notifies the VF that the reset is done
+ * by setting a bit in a HW register, then returns.
+ *
+ * @remark This also sets up the PF<->VF interrupt mapping and allocations in
+ * the hardware after the hardware reset is finished, via
+ * ice_iov_setup_intr_mapping()
+ */
+static void
+ice_reset_vf(struct ice_softc *sc, struct ice_vf *vf, bool trigger_vflr)
+{
+ u16 global_vf_num, reg_idx, bit_idx;
+ struct ice_hw *hw = &sc->hw;
+ int status;
+ u32 reg;
+ int i;
+
+ global_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+ if (trigger_vflr) {
+ reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_num));
+ reg |= VPGEN_VFRTRIG_VFSWR_M;
+ wr32(hw, VPGEN_VFRTRIG(vf->vf_num), reg);
+ }
+
+ /* clear the VFLR bit for the VF in a GLGEN_VFLRSTAT register */
+ reg_idx = (global_vf_num) / 32;
+ bit_idx = (global_vf_num) % 32;
+ wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
+ ice_flush(hw);
+
+ /* Wait until there are no pending PCI transactions */
+ wr32(hw, PF_PCI_CIAA,
+ ICE_PCIE_DEV_STATUS | (global_vf_num << PF_PCI_CIAA_VF_NUM_S));
+
+ for (i = 0; i < ICE_PCI_CIAD_WAIT_COUNT; i++) {
+ reg = rd32(hw, PF_PCI_CIAD);
+ if (!(reg & PCIEM_STA_TRANSACTION_PND))
+ break;
+
+ DELAY(ICE_PCI_CIAD_WAIT_DELAY_US);
+ }
+ if (i == ICE_PCI_CIAD_WAIT_COUNT)
+ device_printf(sc->dev,
+ "VF-%d PCI transactions stuck\n", vf->vf_num);
+
+ /* Disable TX queues, which is required during VF reset */
+ status = ice_dis_vsi_txq(hw->port_info, vf->vsi->idx, 0, 0, NULL, NULL,
+ NULL, ICE_VF_RESET, vf->vf_num, NULL);
+ if (status)
+ device_printf(sc->dev,
+ "%s: Failed to disable LAN Tx queues: err %s aq_err %s\n",
+ __func__, ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+
+ /* Then check for the VF reset to finish in HW */
+ for (i = 0; i < ICE_VPGEN_VFRSTAT_WAIT_COUNT; i++) {
+ reg = rd32(hw, VPGEN_VFRSTAT(vf->vf_num));
+ if ((reg & VPGEN_VFRSTAT_VFRD_M))
+ break;
+
+ DELAY(ICE_VPGEN_VFRSTAT_WAIT_DELAY_US);
+ }
+ if (i == ICE_VPGEN_VFRSTAT_WAIT_COUNT)
+ device_printf(sc->dev,
+ "VF-%d Reset is stuck\n", vf->vf_num);
+
+ ice_iov_ready_vf(sc, vf);
+}
+
+/**
+ * ice_vc_get_vf_res_msg - Handle VIRTCHNL_OP_GET_VF_RESOURCES msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a message from the VF listing its supported capabilities, and
+ * replies to the VF with information about what resources the PF has
+ * allocated for the VF.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail. It's up to the VF driver to reject or complain about the PF's response.
+ */
+static void
+ice_vc_get_vf_res_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vf_resource *vf_res;
+ struct virtchnl_vsi_resource *vsi_res;
+ u16 vf_res_len;
+ u32 vf_caps;
+
+ /* XXX: Only support one VSI per VF, so this size doesn't need adjusting */
+ vf_res_len = sizeof(struct virtchnl_vf_resource);
+ vf_res = (struct virtchnl_vf_resource *)malloc(vf_res_len, M_ICE,
+ M_WAITOK | M_ZERO);
+
+ vf_res->num_vsis = 1;
+ vf_res->num_queue_pairs = vf->vsi->num_tx_queues;
+ vf_res->max_vectors = vf_res->num_queue_pairs + 1;
+
+ vf_res->rss_key_size = ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE;
+ vf_res->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+ vf_res->max_mtu = 0;
+
+ vf_res->vf_cap_flags = VF_BASE_MODE_OFFLOADS;
+ if (msg_buf != NULL) {
+ vf_caps = *((u32 *)(msg_buf));
+
+ if (vf_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
+ vf_res->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
+
+ if (vf_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
+ vf_res->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
+ }
+
+ vsi_res = &vf_res->vsi_res[0];
+ vsi_res->vsi_id = vf->vsi->idx;
+ vsi_res->num_queue_pairs = vf->vsi->num_tx_queues;
+ vsi_res->vsi_type = VIRTCHNL_VSI_SRIOV;
+ vsi_res->qset_handle = 0;
+ if (!ETHER_IS_ZERO(vf->mac))
+ memcpy(vsi_res->default_mac_addr, vf->mac, ETHER_ADDR_LEN);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_VF_RESOURCES,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)vf_res, vf_res_len, NULL);
+
+ free(vf_res, M_ICE);
+}
+
+/**
+ * ice_vc_version_msg - Handle VIRTCHNL_OP_VERSION msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a version message from the VF, and responds to the VF with
+ * the version number that the PF will use.
+ *
+ * @remark This always replies to the VF with a success status; it does not
+ * fail.
+ */
+static void
+ice_vc_version_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct virtchnl_version_info *recv_vf_version;
+ struct ice_hw *hw = &sc->hw;
+ device_t dev = sc->dev;
+
+ recv_vf_version = (struct virtchnl_version_info *)msg_buf;
+
+ /* VFs running the 1.0 API expect to get 1.0 back */
+ if (VF_IS_V10(recv_vf_version)) {
+ vf->version.major = 1;
+ vf->version.minor = VIRTCHNL_VERSION_MINOR_NO_VF_CAPS;
+ } else {
+ vf->version.major = VIRTCHNL_VERSION_MAJOR;
+ vf->version.minor = VIRTCHNL_VERSION_MINOR;
+
+ if ((recv_vf_version->major != VIRTCHNL_VERSION_MAJOR) ||
+ (recv_vf_version->minor != VIRTCHNL_VERSION_MINOR))
+ device_printf(dev,
+ "%s: VF-%d requested version (%d.%d) differs from PF version (%d.%d)\n",
+ __func__, vf->vf_num,
+ recv_vf_version->major, recv_vf_version->minor,
+ VIRTCHNL_VERSION_MAJOR, VIRTCHNL_VERSION_MINOR);
+ }
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_VERSION,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&vf->version, sizeof(vf->version),
+ NULL);
+}
+
+/**
+ * ice_vf_validate_mac - Validate MAC address before adding it
+ * @vf: VF tracking structure
+ * @addr: MAC address to validate
+ *
+ * Validate a MAC address before adding it to a VF during the handling
+ * of a VIRTCHNL_OP_ADD_ETH_ADDR operation. Notably, this also checks if
+ * the VF is allowed to set its own arbitrary MAC addresses.
+ *
+ * Returns 0 if MAC address is valid for the given vf
+ */
+static int
+ice_vf_validate_mac(struct ice_vf *vf, const uint8_t *addr)
+{
+
+ if (ETHER_IS_ZERO(addr) || ETHER_IS_BROADCAST(addr))
+ return (EINVAL);
+
+ /*
+ * If the VF is not allowed to change its MAC address, don't let it
+ * set a MAC filter for an address that is not a multicast address and
+ * is not its assigned MAC.
+ */
+ if (!(vf->vf_flags & VF_FLAG_SET_MAC_CAP) &&
+ !(ETHER_IS_MULTICAST(addr) || !bcmp(addr, vf->mac, ETHER_ADDR_LEN)))
+ return (EPERM);
+
+ return (0);
+}
+
+/**
+ * ice_vc_add_eth_addr_msg - Handle VIRTCHNL_OP_ADD_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and adds those addresses
+ * to the VSI's filter list.
+ */
+static void
+ice_vc_add_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct virtchnl_ether_addr_list *addr_list;
+ struct ice_hw *hw = &sc->hw;
+ u16 added_addr_cnt = 0;
+ int error = 0;
+
+ addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+ if (addr_list->num_elements >
+ (vf->mac_filter_limit - vf->mac_filter_cnt)) {
+ v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ goto done;
+ }
+
+ for (int i = 0; i < addr_list->num_elements; i++) {
+ u8 *addr = addr_list->list[i].addr;
+
+ /* The type flag is currently ignored; every MAC address is
+ * treated as the LEGACY type
+ */
+
+ error = ice_vf_validate_mac(vf, addr);
+ if (error == EPERM) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Not permitted to add MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ } else if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Did not add invalid MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+
+ error = ice_add_vsi_mac_filter(vf->vsi, addr);
+ if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Error adding MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+ /* Don't count VF's MAC against its MAC filter limit */
+ if (memcmp(addr, vf->mac, ETHER_ADDR_LEN))
+ added_addr_cnt++;
+ }
+
+ vf->mac_filter_cnt += added_addr_cnt;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_ETH_ADDR,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_eth_addr_msg - Handle VIRTCHNL_OP_DEL_ETH_ADDR msg from VF
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Receives a list of MAC addresses from the VF and removes those addresses
+ * from the VSI's filter list.
+ */
+static void
+ice_vc_del_eth_addr_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct virtchnl_ether_addr_list *addr_list;
+ struct ice_hw *hw = &sc->hw;
+ u16 deleted_addr_cnt = 0;
+ int error = 0;
+
+ addr_list = (struct virtchnl_ether_addr_list *)msg_buf;
+
+ for (int i = 0; i < addr_list->num_elements; i++) {
+ error = ice_remove_vsi_mac_filter(vf->vsi, addr_list->list[i].addr);
+ if (error) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Error removing MAC addr for VSI %d\n",
+ __func__, vf->vf_num, vf->vsi->idx);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ continue;
+ }
+ /* Don't count VF's MAC against its MAC filter limit */
+ if (memcmp(addr_list->list[i].addr, vf->mac, ETHER_ADDR_LEN))
+ deleted_addr_cnt++;
+ }
+
+ if (deleted_addr_cnt >= vf->mac_filter_cnt)
+ vf->mac_filter_cnt = 0;
+ else
+ vf->mac_filter_cnt -= deleted_addr_cnt;
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_ETH_ADDR,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_add_vlan_msg - Handle VIRTCHNL_OP_ADD_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VLANs in msg_buf to the VF's VLAN filter list.
+ */
+static void
+ice_vc_add_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vlan_filter_list *vlan_list;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+ if (vlan_list->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vlan_list->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vlan_list->num_elements > (vf->vlan_limit - vf->vlan_cnt)) {
+ v_status = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ goto done;
+ }
+
+ status = ice_add_vlan_hw_filters(vsi, vlan_list->vlan_id,
+ vlan_list->num_elements);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failure adding VLANs to VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx, ice_status_str(status),
+ ice_aq_str(sc->hw.adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ vf->vlan_cnt += vlan_list->num_elements;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ADD_VLAN,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_del_vlan_msg - Handle VIRTCHNL_OP_DEL_VLAN msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Removes the VLANs in msg_buf from the VF's VLAN filter list.
+ */
+static void
+ice_vc_del_vlan_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vlan_filter_list *vlan_list;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vlan_list = (struct virtchnl_vlan_filter_list *)msg_buf;
+
+ if (vlan_list->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vlan_list->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ status = ice_remove_vlan_hw_filters(vsi, vlan_list->vlan_id,
+ vlan_list->num_elements);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failure deleting VLANs from VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx, ice_status_str(status),
+ ice_aq_str(sc->hw.adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ if (vlan_list->num_elements >= vf->vlan_cnt)
+ vf->vlan_cnt = 0;
+ else
+ vf->vlan_cnt -= vlan_list->num_elements;
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DEL_VLAN,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_validate_ring_len - Check to see if a descriptor ring length is valid
+ * @ring_len: length of ring
+ *
+ * Check whether a ring size value is valid.
+ *
+ * @returns true if given ring size is valid
+ */
+static bool
+ice_vc_isvalid_ring_len(u16 ring_len)
+{
+ return (ring_len >= ICE_MIN_DESC_COUNT &&
+ ring_len <= ICE_MAX_DESC_COUNT &&
+ !(ring_len % ICE_DESC_COUNT_INCR));
+}
+
+/**
+ * ice_vc_cfg_vsi_qs_msg - Handle VIRTCHNL_OP_CONFIG_VSI_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ */
+static void
+ice_vc_cfg_vsi_qs_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ device_t dev = sc->dev;
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_vsi_queue_config_info *vqci;
+ struct virtchnl_queue_pair_info *vqpi;
+ enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ int i, error = 0;
+
+ vqci = (struct virtchnl_vsi_queue_config_info *)msg_buf;
+
+ if (vqci->num_queue_pairs > vf->vsi->num_tx_queues &&
+ vqci->num_queue_pairs > vf->vsi->num_rx_queues) {
+ status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ ice_vsi_disable_tx(vf->vsi);
+ ice_control_all_rx_queues(vf->vsi, false);
+
+ /*
+ * Clear TX and RX queues config in case VF
+ * requests different number of queues.
+ */
+ for (i = 0; i < vsi->num_tx_queues; i++) {
+ txq = &vsi->tx_queues[i];
+
+ txq->desc_count = 0;
+ txq->tx_paddr = 0;
+ txq->tc = 0;
+ }
+
+ for (i = 0; i < vsi->num_rx_queues; i++) {
+ rxq = &vsi->rx_queues[i];
+
+ rxq->desc_count = 0;
+ rxq->rx_paddr = 0;
+ }
+
+ vqpi = vqci->qpair;
+ for (i = 0; i < vqci->num_queue_pairs; i++, vqpi++) {
+ /* Initial parameter validation */
+ if (vqpi->txq.vsi_id != vf->vsi->idx ||
+ vqpi->rxq.vsi_id != vf->vsi->idx ||
+ vqpi->txq.queue_id != vqpi->rxq.queue_id ||
+ vqpi->txq.headwb_enabled ||
+ vqpi->rxq.splithdr_enabled ||
+ vqpi->rxq.crc_disable ||
+ !(ice_vc_isvalid_ring_len(vqpi->txq.ring_len)) ||
+ !(ice_vc_isvalid_ring_len(vqpi->rxq.ring_len))) {
+ status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Copy parameters into VF's queue/VSI structs */
+ txq = &vsi->tx_queues[vqpi->txq.queue_id];
+
+ txq->desc_count = vqpi->txq.ring_len;
+ txq->tx_paddr = vqpi->txq.dma_ring_addr;
+ txq->q_handle = vqpi->txq.queue_id;
+ txq->tc = 0;
+
+ rxq = &vsi->rx_queues[vqpi->rxq.queue_id];
+
+ rxq->desc_count = vqpi->rxq.ring_len;
+ rxq->rx_paddr = vqpi->rxq.dma_ring_addr;
+ vsi->mbuf_sz = vqpi->rxq.databuffer_size;
+ }
+
+ /* Configure TX queues in HW */
+ error = ice_cfg_vsi_for_tx(vsi);
+ if (error) {
+ device_printf(dev,
+ "VF-%d: Unable to configure VSI for Tx: %s\n",
+ vf->vf_num, ice_err_str(error));
+ status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ goto done;
+ }
+
+ /* Configure RX queues in HW */
+ error = ice_cfg_vsi_for_rx(vsi);
+ if (error) {
+ device_printf(dev,
+ "VF-%d: Unable to configure VSI for Rx: %s\n",
+ vf->vf_num, ice_err_str(error));
+ status = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ ice_vsi_disable_tx(vsi);
+ goto done;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_VSI_QUEUES,
+ status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_key_msg - Handle VIRTCHNL_OP_CONFIG_RSS_KEY msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Sets the RSS key for the given VF, using the contents of msg_buf.
+ */
+static void
+ice_vc_cfg_rss_key_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_aqc_get_set_rss_keys keydata =
+ { .standard_rss_key = {0}, .extended_hash_key = {0} };
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_key *vrk;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ vrk = (struct virtchnl_rss_key *)msg_buf;
+
+ if (vrk->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vrk->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if ((vrk->key_len >
+ (ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE +
+ ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)) ||
+ vrk->key_len == 0) {
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ memcpy(&keydata, vrk->key, vrk->key_len);
+
+ status = ice_aq_set_rss_key(hw, vsi->idx, &keydata);
+ if (status) {
+ device_printf(sc->dev,
+ "ice_aq_set_rss_key status %s, error %s\n",
+ ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_KEY,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_rss_lut_msg - Handle VIRTCHNL_OP_CONFIG_RSS_LUT msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the LUT from the VF in msg_buf to the PF via an admin queue call.
+ */
+static void
+ice_vc_cfg_rss_lut_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_lut *vrl;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_aq_get_set_rss_lut_params lut_params = {};
+ struct ice_vsi *vsi = vf->vsi;
+
+ vrl = (struct virtchnl_rss_lut *)msg_buf;
+
+ if (vrl->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vrl->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vrl->lut_entries > ICE_VSIQF_HLUT_ARRAY_SIZE) {
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ lut_params.vsi_handle = vsi->idx;
+ lut_params.lut_size = vsi->rss_table_size;
+ lut_params.lut_type = vsi->rss_lut_type;
+ lut_params.lut = vrl->lut;
+ lut_params.global_lut_id = 0;
+
+ status = ice_aq_set_rss_lut(hw, &lut_params);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Cannot set RSS lut, err %s aq_err %s\n",
+ vf->vf_num, ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_RSS_LUT,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_set_rss_hena_msg - Handle VIRTCHNL_OP_SET_RSS_HENA msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Adds the VF's hena (hash enable) bits as flow types to the PF's RSS flow
+ * type list.
+ */
+static void
+ice_vc_set_rss_hena_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_rss_hena *vrh;
+ int status = 0;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+
+ MPASS(vsi != NULL);
+
+ vrh = (struct virtchnl_rss_hena *)msg_buf;
+
+ /*
+ * Remove existing configuration to make sure only requested
+ * config is applied and allow VFs to disable RSS completly.
+ */
+ status = ice_rem_vsi_rss_cfg(hw, vsi->idx);
+ if (vrh->hena) {
+ /*
+ * Problem with removing config is not fatal, when new one
+ * is requested. Warn about it but try to apply new config
+ * anyway.
+ */
+ if (status)
+ device_printf(sc->dev,
+ "ice_rem_vsi_rss_cfg status %s, error %s\n",
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ status = ice_add_avf_rss_cfg(hw, vsi->idx, vrh->hena);
+ if (status)
+ device_printf(sc->dev,
+ "ice_add_avf_rss_cfg status %s, error %s\n",
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ }
+ v_status = ice_iov_err_to_virt_err(status);
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_SET_RSS_HENA,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_enable_queues_msg - Handle VIRTCHNL_OP_ENABLE_QUEUES msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Enables VF queues selected in msg_buf for Tx/Rx traffic.
+ *
+ * @remark Only actually operates on Rx queues; Tx queues are enabled in
+ * CONFIG_VSI_QUEUES message handler.
+ */
+static void
+ice_vc_enable_queues_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_queue_select *vqs;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ int bit, error = 0;
+
+ vqs = (struct virtchnl_queue_select *)msg_buf;
+
+ if (vqs->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ __func__, vf->vf_num, vsi->idx, vqs->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (!vqs->rx_queues && !vqs->tx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message queue masks are empty\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Validate rx_queue mask */
+ bit = fls(vqs->rx_queues);
+ if (bit > vsi->num_rx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's rx_queues map (0x%08x) has invalid bit set (%d)\n",
+ __func__, vf->vf_num, vqs->rx_queues, bit);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* Tx ring enable is handled in an earlier message. */
+ for_each_set_bit(bit, &vqs->rx_queues, 32) {
+ error = ice_control_rx_queue(vsi, bit, true);
+ if (error) {
+ device_printf(sc->dev,
+ "Unable to enable Rx ring %d for receive: %s\n",
+ bit, ice_err_str(error));
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_ENABLE_QUEUES,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_disable_queues_msg - Handle VIRTCHNL_OP_DISABLE_QUEUES msg
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Disables all VF queues for the VF's VSI.
+ *
+ * @remark Unlike the ENABLE_QUEUES handler, this operates on both
+ * Tx and Rx queues
+ */
+static void
+ice_vc_disable_queues_msg(struct ice_softc *sc, struct ice_vf *vf,
+ u8 *msg_buf __unused)
+{
+ struct ice_hw *hw = &sc->hw;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ int error = 0;
+
+ error = ice_control_all_rx_queues(vsi, false);
+ if (error) {
+ device_printf(sc->dev,
+ "Unable to disable Rx rings for transmit: %s\n",
+ ice_err_str(error));
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ error = ice_vsi_disable_tx(vsi);
+ if (error) {
+ /* Already prints an error message */
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_DISABLE_QUEUES,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_cfg_irq_map_msg - Handle VIRTCHNL_OP_CFG_IRQ_MAP msg from VF
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the interrupt vectors described in the message in msg_buf. The
+ * VF needs to send this message during init, so that queues can be allowed
+ * to generate interrupts.
+ */
+static void
+ice_vc_cfg_irq_map_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+#define ICE_VIRTCHNL_QUEUE_MAP_SIZE 16
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_irq_map_info *vimi;
+ struct virtchnl_vector_map *vvm;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ struct ice_vsi *vsi = vf->vsi;
+ u16 vector;
+
+ vimi = (struct virtchnl_irq_map_info *)msg_buf;
+
+ if (vimi->num_vectors > vf->num_irq_vectors) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message has more vectors (%d) than configured for VF (%d)\n",
+ __func__, vf->vf_num, vimi->num_vectors, vf->num_irq_vectors);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ vvm = vimi->vecmap;
+ /* Save off information from message */
+ for (int i = 0; i < vimi->num_vectors; i++, vvm++) {
+ struct ice_tx_queue *txq;
+ struct ice_rx_queue *rxq;
+ int bit;
+
+ if (vvm->vsi_id != vf->vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's VSI ID (%d) does not match VF's (%d) for vector %d\n",
+ __func__, vf->vf_num, vvm->vsi_id, vf->vsi->idx, i);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* vvm->vector_id is relative to VF space */
+ vector = vvm->vector_id;
+
+ if (vector >= vf->num_irq_vectors) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message's vector ID (%d) is greater than VF's max ID (%d)\n",
+ __func__, vf->vf_num, vector, vf->num_irq_vectors - 1);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ /* The Misc/Admin Queue vector doesn't need mapping */
+ if (vector == 0)
+ continue;
+
+ /* coverity[address_of] */
+ for_each_set_bit(bit, &vvm->txq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+ if (bit >= vsi->num_tx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: txq map has invalid bit set\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ vf->tx_irqvs[vector].me = vector;
+
+ txq = &vsi->tx_queues[bit];
+ txq->irqv = &vf->tx_irqvs[vector];
+ txq->itr_idx = vvm->txitr_idx;
+ }
+ /* coverity[address_of] */
+ for_each_set_bit(bit, &vvm->rxq_map, ICE_VIRTCHNL_QUEUE_MAP_SIZE) {
+ if (bit >= vsi->num_rx_queues) {
+ device_printf(sc->dev,
+ "%s: VF-%d: rxq map has invalid bit set\n",
+ __func__, vf->vf_num);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+ vf->rx_irqvs[vector].me = vector;
+
+ rxq = &vsi->rx_queues[bit];
+ rxq->irqv = &vf->rx_irqvs[vector];
+ rxq->itr_idx = vvm->rxitr_idx;
+ }
+ }
+
+ /* Write to T/RQCTL registers to actually map vectors to queues */
+ for (int i = 0; i < vf->vsi->num_rx_queues; i++)
+ if (vsi->rx_queues[i].irqv != NULL)
+ ice_configure_rxq_interrupt(hw, vsi->rx_qmap[i],
+ vsi->rx_queues[i].irqv->me, vsi->rx_queues[i].itr_idx);
+
+ for (int i = 0; i < vf->vsi->num_tx_queues; i++)
+ if (vsi->tx_queues[i].irqv != NULL)
+ ice_configure_txq_interrupt(hw, vsi->tx_qmap[i],
+ vsi->tx_queues[i].irqv->me, vsi->tx_queues[i].itr_idx);
+
+ ice_flush(hw);
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_eth_stats_to_virtchnl_eth_stats - Convert stats for virtchnl
+ * @istats: VSI stats from HW to convert
+ * @vstats: stats struct to copy to
+ *
+ * This function copies all known stats in struct virtchnl_eth_stats from the
+ * input struct ice_eth_stats to an output struct virtchnl_eth_stats.
+ *
+ * @remark These two structure types currently have the same definition up to
+ * the size of struct virtchnl_eth_stats (on FreeBSD), but that could change
+ * in the future.
+ */
+static void
+ice_eth_stats_to_virtchnl_eth_stats(struct ice_eth_stats *istats,
+ struct virtchnl_eth_stats *vstats)
+{
+ vstats->rx_bytes = istats->rx_bytes;
+ vstats->rx_unicast = istats->rx_unicast;
+ vstats->rx_multicast = istats->rx_multicast;
+ vstats->rx_broadcast = istats->rx_broadcast;
+ vstats->rx_discards = istats->rx_discards;
+ vstats->rx_unknown_protocol = istats->rx_unknown_protocol;
+ vstats->tx_bytes = istats->tx_bytes;
+ vstats->tx_unicast = istats->tx_unicast;
+ vstats->tx_multicast = istats->tx_multicast;
+ vstats->tx_broadcast = istats->tx_broadcast;
+ vstats->tx_discards = istats->tx_discards;
+ vstats->tx_errors = istats->tx_errors;
+}
+
+/**
+ * ice_vc_get_stats_msg - Handle VIRTCHNL_OP_GET_STATS msg
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ * @msg_buf: raw message buffer from the VF
+ *
+ * Updates the VF's VSI stats and sends those stats back to the VF.
+ */
+static void
+ice_vc_get_stats_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct virtchnl_queue_select *vqs;
+ struct virtchnl_eth_stats stats;
+ struct ice_vsi *vsi = vf->vsi;
+ struct ice_hw *hw = &sc->hw;
+
+ vqs = (struct virtchnl_queue_select *)msg_buf;
+
+ if (vqs->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "%s: VF-%d: message has invalid VSI ID %d (VF has VSI ID %d)\n",
+ __func__, vf->vf_num, vqs->vsi_id, vsi->idx);
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+ VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+ }
+
+ ice_update_vsi_hw_stats(vf->vsi);
+ ice_eth_stats_to_virtchnl_eth_stats(&vsi->hw_stats.cur, &stats);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_GET_STATS,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&stats,
+ sizeof(struct virtchnl_eth_stats), NULL);
+}
+
+/**
+ * ice_vc_cfg_promisc_mode_msg - Handle VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
+ * @sc: PF's softc structure
+ * @vf: VF tracking structure
+ * @msg_buf: message buffer from VF
+ *
+ * Configures the promiscuous modes for the given VSI in msg_buf.
+ */
+static void
+ice_vc_cfg_promisc_mode_msg(struct ice_softc *sc, struct ice_vf *vf, u8 *msg_buf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct virtchnl_promisc_info *vpi;
+ enum virtchnl_status_code v_status = VIRTCHNL_STATUS_SUCCESS;
+ int status = 0;
+ struct ice_vsi *vsi = vf->vsi;
+ ice_declare_bitmap(old_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(clear_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(set_promisc_mask, ICE_PROMISC_MAX);
+ ice_declare_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+ u16 vid;
+
+ vpi = (struct virtchnl_promisc_info *)msg_buf;
+
+ /* Check to see if VF has permission to configure promiscuous mode */
+ if (!(vf->vf_flags & VF_FLAG_PROMISC_CAP)) {
+ device_printf(sc->dev,
+ "VF-%d: attempted to configure promiscuous mode\n",
+ vf->vf_num);
+ /* Don't reply to VF with an error */
+ goto done;
+ }
+
+ if (vpi->vsi_id != vsi->idx) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid VSI ID (expected %d, got %d)\n",
+ vf->vf_num, vsi->idx, vpi->vsi_id);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+ }
+
+ if (vpi->flags & ~ICE_VIRTCHNL_VALID_PROMISC_FLAGS) {
+ device_printf(sc->dev,
+ "VF-%d: Message has invalid promiscuous flags set (valid 0x%02x, got 0x%02x)\n",
+ vf->vf_num, ICE_VIRTCHNL_VALID_PROMISC_FLAGS,
+ vpi->flags);
+ v_status = VIRTCHNL_STATUS_ERR_PARAM;
+ goto done;
+
+ }
+
+ ice_zero_bitmap(req_promisc_mask, ICE_PROMISC_MAX);
+ /* Convert virtchnl flags to ice AQ promiscuous mode flags */
+ if (vpi->flags & FLAG_VF_UNICAST_PROMISC) {
+ ice_set_bit(ICE_PROMISC_UCAST_TX, req_promisc_mask);
+ ice_set_bit(ICE_PROMISC_UCAST_RX, req_promisc_mask);
+ }
+ if (vpi->flags & FLAG_VF_MULTICAST_PROMISC) {
+ ice_set_bit(ICE_PROMISC_MCAST_TX, req_promisc_mask);
+ ice_set_bit(ICE_PROMISC_MCAST_RX, req_promisc_mask);
+ }
+
+ status = ice_get_vsi_promisc(hw, vsi->idx, old_promisc_mask, &vid);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to get promiscuous mode mask for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+
+ /* Figure out what got added and what got removed */
+ ice_zero_bitmap(old_req_xor_mask, ICE_PROMISC_MAX);
+ ice_xor_bitmap(old_req_xor_mask, old_promisc_mask, req_promisc_mask, ICE_PROMISC_MAX);
+ ice_and_bitmap(clear_promisc_mask, old_req_xor_mask, old_promisc_mask, ICE_PROMISC_MAX);
+ ice_and_bitmap(set_promisc_mask, old_req_xor_mask, req_promisc_mask, ICE_PROMISC_MAX);
+
+ if (ice_is_any_bit_set(clear_promisc_mask, ICE_PROMISC_MAX)) {
+ status = ice_clear_vsi_promisc(hw, vsi->idx,
+ clear_promisc_mask, 0);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to clear promiscuous mode for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+ }
+
+ if (ice_is_any_bit_set(set_promisc_mask, ICE_PROMISC_MAX)) {
+ status = ice_set_vsi_promisc(hw, vsi->idx, set_promisc_mask, 0);
+ if (status) {
+ device_printf(sc->dev,
+ "VF-%d: Failed to set promiscuous mode for VSI %d, err %s aq_err %s\n",
+ vf->vf_num, vsi->idx,
+ ice_status_str(status),
+ ice_aq_str(hw->adminq.sq_last_status));
+ v_status = ice_iov_err_to_virt_err(status);
+ goto done;
+ }
+ }
+
+done:
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+ v_status, NULL, 0, NULL);
+}
+
+/**
+ * ice_vc_notify_all_vfs_link_state - Notify all VFs of PF link state
+ * @sc: device private structure
+ *
+ * Sends a message to all VFs about the status of the PF's link
+ * state. For more details, @see ice_vc_notify_vf_link_state.
+ */
+void
+ice_vc_notify_all_vfs_link_state(struct ice_softc *sc)
+{
+ for (int i = 0; i < sc->num_vfs; i++)
+ ice_vc_notify_vf_link_state(sc, &sc->vfs[i]);
+}
+
+/**
+ * ice_vc_notify_vf_link_state - Notify VF of PF link state
+ * @sc: device private structure
+ * @vf: VF tracking structure
+ *
+ * Sends an event message to the specified VF with information about
+ * the current link state from the PF's port. This includes whether
+ * link is up or down, and the link speed in 100Mbps units.
+ */
+static void
+ice_vc_notify_vf_link_state(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct virtchnl_pf_event event = {};
+ struct ice_hw *hw = &sc->hw;
+
+ event.event = VIRTCHNL_EVENT_LINK_CHANGE;
+ event.severity = PF_EVENT_SEVERITY_INFO;
+ event.event_data.link_event_adv.link_status = sc->link_up;
+ event.event_data.link_event_adv.link_speed =
+ (u32)ice_conv_link_speed_to_virtchnl(true,
+ hw->port_info->phy.link_info.link_speed);
+
+ ice_aq_send_msg_to_vf(hw, vf->vf_num, VIRTCHNL_OP_EVENT,
+ VIRTCHNL_STATUS_SUCCESS, (u8 *)&event, sizeof(event), NULL);
+}
+
+/**
+ * ice_vc_handle_vf_msg - Handle a message from a VF
+ * @sc: device private structure
+ * @event: event received from the HW MBX queue
+ *
+ * Called whenever an event is received from a VF on the HW mailbox queue.
+ * Responsible for handling these messages as well as responding to the
+ * VF afterwards, depending on the received message type.
+ */
+void
+ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event)
+{
+ struct ice_hw *hw = &sc->hw;
+ device_t dev = sc->dev;
+ struct ice_vf *vf;
+ int err = 0;
+
+ u32 v_opcode = event->desc.cookie_high;
+ u16 v_id = event->desc.retval;
+ u8 *msg = event->msg_buf;
+ u16 msglen = event->msg_len;
+
+ if (v_id >= sc->num_vfs) {
+ device_printf(dev, "%s: Received msg from invalid VF-%d: opcode %d, len %d\n",
+ __func__, v_id, v_opcode, msglen);
+ return;
+ }
+
+ vf = &sc->vfs[v_id];
+
+ /* Perform basic checks on the msg */
+ err = virtchnl_vc_validate_vf_msg(&vf->version, v_opcode, msg, msglen);
+ if (err) {
+ device_printf(dev, "%s: Received invalid msg from VF-%d: opcode %d, len %d, error %d\n",
+ __func__, vf->vf_num, v_opcode, msglen, err);
+ ice_aq_send_msg_to_vf(hw, v_id, v_opcode, VIRTCHNL_STATUS_ERR_PARAM, NULL, 0, NULL);
+ return;
+ }
+
+ switch (v_opcode) {
+ case VIRTCHNL_OP_VERSION:
+ ice_vc_version_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_RESET_VF:
+ ice_reset_vf(sc, vf, true);
+ break;
+ case VIRTCHNL_OP_GET_VF_RESOURCES:
+ ice_vc_get_vf_res_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ADD_ETH_ADDR:
+ ice_vc_add_eth_addr_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_DEL_ETH_ADDR:
+ ice_vc_del_eth_addr_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ADD_VLAN:
+ ice_vc_add_vlan_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_DEL_VLAN:
+ ice_vc_del_vlan_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+ ice_vc_cfg_vsi_qs_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_RSS_KEY:
+ ice_vc_cfg_rss_key_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_RSS_LUT:
+ ice_vc_cfg_rss_lut_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_SET_RSS_HENA:
+ ice_vc_set_rss_hena_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_ENABLE_QUEUES:
+ ice_vc_enable_queues_msg(sc, vf, msg);
+ ice_vc_notify_vf_link_state(sc, vf);
+ break;
+ case VIRTCHNL_OP_DISABLE_QUEUES:
+ ice_vc_disable_queues_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+ ice_vc_cfg_irq_map_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_GET_STATS:
+ ice_vc_get_stats_msg(sc, vf, msg);
+ break;
+ case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+ ice_vc_cfg_promisc_mode_msg(sc, vf, msg);
+ break;
+ default:
+ device_printf(dev, "%s: Received unknown msg from VF-%d: opcode %d, len %d\n",
+ __func__, vf->vf_num, v_opcode, msglen);
+ ice_aq_send_msg_to_vf(hw, v_id, v_opcode,
+ VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL, 0, NULL);
+ break;
+ }
+}
+
+/**
+ * ice_iov_setup_intr_mapping - Setup interrupt config for a VF
+ * @sc: device softc structure
+ * @vf: driver's VF structure for VF to be configured
+ *
+ * Before a VF can be used, and after a VF reset, the PF must configure
+ * the VF's interrupt allocation registers. This includes allocating
+ * interrupts from the PF's interrupt pool to the VF using the
+ * VPINT_ALLOC(_PCI) registers, and setting up a mapping from PF vectors
+ * to VF vectors in GLINT_VECT2FUNC.
+ *
+ * As well, this sets up queue allocation registers and maps the mailbox
+ * interrupt for the VF.
+ */
+static void
+ice_iov_setup_intr_mapping(struct ice_softc *sc, struct ice_vf *vf)
+{
+ struct ice_hw *hw = &sc->hw;
+ struct ice_vsi *vsi = vf->vsi;
+ u16 v;
+
+ /* Calculate indices for register ops below */
+ u16 vf_first_irq_idx = vf->vf_imap[0];
+ u16 vf_last_irq_idx = (vf_first_irq_idx + vf->num_irq_vectors) - 1;
+ u16 abs_vf_first_irq_idx = hw->func_caps.common_cap.msix_vector_first_id +
+ vf_first_irq_idx;
+ u16 abs_vf_last_irq_idx = (abs_vf_first_irq_idx + vf->num_irq_vectors) - 1;
+ u16 abs_vf_num = vf->vf_num + hw->func_caps.vf_base_id;
+
+ /* Map out VF interrupt allocation in global device space. Both
+ * VPINT_ALLOC and VPINT_ALLOC_PCI use the same values.
+ */
+ wr32(hw, VPINT_ALLOC(vf->vf_num),
+ (((abs_vf_first_irq_idx << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) |
+ ((abs_vf_last_irq_idx << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) |
+ VPINT_ALLOC_VALID_M));
+ wr32(hw, VPINT_ALLOC_PCI(vf->vf_num),
+ (((abs_vf_first_irq_idx << VPINT_ALLOC_PCI_FIRST_S) & VPINT_ALLOC_PCI_FIRST_M) |
+ ((abs_vf_last_irq_idx << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) |
+ VPINT_ALLOC_PCI_VALID_M));
+
+ /* Create inverse mapping of vectors to PF/VF combinations */
+ for (v = vf_first_irq_idx; v <= vf_last_irq_idx; v++)
+ {
+ wr32(hw, GLINT_VECT2FUNC(v),
+ (((abs_vf_num << GLINT_VECT2FUNC_VF_NUM_S) & GLINT_VECT2FUNC_VF_NUM_M) |
+ ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) & GLINT_VECT2FUNC_PF_NUM_M)));
+ }
+
+ /* Map mailbox interrupt to MSI-X index 0. Disable ITR for it, too. */
+ wr32(hw, VPINT_MBX_CTL(abs_vf_num),
+ ((0 << VPINT_MBX_CTL_MSIX_INDX_S) & VPINT_MBX_CTL_MSIX_INDX_M) |
+ ((0x3 << VPINT_MBX_CTL_ITR_INDX_S) & VPINT_MBX_CTL_ITR_INDX_M) |
+ VPINT_MBX_CTL_CAUSE_ENA_M);
+
+ /* Mark the TX queue mapping registers as valid */
+ wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_num), VPLAN_TXQ_MAPENA_TX_ENA_M);
+
+ /* Indicate to HW that VF has scattered queue allocation */
+ wr32(hw, VPLAN_TX_QBASE(vf->vf_num), VPLAN_TX_QBASE_VFQTABLE_ENA_M);
+ for (int i = 0; i < vsi->num_tx_queues; i++) {
+ wr32(hw, VPLAN_TX_QTABLE(i, vf->vf_num),
+ (vsi->tx_qmap[i] << VPLAN_TX_QTABLE_QINDEX_S) & VPLAN_TX_QTABLE_QINDEX_M);
+ }
+
+ /* Mark the RX queue mapping registers as valid */
+ wr32(hw, VPLAN_RXQ_MAPENA(vf->vf_num), VPLAN_RXQ_MAPENA_RX_ENA_M);
+ wr32(hw, VPLAN_RX_QBASE(vf->vf_num), VPLAN_RX_QBASE_VFQTABLE_ENA_M);
+ for (int i = 0; i < vsi->num_rx_queues; i++) {
+ wr32(hw, VPLAN_RX_QTABLE(i, vf->vf_num),
+ (vsi->rx_qmap[i] << VPLAN_RX_QTABLE_QINDEX_S) & VPLAN_RX_QTABLE_QINDEX_M);
+ }
+}
+
+/**
+ * ice_err_to_virt err - translate ice errors into virtchnl errors
+ * @ice_err: status returned from ice function
+ */
+static enum virtchnl_status_code
+ice_iov_err_to_virt_err(int ice_err)
+{
+ switch (ice_err) {
+ case 0:
+ return VIRTCHNL_STATUS_SUCCESS;
+ case ICE_ERR_BAD_PTR:
+ case ICE_ERR_INVAL_SIZE:
+ case ICE_ERR_DEVICE_NOT_SUPPORTED:
+ case ICE_ERR_PARAM:
+ case ICE_ERR_CFG:
+ return VIRTCHNL_STATUS_ERR_PARAM;
+ case ICE_ERR_NO_MEMORY:
+ return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+ case ICE_ERR_NOT_READY:
+ case ICE_ERR_RESET_FAILED:
+ case ICE_ERR_FW_API_VER:
+ case ICE_ERR_AQ_ERROR:
+ case ICE_ERR_AQ_TIMEOUT:
+ case ICE_ERR_AQ_FULL:
+ case ICE_ERR_AQ_NO_WORK:
+ case ICE_ERR_AQ_EMPTY:
+ return VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+ default:
+ return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+ }
+}
diff --git a/sys/dev/ice/ice_iov.h b/sys/dev/ice/ice_iov.h
new file mode 100644
index 000000000000..c4fb3e932e3f
--- /dev/null
+++ b/sys/dev/ice/ice_iov.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file ice_iov.h
+ * @brief header for IOV functionality
+ *
+ * This header includes definitions used to implement device Virtual Functions
+ * for the ice driver.
+ */
+
+#ifndef _ICE_IOV_H_
+#define _ICE_IOV_H_
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/nv.h>
+#include <sys/iov_schema.h>
+#include <sys/param.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <dev/pci/pci_iov.h>
+
+#include "ice_iflib.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * @enum ice_vf_flags
+ * @brief VF state flags
+ *
+ * Used to indicate the status of a PF's VF, as well as indicating what each VF
+ * is capabile of. Intended to be modified only using atomic operations, so
+ * they can be read and modified in places that aren't locked.
+ *
+ * Used in struct ice_vf's vf_flags field.
+ */
+enum ice_vf_flags {
+ VF_FLAG_ENABLED = BIT(0),
+ VF_FLAG_SET_MAC_CAP = BIT(1),
+ VF_FLAG_VLAN_CAP = BIT(2),
+ VF_FLAG_PROMISC_CAP = BIT(3),
+ VF_FLAG_MAC_ANTI_SPOOF = BIT(4),
+};
+
+/**
+ * @struct ice_vf
+ * @brief PF's VF software context
+ *
+ * Represents the state and options for a VF spawned from a PF.
+ */
+struct ice_vf {
+ struct ice_vsi *vsi;
+ u32 vf_flags;
+
+ u8 mac[ETHER_ADDR_LEN];
+ u16 vf_num;
+ struct virtchnl_version_info version;
+
+ u16 mac_filter_limit;
+ u16 mac_filter_cnt;
+ u16 vlan_limit;
+ u16 vlan_cnt;
+
+ u16 num_irq_vectors;
+ u16 *vf_imap;
+ struct ice_irq_vector *tx_irqvs;
+ struct ice_irq_vector *rx_irqvs;
+};
+
+#define ICE_PCIE_DEV_STATUS 0xAA
+
+#define ICE_PCI_CIAD_WAIT_COUNT 100
+#define ICE_PCI_CIAD_WAIT_DELAY_US 1
+#define ICE_VPGEN_VFRSTAT_WAIT_COUNT 100
+#define ICE_VPGEN_VFRSTAT_WAIT_DELAY_US 20
+
+#define ICE_VIRTCHNL_VALID_PROMISC_FLAGS (FLAG_VF_UNICAST_PROMISC | \
+ FLAG_VF_MULTICAST_PROMISC)
+
+#define ICE_DEFAULT_VF_VLAN_LIMIT 64
+#define ICE_DEFAULT_VF_FILTER_LIMIT 16
+
+int ice_iov_attach(struct ice_softc *sc);
+int ice_iov_detach(struct ice_softc *sc);
+
+int ice_iov_init(struct ice_softc *sc, uint16_t num_vfs, const nvlist_t *params);
+int ice_iov_add_vf(struct ice_softc *sc, uint16_t vfnum, const nvlist_t *params);
+void ice_iov_uninit(struct ice_softc *sc);
+
+void ice_iov_handle_vflr(struct ice_softc *sc);
+
+void ice_vc_handle_vf_msg(struct ice_softc *sc, struct ice_rq_event_info *event);
+void ice_vc_notify_all_vfs_link_state(struct ice_softc *sc);
+
+#endif /* _ICE_IOV_H_ */
+
diff --git a/sys/dev/ice/ice_lib.c b/sys/dev/ice/ice_lib.c
index d44ae5f37750..442111e5ffaf 100644
--- a/sys/dev/ice/ice_lib.c
+++ b/sys/dev/ice/ice_lib.c
@@ -42,6 +42,9 @@
#include "ice_lib.h"
#include "ice_iflib.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/resource.h>
@@ -741,6 +744,12 @@ ice_initialize_vsi(struct ice_vsi *vsi)
case ICE_VSI_VMDQ2:
ctx.flags = ICE_AQ_VSI_TYPE_VMDQ2;
break;
+#ifdef PCI_IOV
+ case ICE_VSI_VF:
+ ctx.flags = ICE_AQ_VSI_TYPE_VF;
+ ctx.vf_num = vsi->vf_num;
+ break;
+#endif
default:
return (ENODEV);
}
@@ -1607,6 +1616,12 @@ ice_setup_tx_ctx(struct ice_tx_queue *txq, struct ice_tlan_ctx *tlan_ctx, u16 pf
case ICE_VSI_VMDQ2:
tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
break;
+#ifdef PCI_IOV
+ case ICE_VSI_VF:
+ tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
+ tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_num;
+ break;
+#endif
default:
return (ENODEV);
}
@@ -1660,6 +1675,10 @@ ice_cfg_vsi_for_tx(struct ice_vsi *vsi)
struct ice_tlan_ctx tlan_ctx = { 0 };
struct ice_tx_queue *txq = &vsi->tx_queues[i];
+ /* Last configured queue */
+ if (txq->desc_count == 0)
+ break;
+
pf_q = vsi->tx_qmap[txq->me];
qg->txqs[0].txq_id = htole16(pf_q);
@@ -1788,6 +1807,10 @@ ice_cfg_vsi_for_rx(struct ice_vsi *vsi)
for (i = 0; i < vsi->num_rx_queues; i++) {
MPASS(vsi->mbuf_sz > 0);
+ /* Last configured queue */
+ if (vsi->rx_queues[i].desc_count == 0)
+ break;
+
err = ice_setup_rx_ctx(&vsi->rx_queues[i]);
if (err)
return err;
@@ -2257,6 +2280,11 @@ ice_process_ctrlq_event(struct ice_softc *sc, const char *qname,
case ice_aqc_opc_get_link_status:
ice_process_link_event(sc, event);
break;
+#ifdef PCI_IOV
+ case ice_mbx_opc_send_msg_to_pf:
+ ice_vc_handle_vf_msg(sc, event);
+ break;
+#endif
case ice_aqc_opc_fw_logs_event:
ice_handle_fw_log_event(sc, &event->desc, event->msg_buf);
break;
diff --git a/sys/dev/ice/ice_lib.h b/sys/dev/ice/ice_lib.h
index b6b23ec82161..308b2bda2790 100644
--- a/sys/dev/ice/ice_lib.h
+++ b/sys/dev/ice/ice_lib.h
@@ -611,6 +611,10 @@ struct ice_vsi {
u16 mirror_src_vsi;
u16 rule_mir_ingress;
u16 rule_mir_egress;
+
+#ifdef PCI_IOV
+ u8 vf_num; /* Index of owning VF, if applicable */
+#endif
};
/**
diff --git a/sys/dev/ice/ice_vf_mbx.c b/sys/dev/ice/ice_vf_mbx.c
new file mode 100644
index 000000000000..387a1c6739a6
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ice_common.h"
+#include "ice_hw_autogen.h"
+#include "ice_vf_mbx.h"
+
+/**
+ * ice_aq_send_msg_to_vf
+ * @hw: pointer to the hardware structure
+ * @vfid: VF ID to send msg
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to VF driver (0x0802) using mailbox
+ * queue and asynchronously sending message via
+ * ice_sq_send_cmd() function
+ */
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+ u8 *msg, u16 msglen, struct ice_sq_cd *cd)
+{
+ struct ice_aqc_pf_vf_msg *cmd;
+ struct ice_aq_desc desc;
+
+ ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_vf);
+
+ cmd = &desc.params.virt;
+ cmd->id = CPU_TO_LE32(vfid);
+
+ desc.cookie_high = CPU_TO_LE32(v_opcode);
+ desc.cookie_low = CPU_TO_LE32(v_retval);
+
+ if (msglen)
+ desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+ return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+/**
+ * ice_aq_send_msg_to_pf
+ * @hw: pointer to the hardware structure
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cd: pointer to command details
+ *
+ * Send message to PF driver using mailbox queue. By default, this
+ * message is sent asynchronously, i.e. ice_sq_send_cmd()
+ * does not wait for completion before returning.
+ */
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+ int v_retval, u8 *msg, u16 msglen,
+ struct ice_sq_cd *cd)
+{
+ struct ice_aq_desc desc;
+
+ ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_pf);
+ desc.cookie_high = CPU_TO_LE32(v_opcode);
+ desc.cookie_low = CPU_TO_LE32(v_retval);
+
+ if (msglen)
+ desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+ return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
+}
+
+static const u32 ice_legacy_aq_to_vc_speed[] = {
+ VIRTCHNL_LINK_SPEED_100MB, /* BIT(0) */
+ VIRTCHNL_LINK_SPEED_100MB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_1GB,
+ VIRTCHNL_LINK_SPEED_10GB,
+ VIRTCHNL_LINK_SPEED_20GB,
+ VIRTCHNL_LINK_SPEED_25GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+ VIRTCHNL_LINK_SPEED_40GB,
+};
+
+/**
+ * ice_conv_link_speed_to_virtchnl
+ * @adv_link_support: determines the format of the returned link speed
+ * @link_speed: variable containing the link_speed to be converted
+ *
+ * Convert link speed supported by HW to link speed supported by virtchnl.
+ * If adv_link_support is true, then return link speed in Mbps. Else return
+ * link speed as a VIRTCHNL_LINK_SPEED_* casted to a u32. Note that the caller
+ * needs to cast back to an enum virtchnl_link_speed in the case where
+ * adv_link_support is false, but when adv_link_support is true the caller can
+ * expect the speed in Mbps.
+ */
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
+{
+ /* convert a BIT() value into an array index */
+ u16 index = (u16)(ice_fls(link_speed) - 1);
+
+ if (adv_link_support)
+ return ice_get_link_speed(index);
+ else if (index < ARRAY_SIZE(ice_legacy_aq_to_vc_speed))
+ /* Virtchnl speeds are not defined for every speed supported in
+ * the hardware. To maintain compatibility with older AVF
+ * drivers, while reporting the speed the new speed values are
+ * resolved to the closest known virtchnl speeds
+ */
+ return ice_legacy_aq_to_vc_speed[index];
+
+ return VIRTCHNL_LINK_SPEED_UNKNOWN;
+}
+
+/* The mailbox overflow detection algorithm helps to check if there
+ * is a possibility of a malicious VF transmitting too many MBX messages to the
+ * PF.
+ * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during
+ * driver initialization in ice_init_hw() using ice_mbx_init_snapshot().
+ * The struct ice_mbx_snapshot helps to track and traverse a static window of
+ * messages within the mailbox queue while looking for a malicious VF.
+ *
+ * 2. When the caller starts processing its mailbox queue in response to an
+ * interrupt, the structure ice_mbx_snapshot is expected to be cleared before
+ * the algorithm can be run for the first time for that interrupt. This
+ * requires calling ice_mbx_reset_snapshot() as well as calling
+ * ice_mbx_reset_vf_info() for each VF tracking structure.
+ *
+ * 3. For every message read by the caller from the MBX Queue, the caller must
+ * call the detection algorithm's entry function ice_mbx_vf_state_handler().
+ * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is
+ * filled as it is required to be passed to the algorithm.
+ *
+ * 4. Every time a message is read from the MBX queue, a tracking structure
+ * for the VF must be passed to the state handler. The boolean output
+ * report_malvf from ice_mbx_vf_state_handler() serves as an indicator to the
+ * caller whether it must report this VF as malicious or not.
+ *
+ * 5. When a VF is identified to be malicious, the caller can send a message
+ * to the system administrator.
+ *
+ * 6. The PF is responsible for maintaining the struct ice_mbx_vf_info
+ * structure for each VF. The PF should clear the VF tracking structure if the
+ * VF is reset. When a VF is shut down and brought back up, we will then
+ * assume that the new VF is not malicious and may report it again if we
+ * detect it again.
+ *
+ * 7. The function ice_mbx_reset_snapshot() is called to reset the information
+ * in ice_mbx_snapshot for every new mailbox interrupt handled.
+ */
+#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M)
+/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that
+ * the max messages check must be ignored in the algorithm
+ */
+#define ICE_IGNORE_MAX_MSG_CNT 0xFFFF
+
+/**
+ * ice_mbx_reset_snapshot - Initialize mailbox snapshot structure
+ * @snap: pointer to the mailbox snapshot
+ */
+static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap)
+{
+ struct ice_mbx_vf_info *vf_info;
+
+ /* Clear mbx_buf in the mailbox snaphot structure and setting the
+ * mailbox snapshot state to a new capture.
+ */
+ ice_memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf), ICE_NONDMA_MEM);
+ snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+
+ /* Reset message counts for all VFs to zero */
+ LIST_FOR_EACH_ENTRY(vf_info, &snap->mbx_vf, ice_mbx_vf_info, list_entry)
+ vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_traverse - Pass through mailbox snapshot
+ * @hw: pointer to the HW struct
+ * @new_state: new algorithm state
+ *
+ * Traversing the mailbox static snapshot without checking
+ * for malicious VFs.
+ */
+static void
+ice_mbx_traverse(struct ice_hw *hw,
+ enum ice_mbx_snapshot_state *new_state)
+{
+ struct ice_mbx_snap_buffer_data *snap_buf;
+ u32 num_iterations;
+
+ snap_buf = &hw->mbx_snapshot.mbx_buf;
+
+ /* As mailbox buffer is circular, applying a mask
+ * on the incremented iteration count.
+ */
+ num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations);
+
+ /* Checking either of the below conditions to exit snapshot traversal:
+ * Condition-1: If the number of iterations in the mailbox is equal to
+ * the mailbox head which would indicate that we have reached the end
+ * of the static snapshot.
+ * Condition-2: If the maximum messages serviced in the mailbox for a
+ * given interrupt is the highest possible value then there is no need
+ * to check if the number of messages processed is equal to it. If not
+ * check if the number of messages processed is greater than or equal
+ * to the maximum number of mailbox entries serviced in current work item.
+ */
+ if (num_iterations == snap_buf->head ||
+ (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT &&
+ ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx))
+ *new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_detect_malvf - Detect malicious VF in snapshot
+ * @hw: pointer to the HW struct
+ * @vf_info: mailbox tracking structure for a VF
+ * @new_state: new algorithm state
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * This function tracks the number of asynchronous messages
+ * sent per VF and marks the VF as malicious if it exceeds
+ * the permissible number of messages to send.
+ */
+static int
+ice_mbx_detect_malvf(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info,
+ enum ice_mbx_snapshot_state *new_state,
+ bool *is_malvf)
+{
+ /* increment the message count for this VF */
+ vf_info->msg_count++;
+
+ if (vf_info->msg_count >= ICE_ASYNC_VF_MSG_THRESHOLD)
+ *is_malvf = true;
+
+ /* continue to iterate through the mailbox snapshot */
+ ice_mbx_traverse(hw, new_state);
+
+ return 0;
+}
+
+/**
+ * ice_e830_mbx_vf_dec_trig - Decrements the VF mailbox queue counter
+ * @hw: pointer to the HW struct
+ * @event: pointer to the control queue receive event
+ *
+ * This function triggers to decrement the counter
+ * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT when the driver replenishes
+ * the buffers at the PF mailbox queue.
+ */
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+ struct ice_rq_event_info *event)
+{
+ u16 vfid = LE16_TO_CPU(event->desc.retval);
+
+ wr32(hw, E830_MBX_VF_DEC_TRIG(vfid), 1);
+}
+
+/**
+ * ice_mbx_vf_clear_cnt_e830 - Clear the VF mailbox queue count
+ * @hw: pointer to the HW struct
+ * @vf_id: VF ID in the PF space
+ *
+ * This function clears the counter MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT, and should
+ * be called when a VF is created and on VF reset.
+ */
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id)
+{
+ u32 reg = rd32(hw, E830_MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT(vf_id));
+
+ wr32(hw, E830_MBX_VF_DEC_TRIG(vf_id), reg);
+}
+
+/**
+ * ice_mbx_vf_state_handler - Handle states of the overflow algorithm
+ * @hw: pointer to the HW struct
+ * @mbx_data: pointer to structure containing mailbox data
+ * @vf_info: mailbox tracking structure for the VF in question
+ * @report_malvf: boolean output to indicate whether VF should be reported
+ *
+ * The function serves as an entry point for the malicious VF
+ * detection algorithm by handling the different states and state
+ * transitions of the algorithm:
+ * New snapshot: This state is entered when creating a new static
+ * snapshot. The data from any previous mailbox snapshot is
+ * cleared and a new capture of the mailbox head and tail is
+ * logged. This will be the new static snapshot to detect
+ * asynchronous messages sent by VFs. On capturing the snapshot
+ * and depending on whether the number of pending messages in that
+ * snapshot exceed the watermark value, the state machine enters
+ * traverse or detect states.
+ * Traverse: If pending message count is below watermark then iterate
+ * through the snapshot without any action on VF.
+ * Detect: If pending message count exceeds watermark traverse
+ * the static snapshot and look for a malicious VF.
+ */
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+ struct ice_mbx_vf_info *vf_info, bool *report_malvf)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+ struct ice_mbx_snap_buffer_data *snap_buf;
+ struct ice_ctl_q_info *cq = &hw->mailboxq;
+ enum ice_mbx_snapshot_state new_state;
+ int status = 0;
+ bool is_malvf = false;
+
+ if (!report_malvf || !mbx_data || !vf_info)
+ return ICE_ERR_BAD_PTR;
+
+ *report_malvf = false;
+
+ /* When entering the mailbox state machine assume that the VF
+ * is not malicious until detected.
+ */
+ /* Checking if max messages allowed to be processed while servicing current
+ * interrupt is not less than the defined AVF message threshold.
+ */
+ if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD)
+ return ICE_ERR_INVAL_SIZE;
+
+ /* The watermark value should not be lesser than the threshold limit
+ * set for the number of asynchronous messages a VF can send to mailbox
+ * nor should it be greater than the maximum number of messages in the
+ * mailbox serviced in current interrupt.
+ */
+ if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD ||
+ mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx)
+ return ICE_ERR_PARAM;
+
+ new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+ snap_buf = &snap->mbx_buf;
+
+ switch (snap_buf->state) {
+ case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT:
+ /* Clear any previously held data in mailbox snapshot structure. */
+ ice_mbx_reset_snapshot(snap);
+
+ /* Collect the pending ARQ count, number of messages processed and
+ * the maximum number of messages allowed to be processed from the
+ * Mailbox for current interrupt.
+ */
+ snap_buf->num_pending_arq = mbx_data->num_pending_arq;
+ snap_buf->num_msg_proc = mbx_data->num_msg_proc;
+ snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx;
+
+ /* Capture a new static snapshot of the mailbox by logging the
+ * head and tail of snapshot and set num_iterations to the tail
+ * value to mark the start of the iteration through the snapshot.
+ */
+ snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean +
+ mbx_data->num_pending_arq);
+ snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1);
+ snap_buf->num_iterations = snap_buf->tail;
+
+ /* Pending ARQ messages returned by ice_clean_rq_elem
+ * is the difference between the head and tail of the
+ * mailbox queue. Comparing this value against the watermark
+ * helps to check if we potentially have malicious VFs.
+ */
+ if (snap_buf->num_pending_arq >=
+ mbx_data->async_watermark_val) {
+ new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+ status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+ } else {
+ new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+ ice_mbx_traverse(hw, &new_state);
+ }
+ break;
+
+ case ICE_MAL_VF_DETECT_STATE_TRAVERSE:
+ new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+ ice_mbx_traverse(hw, &new_state);
+ break;
+
+ case ICE_MAL_VF_DETECT_STATE_DETECT:
+ new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+ status = ice_mbx_detect_malvf(hw, vf_info, &new_state, &is_malvf);
+ break;
+
+ default:
+ new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+ status = ICE_ERR_CFG;
+ }
+
+ snap_buf->state = new_state;
+
+ /* Only report VFs as malicious the first time we detect it */
+ if (is_malvf && !vf_info->malicious) {
+ vf_info->malicious = 1;
+ *report_malvf = true;
+ }
+
+ return status;
+}
+
+/**
+ * ice_mbx_clear_malvf - Clear VF mailbox info
+ * @vf_info: the mailbox tracking structure for a VF
+ *
+ * In case of a VF reset, this function shall be called to clear the VF's
+ * current mailbox tracking state.
+ */
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info)
+{
+ vf_info->malicious = 0;
+ vf_info->msg_count = 0;
+}
+
+/**
+ * ice_mbx_init_vf_info - Initialize a new VF mailbox tracking info
+ * @hw: pointer to the hardware structure
+ * @vf_info: the mailbox tracking info structure for a VF
+ *
+ * Initialize a VF mailbox tracking info structure and insert it into the
+ * snapshot list.
+ *
+ * If you remove the VF, you must also delete the associated VF info structure
+ * from the linked list.
+ */
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+ ice_mbx_clear_malvf(vf_info);
+ LIST_ADD(&vf_info->list_entry, &snap->mbx_vf);
+}
+
+/**
+ * ice_mbx_init_snapshot - Initialize mailbox snapshot data
+ * @hw: pointer to the hardware structure
+ *
+ * Clear the mailbox snapshot structure and initialize the VF mailbox list.
+ */
+void ice_mbx_init_snapshot(struct ice_hw *hw)
+{
+ struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+ INIT_LIST_HEAD(&snap->mbx_vf);
+ ice_mbx_reset_snapshot(snap);
+}
diff --git a/sys/dev/ice/ice_vf_mbx.h b/sys/dev/ice/ice_vf_mbx.h
new file mode 100644
index 000000000000..3b185ac89c11
--- /dev/null
+++ b/sys/dev/ice/ice_vf_mbx.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ICE_VF_MBX_H_
+#define _ICE_VF_MBX_H_
+
+#include "ice_type.h"
+#include "ice_controlq.h"
+
+/* Defining the mailbox message threshold as 63 asynchronous
+ * pending messages. Normal VF functionality does not require
+ * sending more than 63 asynchronous pending message.
+ */
+
+ /* Threshold value should be used to initialize
+ * MBX_VF_IN_FLIGHT_MSGS_AT_PF_CNT register.
+ */
+#define ICE_ASYNC_VF_MSG_THRESHOLD 63
+
+int
+ice_aq_send_msg_to_pf(struct ice_hw *hw, enum virtchnl_ops v_opcode,
+ int v_retval, u8 *msg, u16 msglen,
+ struct ice_sq_cd *cd);
+int
+ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
+ u8 *msg, u16 msglen, struct ice_sq_cd *cd);
+
+u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed);
+
+void ice_e830_mbx_vf_dec_trig(struct ice_hw *hw,
+ struct ice_rq_event_info *event);
+void ice_mbx_vf_clear_cnt_e830(struct ice_hw *hw, u16 vf_id);
+int
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+ struct ice_mbx_vf_info *vf_info, bool *report_malvf);
+void ice_mbx_clear_malvf(struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_vf_info(struct ice_hw *hw, struct ice_mbx_vf_info *vf_info);
+void ice_mbx_init_snapshot(struct ice_hw *hw);
+#endif /* _ICE_VF_MBX_H_ */
diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c
index e60ee0f1c5c3..1469d2916465 100644
--- a/sys/dev/ice/if_ice_iflib.c
+++ b/sys/dev/ice/if_ice_iflib.c
@@ -42,6 +42,9 @@
#include "ice_drv_info.h"
#include "ice_switch.h"
#include "ice_sched.h"
+#ifdef PCI_IOV
+#include "ice_iov.h"
+#endif
#include <sys/module.h>
#include <sys/sockio.h>
@@ -85,6 +88,12 @@ static int ice_if_suspend(if_ctx_t ctx);
static int ice_if_resume(if_ctx_t ctx);
static bool ice_if_needs_restart(if_ctx_t ctx, enum iflib_restart_event event);
static void ice_init_link(struct ice_softc *sc);
+#ifdef PCI_IOV
+static int ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params);
+static void ice_if_iov_uninit(if_ctx_t ctx);
+static int ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params);
+static void ice_if_vflr_handle(if_ctx_t ctx);
+#endif
static int ice_setup_mirror_vsi(struct ice_mirr_if *mif);
static int ice_wire_mirror_intrs(struct ice_mirr_if *mif);
static void ice_free_irqvs_subif(struct ice_mirr_if *mif);
@@ -158,6 +167,11 @@ static device_method_t ice_methods[] = {
DEVMETHOD(device_shutdown, iflib_device_shutdown),
DEVMETHOD(device_suspend, iflib_device_suspend),
DEVMETHOD(device_resume, iflib_device_resume),
+#ifdef PCI_IOV
+ DEVMETHOD(pci_iov_init, iflib_device_iov_init),
+ DEVMETHOD(pci_iov_uninit, iflib_device_iov_uninit),
+ DEVMETHOD(pci_iov_add_vf, iflib_device_iov_add_vf),
+#endif
DEVMETHOD_END
};
@@ -198,6 +212,12 @@ static device_method_t ice_iflib_methods[] = {
DEVMETHOD(ifdi_suspend, ice_if_suspend),
DEVMETHOD(ifdi_resume, ice_if_resume),
DEVMETHOD(ifdi_needs_restart, ice_if_needs_restart),
+#ifdef PCI_IOV
+ DEVMETHOD(ifdi_iov_vf_add, ice_if_iov_vf_add),
+ DEVMETHOD(ifdi_iov_init, ice_if_iov_init),
+ DEVMETHOD(ifdi_iov_uninit, ice_if_iov_uninit),
+ DEVMETHOD(ifdi_vflr_handle, ice_if_vflr_handle),
+#endif
DEVMETHOD_END
};
@@ -733,6 +753,9 @@ ice_update_link_status(struct ice_softc *sc, bool update_media)
iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0);
ice_rdma_link_change(sc, LINK_STATE_DOWN, 0);
}
+#ifdef PCI_IOV
+ ice_vc_notify_all_vfs_link_state(sc);
+#endif
update_media = true;
}
@@ -831,6 +854,14 @@ ice_if_attach_post(if_ctx_t ctx)
ice_add_device_sysctls(sc);
+#ifdef PCI_IOV
+ if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV)) {
+ err = ice_iov_attach(sc);
+ if (err == ENOMEM)
+ return (err);
+ }
+#endif /* PCI_IOV */
+
/* Get DCBX/LLDP state and start DCBX agent */
ice_init_dcb_setup(sc);
@@ -953,6 +984,11 @@ ice_if_detach(if_ctx_t ctx)
ice_destroy_mirror_interface(sc);
ice_rdma_pf_detach(sc);
+#ifdef PCI_IOV
+ if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_SRIOV))
+ ice_iov_detach(sc);
+#endif /* PCI_IOV */
+
/* Free allocated media types */
ifmedia_removeall(sc->media);
@@ -1676,6 +1712,11 @@ ice_if_msix_intr_assign(if_ctx_t ctx, int msix)
/* For future interrupt assignments */
sc->last_rid = rid + sc->irdma_vectors;
+#ifdef PCI_IOV
+ /* Create soft IRQ for handling VF resets */
+ iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_IOV, sc, 0, "iov");
+#endif
+
return (0);
fail:
for (; i >= 0; i--, vector--)
@@ -2277,7 +2318,12 @@ ice_transition_recovery_mode(struct ice_softc *sc)
ice_rdma_pf_detach(sc);
ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
+#ifdef PCI_IOV
+ if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+ ice_iov_detach(sc);
+#else
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
ice_vsi_del_txqs_ctx(vsi);
@@ -2325,7 +2371,12 @@ ice_transition_safe_mode(struct ice_softc *sc)
ice_rdma_pf_detach(sc);
ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap);
+#ifdef PCI_IOV
+ if (ice_test_and_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en))
+ ice_iov_detach(sc);
+#else
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en);
+#endif /* PCI_IOV */
ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap);
ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap);
@@ -2410,6 +2461,15 @@ ice_if_update_admin_status(if_ctx_t ctx)
/* Check and update link status */
ice_update_link_status(sc, false);
+#ifdef PCI_IOV
+ /*
+ * Schedule VFs' reset handler after global resets
+ * and other events were processed.
+ */
+ if (ice_testandclear_state(&sc->state, ICE_STATE_VFLR_PENDING))
+ iflib_iov_intr_deferred(ctx);
+#endif
+
/*
* If there are still messages to process, we need to reschedule
* ourselves. Otherwise, we can just re-enable the interrupt. We'll be
@@ -3349,6 +3409,78 @@ ice_init_link(struct ice_softc *sc)
}
+#ifdef PCI_IOV
+/**
+ * ice_if_iov_init - iov init handler for iflib
+ * @ctx: iflib context pointer
+ * @num_vfs: number of VFs to create
+ * @params: configuration parameters for the PF
+ *
+ * Configure the driver for SR-IOV mode. Used to setup things like memory
+ * before any VFs are created.
+ *
+ * @remark This is a wrapper for ice_iov_init
+ */
+static int
+ice_if_iov_init(if_ctx_t ctx, uint16_t num_vfs, const nvlist_t *params)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ return ice_iov_init(sc, num_vfs, params);
+}
+
+/**
+ * ice_if_iov_uninit - iov uninit handler for iflib
+ * @ctx: iflib context pointer
+ *
+ * Destroys VFs and frees their memory and resources.
+ *
+ * @remark This is a wrapper for ice_iov_uninit
+ */
+static void
+ice_if_iov_uninit(if_ctx_t ctx)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ ice_iov_uninit(sc);
+}
+
+/**
+ * ice_if_iov_vf_add - iov add vf handler for iflib
+ * @ctx: iflib context pointer
+ * @vfnum: index of VF to configure
+ * @params: configuration parameters for the VF
+ *
+ * Sets up the VF given by the vfnum index. This is called by the OS
+ * for each VF created by the PF driver after it is spawned.
+ *
+ * @remark This is a wrapper for ice_iov_vf_add
+ */
+static int
+ice_if_iov_vf_add(if_ctx_t ctx, uint16_t vfnum, const nvlist_t *params)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+
+ return ice_iov_add_vf(sc, vfnum, params);
+}
+
+/**
+ * ice_if_vflr_handle - iov VFLR handler
+ * @ctx: iflib context pointer
+ *
+ * Performs the necessar teardown or setup required for a VF after
+ * a VFLR is initiated.
+ *
+ * @remark This is a wrapper for ice_iov_handle_vflr
+ */
+static void
+ice_if_vflr_handle(if_ctx_t ctx)
+{
+ struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx);
+ ice_iov_handle_vflr(sc);
+}
+#endif /* PCI_IOV */
+
extern struct if_txrx ice_subif_txrx;
/**
diff --git a/sys/dev/iicbus/iichid.c b/sys/dev/iicbus/iichid.c
index 9c0324a24685..3f1d7a0cefba 100644
--- a/sys/dev/iicbus/iichid.c
+++ b/sys/dev/iicbus/iichid.c
@@ -275,62 +275,36 @@ iichid_cmd_read(struct iichid_softc* sc, void *buf, iichid_size_t maxlen,
* 6.1.3 - Retrieval of Input Reports
* DEVICE returns the length (2 Bytes) and the entire Input Report.
*/
- uint8_t actbuf[2] = { 0, 0 };
- /* Read actual input report length. */
+
+ memset(buf, 0xaa, 2); // In case nothing gets read
struct iic_msg msgs[] = {
- { sc->addr, IIC_M_RD | IIC_M_NOSTOP, sizeof(actbuf), actbuf },
+ { sc->addr, IIC_M_RD, maxlen, buf },
};
- uint16_t actlen;
int error;
error = iicbus_transfer(sc->dev, msgs, nitems(msgs));
if (error != 0)
return (error);
- actlen = actbuf[0] | actbuf[1] << 8;
-#ifdef IICHID_SAMPLING
- if ((actlen == 0 && sc->sampling_rate_slow < 0) ||
- (maxlen == 0 && sc->sampling_rate_slow >= 0)) {
-#else
+ DPRINTFN(sc, 5, "%*D\n", msgs[0].len, msgs[0].buf, " ");
+
+ uint16_t actlen = le16dec(buf);
+
if (actlen == 0) {
-#endif
- /* Read and discard reset command response. */
- msgs[0] = (struct iic_msg)
- { sc->addr, IIC_M_RD | IIC_M_NOSTART,
- le16toh(sc->desc.wMaxInputLength) - 2, sc->intr_buf };
- actlen = 0;
if (!sc->reset_acked) {
mtx_lock(&sc->mtx);
sc->reset_acked = true;
wakeup(&sc->reset_acked);
mtx_unlock(&sc->mtx);
}
-#ifdef IICHID_SAMPLING
- } else if ((actlen <= 2 || actlen == 0xFFFF) &&
- sc->sampling_rate_slow >= 0) {
- /* Read and discard 1 byte to send I2C STOP condition. */
- msgs[0] = (struct iic_msg)
- { sc->addr, IIC_M_RD | IIC_M_NOSTART, 1, actbuf };
- actlen = 0;
-#endif
- } else {
- actlen -= 2;
- if (actlen > maxlen) {
- DPRINTF(sc, "input report too big. requested=%d "
- "received=%d\n", maxlen, actlen);
- actlen = maxlen;
- }
- /* Read input report itself. */
- msgs[0] = (struct iic_msg)
- { sc->addr, IIC_M_RD | IIC_M_NOSTART, actlen, buf };
}
- error = iicbus_transfer(sc->dev, msgs, 1);
- if (error == 0 && actual_len != NULL)
+ if (actlen <= 2 || actlen > maxlen) {
+ actlen = 0;
+ }
+ if (actual_len != NULL) {
*actual_len = actlen;
-
- DPRINTFN(sc, 5,
- "%*D - %*D\n", 2, actbuf, " ", msgs[0].len, msgs[0].buf, " ");
+ }
return (error);
}
@@ -566,7 +540,7 @@ iichid_sampling_task(void *context, int pending)
error = iichid_cmd_read(sc, sc->intr_buf, sc->intr_bufsize, &actual);
if (error == 0) {
if (actual > 0) {
- sc->intr_handler(sc->intr_ctx, sc->intr_buf, actual);
+ sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2, actual);
sc->missing_samples = 0;
if (sc->dup_size != actual ||
memcmp(sc->dup_buf, sc->intr_buf, actual) != 0) {
@@ -577,7 +551,7 @@ iichid_sampling_task(void *context, int pending)
++sc->dup_samples;
} else {
if (++sc->missing_samples == 1)
- sc->intr_handler(sc->intr_ctx, sc->intr_buf, 0);
+ sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2, 0);
sc->dup_samples = 0;
}
} else
@@ -632,7 +606,7 @@ iichid_intr(void *context)
if (error == 0) {
if (sc->power_on && sc->open) {
if (actual != 0)
- sc->intr_handler(sc->intr_ctx, sc->intr_buf,
+ sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2,
actual);
else
DPRINTF(sc, "no data received\n");
@@ -842,11 +816,12 @@ iichid_intr_setup(device_t dev, device_t child __unused, hid_intr_t intr,
sc = device_get_softc(dev);
/*
- * Do not rely on wMaxInputLength, as some devices may set it to
- * a wrong length. Find the longest input report in report descriptor.
+ * Do not rely just on wMaxInputLength, as some devices (which?)
+ * may set it to a wrong length. Also find the longest input report
+ * in report descriptor, and add two for the length field.
*/
- rdesc->rdsize =
- MAX(rdesc->isize, le16toh(sc->desc.wMaxInputLength) - 2);
+ rdesc->rdsize = 2 +
+ MAX(rdesc->isize, le16toh(sc->desc.wMaxInputLength));
/* Write and get/set_report sizes are limited by I2C-HID protocol. */
rdesc->grsize = rdesc->srsize = IICHID_SIZE_MAX;
rdesc->wrsize = IICHID_SIZE_MAX;
@@ -919,7 +894,7 @@ iichid_intr_poll(device_t dev, device_t child __unused)
sc = device_get_softc(dev);
error = iichid_cmd_read(sc, sc->intr_buf, sc->intr_bufsize, &actual);
if (error == 0 && actual != 0)
- sc->intr_handler(sc->intr_ctx, sc->intr_buf, actual);
+ sc->intr_handler(sc->intr_ctx, sc->intr_buf + 2, actual);
}
/*
@@ -946,6 +921,7 @@ iichid_read(device_t dev, device_t child __unused, void *buf,
{
struct iichid_softc *sc;
device_t parent;
+ uint8_t *tmpbuf;
int error;
if (maxlen > IICHID_SIZE_MAX)
@@ -954,8 +930,12 @@ iichid_read(device_t dev, device_t child __unused, void *buf,
parent = device_get_parent(sc->dev);
error = iicbus_request_bus(parent, sc->dev, IIC_WAIT);
if (error == 0) {
- error = iichid_cmd_read(sc, buf, maxlen, actlen);
+ tmpbuf = malloc(maxlen + 2, M_DEVBUF, M_WAITOK | M_ZERO);
+ error = iichid_cmd_read(sc, tmpbuf, maxlen + 2, actlen);
iicbus_release_bus(parent, sc->dev);
+ if (*actlen > 0)
+ memcpy(buf, tmpbuf + 2, *actlen);
+ free(tmpbuf, M_DEVBUF);
}
return (iic2errno(error));
}
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
index 741a7c013f7d..ec1664fac701 100644
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@@ -11,9 +11,9 @@
*/
/*-
- * The following functions are based on the vn(4) driver: mdstart_swap(),
- * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
- * and as such under the following copyright:
+ * The following functions are based on the historical vn(4) driver:
+ * mdstart_swap(), mdstart_vnode(), mdcreate_swap(), mdcreate_vnode()
+ * and mddestroy(), and as such under the following copyright:
*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
@@ -89,6 +89,8 @@
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/disk.h>
+#include <sys/param.h>
+#include <sys/bus.h>
#include <geom/geom.h>
#include <geom/geom_int.h>
@@ -2082,8 +2084,10 @@ g_md_init(struct g_class *mp __unused)
{
caddr_t mod;
u_char *ptr, *name, *type;
+ u_char scratch[40];
unsigned len;
int i;
+ vm_offset_t paddr;
/* figure out log2(NINDIR) */
for (i = NINDIR, nshift = -1; i; nshift++)
@@ -2123,6 +2127,25 @@ g_md_init(struct g_class *mp __unused)
sx_xunlock(&md_sx);
}
}
+
+ /*
+ * Load up to 32 pre-loaded disks
+ */
+ for (int i = 0; i < 32; i++) {
+ if (resource_long_value("md", i, "physaddr",
+ (long *) &paddr) != 0 ||
+ resource_int_value("md", i, "len", &len) != 0)
+ break;
+ ptr = (char *)pmap_map(NULL, paddr, paddr + len, VM_PROT_READ);
+ if (ptr != NULL && len != 0) {
+ sprintf(scratch, "preload%d 0x%016jx", i,
+ (uintmax_t)paddr);
+ sx_xlock(&md_sx);
+ md_preloaded(ptr, len, scratch);
+ sx_xunlock(&md_sx);
+ }
+ }
+
status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
0600, MDCTL_NAME);
g_topology_lock();
diff --git a/sys/dev/mgb/if_mgb.c b/sys/dev/mgb/if_mgb.c
index 1240d0f84415..409f34167df0 100644
--- a/sys/dev/mgb/if_mgb.c
+++ b/sys/dev/mgb/if_mgb.c
@@ -1435,7 +1435,7 @@ mgb_hw_teardown(struct mgb_softc *sc)
/* Stop MAC */
CSR_CLEAR_REG(sc, MGB_MAC_RX, MGB_MAC_ENBL);
- CSR_WRITE_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
+ CSR_CLEAR_REG(sc, MGB_MAC_TX, MGB_MAC_ENBL);
if ((err = mgb_wait_for_bits(sc, MGB_MAC_RX, MGB_MAC_DSBL, 0)))
return (err);
if ((err = mgb_wait_for_bits(sc, MGB_MAC_TX, MGB_MAC_DSBL, 0)))
diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h
index 361b9f72d873..c3f3a2372482 100644
--- a/sys/dev/mlx5/mlx5_accel/ipsec.h
+++ b/sys/dev/mlx5/mlx5_accel/ipsec.h
@@ -260,8 +260,8 @@ int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv);
void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv);
int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv);
int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr);
-void mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
- struct mlx5e_rq_mbuf *mr);
+void mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+ struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr);
static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
{
@@ -269,12 +269,12 @@ static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe)
}
static inline void
-mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe,
+mlx5e_accel_ipsec_handle_rx(if_t ifp, struct mbuf *mb, struct mlx5_cqe64 *cqe,
struct mlx5e_rq_mbuf *mr)
{
u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata);
if (MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data))
- mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe, mr);
+ mlx5e_accel_ipsec_handle_rx_cqe(ifp, mb, cqe, mr);
}
#endif /* __MLX5_ACCEL_IPSEC_H__ */
diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
index 0883cfb2d510..5dccb8bc2b87 100644
--- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
+++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c
@@ -24,11 +24,14 @@
*
*/
+#include "opt_ipsec.h"
+
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netipsec/keydb.h>
#include <netipsec/ipsec_offload.h>
+#include <netipsec/xform.h>
#include <dev/mlx5/qp.h>
#include <dev/mlx5/mlx5_en/en.h>
#include <dev/mlx5/mlx5_accel/ipsec.h>
@@ -48,7 +51,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
return (0);
mtag = (struct ipsec_accel_in_tag *)m_tag_get(
- PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*mtag), M_NOWAIT);
+ PACKET_TAG_IPSEC_ACCEL_IN, sizeof(struct ipsec_accel_in_tag) -
+ __offsetof(struct ipsec_accel_in_tag, xh), M_NOWAIT);
if (mtag == NULL)
return (-ENOMEM);
mr->ipsec_mtag = mtag;
@@ -56,8 +60,8 @@ mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr)
}
void
-mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
- struct mlx5e_rq_mbuf *mr)
+mlx5e_accel_ipsec_handle_rx_cqe(if_t ifp, struct mbuf *mb,
+ struct mlx5_cqe64 *cqe, struct mlx5e_rq_mbuf *mr)
{
struct ipsec_accel_in_tag *mtag;
u32 drv_spi;
@@ -65,10 +69,12 @@ mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe,
drv_spi = MLX5_IPSEC_METADATA_HANDLE(be32_to_cpu(cqe->ft_metadata));
mtag = mr->ipsec_mtag;
WARN_ON(mtag == NULL);
- mr->ipsec_mtag = NULL;
if (mtag != NULL) {
mtag->drv_spi = drv_spi;
- m_tag_prepend(mb, &mtag->tag);
+ if (ipsec_accel_fill_xh(ifp, drv_spi, &mtag->xh)) {
+ m_tag_prepend(mb, &mtag->tag);
+ mr->ipsec_mtag = NULL;
+ }
}
}
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
index 4de451f1b039..89d2010656c5 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
@@ -659,7 +659,8 @@ mlx5e_tls_rx_set_params(void *ctx, struct inpcb *inp, const struct tls_session_p
return (EINVAL);
MLX5_SET64(sw_tls_rx_cntx, ctx, param.initial_record_number, tls_sn_he);
- MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, tcp_sn_he);
+ MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, 0);
+ MLX5_SET(sw_tls_rx_cntx, ctx, progress.next_record_tcp_sn, tcp_sn_he);
return (0);
}
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
index 6b53db6fea23..eb569488631a 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
@@ -467,7 +467,7 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq,
break;
}
- mlx5e_accel_ipsec_handle_rx(mb, cqe, mr);
+ mlx5e_accel_ipsec_handle_rx(ifp, mb, cqe, mr);
}
static inline void
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 73a7cee4aad0..fd7f00ced14b 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -48,7 +48,7 @@
#define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */
static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
- struct nvme_async_event_request *aer);
+ struct nvme_async_event_request *aer);
static void
nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags)
@@ -680,96 +680,6 @@ nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
}
static void
-nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
-{
- struct nvme_async_event_request *aer = arg;
- struct nvme_health_information_page *health_info;
- struct nvme_ns_list *nsl;
- struct nvme_error_information_entry *err;
- int i;
-
- /*
- * If the log page fetch for some reason completed with an error,
- * don't pass log page data to the consumers. In practice, this case
- * should never happen.
- */
- if (nvme_completion_is_error(cpl))
- nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
- aer->log_page_id, NULL, 0);
- else {
- /* Convert data to host endian */
- switch (aer->log_page_id) {
- case NVME_LOG_ERROR:
- err = (struct nvme_error_information_entry *)aer->log_page_buffer;
- for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
- nvme_error_information_entry_swapbytes(err++);
- break;
- case NVME_LOG_HEALTH_INFORMATION:
- nvme_health_information_page_swapbytes(
- (struct nvme_health_information_page *)aer->log_page_buffer);
- break;
- case NVME_LOG_CHANGED_NAMESPACE:
- nvme_ns_list_swapbytes(
- (struct nvme_ns_list *)aer->log_page_buffer);
- break;
- case NVME_LOG_COMMAND_EFFECT:
- nvme_command_effects_page_swapbytes(
- (struct nvme_command_effects_page *)aer->log_page_buffer);
- break;
- case NVME_LOG_RES_NOTIFICATION:
- nvme_res_notification_page_swapbytes(
- (struct nvme_res_notification_page *)aer->log_page_buffer);
- break;
- case NVME_LOG_SANITIZE_STATUS:
- nvme_sanitize_status_page_swapbytes(
- (struct nvme_sanitize_status_page *)aer->log_page_buffer);
- break;
- default:
- break;
- }
-
- if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
- health_info = (struct nvme_health_information_page *)
- aer->log_page_buffer;
- nvme_ctrlr_log_critical_warnings(aer->ctrlr,
- health_info->critical_warning);
- /*
- * Critical warnings reported through the
- * SMART/health log page are persistent, so
- * clear the associated bits in the async event
- * config so that we do not receive repeated
- * notifications for the same event.
- */
- aer->ctrlr->async_event_config &=
- ~health_info->critical_warning;
- nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
- aer->ctrlr->async_event_config, NULL, NULL);
- } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
- !nvme_use_nvd) {
- nsl = (struct nvme_ns_list *)aer->log_page_buffer;
- for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
- if (nsl->ns[i] > NVME_MAX_NAMESPACES)
- break;
- nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
- }
- }
-
- /*
- * Pass the cpl data from the original async event completion,
- * not the log page fetch.
- */
- nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
- aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
- }
-
- /*
- * Repost another asynchronous event request to replace the one
- * that just completed.
- */
- nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
-}
-
-static void
nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
{
struct nvme_async_event_request *aer = arg;
@@ -784,33 +694,18 @@ nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
return;
}
- /* Associated log page is in bits 23:16 of completion entry dw0. */
+ /*
+ * Save the completion status and associated log page is in bits 23:16
+ * of completion entry dw0. Print a message and queue it for further
+ * processing.
+ */
+ memcpy(&aer->cpl, cpl, sizeof(*cpl));
aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0);
-
nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
" page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0),
NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0),
aer->log_page_id);
-
- if (is_log_page_id_valid(aer->log_page_id)) {
- aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
- aer->log_page_id);
- memcpy(&aer->cpl, cpl, sizeof(*cpl));
- nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
- NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
- aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
- aer);
- /* Wait to notify consumers until after log page is fetched. */
- } else {
- nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
- NULL, 0);
-
- /*
- * Repost another asynchronous event request to replace the one
- * that just completed.
- */
- nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
- }
+ taskqueue_enqueue(aer->ctrlr->taskqueue, &aer->task);
}
static void
@@ -819,15 +714,21 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
{
struct nvme_request *req;
- aer->ctrlr = ctrlr;
/*
- * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable
- * callback context. AER completions should be handled on a dedicated
- * thread.
+ * We're racing the reset thread, so let that process submit this again.
+ * XXX does this really solve that race? And is that race even possible
+ * since we only reset when we've no theard from the card in a long
+ * time. Why would we get an AER in the middle of that just before we
+ * kick off the reset?
*/
- req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb,
+ if (ctrlr->is_resetting)
+ return;
+
+ aer->ctrlr = ctrlr;
+ req = nvme_allocate_request_null(M_WAITOK, nvme_ctrlr_async_event_cb,
aer);
aer->req = req;
+ aer->log_page_id = 0; /* Not a valid page */
/*
* Disable timeout here, since asynchronous event requests should by
@@ -1203,6 +1104,140 @@ nvme_ctrlr_reset_task(void *arg, int pending)
atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
}
+static void
+nvme_ctrlr_aer_done(void *arg, const struct nvme_completion *cpl)
+{
+ struct nvme_async_event_request *aer = arg;
+
+ mtx_lock(&aer->mtx);
+ if (nvme_completion_is_error(cpl))
+ aer->log_page_size = (uint32_t)-1;
+ else
+ aer->log_page_size = nvme_ctrlr_get_log_page_size(
+ aer->ctrlr, aer->log_page_id);
+ wakeup(aer);
+ mtx_unlock(&aer->mtx);
+}
+
+static void
+nvme_ctrlr_aer_task(void *arg, int pending)
+{
+ struct nvme_async_event_request *aer = arg;
+ struct nvme_controller *ctrlr = aer->ctrlr;
+ uint32_t len;
+
+ /*
+ * We're resetting, so just punt.
+ */
+ if (ctrlr->is_resetting)
+ return;
+
+ if (!is_log_page_id_valid(aer->log_page_id)) {
+ /*
+ * Repost another asynchronous event request to replace the one
+ * that just completed.
+ */
+ nvme_notify_async_consumers(ctrlr, &aer->cpl, aer->log_page_id,
+ NULL, 0);
+ nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+ goto out;
+ }
+
+ aer->log_page_size = 0;
+ len = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id);
+ nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
+ NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, len,
+ nvme_ctrlr_aer_done, aer);
+ mtx_lock(&aer->mtx);
+ while (aer->log_page_size == 0)
+ mtx_sleep(aer, &aer->mtx, PRIBIO, "nvme_pt", 0);
+ mtx_unlock(&aer->mtx);
+
+ if (aer->log_page_size != (uint32_t)-1) {
+ /*
+ * If the log page fetch for some reason completed with an
+ * error, don't pass log page data to the consumers. In
+ * practice, this case should never happen.
+ */
+ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+ aer->log_page_id, NULL, 0);
+ goto out;
+ }
+
+ /* Convert data to host endian */
+ switch (aer->log_page_id) {
+ case NVME_LOG_ERROR: {
+ struct nvme_error_information_entry *err =
+ (struct nvme_error_information_entry *)aer->log_page_buffer;
+ for (int i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
+ nvme_error_information_entry_swapbytes(err++);
+ break;
+ }
+ case NVME_LOG_HEALTH_INFORMATION:
+ nvme_health_information_page_swapbytes(
+ (struct nvme_health_information_page *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_CHANGED_NAMESPACE:
+ nvme_ns_list_swapbytes(
+ (struct nvme_ns_list *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_COMMAND_EFFECT:
+ nvme_command_effects_page_swapbytes(
+ (struct nvme_command_effects_page *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_RES_NOTIFICATION:
+ nvme_res_notification_page_swapbytes(
+ (struct nvme_res_notification_page *)aer->log_page_buffer);
+ break;
+ case NVME_LOG_SANITIZE_STATUS:
+ nvme_sanitize_status_page_swapbytes(
+ (struct nvme_sanitize_status_page *)aer->log_page_buffer);
+ break;
+ default:
+ break;
+ }
+
+ if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
+ struct nvme_health_information_page *health_info =
+ (struct nvme_health_information_page *)aer->log_page_buffer;
+
+ /*
+ * Critical warnings reported through the SMART/health log page
+ * are persistent, so clear the associated bits in the async
+ * event config so that we do not receive repeated notifications
+ * for the same event.
+ */
+ nvme_ctrlr_log_critical_warnings(aer->ctrlr,
+ health_info->critical_warning);
+ aer->ctrlr->async_event_config &=
+ ~health_info->critical_warning;
+ nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
+ aer->ctrlr->async_event_config, NULL, NULL);
+ } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE) {
+ struct nvme_ns_list *nsl =
+ (struct nvme_ns_list *)aer->log_page_buffer;
+ for (int i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
+ if (nsl->ns[i] > NVME_MAX_NAMESPACES)
+ break;
+ nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
+ }
+ }
+
+ /*
+ * Pass the cpl data from the original async event completion, not the
+ * log page fetch.
+ */
+ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+ aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
+
+ /*
+ * Repost another asynchronous event request to replace the one
+ * that just completed.
+ */
+out:
+ nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+}
+
/*
* Poll all the queues enabled on the device for completion.
*/
@@ -1574,13 +1609,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
/*
* Create 2 threads for the taskqueue. The reset thread will block when
* it detects that the controller has failed until all I/O has been
- * failed up the stack. The fail_req task needs to be able to run in
- * this case to finish the request failure for some cases.
- *
- * We could partially solve this race by draining the failed requeust
- * queue before proceding to free the sim, though nothing would stop
- * new I/O from coming in after we do that drain, but before we reach
- * cam_sim_free, so this big hammer is used instead.
+ * failed up the stack. The second thread is used for AER events, which
+ * can block, but only briefly for memory and log page fetching.
*/
ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
taskqueue_thread_enqueue, &ctrlr->taskqueue);
@@ -1590,7 +1620,12 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
ctrlr->is_initialized = false;
ctrlr->notification_sent = 0;
TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
- STAILQ_INIT(&ctrlr->fail_req);
+ for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+ struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+ TASK_INIT(&aer->task, 0, nvme_ctrlr_aer_task, aer);
+ mtx_init(&aer->mtx, "AER mutex", NULL, MTX_DEF);
+ }
ctrlr->is_failed = false;
make_dev_args_init(&md_args);
@@ -1678,8 +1713,14 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
}
noadminq:
- if (ctrlr->taskqueue)
+ if (ctrlr->taskqueue) {
taskqueue_free(ctrlr->taskqueue);
+ for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+ struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+ mtx_destroy(&aer->mtx);
+ }
+ }
if (ctrlr->tag)
bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 949e69ec9290..36f00fedc48e 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -123,6 +123,8 @@ struct nvme_request {
struct nvme_async_event_request {
struct nvme_controller *ctrlr;
struct nvme_request *req;
+ struct task task;
+ struct mtx mtx;
struct nvme_completion cpl;
uint32_t log_page_id;
uint32_t log_page_size;
@@ -307,8 +309,6 @@ struct nvme_controller {
bool isr_warned;
bool is_initialized;
- STAILQ_HEAD(, nvme_request) fail_req;
-
/* Host Memory Buffer */
int hmb_nchunks;
size_t hmb_chunk;
diff --git a/sys/dev/nvmf/controller/nvmft_subr.c b/sys/dev/nvmf/controller/nvmft_subr.c
index bb2bc0988e81..245971813854 100644
--- a/sys/dev/nvmf/controller/nvmft_subr.c
+++ b/sys/dev/nvmf/controller/nvmft_subr.c
@@ -26,46 +26,6 @@ nvmf_nqn_valid(const char *nqn)
len = strnlen(nqn, NVME_NQN_FIELD_SIZE);
if (len == 0 || len > NVMF_NQN_MAX_LEN)
return (false);
-
-#ifdef STRICT_CHECKS
- /*
- * Stricter checks from the spec. Linux does not seem to
- * require these.
- */
-
- /*
- * NVMF_NQN_MIN_LEN does not include '.', and require at least
- * one character of a domain name.
- */
- if (len < NVMF_NQN_MIN_LEN + 2)
- return (false);
- if (memcmp("nqn.", nqn, strlen("nqn.")) != 0)
- return (false);
- nqn += strlen("nqn.");
-
- /* Next 4 digits must be a year. */
- for (u_int i = 0; i < 4; i++) {
- if (!isdigit(nqn[i]))
- return (false);
- }
- nqn += 4;
-
- /* '-' between year and month. */
- if (nqn[0] != '-')
- return (false);
- nqn++;
-
- /* 2 digit month. */
- for (u_int i = 0; i < 2; i++) {
- if (!isdigit(nqn[i]))
- return (false);
- }
- nqn += 2;
-
- /* '.' between month and reverse domain name. */
- if (nqn[0] != '.')
- return (false);
-#endif
return (true);
}
diff --git a/sys/dev/ofw/ofw_bus_subr.c b/sys/dev/ofw/ofw_bus_subr.c
index 4d0479dfb957..b99d784929bc 100644
--- a/sys/dev/ofw/ofw_bus_subr.c
+++ b/sys/dev/ofw/ofw_bus_subr.c
@@ -634,11 +634,89 @@ ofw_bus_find_iparent(phandle_t node)
return (iparent);
}
+static phandle_t
+ofw_bus_search_iparent(phandle_t node)
+{
+ phandle_t iparent;
+
+ do {
+ if (OF_getencprop(node, "interrupt-parent", &iparent,
+ sizeof(iparent)) > 0) {
+ node = OF_node_from_xref(iparent);
+ } else {
+ node = OF_parent(node);
+ }
+ if (node == 0)
+ return (0);
+ } while (!OF_hasprop(node, "#interrupt-cells"));
+
+ return (OF_xref_from_node(node));
+}
+
+static int
+ofw_bus_traverse_imap(phandle_t inode, phandle_t node, uint32_t *intr,
+ int intrsz, pcell_t *res, int ressz, phandle_t *iparentp)
+{
+ struct ofw_bus_iinfo ii;
+ void *reg;
+ uint32_t *intrp;
+ phandle_t iparent;
+ int rv = 0;
+
+ /* We already have an interrupt controller */
+ if (OF_hasprop(node, "interrupt-controller"))
+ return (0);
+
+ intrp = malloc(intrsz, M_OFWPROP, M_WAITOK);
+ memcpy(intrp, intr, intrsz);
+
+ while (true) {
+ /* There is no interrupt-map to follow */
+ if (!OF_hasprop(inode, "interrupt-map")) {
+ free(intrp, M_OFWPROP);
+ return (0);
+ }
+
+ memset(&ii, 0, sizeof(ii));
+ ofw_bus_setup_iinfo(inode, &ii, sizeof(cell_t));
+
+ reg = NULL;
+ if (ii.opi_addrc > 0)
+ reg = malloc(ii.opi_addrc, M_OFWPROP, M_WAITOK);
+
+ rv = ofw_bus_lookup_imap(node, &ii, reg, ii.opi_addrc, intrp,
+ intrsz, res, ressz, &iparent);
+
+ free(reg, M_OFWPROP);
+ free(ii.opi_imap, M_OFWPROP);
+ free(ii.opi_imapmsk, M_OFWPROP);
+ free(intrp, M_OFWPROP);
+
+ if (rv == 0)
+ return (0);
+
+ node = inode;
+ inode = OF_node_from_xref(iparent);
+
+ /* Stop when we have an interrupt controller */
+ if (OF_hasprop(inode, "interrupt-controller")) {
+ *iparentp = iparent;
+ return (rv);
+ }
+
+ intrsz = rv * sizeof(pcell_t);
+ intrp = malloc(intrsz, M_OFWPROP, M_WAITOK);
+ memcpy(intrp, res, intrsz);
+ }
+}
+
int
ofw_bus_intr_to_rl(device_t dev, phandle_t node,
struct resource_list *rl, int *rlen)
{
- phandle_t iparent;
+ phandle_t iparent, iparent_node;
+ uint32_t result[16];
+ uint32_t intrpcells, *intrp;
uint32_t icells, *intr;
int err, i, irqnum, nintr, rid;
bool extended;
@@ -646,15 +724,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
nintr = OF_getencprop_alloc_multi(node, "interrupts", sizeof(*intr),
(void **)&intr);
if (nintr > 0) {
- iparent = ofw_bus_find_iparent(node);
+ iparent = ofw_bus_search_iparent(node);
if (iparent == 0) {
device_printf(dev, "No interrupt-parent found, "
"assuming direct parent\n");
iparent = OF_parent(node);
iparent = OF_xref_from_node(iparent);
}
- if (OF_searchencprop(OF_node_from_xref(iparent),
- "#interrupt-cells", &icells, sizeof(icells)) == -1) {
+ iparent_node = OF_node_from_xref(iparent);
+ if (OF_searchencprop(iparent_node, "#interrupt-cells", &icells,
+ sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property, assuming <1>\n");
icells = 1;
@@ -677,7 +756,8 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
for (i = 0; i < nintr; i += icells) {
if (extended) {
iparent = intr[i++];
- if (OF_searchencprop(OF_node_from_xref(iparent),
+ iparent_node = OF_node_from_xref(iparent);
+ if (OF_searchencprop(iparent_node,
"#interrupt-cells", &icells, sizeof(icells)) == -1) {
device_printf(dev, "Missing #interrupt-cells "
"property\n");
@@ -691,7 +771,16 @@ ofw_bus_intr_to_rl(device_t dev, phandle_t node,
break;
}
}
- irqnum = ofw_bus_map_intr(dev, iparent, icells, &intr[i]);
+
+ intrp = &intr[i];
+ intrpcells = ofw_bus_traverse_imap(iparent_node, node, intrp,
+ icells * sizeof(intr[0]), result, sizeof(result), &iparent);
+ if (intrpcells > 0)
+ intrp = result;
+ else
+ intrpcells = icells;
+
+ irqnum = ofw_bus_map_intr(dev, iparent, intrpcells, intrp);
resource_list_add(rl, SYS_RES_IRQ, rid++, irqnum, irqnum, 1);
}
if (rlen != NULL)
diff --git a/sys/dev/qlnx/qlnxe/qlnx_os.c b/sys/dev/qlnx/qlnxe/qlnx_os.c
index 05ec69a70dfe..4ad190374f87 100644
--- a/sys/dev/qlnx/qlnxe/qlnx_os.c
+++ b/sys/dev/qlnx/qlnxe/qlnx_os.c
@@ -30,6 +30,8 @@
* Author : David C Somayajulu, Cavium, Inc., San Jose, CA 95131.
*/
+#include "opt_inet.h"
+
#include <sys/cdefs.h>
#include "qlnx_os.h"
#include "bcm_osal.h"
@@ -2306,8 +2308,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
else if (device_id == QLOGIC_PCI_DEVICE_ID_1644)
if_setbaudrate(ifp, IF_Gbps(100));
- if_setcapabilities(ifp, IFCAP_LINKSTATE);
-
if_setinitfn(ifp, qlnx_init);
if_setsoftc(ifp, ha);
if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
@@ -2341,7 +2341,6 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
if_setcapabilities(ifp, IFCAP_HWCSUM);
if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
-
if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0);
if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING, 0);
if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0);
@@ -2350,6 +2349,8 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
+ if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0);
+ if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0);
if_sethwtsomax(ifp, QLNX_MAX_TSO_FRAME_SIZE -
(ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
@@ -2778,7 +2779,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
if (!p_ptt) {
QL_DPRINT1(ha, "ecore_ptt_acquire failed\n");
- ret = -1;
+ ret = ERESTART;
break;
}
@@ -2789,7 +2790,7 @@ qlnx_ioctl(if_t ifp, u_long cmd, caddr_t data)
ecore_ptt_release(p_hwfn, p_ptt);
if (ret) {
- ret = -1;
+ ret = ENODEV;
break;
}
diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c
index c4282c723a44..8363de99a60a 100644
--- a/sys/dev/random/fortuna.c
+++ b/sys/dev/random/fortuna.c
@@ -341,6 +341,13 @@ random_fortuna_process_event(struct harvest_event *event)
u_int pl;
RANDOM_RESEED_LOCK();
+ /*
+ * Run SP 800-90B health tests on the source if so configured.
+ */
+ if (!random_harvest_healthtest(event)) {
+ RANDOM_RESEED_UNLOCK();
+ return;
+ }
/*-
* FS&K - P_i = P_i|<harvested stuff>
* Accumulate the event into the appropriate pool
diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c
index 395310b115fb..c7762967c4fb 100644
--- a/sys/dev/random/random_harvestq.c
+++ b/sys/dev/random/random_harvestq.c
@@ -88,6 +88,8 @@ static void random_sources_feed(void);
static __read_mostly bool epoch_inited;
static __read_mostly epoch_t rs_epoch;
+static const char *random_source_descr[ENTROPYSOURCE];
+
/*
* How many events to queue up. We create this many items in
* an 'empty' queue, then transfer them to the 'harvest' queue with
@@ -299,6 +301,230 @@ random_sources_feed(void)
explicit_bzero(entropy, sizeof(entropy));
}
+/*
+ * State used for conducting NIST SP 800-90B health tests on entropy sources.
+ */
+static struct health_test_softc {
+ uint32_t ht_rct_value[HARVESTSIZE + 1];
+ u_int ht_rct_count; /* number of samples with the same value */
+ u_int ht_rct_limit; /* constant after init */
+
+ uint32_t ht_apt_value[HARVESTSIZE + 1];
+ u_int ht_apt_count; /* number of samples with the same value */
+ u_int ht_apt_seq; /* sequence number of the last sample */
+ u_int ht_apt_cutoff; /* constant after init */
+
+ uint64_t ht_total_samples;
+ bool ondemand; /* Set to true to restart the state machine */
+ enum {
+ INIT = 0, /* initial state */
+ DISABLED, /* health checking is disabled */
+ STARTUP, /* doing startup tests, samples are discarded */
+ STEADY, /* steady-state operation */
+ FAILED, /* health check failed, discard samples */
+ } ht_state;
+} healthtest[ENTROPYSOURCE];
+
+#define RANDOM_SELFTEST_STARTUP_SAMPLES 1024 /* 4.3, requirement 4 */
+#define RANDOM_SELFTEST_APT_WINDOW 512 /* 4.4.2 */
+
+static void
+copy_event(uint32_t dst[static HARVESTSIZE + 1],
+ const struct harvest_event *event)
+{
+ memset(dst, 0, sizeof(uint32_t) * (HARVESTSIZE + 1));
+ memcpy(dst, event->he_entropy, event->he_size);
+ dst[HARVESTSIZE] = event->he_somecounter;
+}
+
+static void
+random_healthtest_rct_init(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ ht->ht_rct_count = 1;
+ copy_event(ht->ht_rct_value, event);
+}
+
+/*
+ * Apply the repitition count test to a sample.
+ *
+ * Return false if the test failed, i.e., we observed >= C consecutive samples
+ * with the same value, and true otherwise.
+ */
+static bool
+random_healthtest_rct_next(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ uint32_t val[HARVESTSIZE + 1];
+
+ copy_event(val, event);
+ if (memcmp(val, ht->ht_rct_value, sizeof(ht->ht_rct_value)) != 0) {
+ ht->ht_rct_count = 1;
+ memcpy(ht->ht_rct_value, val, sizeof(ht->ht_rct_value));
+ return (true);
+ } else {
+ ht->ht_rct_count++;
+ return (ht->ht_rct_count < ht->ht_rct_limit);
+ }
+}
+
+static void
+random_healthtest_apt_init(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ ht->ht_apt_count = 1;
+ ht->ht_apt_seq = 1;
+ copy_event(ht->ht_apt_value, event);
+}
+
+static bool
+random_healthtest_apt_next(struct health_test_softc *ht,
+ const struct harvest_event *event)
+{
+ uint32_t val[HARVESTSIZE + 1];
+
+ if (ht->ht_apt_seq == 0) {
+ random_healthtest_apt_init(ht, event);
+ return (true);
+ }
+
+ copy_event(val, event);
+ if (memcmp(val, ht->ht_apt_value, sizeof(ht->ht_apt_value)) == 0) {
+ ht->ht_apt_count++;
+ if (ht->ht_apt_count >= ht->ht_apt_cutoff)
+ return (false);
+ }
+
+ ht->ht_apt_seq++;
+ if (ht->ht_apt_seq == RANDOM_SELFTEST_APT_WINDOW)
+ ht->ht_apt_seq = 0;
+
+ return (true);
+}
+
+/*
+ * Run the health tests for the given event. This is assumed to be called from
+ * a serialized context.
+ */
+bool
+random_harvest_healthtest(const struct harvest_event *event)
+{
+ struct health_test_softc *ht;
+
+ ht = &healthtest[event->he_source];
+
+ /*
+ * Was on-demand testing requested? Restart the state machine if so,
+ * restarting the startup tests.
+ */
+ if (atomic_load_bool(&ht->ondemand)) {
+ atomic_store_bool(&ht->ondemand, false);
+ ht->ht_state = INIT;
+ }
+
+ switch (ht->ht_state) {
+ case __predict_false(INIT):
+ /* Store the first sample and initialize test state. */
+ random_healthtest_rct_init(ht, event);
+ random_healthtest_apt_init(ht, event);
+ ht->ht_total_samples = 0;
+ ht->ht_state = STARTUP;
+ return (false);
+ case DISABLED:
+ /* No health testing for this source. */
+ return (true);
+ case STEADY:
+ case STARTUP:
+ ht->ht_total_samples++;
+ if (random_healthtest_rct_next(ht, event) &&
+ random_healthtest_apt_next(ht, event)) {
+ if (ht->ht_state == STARTUP &&
+ ht->ht_total_samples >=
+ RANDOM_SELFTEST_STARTUP_SAMPLES) {
+ printf(
+ "random: health test passed for source %s\n",
+ random_source_descr[event->he_source]);
+ ht->ht_state = STEADY;
+ }
+ return (ht->ht_state == STEADY);
+ }
+ ht->ht_state = FAILED;
+ printf(
+ "random: health test failed for source %s, discarding samples\n",
+ random_source_descr[event->he_source]);
+ /* FALLTHROUGH */
+ case FAILED:
+ return (false);
+ }
+}
+
+static bool nist_healthtest_enabled = false;
+SYSCTL_BOOL(_kern_random, OID_AUTO, nist_healthtest_enabled,
+ CTLFLAG_RDTUN, &nist_healthtest_enabled, 0,
+ "Enable NIST SP 800-90B health tests for noise sources");
+
+static void
+random_healthtest_init(enum random_entropy_source source)
+{
+ struct health_test_softc *ht;
+
+ ht = &healthtest[source];
+ KASSERT(ht->ht_state == INIT,
+ ("%s: health test state is %d for source %d",
+ __func__, ht->ht_state, source));
+
+ /*
+ * If health-testing is enabled, validate all sources except CACHED and
+ * VMGENID: they are deterministic sources used only a small, fixed
+ * number of times, so statistical testing is not applicable.
+ */
+ if (!nist_healthtest_enabled ||
+ source == RANDOM_CACHED || source == RANDOM_PURE_VMGENID) {
+ ht->ht_state = DISABLED;
+ return;
+ }
+
+ /*
+ * Set cutoff values for the two tests, assuming that each sample has
+ * min-entropy of 1 bit and allowing for an error rate of 1 in 2^{34}.
+ * With a sample rate of RANDOM_KTHREAD_HZ, we expect to see an false
+ * positive once in ~54.5 years.
+ *
+ * The RCT limit comes from the formula in section 4.4.1.
+ *
+ * The APT cutoff is calculated using the formula in section 4.4.2
+ * footnote 10 with the window size changed from 512 to 511, since the
+ * test as written counts the number of samples equal to the first
+ * sample in the window, and thus tests W-1 samples.
+ */
+ ht->ht_rct_limit = 35;
+ ht->ht_apt_cutoff = 330;
+}
+
+static int
+random_healthtest_ondemand(SYSCTL_HANDLER_ARGS)
+{
+ u_int mask, source;
+ int error;
+
+ mask = 0;
+ error = sysctl_handle_int(oidp, &mask, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ while (mask != 0) {
+ source = ffs(mask) - 1;
+ if (source < nitems(healthtest))
+ atomic_store_bool(&healthtest[source].ondemand, true);
+ mask &= ~(1u << source);
+ }
+ return (0);
+}
+SYSCTL_PROC(_kern_random, OID_AUTO, nist_healthtest_ondemand,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
+ random_healthtest_ondemand, "I",
+ "Re-run NIST SP 800-90B startup health tests for a noise source");
+
static int
random_check_uint_harvestmask(SYSCTL_HANDLER_ARGS)
{
@@ -362,7 +588,8 @@ static const char *random_source_descr[ENTROPYSOURCE] = {
[RANDOM_SWI] = "SWI",
[RANDOM_FS_ATIME] = "FS_ATIME",
[RANDOM_UMA] = "UMA",
- [RANDOM_CALLOUT] = "CALLOUT", /* ENVIRONMENTAL_END */
+ [RANDOM_CALLOUT] = "CALLOUT",
+ [RANDOM_RANDOMDEV] = "RANDOMDEV", /* ENVIRONMENTAL_END */
[RANDOM_PURE_OCTEON] = "PURE_OCTEON", /* PURE_START */
[RANDOM_PURE_SAFE] = "PURE_SAFE",
[RANDOM_PURE_GLXSB] = "PURE_GLXSB",
@@ -424,6 +651,9 @@ random_harvestq_init(void *unused __unused)
hc_source_mask = almost_everything_mask;
RANDOM_HARVEST_INIT_LOCK();
harvest_context.hc_active_buf = 0;
+
+ for (int i = 0; i < ENTROPYSOURCE; i++)
+ random_healthtest_init(i);
}
SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_THIRD, random_harvestq_init, NULL);
diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h
index 7804bf52aa4f..1d462500df85 100644
--- a/sys/dev/random/random_harvestq.h
+++ b/sys/dev/random/random_harvestq.h
@@ -49,4 +49,6 @@ random_get_cyclecount(void)
return ((uint32_t)get_cyclecount());
}
+bool random_harvest_healthtest(const struct harvest_event *event);
+
#endif /* SYS_DEV_RANDOM_RANDOM_HARVESTQ_H_INCLUDED */
diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c
index 9d1c7b1167c8..ced4dd8067d9 100644
--- a/sys/dev/random/randomdev.c
+++ b/sys/dev/random/randomdev.c
@@ -312,7 +312,7 @@ randomdev_accumulate(uint8_t *buf, u_int count)
for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
event.he_somecounter = random_get_cyclecount();
event.he_size = sizeof(event.he_entropy);
- event.he_source = RANDOM_CACHED;
+ event.he_source = RANDOM_RANDOMDEV;
event.he_destination = destination++; /* Harmless cheating */
memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
p_random_alg_context->ra_event_processor(&event);
diff --git a/sys/dev/ufshci/ufshci_private.h b/sys/dev/ufshci/ufshci_private.h
index cac743884ee6..ac58d44102a0 100644
--- a/sys/dev/ufshci/ufshci_private.h
+++ b/sys/dev/ufshci/ufshci_private.h
@@ -149,6 +149,8 @@ struct ufshci_hw_queue {
bus_dmamap_t queuemem_map;
bus_addr_t req_queue_addr;
+ bus_addr_t *ucd_bus_addr;
+
uint32_t num_entries;
uint32_t num_trackers;
@@ -198,8 +200,6 @@ struct ufshci_req_queue {
bus_dma_tag_t dma_tag_payload;
bus_dmamap_t ucdmem_map;
-
- bus_addr_t ucd_addr;
};
struct ufshci_device {
diff --git a/sys/dev/ufshci/ufshci_req_sdb.c b/sys/dev/ufshci/ufshci_req_sdb.c
index 4670281d367a..b1f303afaef5 100644
--- a/sys/dev/ufshci/ufshci_req_sdb.c
+++ b/sys/dev/ufshci/ufshci_req_sdb.c
@@ -48,6 +48,29 @@ ufshci_req_sdb_cmd_desc_destroy(struct ufshci_req_queue *req_queue)
}
}
+static void
+ufshci_ucd_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
+{
+ struct ufshci_hw_queue *hwq = arg;
+ int i;
+
+ if (error != 0) {
+ printf("ufshci: Failed to map UCD, error = %d\n", error);
+ return;
+ }
+
+ if (hwq->num_trackers != nseg) {
+ printf(
+ "ufshci: Failed to map UCD, num_trackers = %d, nseg = %d\n",
+ hwq->num_trackers, nseg);
+ return;
+ }
+
+ for (i = 0; i < nseg; i++) {
+ hwq->ucd_bus_addr[i] = seg[i].ds_addr;
+ }
+}
+
static int
ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
uint32_t num_entries, struct ufshci_controller *ctrlr)
@@ -55,7 +78,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
struct ufshci_hw_queue *hwq = &req_queue->hwq[UFSHCI_SDB_Q];
struct ufshci_tracker *tr;
size_t ucd_allocsz, payload_allocsz;
- uint64_t ucdmem_phys;
uint8_t *ucdmem;
int i, error;
@@ -71,10 +93,11 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
* Allocate physical memory for UTP Command Descriptor (UCD)
* Note: UFSHCI UCD format is restricted to 128-byte alignment.
*/
- error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128,
- ctrlr->page_size, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
- ucd_allocsz, howmany(ucd_allocsz, ctrlr->page_size),
- ctrlr->page_size, 0, NULL, NULL, &req_queue->dma_tag_ucd);
+ error = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 128, 0,
+ BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ucd_allocsz,
+ howmany(ucd_allocsz, sizeof(struct ufshci_utp_cmd_desc)),
+ sizeof(struct ufshci_utp_cmd_desc), 0, NULL, NULL,
+ &req_queue->dma_tag_ucd);
if (error != 0) {
ufshci_printf(ctrlr, "request cmd desc tag create failed %d\n",
error);
@@ -88,7 +111,7 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
}
if (bus_dmamap_load(req_queue->dma_tag_ucd, req_queue->ucdmem_map,
- ucdmem, ucd_allocsz, ufshci_single_map, &ucdmem_phys, 0) != 0) {
+ ucdmem, ucd_allocsz, ufshci_ucd_map, hwq, 0) != 0) {
ufshci_printf(ctrlr, "failed to load cmd desc memory\n");
bus_dmamem_free(req_queue->dma_tag_ucd, req_queue->ucd,
req_queue->ucdmem_map);
@@ -96,7 +119,6 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
}
req_queue->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
- req_queue->ucd_addr = ucdmem_phys;
/*
* Allocate physical memory for PRDT
@@ -128,10 +150,9 @@ ufshci_req_sdb_cmd_desc_construct(struct ufshci_req_queue *req_queue,
tr->slot_state = UFSHCI_SLOT_STATE_FREE;
tr->ucd = (struct ufshci_utp_cmd_desc *)ucdmem;
- tr->ucd_bus_addr = ucdmem_phys;
+ tr->ucd_bus_addr = hwq->ucd_bus_addr[i];
ucdmem += sizeof(struct ufshci_utp_cmd_desc);
- ucdmem_phys += sizeof(struct ufshci_utp_cmd_desc);
hwq->act_tr[i] = tr;
}
@@ -175,6 +196,11 @@ ufshci_req_sdb_construct(struct ufshci_controller *ctrlr,
req_queue->hwq = malloc(sizeof(struct ufshci_hw_queue), M_UFSHCI,
M_ZERO | M_NOWAIT);
hwq = &req_queue->hwq[UFSHCI_SDB_Q];
+ hwq->num_entries = req_queue->num_entries;
+ hwq->num_trackers = req_queue->num_trackers;
+ req_queue->hwq->ucd_bus_addr = malloc(sizeof(bus_addr_t) *
+ req_queue->num_trackers,
+ M_UFSHCI, M_ZERO | M_NOWAIT);
mtx_init(&hwq->qlock, "ufshci req_queue lock", NULL, MTX_DEF);
@@ -277,6 +303,7 @@ ufshci_req_sdb_destroy(struct ufshci_controller *ctrlr,
if (mtx_initialized(&hwq->qlock))
mtx_destroy(&hwq->qlock);
+ free(req_queue->hwq->ucd_bus_addr, M_UFSHCI);
free(req_queue->hwq, M_UFSHCI);
}
diff --git a/sys/dev/usb/controller/xhci_pci.c b/sys/dev/usb/controller/xhci_pci.c
index b50e33ea36ce..d5cfd228a429 100644
--- a/sys/dev/usb/controller/xhci_pci.c
+++ b/sys/dev/usb/controller/xhci_pci.c
@@ -99,6 +99,11 @@ xhci_pci_match(device_t self)
return ("AMD Starship USB 3.0 controller");
case 0x149c1022:
return ("AMD Matisse USB 3.0 controller");
+ case 0x15b61022:
+ case 0x15b71022:
+ return ("AMD Raphael/Granite Ridge USB 3.1 controller");
+ case 0x15b81022:
+ return ("AMD Raphael/Granite Ridge USB 2.0 controller");
case 0x15e01022:
case 0x15e11022:
return ("AMD Raven USB 3.1 controller");
@@ -109,6 +114,8 @@ xhci_pci_match(device_t self)
return ("AMD 300 Series USB 3.1 controller");
case 0x43d51022:
return ("AMD 400 Series USB 3.1 controller");
+ case 0x43f71022:
+ return ("AMD 600 Series USB 3.2 controller");
case 0x78121022:
case 0x78141022:
case 0x79141022:
diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c
index 819debadd1ac..9f2b009d02ec 100644
--- a/sys/dev/vmm/vmm_dev.c
+++ b/sys/dev/vmm/vmm_dev.c
@@ -30,7 +30,8 @@
#include <dev/vmm/vmm_mem.h>
#include <dev/vmm/vmm_stat.h>
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#ifdef COMPAT_FREEBSD12
struct vm_memseg_12 {
int segid;
size_t len;
@@ -42,7 +43,22 @@ _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
#define VM_GET_MEMSEG_12 \
_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
-#endif
+#endif /* COMPAT_FREEBSD12 */
+#ifdef COMPAT_FREEBSD14
+struct vm_memseg_14 {
+ int segid;
+ size_t len;
+ char name[VM_MAX_SUFFIXLEN + 1];
+};
+_Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16),
+ "COMPAT_FREEBSD14 ABI");
+
+#define VM_ALLOC_MEMSEG_14 \
+ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14)
+#define VM_GET_MEMSEG_14 \
+ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14)
+#endif /* COMPAT_FREEBSD14 */
+#endif /* __amd64__ */
struct devmem_softc {
int segid;
@@ -257,7 +273,8 @@ get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
}
static int
-alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
+alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len,
+ struct domainset *domainset)
{
char *name;
int error;
@@ -278,8 +295,7 @@ alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
if (error)
goto done;
}
-
- error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
+ error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset);
if (error)
goto done;
@@ -295,6 +311,20 @@ done:
return (error);
}
+#if defined(__amd64__) && \
+ (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12))
+/*
+ * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts.
+ */
+static void
+adjust_segid(struct vm_memseg *mseg)
+{
+ if (mseg->segid != VM_SYSMEM) {
+ mseg->segid += (VM_BOOTROM - 1);
+ }
+}
+#endif
+
static int
vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
uint64_t *regval)
@@ -353,10 +383,16 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = {
VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
VMMDEV_IOCTL(VM_STAT_DESC, 0),
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#ifdef COMPAT_FREEBSD12
VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
#endif
+#ifdef COMPAT_FREEBSD14
+ VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14,
+ VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+#endif
+#endif /* __amd64__ */
VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
VMMDEV_IOCTL(VM_MMAP_MEMSEG,
@@ -366,9 +402,14 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = {
VMMDEV_IOCTL(VM_REINIT,
VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#if defined(COMPAT_FREEBSD12)
VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
#endif
+#ifdef COMPAT_FREEBSD14
+ VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS),
+#endif
+#endif /* __amd64__ */
VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
@@ -388,6 +429,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vmmdev_softc *sc;
struct vcpu *vcpu;
const struct vmmdev_ioctl *ioctl;
+ struct vm_memseg *mseg;
int error, vcpuid;
sc = vmmdev_lookup2(cdev);
@@ -499,20 +541,77 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
break;
}
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#ifdef COMPAT_FREEBSD12
case VM_ALLOC_MEMSEG_12:
- error = alloc_memseg(sc, (struct vm_memseg *)data,
- sizeof(((struct vm_memseg_12 *)0)->name));
+ mseg = (struct vm_memseg *)data;
+
+ adjust_segid(mseg);
+ error = alloc_memseg(sc, mseg,
+ sizeof(((struct vm_memseg_12 *)0)->name), NULL);
break;
case VM_GET_MEMSEG_12:
- error = get_memseg(sc, (struct vm_memseg *)data,
+ mseg = (struct vm_memseg *)data;
+
+ adjust_segid(mseg);
+ error = get_memseg(sc, mseg,
sizeof(((struct vm_memseg_12 *)0)->name));
break;
-#endif
- case VM_ALLOC_MEMSEG:
- error = alloc_memseg(sc, (struct vm_memseg *)data,
- sizeof(((struct vm_memseg *)0)->name));
+#endif /* COMPAT_FREEBSD12 */
+#ifdef COMPAT_FREEBSD14
+ case VM_ALLOC_MEMSEG_14:
+ mseg = (struct vm_memseg *)data;
+
+ adjust_segid(mseg);
+ error = alloc_memseg(sc, mseg,
+ sizeof(((struct vm_memseg_14 *)0)->name), NULL);
+ break;
+ case VM_GET_MEMSEG_14:
+ mseg = (struct vm_memseg *)data;
+
+ adjust_segid(mseg);
+ error = get_memseg(sc, mseg,
+ sizeof(((struct vm_memseg_14 *)0)->name));
+ break;
+#endif /* COMPAT_FREEBSD14 */
+#endif /* __amd64__ */
+ case VM_ALLOC_MEMSEG: {
+ domainset_t *mask;
+ struct domainset *domainset, domain;
+
+ domainset = NULL;
+ mseg = (struct vm_memseg *)data;
+ if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) {
+ if (mseg->ds_mask_size < sizeof(domainset_t) ||
+ mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) {
+ error = ERANGE;
+ break;
+ }
+ memset(&domain, 0, sizeof(domain));
+ mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK);
+ error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size);
+ if (error) {
+ free(mask, M_VMMDEV);
+ break;
+ }
+ error = domainset_populate(&domain, mask, mseg->ds_policy,
+ mseg->ds_mask_size);
+ if (error) {
+ free(mask, M_VMMDEV);
+ break;
+ }
+ domainset = domainset_create(&domain);
+ if (domainset == NULL) {
+ error = EINVAL;
+ free(mask, M_VMMDEV);
+ break;
+ }
+ free(mask, M_VMMDEV);
+ }
+ error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset);
+
break;
+ }
case VM_GET_MEMSEG:
error = get_memseg(sc, (struct vm_memseg *)data,
sizeof(((struct vm_memseg *)0)->name));
@@ -820,7 +919,6 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
buflen = VM_MAX_NAMELEN + 1;
buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
- strlcpy(buf, "beavis", buflen);
error = sysctl_handle_string(oidp, buf, buflen, req);
if (error == 0 && req->newptr != NULL)
error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
@@ -830,7 +928,7 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
NULL, 0, sysctl_vmm_destroy, "A",
- NULL);
+ "Destroy a vmm(4) instance (legacy interface)");
static struct cdevsw vmmdevsw = {
.d_name = "vmmdev",
@@ -909,7 +1007,6 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
buflen = VM_MAX_NAMELEN + 1;
buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
- strlcpy(buf, "beavis", buflen);
error = sysctl_handle_string(oidp, buf, buflen, req);
if (error == 0 && req->newptr != NULL)
error = vmmdev_create(buf, req->td->td_ucred);
@@ -919,7 +1016,7 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
NULL, 0, sysctl_vmm_create, "A",
- NULL);
+ "Create a vmm(4) instance (legacy interface)");
static int
vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
diff --git a/sys/dev/vmm/vmm_mem.c b/sys/dev/vmm/vmm_mem.c
index c61ae2d44b96..be59e37de33d 100644
--- a/sys/dev/vmm/vmm_mem.c
+++ b/sys/dev/vmm/vmm_mem.c
@@ -7,6 +7,7 @@
#include <sys/types.h>
#include <sys/lock.h>
+#include <sys/malloc.h>
#include <sys/sx.h>
#include <sys/systm.h>
@@ -156,10 +157,11 @@ vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
}
int
-vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
+vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem,
+ struct domainset *obj_domainset)
{
- struct vm_mem *mem;
struct vm_mem_seg *seg;
+ struct vm_mem *mem;
vm_object_t obj;
mem = vm_mem(vm);
@@ -179,13 +181,22 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
return (EINVAL);
}
+ /*
+ * When given an impossible policy, signal an
+ * error to the user.
+ */
+ if (obj_domainset != NULL && domainset_empty_vm(obj_domainset))
+ return (EINVAL);
obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT);
if (obj == NULL)
return (ENOMEM);
seg->len = len;
seg->object = obj;
+ if (obj_domainset != NULL)
+ seg->object->domain.dr_policy = obj_domainset;
seg->sysmem = sysmem;
+
return (0);
}
diff --git a/sys/dev/vmm/vmm_mem.h b/sys/dev/vmm/vmm_mem.h
index a4be4c1c57aa..856470cf2590 100644
--- a/sys/dev/vmm/vmm_mem.h
+++ b/sys/dev/vmm/vmm_mem.h
@@ -8,6 +8,27 @@
#ifndef _DEV_VMM_MEM_H_
#define _DEV_VMM_MEM_H_
+/* Maximum number of NUMA domains in a guest. */
+#define VM_MAXMEMDOM 8
+#define VM_MAXSYSMEM VM_MAXMEMDOM
+
+/*
+ * Identifiers for memory segments.
+ * Each guest NUMA domain is represented by a single system
+ * memory segment from [VM_SYSMEM, VM_MAXSYSMEM).
+ * The remaining identifiers can be used to create devmem segments.
+ */
+enum {
+ VM_SYSMEM = 0,
+ VM_BOOTROM = VM_MAXSYSMEM,
+ VM_FRAMEBUFFER,
+ VM_PCIROM,
+ VM_MEMSEG_END
+};
+
+#define VM_MAX_MEMSEGS VM_MEMSEG_END
+#define VM_MAX_MEMMAPS (VM_MAX_MEMSEGS * 2)
+
#ifdef _KERNEL
#include <sys/types.h>
@@ -31,9 +52,6 @@ struct vm_mem_map {
int flags;
};
-#define VM_MAX_MEMSEGS 4
-#define VM_MAX_MEMMAPS 8
-
struct vm_mem {
struct vm_mem_map mem_maps[VM_MAX_MEMMAPS];
struct vm_mem_seg mem_segs[VM_MAX_MEMSEGS];
@@ -55,7 +73,8 @@ void vm_assert_memseg_xlocked(struct vm *vm);
int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
size_t len, int prot, int flags);
int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len);
-int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
+int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem,
+ struct domainset *obj_domainset);
void vm_free_memseg(struct vm *vm, int ident);
/*
diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c
index 64039575c0ad..675c0573bd7e 100644
--- a/sys/dev/vt/hw/vga/vt_vga.c
+++ b/sys/dev/vt/hw/vga/vt_vga.c
@@ -1347,7 +1347,7 @@ vga_postswitch(struct vt_device *vd)
/* Reinit VGA mode, to restore view after app which change mode. */
vga_initialize(vd, (vd->vd_flags & VDF_TEXTMODE));
- /* Ask vt(9) to update chars on visible area. */
+ /* Ask vt(4) to update chars on visible area. */
vd->vd_flags |= VDF_INVALID;
}
diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c
index b0f58b38a6f1..b51ef6766de4 100644
--- a/sys/dev/vt/vt_core.c
+++ b/sys/dev/vt/vt_core.c
@@ -125,10 +125,10 @@ static const struct terminal_class vt_termclass = {
(vw)->vw_number)
static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "vt(9) parameters");
+ "vt(4) parameters");
static VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)");
static VT_SYSCTL_INT(enable_bell, 0, "Enable bell");
-static VT_SYSCTL_INT(debug, 0, "vt(9) debug level");
+static VT_SYSCTL_INT(debug, 0, "vt(4) debug level");
static VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode");
static VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend");
diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c
index 676ea5de12b8..58a22b8bdc50 100644
--- a/sys/fs/fdescfs/fdesc_vnops.c
+++ b/sys/fs/fdescfs/fdesc_vnops.c
@@ -547,6 +547,8 @@ fdesc_readdir(struct vop_readdir_args *ap)
fmp = VFSTOFDESC(ap->a_vp->v_mount);
if (ap->a_ncookies != NULL)
*ap->a_ncookies = 0;
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
off = (int)uio->uio_offset;
if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
@@ -559,7 +561,12 @@ fdesc_readdir(struct vop_readdir_args *ap)
fcnt = i - 2; /* The first two nodes are `.' and `..' */
FILEDESC_SLOCK(fdp);
- while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
+ while (uio->uio_resid >= UIO_MX) {
+ if (i >= fdp->fd_nfiles + 2) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
+ break;
+ }
bzero((caddr_t)dp, UIO_MX);
switch (i) {
case 0: /* `.' */
diff --git a/sys/fs/fuse/fuse_internal.h b/sys/fs/fuse/fuse_internal.h
index cddf88095840..932012b5f52a 100644
--- a/sys/fs/fuse/fuse_internal.h
+++ b/sys/fs/fuse/fuse_internal.h
@@ -208,9 +208,9 @@ fuse_match_cred(struct ucred *basecred, struct ucred *usercred)
if (basecred->cr_uid == usercred->cr_uid &&
basecred->cr_uid == usercred->cr_ruid &&
basecred->cr_uid == usercred->cr_svuid &&
- basecred->cr_groups[0] == usercred->cr_groups[0] &&
- basecred->cr_groups[0] == usercred->cr_rgid &&
- basecred->cr_groups[0] == usercred->cr_svgid)
+ basecred->cr_gid == usercred->cr_gid &&
+ basecred->cr_gid == usercred->cr_rgid &&
+ basecred->cr_gid == usercred->cr_svgid)
return (0);
return (EPERM);
diff --git a/sys/fs/fuse/fuse_ipc.c b/sys/fs/fuse/fuse_ipc.c
index 0b6048644d32..a751c09159ff 100644
--- a/sys/fs/fuse/fuse_ipc.c
+++ b/sys/fs/fuse/fuse_ipc.c
@@ -868,7 +868,7 @@ fuse_setup_ihead(struct fuse_in_header *ihead, struct fuse_ticket *ftick,
ihead->pid = pid;
ihead->uid = cred->cr_uid;
- ihead->gid = cred->cr_groups[0];
+ ihead->gid = cred->cr_gid;
}
/*
diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c
index ae28617537fd..32872e8f3f3a 100644
--- a/sys/fs/fuse/fuse_vnops.c
+++ b/sys/fs/fuse/fuse_vnops.c
@@ -884,7 +884,7 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap)
return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not "
"support different credentials for infd and outfd"));
- if (incred->cr_groups[0] != outcred->cr_groups[0])
+ if (incred->cr_gid != outcred->cr_gid)
return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not "
"support different credentials for infd and outfd"));
diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c
index da4848169173..208b64930e61 100644
--- a/sys/fs/msdosfs/msdosfs_conv.c
+++ b/sys/fs/msdosfs/msdosfs_conv.c
@@ -797,19 +797,24 @@ mbsadjpos(const char **instr, size_t inlen, size_t outlen, int weight, int flag,
static u_char *
dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struct msdosfsmount *pmp)
{
- u_char c, *outp;
- size_t len, olen;
+ u_char c, *outp, *outp1;
+ size_t i, len, olen;
outp = outbuf;
if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
olen = len = 4;
+ outp1 = outp;
if (lower & (LCASE_BASE | LCASE_EXT))
msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char **)instr,
ilen, (char **)&outp, &olen, KICONV_LOWER);
else
msdosfs_iconv->convchr(pmp->pm_d2u, (const char **)instr,
ilen, (char **)&outp, &olen);
+ for (i = 0; i < outp - outp1; i++) {
+ if (outp1[i] == '/')
+ outp1[i] = '?';
+ }
len -= olen;
/*
@@ -826,6 +831,8 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struc
c = dos2unix[c];
if (lower & (LCASE_BASE | LCASE_EXT))
c = u2l[c];
+ if (c == '/')
+ c = '?';
*outp++ = c;
outbuf[1] = '\0';
}
diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c
index 5db61c8951f6..33e0d94954d7 100644
--- a/sys/fs/msdosfs/msdosfs_vnops.c
+++ b/sys/fs/msdosfs/msdosfs_vnops.c
@@ -1521,6 +1521,9 @@ msdosfs_readdir(struct vop_readdir_args *ap)
ap->a_vp, uio, ap->a_cred, ap->a_eofflag);
#endif
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
+
/*
* msdosfs_readdir() won't operate properly on regular files since
* it does i/o only with the filesystem vnode, and hence can
@@ -1614,8 +1617,11 @@ msdosfs_readdir(struct vop_readdir_args *ap)
on = (offset - bias) & pmp->pm_crbomask;
n = min(pmp->pm_bpcluster - on, uio->uio_resid);
diff = dep->de_FileSize - (offset - bias);
- if (diff <= 0)
- break;
+ if (diff <= 0) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
+ goto out;
+ }
n = min(n, diff);
error = pcbmap(dep, lbn, &bn, &cn, &blsize);
if (error)
@@ -1646,6 +1652,8 @@ msdosfs_readdir(struct vop_readdir_args *ap)
*/
if (dentp->deName[0] == SLOT_EMPTY) {
brelse(bp);
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
goto out;
}
/*
@@ -1743,15 +1751,6 @@ out:
uio->uio_offset = off;
- /*
- * Set the eofflag (NFS uses it)
- */
- if (ap->a_eofflag) {
- if (dep->de_FileSize - (offset - bias) <= 0)
- *ap->a_eofflag = 1;
- else
- *ap->a_eofflag = 0;
- }
return (error);
}
diff --git a/sys/fs/nfs/nfs_commonport.c b/sys/fs/nfs/nfs_commonport.c
index 0c94f4e7dc52..222cfc03e4b3 100644
--- a/sys/fs/nfs/nfs_commonport.c
+++ b/sys/fs/nfs/nfs_commonport.c
@@ -379,7 +379,8 @@ newnfs_setroot(struct ucred *cred)
{
cred->cr_uid = 0;
- cred->cr_groups[0] = 0;
+ cred->cr_gid = 0;
+ /* XXXKE Fix this if cr_gid gets separated out. */
cred->cr_ngroups = 1;
}
diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c
index 4c498e96a3c0..a957315aaa12 100644
--- a/sys/fs/nfs/nfs_commonsubs.c
+++ b/sys/fs/nfs/nfs_commonsubs.c
@@ -647,7 +647,8 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap,
NFSATTRBIT_TIMECREATE))
NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE);
(void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0,
- &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL);
+ &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL,
+ false, false, false);
break;
}
}
@@ -2646,7 +2647,8 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror,
nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram,
int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno,
- struct statfs *pnfssf)
+ struct statfs *pnfssf, bool xattrsupp, bool has_hiddensystem,
+ bool has_namedattr)
{
int bitpos, retnum = 0;
u_int32_t *tl;
@@ -2660,10 +2662,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
struct nfsfsinfo fsinf;
struct timespec temptime;
NFSACL_T *aclp, *naclp = NULL;
- size_t atsiz;
- bool xattrsupp;
short irflag;
- long has_pathconf;
#ifdef QUOTA
struct dqblk dqb;
uid_t savuid;
@@ -2747,18 +2746,6 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
}
}
- /* Check to see if Extended Attributes are supported. */
- xattrsupp = false;
- if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_XATTRSUPPORT)) {
- if (NFSVOPLOCK(vp, LK_SHARED) == 0) {
- error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER,
- "xxx", NULL, &atsiz, cred, p);
- NFSVOPUNLOCK(vp);
- if (error != EOPNOTSUPP)
- xattrsupp = true;
- }
- }
-
/*
* Put out the attribute bitmap for the ones being filled in
* and get the field for the number of attributes returned.
@@ -2780,11 +2767,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT);
NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL);
}
- if (cred == NULL || p == NULL || vp == NULL ||
- VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM,
- &has_pathconf) != 0)
- has_pathconf = 0;
- if (has_pathconf == 0) {
+ if (!has_hiddensystem) {
NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN);
NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM);
}
@@ -2828,10 +2811,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp,
break;
case NFSATTRBIT_NAMEDATTR:
NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
- if (VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR,
- &has_pathconf) != 0)
- has_pathconf = 0;
- if (has_pathconf != 0)
+ if (has_namedattr)
*tl = newnfs_true;
else
*tl = newnfs_false;
diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h
index 3b6c1ec90c06..54f60a753c50 100644
--- a/sys/fs/nfs/nfs_var.h
+++ b/sys/fs/nfs/nfs_var.h
@@ -395,8 +395,9 @@ int nfsrv_putopbit(struct nfsrv_descript *, nfsopbit_t *);
void nfsrv_wcc(struct nfsrv_descript *, int, struct nfsvattr *, int,
struct nfsvattr *);
int nfsv4_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, NFSACL_T *,
- struct vattr *, fhandle_t *, int, nfsattrbit_t *,
- struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *);
+ struct vattr *, fhandle_t *, int, nfsattrbit_t *, struct ucred *,
+ NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *, bool, bool,
+ bool);
void nfsrv_fillattr(struct nfsrv_descript *, struct nfsvattr *);
struct mbuf *nfsrv_adj(struct mbuf *, int, int);
void nfsrv_postopattr(struct nfsrv_descript *, int, struct nfsvattr *);
@@ -735,7 +736,8 @@ int nfsvno_updfilerev(vnode_t, struct nfsvattr *, struct nfsrv_descript *,
NFSPROC_T *);
int nfsvno_fillattr(struct nfsrv_descript *, struct mount *, vnode_t,
struct nfsvattr *, fhandle_t *, int, nfsattrbit_t *,
- struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t);
+ struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, bool, bool,
+ bool);
int nfsrv_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *,
NFSACL_T *, NFSPROC_T *);
int nfsv4_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *,
diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c
index e0e66baca44d..36b534be531e 100644
--- a/sys/fs/nfsclient/nfs_clrpcops.c
+++ b/sys/fs/nfsclient/nfs_clrpcops.c
@@ -5436,7 +5436,8 @@ nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
NFSZERO_ATTRBIT(&attrbits);
NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
(void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0,
- &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL);
+ &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, false, false,
+ false);
error = nfscl_request(nd, vp, p, cred);
if (error)
return (error);
@@ -6932,7 +6933,8 @@ nfscl_dofflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0) {
tcred = NFSNEWCRED(cred);
tcred->cr_uid = flp->nfsfl_ffm[mirror].user;
- tcred->cr_groups[0] = flp->nfsfl_ffm[mirror].group;
+ tcred->cr_gid = flp->nfsfl_ffm[mirror].group;
+ /* XXXKE Fix this if cr_gid gets separated out. */
tcred->cr_ngroups = 1;
} else
tcred = cred;
diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c
index 1ae5ed1a75ca..99a781640c53 100644
--- a/sys/fs/nfsclient/nfs_clstate.c
+++ b/sys/fs/nfsclient/nfs_clstate.c
@@ -3701,7 +3701,7 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p)
if (!error)
(void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va,
NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0,
- (uint64_t)0, NULL);
+ (uint64_t)0, NULL, false, false, false);
break;
case NFSV4OP_CBRECALL:
NFSCL_DEBUG(4, "cbrecall\n");
diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c
index 43ee0383669f..4f0d5946d6b9 100644
--- a/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/sys/fs/nfsserver/nfs_nfsdport.c
@@ -2112,7 +2112,8 @@ int
nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
struct ucred *cred, struct thread *p, int isdgram, int reterr,
- int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
+ int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno,
+ bool xattrsupp, bool has_hiddensystem, bool has_namedattr)
{
struct statfs *sf;
int error;
@@ -2131,7 +2132,7 @@ nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
}
error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
- mounted_on_fileno, sf);
+ mounted_on_fileno, sf, xattrsupp, has_hiddensystem, has_namedattr);
free(sf, M_TEMP);
NFSEXITCODE2(0, nd);
return (error);
@@ -2448,7 +2449,7 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
struct nfsvattr nva, at, *nvap = &nva;
struct mbuf *mb0, *mb1;
struct nfsreferral *refp;
- int nlen, r, error = 0, getret = 1, usevget = 1;
+ int nlen, r, error = 0, getret = 1, ret, usevget = 1;
int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
caddr_t bpos0, bpos1;
u_int64_t off, toff, verf __unused;
@@ -2462,6 +2463,9 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
uint64_t mounted_on_fileno;
struct thread *p = curthread;
int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1;
+ size_t atsiz;
+ long pathval;
+ bool has_hiddensystem, has_namedattr, xattrsupp;
if (nd->nd_repstat) {
nfsrv_postopattr(nd, getret, &at);
@@ -2936,9 +2940,32 @@ again:
*tl++ = newnfs_true;
txdr_hyper(*cookiep, tl);
dirlen += nfsm_strtom(nd, dp->d_name, nlen);
+ xattrsupp = false;
+ has_hiddensystem = false;
+ has_namedattr = false;
if (nvp != NULL) {
supports_nfsv4acls =
nfs_supportsnfsv4acls(nvp);
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_XATTRSUPPORT)) {
+ ret = VOP_GETEXTATTR(nvp,
+ EXTATTR_NAMESPACE_USER,
+ "xxx", NULL, &atsiz,
+ nd->nd_cred, p);
+ xattrsupp = ret != EOPNOTSUPP;
+ }
+ if (VOP_PATHCONF(nvp,
+ _PC_HAS_HIDDENSYSTEM, &pathval) !=
+ 0)
+ pathval = 0;
+ has_hiddensystem = pathval > 0;
+ pathval = 0;
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_NAMEDATTR) &&
+ VOP_PATHCONF(nvp, _PC_HAS_NAMEDATTR,
+ &pathval) != 0)
+ pathval = 0;
+ has_namedattr = pathval > 0;
NFSVOPUNLOCK(nvp);
} else
supports_nfsv4acls = 0;
@@ -2958,13 +2985,15 @@ again:
nvp, nvap, &nfh, r, &rderrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
- mounted_on_fileno);
+ mounted_on_fileno, xattrsupp,
+ has_hiddensystem, has_namedattr);
} else {
dirlen += nfsvno_fillattr(nd, new_mp,
nvp, nvap, &nfh, r, &attrbits,
nd->nd_cred, p, isdgram, 0,
supports_nfsv4acls, at_root,
- mounted_on_fileno);
+ mounted_on_fileno, xattrsupp,
+ has_hiddensystem, has_namedattr);
}
if (nvp != NULL)
vrele(nvp);
@@ -6356,7 +6385,7 @@ nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
* the same type (VREG).
*/
nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL,
- NULL, 0, 0, 0, 0, 0, NULL);
+ NULL, 0, 0, 0, 0, 0, NULL, false, false, false);
error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
if (error != 0) {
diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c
index f7564ade401b..9eebcda548c6 100644
--- a/sys/fs/nfsserver/nfs_nfsdserv.c
+++ b/sys/fs/nfsserver/nfs_nfsdserv.c
@@ -241,7 +241,7 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
{
struct nfsvattr nva;
fhandle_t fh;
- int at_root = 0, error = 0, supports_nfsv4acls;
+ int at_root = 0, error = 0, ret, supports_nfsv4acls;
struct nfsreferral *refp;
nfsattrbit_t attrbits, tmpbits;
struct mount *mp;
@@ -250,6 +250,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
uint64_t mounted_on_fileno = 0;
accmode_t accmode;
struct thread *p = curthread;
+ size_t atsiz;
+ long pathval;
+ bool has_hiddensystem, has_namedattr, xattrsupp;
if (nd->nd_repstat)
goto out;
@@ -307,6 +310,26 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
&nva, &attrbits, p);
if (nd->nd_repstat == 0) {
supports_nfsv4acls = nfs_supportsnfsv4acls(vp);
+ xattrsupp = false;
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_XATTRSUPPORT)) {
+ ret = VOP_GETEXTATTR(vp,
+ EXTATTR_NAMESPACE_USER,
+ "xxx", NULL, &atsiz, nd->nd_cred,
+ p);
+ xattrsupp = ret != EOPNOTSUPP;
+ }
+ if (VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM,
+ &pathval) != 0)
+ pathval = 0;
+ has_hiddensystem = pathval > 0;
+ pathval = 0;
+ if (NFSISSET_ATTRBIT(&attrbits,
+ NFSATTRBIT_NAMEDATTR) &&
+ VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR,
+ &pathval) != 0)
+ pathval = 0;
+ has_namedattr = pathval > 0;
mp = vp->v_mount;
if (nfsrv_enable_crossmntpt != 0 &&
vp->v_type == VDIR &&
@@ -340,7 +363,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram,
(void)nfsvno_fillattr(nd, mp, vp, &nva,
&fh, 0, &attrbits, nd->nd_cred, p,
isdgram, 1, supports_nfsv4acls,
- at_root, mounted_on_fileno);
+ at_root, mounted_on_fileno,
+ xattrsupp, has_hiddensystem,
+ has_namedattr);
vfs_unbusy(mp);
}
vrele(vp);
@@ -4353,9 +4378,10 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
int error = 0;
NFSNAMEICNDSET(&cn, nd->nd_cred, LOOKUP, OPENNAMED | ISLASTCN |
- NOFOLLOW);
+ NOFOLLOW | LOCKLEAF);
cn.cn_nameptr = ".";
cn.cn_namelen = 1;
+ cn.cn_lkflags = LK_SHARED;
NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
if (*tl == newnfs_true)
cn.cn_flags |= CREATENAMED;
@@ -4374,6 +4400,8 @@ nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram,
if (nd->nd_repstat == ENOATTR)
nd->nd_repstat = NFSERR_NOENT;
}
+ if (nd->nd_repstat == 0)
+ NFSVOPUNLOCK(*vpp);
vput(dp);
NFSEXITCODE2(0, nd);
diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c
index 56bf766ef801..227e2b93883e 100644
--- a/sys/fs/p9fs/p9fs_vnops.c
+++ b/sys/fs/p9fs/p9fs_vnops.c
@@ -1784,6 +1784,9 @@ p9fs_readdir(struct vop_readdir_args *ap)
return (EBADF);
}
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 0;
+
io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK);
/* We haven't reached the end yet. read more. */
@@ -1801,8 +1804,11 @@ p9fs_readdir(struct vop_readdir_args *ap)
count = p9_client_readdir(vofid, (char *)io_buffer,
diroffset, count);
- if (count == 0)
+ if (count == 0) {
+ if (ap->a_eofflag != NULL)
+ *ap->a_eofflag = 1;
break;
+ }
if (count < 0) {
error = EIO;
diff --git a/sys/fs/pseudofs/pseudofs_vnops.c b/sys/fs/pseudofs/pseudofs_vnops.c
index 0bdfedffafcb..8cd092118d0e 100644
--- a/sys/fs/pseudofs/pseudofs_vnops.c
+++ b/sys/fs/pseudofs/pseudofs_vnops.c
@@ -850,7 +850,7 @@ pfs_readdir(struct vop_readdir_args *va)
struct uio *uio;
struct pfsentry *pfsent, *pfsent2;
struct pfsdirentlist lst;
- off_t offset;
+ off_t coffset, offset;
int error, i, resid;
STAILQ_INIT(&lst);
@@ -860,6 +860,9 @@ pfs_readdir(struct vop_readdir_args *va)
PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid));
pfs_assert_not_owned(pd);
+ if (va->a_eofflag != NULL)
+ *va->a_eofflag = 0;
+
if (vn->v_type != VDIR)
PFS_RETURN (ENOTDIR);
KASSERT_PN_IS_DIR(pd);
@@ -878,6 +881,10 @@ pfs_readdir(struct vop_readdir_args *va)
if (pid != NO_PID && !pfs_lookup_proc(pid, &proc))
PFS_RETURN (ENOENT);
+ /*
+ * The allproc lock is required in pfs_iterate() for procdir
+ * directories.
+ */
sx_slock(&allproc_lock);
pfs_lock(pd);
@@ -897,23 +904,15 @@ pfs_readdir(struct vop_readdir_args *va)
}
}
- /* skip unwanted entries */
- for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) {
+ for (pn = NULL, p = NULL, coffset = 0; resid >= PFS_DELEN;
+ coffset += PFS_DELEN) {
if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) {
- /* nothing left... */
- if (proc != NULL) {
- _PRELE(proc);
- PROC_UNLOCK(proc);
- }
- pfs_unlock(pd);
- sx_sunlock(&allproc_lock);
- PFS_RETURN (0);
+ if (va->a_eofflag != NULL)
+ *va->a_eofflag = 1;
+ break;
}
- }
-
- /* fill in entries */
- while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 &&
- resid >= PFS_DELEN) {
+ if (coffset < offset)
+ continue;
if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV,
M_NOWAIT | M_ZERO)) == NULL) {
error = ENOMEM;
diff --git a/sys/fs/smbfs/smbfs_io.c b/sys/fs/smbfs/smbfs_io.c
index 35454998fc8e..8c484381ed59 100644
--- a/sys/fs/smbfs/smbfs_io.c
+++ b/sys/fs/smbfs/smbfs_io.c
@@ -71,7 +71,7 @@ SYSCTL_INT(_vfs_smbfs, OID_AUTO, fastlookup, CTLFLAG_RW, &smbfs_fastlookup, 0, "
#define DE_SIZE (sizeof(struct dirent))
static int
-smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred)
+smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred, int *eofp)
{
struct dirent de;
struct componentname cn;
@@ -86,6 +86,8 @@ smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred)
SMBVDEBUG("dirname='%s'\n", np->n_name);
scred = smbfs_malloc_scred();
smb_makescred(scred, uio->uio_td, cred);
+ if (eofp != NULL)
+ *eofp = 0;
offset = uio->uio_offset / DE_SIZE; /* offset in the directory */
limit = uio->uio_resid / DE_SIZE;
if (uio->uio_resid < DE_SIZE || uio->uio_offset < 0) {
@@ -138,8 +140,7 @@ smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred)
if (error) {
smbfs_findclose(np->n_dirseq, scred);
np->n_dirseq = NULL;
- error = ENOENT ? 0 : error;
- goto out;
+ goto out1;
}
}
error = 0;
@@ -170,16 +171,21 @@ smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred)
if (error)
break;
}
- if (error == ENOENT)
- error = 0;
uio->uio_offset = offset * DE_SIZE;
+out1:
+ if (error == ENOENT) {
+ if (eofp != NULL)
+ *eofp = 1;
+ error = 0;
+ }
out:
smbfs_free_scred(scred);
return error;
}
int
-smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred)
+smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred,
+ int *eofp)
{
struct smbmount *smp = VFSTOSMBFS(vp->v_mount);
struct smbnode *np = VTOSMB(vp);
@@ -209,7 +215,7 @@ smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred)
lks = LK_EXCLUSIVE; /* lockstatus(vp->v_vnlock); */
if (lks == LK_SHARED)
vn_lock(vp, LK_UPGRADE | LK_RETRY);
- error = smbfs_readvdir(vp, uiop, cred);
+ error = smbfs_readvdir(vp, uiop, cred, eofp);
if (lks == LK_SHARED)
vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
return error;
diff --git a/sys/fs/smbfs/smbfs_node.h b/sys/fs/smbfs/smbfs_node.h
index f28f0007100a..8c8ce038b913 100644
--- a/sys/fs/smbfs/smbfs_node.h
+++ b/sys/fs/smbfs/smbfs_node.h
@@ -93,7 +93,7 @@ u_int32_t smbfs_hash(const u_char *name, int nmlen);
int smbfs_getpages(struct vop_getpages_args *);
int smbfs_putpages(struct vop_putpages_args *);
-int smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred);
+int smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred, int *eofp);
int smbfs_writevnode(struct vnode *vp, struct uio *uiop, struct ucred *cred, int ioflag);
void smbfs_attr_cacheenter(struct vnode *vp, struct smbfattr *fap);
int smbfs_attr_cachelookup(struct vnode *vp ,struct vattr *va);
diff --git a/sys/fs/smbfs/smbfs_vnops.c b/sys/fs/smbfs/smbfs_vnops.c
index 5d412cabadb8..63b249c93771 100644
--- a/sys/fs/smbfs/smbfs_vnops.c
+++ b/sys/fs/smbfs/smbfs_vnops.c
@@ -466,7 +466,7 @@ smbfs_read(struct vop_read_args *ap)
SMBVDEBUG("\n");
if (vp->v_type != VREG && vp->v_type != VDIR)
return EPERM;
- return smbfs_readvnode(vp, uio, ap->a_cred);
+ return smbfs_readvnode(vp, uio, ap->a_cred, NULL);
}
static int
@@ -748,7 +748,6 @@ smbfs_readdir(struct vop_readdir_args *ap)
{
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
- int error;
if (vp->v_type != VDIR)
return (EPERM);
@@ -758,8 +757,7 @@ smbfs_readdir(struct vop_readdir_args *ap)
return (EOPNOTSUPP);
}
#endif
- error = smbfs_readvnode(vp, uio, ap->a_cred);
- return error;
+ return (smbfs_readvnode(vp, uio, ap->a_cred, ap->a_eofflag));
}
/* ARGSUSED */
diff --git a/sys/fs/udf/ecma167-udf.h b/sys/fs/udf/ecma167-udf.h
index 839bbec08254..19e114763cac 100644
--- a/sys/fs/udf/ecma167-udf.h
+++ b/sys/fs/udf/ecma167-udf.h
@@ -243,7 +243,7 @@ struct part_map_spare {
uint8_t n_st; /* Number of Sparing Tables */
uint8_t reserved1;
uint32_t st_size;
- uint32_t st_loc[1];
+ uint32_t st_loc[];
} __packed;
union udf_pmap {
@@ -266,7 +266,7 @@ struct udf_sparing_table {
uint16_t rt_l; /* Relocation Table len */
uint8_t reserved[2];
uint32_t seq_num;
- struct spare_map_entry entries[1];
+ struct spare_map_entry entries[];
} __packed;
/* Partition Descriptor [3/10.5] */
diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c
index c7438147c0a0..c5ef1f686093 100644
--- a/sys/fs/udf/udf_vfsops.c
+++ b/sys/fs/udf/udf_vfsops.c
@@ -81,6 +81,7 @@
#include <sys/fcntl.h>
#include <sys/iconv.h>
#include <sys/kernel.h>
+#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
@@ -729,7 +730,7 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
struct ifid *ifhp;
struct vnode *nvp;
struct udf_node *np;
- off_t fsize;
+ uint64_t fsize;
int error;
ifhp = (struct ifid *)fhp;
@@ -741,6 +742,10 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
np = VTON(nvp);
fsize = le64toh(np->fentry->inf_len);
+ if (fsize > OFF_MAX) {
+ *vpp = NULLVP;
+ return (EIO);
+ }
*vpp = nvp;
vnode_create_vobject(*vpp, fsize, curthread);
diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c
index 88bf4917a851..37889241e8c3 100644
--- a/sys/fs/udf/udf_vnops.c
+++ b/sys/fs/udf/udf_vnops.c
@@ -39,6 +39,7 @@
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/iconv.h>
+#include <sys/limits.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/dirent.h>
@@ -182,11 +183,14 @@ udf_access(struct vop_access_args *a)
}
static int
-udf_open(struct vop_open_args *ap) {
+udf_open(struct vop_open_args *ap)
+{
struct udf_node *np = VTON(ap->a_vp);
- off_t fsize;
+ uint64_t fsize;
fsize = le64toh(np->fentry->inf_len);
+ if (fsize > OFF_MAX)
+ return (EIO);
vnode_create_vobject(ap->a_vp, fsize, ap->a_td);
return 0;
}
@@ -314,12 +318,13 @@ udf_getattr(struct vop_getattr_args *a)
* that directories consume at least one logical block,
* make it appear so.
*/
- if (fentry->logblks_rec != 0) {
- vap->va_size =
- le64toh(fentry->logblks_rec) * node->udfmp->bsize;
- } else {
+ vap->va_size = le64toh(fentry->logblks_rec);
+ if (vap->va_size == 0)
vap->va_size = node->udfmp->bsize;
- }
+ else if (vap->va_size > UINT64_MAX / node->udfmp->bsize)
+ vap->va_size = UINT64_MAX;
+ else
+ vap->va_size *= node->udfmp->bsize;
} else {
vap->va_size = le64toh(fentry->inf_len);
}
@@ -446,6 +451,7 @@ udf_read(struct vop_read_args *ap)
struct buf *bp;
uint8_t *data;
daddr_t lbn, rablock;
+ uint64_t len;
off_t diff, fsize;
ssize_t n;
int error = 0;
@@ -471,7 +477,12 @@ udf_read(struct vop_read_args *ap)
return (error);
}
- fsize = le64toh(node->fentry->inf_len);
+ len = le64toh(node->fentry->inf_len);
+ if (len > OFF_MAX) {
+ /* too big, just cap to the requested length */
+ len = uio->uio_resid;
+ }
+ fsize = len;
udfmp = node->udfmp;
do {
lbn = lblkno(udfmp, uio->uio_offset);
@@ -783,6 +794,7 @@ udf_readdir(struct vop_readdir_args *a)
struct udf_uiodir uiodir;
struct udf_dirstream *ds;
uint64_t *cookies = NULL;
+ uint64_t len;
int ncookies;
int error = 0;
@@ -811,8 +823,12 @@ udf_readdir(struct vop_readdir_args *a)
* Iterate through the file id descriptors. Give the parent dir
* entry special attention.
*/
- ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len),
- node->udfmp);
+ len = le64toh(node->fentry->inf_len);
+ if (len > INT_MAX) {
+ /* too big, just cap to INT_MAX */
+ len = INT_MAX;
+ }
+ ds = udf_opendir(node, uio->uio_offset, len, node->udfmp);
while ((fid = udf_getfid(ds)) != NULL) {
/* XXX Should we return an error on a bad fid? */
@@ -904,7 +920,8 @@ udf_readlink(struct vop_readlink_args *ap)
struct udf_node *node;
void *buf;
char *cp;
- int error, len, root;
+ uint64_t len;
+ int error, root;
/*
* A symbolic link in UDF is a list of variable-length path
@@ -914,6 +931,8 @@ udf_readlink(struct vop_readlink_args *ap)
vp = ap->a_vp;
node = VTON(vp);
len = le64toh(node->fentry->inf_len);
+ if (len > MAXPATHLEN)
+ return (EIO);
buf = malloc(len, M_DEVBUF, M_WAITOK);
iov[0].iov_len = len;
iov[0].iov_base = buf;
@@ -1116,13 +1135,14 @@ udf_lookup(struct vop_cachedlookup_args *a)
struct udf_mnt *udfmp;
struct fileid_desc *fid = NULL;
struct udf_dirstream *ds;
+ uint64_t fsize;
u_long nameiop;
u_long flags;
char *nameptr;
long namelen;
ino_t id = 0;
int offset, error = 0;
- int fsize, lkflags, ltype, numdirpasses;
+ int lkflags, ltype, numdirpasses;
dvp = a->a_dvp;
node = VTON(dvp);
@@ -1133,6 +1153,10 @@ udf_lookup(struct vop_cachedlookup_args *a)
nameptr = a->a_cnp->cn_nameptr;
namelen = a->a_cnp->cn_namelen;
fsize = le64toh(node->fentry->inf_len);
+ if (fsize > INT_MAX) {
+ /* too big, just cap to INT_MAX */
+ fsize = INT_MAX;
+ }
/*
* If this is a LOOKUP and we've already partially searched through
diff --git a/sys/geom/concat/g_concat.c b/sys/geom/concat/g_concat.c
index 2b1cb575cac8..2173a84c7acf 100644
--- a/sys/geom/concat/g_concat.c
+++ b/sys/geom/concat/g_concat.c
@@ -590,6 +590,7 @@ g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no)
strcmp(md.md_name, sc->sc_name) != 0 ||
md.md_id != sc->sc_id) {
G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name);
+ error = EINVAL;
goto fail;
}
diff --git a/sys/geom/geom.h b/sys/geom/geom.h
index dcd6f793f9f7..908ce86f03a6 100644
--- a/sys/geom/geom.h
+++ b/sys/geom/geom.h
@@ -282,7 +282,7 @@ void g_detach(struct g_consumer *cp);
void g_error_provider(struct g_provider *pp, int error);
struct g_provider *g_provider_by_name(char const *arg);
int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len);
-#define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof *(v))
+#define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof(*(v)))
int g_handleattr(struct bio *bp, const char *attribute, const void *val,
int len);
int g_handleattr_int(struct bio *bp, const char *attribute, int val);
diff --git a/sys/geom/geom_ccd.c b/sys/geom/geom_ccd.c
index 5700399ee5d1..2140d005160e 100644
--- a/sys/geom/geom_ccd.c
+++ b/sys/geom/geom_ccd.c
@@ -730,17 +730,17 @@ g_ccd_create(struct gctl_req *req, struct g_class *mp)
int i, error;
g_topology_assert();
- unit = gctl_get_paraml(req, "unit", sizeof (*unit));
+ unit = gctl_get_paraml(req, "unit", sizeof(*unit));
if (unit == NULL) {
gctl_error(req, "unit parameter not given");
return;
}
- ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave));
+ ileave = gctl_get_paraml(req, "ileave", sizeof(*ileave));
if (ileave == NULL) {
gctl_error(req, "ileave parameter not given");
return;
}
- nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider));
+ nprovider = gctl_get_paraml(req, "nprovider", sizeof(*nprovider));
if (nprovider == NULL) {
gctl_error(req, "nprovider parameter not given");
return;
@@ -769,7 +769,7 @@ g_ccd_create(struct gctl_req *req, struct g_class *mp)
}
gp = g_new_geomf(mp, "ccd%d", *unit);
- sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO);
+ sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
gp->softc = sc;
sc->sc_ndisks = *nprovider;
@@ -872,7 +872,7 @@ g_ccd_list(struct gctl_req *req, struct g_class *mp)
struct g_geom *gp;
int i, unit, *up;
- up = gctl_get_paraml(req, "unit", sizeof (*up));
+ up = gctl_get_paraml(req, "unit", sizeof(*up));
if (up == NULL) {
gctl_error(req, "unit parameter not given");
return;
diff --git a/sys/geom/geom_event.c b/sys/geom/geom_event.c
index 0a76fd6c6f57..341233a6ef47 100644
--- a/sys/geom/geom_event.c
+++ b/sys/geom/geom_event.c
@@ -145,7 +145,7 @@ g_attr_changed(struct g_provider *pp, const char *attr, int flag)
struct g_attrchanged_args *args;
int error;
- args = g_malloc(sizeof *args, flag);
+ args = g_malloc(sizeof(*args), flag);
if (args == NULL)
return (ENOMEM);
args->pp = pp;
diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c
index 8d6b9a926e1d..247a623bf1bf 100644
--- a/sys/geom/geom_io.c
+++ b/sys/geom/geom_io.c
@@ -278,7 +278,7 @@ g_io_init(void)
g_bioq_init(&g_bio_run_down);
g_bioq_init(&g_bio_run_up);
- biozone = uma_zcreate("g_bio", sizeof (struct bio),
+ biozone = uma_zcreate("g_bio", sizeof(struct bio),
NULL, NULL,
NULL, NULL,
0, 0);
diff --git a/sys/geom/geom_slice.c b/sys/geom/geom_slice.c
index 8cfffc478849..0491b0069be4 100644
--- a/sys/geom/geom_slice.c
+++ b/sys/geom/geom_slice.c
@@ -57,7 +57,7 @@ g_slice_alloc(unsigned nslice, unsigned scsize)
{
struct g_slicer *gsp;
- gsp = g_malloc(sizeof *gsp, M_WAITOK | M_ZERO);
+ gsp = g_malloc(sizeof(*gsp), M_WAITOK | M_ZERO);
if (scsize > 0)
gsp->softc = g_malloc(scsize, M_WAITOK | M_ZERO);
else
@@ -463,9 +463,9 @@ g_slice_conf_hot(struct g_geom *gp, u_int idx, off_t offset, off_t length, int r
}
gsl = gsp->hotspot;
if(idx >= gsp->nhotspot) {
- gsl2 = g_malloc((idx + 1) * sizeof *gsl2, M_WAITOK | M_ZERO);
+ gsl2 = g_malloc((idx + 1) * sizeof(*gsl2), M_WAITOK | M_ZERO);
if (gsp->hotspot != NULL)
- bcopy(gsp->hotspot, gsl2, gsp->nhotspot * sizeof *gsl2);
+ bcopy(gsp->hotspot, gsl2, gsp->nhotspot * sizeof(*gsl2));
gsp->hotspot = gsl2;
if (gsp->hotspot != NULL)
g_free(gsl);
diff --git a/sys/geom/geom_subr.c b/sys/geom/geom_subr.c
index 41cc115225f9..1429c84942ed 100644
--- a/sys/geom/geom_subr.c
+++ b/sys/geom/geom_subr.c
@@ -267,7 +267,7 @@ g_modevent(module_t mod, int type, void *data)
switch (type) {
case MOD_LOAD:
g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name);
- hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
+ hh = g_malloc(sizeof(*hh), M_WAITOK | M_ZERO);
hh->mp = mp;
/*
* Once the system is not cold, MOD_LOAD calls will be
@@ -351,7 +351,7 @@ g_retaste(struct g_class *mp)
if (mp->taste == NULL)
return (EINVAL);
- hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
+ hh = g_malloc(sizeof(*hh), M_WAITOK | M_ZERO);
hh->mp = mp;
if (cold) {
@@ -381,8 +381,8 @@ g_new_geomf(struct g_class *mp, const char *fmt, ...)
sbuf_vprintf(sb, fmt, ap);
va_end(ap);
sbuf_finish(sb);
- gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO);
- gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
+ gp = g_malloc(sizeof(*gp) + sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
+ gp->name = (char *)(gp + 1);
gp->class = mp;
gp->rank = 1;
LIST_INIT(&gp->consumer);
@@ -420,7 +420,6 @@ g_destroy_geom(struct g_geom *gp)
g_cancel_event(gp);
LIST_REMOVE(gp, geom);
TAILQ_REMOVE(&geoms, gp, geoms);
- g_free(gp->name);
g_free(gp);
}
@@ -528,7 +527,7 @@ g_new_consumer(struct g_geom *gp)
("g_new_consumer on geom(%s) (class %s) without orphan",
gp->name, gp->class->name));
- cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO);
+ cp = g_malloc(sizeof(*cp), M_WAITOK | M_ZERO);
cp->geom = gp;
cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED,
DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
@@ -617,7 +616,7 @@ g_new_providerf(struct g_geom *gp, const char *fmt, ...)
sbuf_vprintf(sb, fmt, ap);
va_end(ap);
sbuf_finish(sb);
- pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
+ pp = g_malloc(sizeof(*pp) + sbuf_len(sb) + 1, M_WAITOK | M_ZERO);
pp->name = (char *)(pp + 1);
strcpy(pp->name, sbuf_data(sb));
sbuf_delete(sb);
@@ -749,7 +748,7 @@ g_resize_provider(struct g_provider *pp, off_t size)
if (size == pp->mediasize)
return;
- hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO);
+ hh = g_malloc(sizeof(*hh), M_WAITOK | M_ZERO);
hh->pp = pp;
hh->size = size;
g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL);
@@ -1083,21 +1082,21 @@ int
g_handleattr_int(struct bio *bp, const char *attribute, int val)
{
- return (g_handleattr(bp, attribute, &val, sizeof val));
+ return (g_handleattr(bp, attribute, &val, sizeof(val)));
}
int
g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val)
{
- return (g_handleattr(bp, attribute, &val, sizeof val));
+ return (g_handleattr(bp, attribute, &val, sizeof(val)));
}
int
g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val)
{
- return (g_handleattr(bp, attribute, &val, sizeof val));
+ return (g_handleattr(bp, attribute, &val, sizeof(val)));
}
int
diff --git a/sys/geom/multipath/g_multipath.c b/sys/geom/multipath/g_multipath.c
index 23088c895541..a4935df7eaa1 100644
--- a/sys/geom/multipath/g_multipath.c
+++ b/sys/geom/multipath/g_multipath.c
@@ -321,7 +321,7 @@ g_multipath_resize(struct g_consumer *cp)
if (sc->sc_uuid[0] != 0) {
pp = cp->provider;
strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic));
- memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid));
+ memcpy(md.md_uuid, sc->sc_uuid, sizeof(sc->sc_uuid));
strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name));
md.md_version = G_MULTIPATH_VERSION;
md.md_size = size;
@@ -552,8 +552,8 @@ g_multipath_create(struct g_class *mp, struct g_multipath_metadata *md)
gp = g_new_geomf(mp, "%s", md->md_name);
sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
mtx_init(&sc->sc_mtx, "multipath", NULL, MTX_DEF);
- memcpy(sc->sc_uuid, md->md_uuid, sizeof (sc->sc_uuid));
- memcpy(sc->sc_name, md->md_name, sizeof (sc->sc_name));
+ memcpy(sc->sc_uuid, md->md_uuid, sizeof(sc->sc_uuid));
+ memcpy(sc->sc_name, md->md_name, sizeof(sc->sc_name));
sc->sc_active_active = md->md_active_active;
sc->sc_size = md->md_size;
gp->softc = sc;
@@ -906,7 +906,7 @@ g_multipath_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
char buf[16];
u_long rand = random();
- snprintf(buf, sizeof (buf), "%s-%lu", md.md_name, rand);
+ snprintf(buf, sizeof(buf), "%s-%lu", md.md_name, rand);
printf("GEOM_MULTIPATH: geom %s/%s exists already\n",
sc->sc_name, sc->sc_uuid);
printf("GEOM_MULTIPATH: %s will be (temporarily) %s\n",
@@ -1200,7 +1200,7 @@ g_multipath_ctl_configure(struct gctl_req *req, struct g_class *mp)
cp = sc->sc_active;
pp = cp->provider;
strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic));
- memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid));
+ memcpy(md.md_uuid, sc->sc_uuid, sizeof(sc->sc_uuid));
strlcpy(md.md_name, name, sizeof(md.md_name));
md.md_version = G_MULTIPATH_VERSION;
md.md_size = pp->mediasize;
diff --git a/sys/geom/virstor/g_virstor.c b/sys/geom/virstor/g_virstor.c
index b8cf32875660..c7d737493f11 100644
--- a/sys/geom/virstor/g_virstor.c
+++ b/sys/geom/virstor/g_virstor.c
@@ -202,7 +202,7 @@ virstor_ctl_stop(struct gctl_req *req, struct g_class *cp)
int *force, *nargs;
int i;
- nargs = gctl_get_paraml(req, "nargs", sizeof *nargs);
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
if (nargs == NULL) {
gctl_error(req, "Error fetching argument '%s'", "nargs");
return;
@@ -211,7 +211,7 @@ virstor_ctl_stop(struct gctl_req *req, struct g_class *cp)
gctl_error(req, "Invalid number of arguments");
return;
}
- force = gctl_get_paraml(req, "force", sizeof *force);
+ force = gctl_get_paraml(req, "force", sizeof(*force));
if (force == NULL) {
gctl_error(req, "Error fetching argument '%s'", "force");
return;
@@ -315,7 +315,7 @@ virstor_ctl_add(struct gctl_req *req, struct g_class *cp)
u_int nc;
u_int j;
- snprintf(aname, sizeof aname, "arg%d", i);
+ snprintf(aname, sizeof(aname), "arg%d", i);
pp = gctl_get_provider(req, aname);
if (pp == NULL) {
/* This is the most common error so be verbose about it */
@@ -487,12 +487,12 @@ fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md,
{
struct g_virstor_component *c;
- bzero(md, sizeof *md);
+ bzero(md, sizeof(*md));
c = &sc->components[nc];
- strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic);
+ strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof(md->md_magic));
md->md_version = G_VIRSTOR_VERSION;
- strncpy(md->md_name, sc->geom->name, sizeof md->md_name);
+ strncpy(md->md_name, sc->geom->name, sizeof(md->md_name));
md->md_id = sc->id;
md->md_virsize = sc->virsize;
md->md_chunk_size = sc->chunk_size;
@@ -500,7 +500,7 @@ fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md,
if (hardcode) {
strncpy(md->provider, c->gcons->provider->name,
- sizeof md->provider);
+ sizeof(md->provider));
}
md->no = nc;
md->provsize = c->gcons->provider->mediasize;
@@ -589,7 +589,7 @@ virstor_ctl_remove(struct gctl_req *req, struct g_class *cp)
M_GVIRSTOR, M_WAITOK | M_ZERO);
bcopy(sc->components, newcomp, found * sizeof(*sc->components));
bcopy(&sc->components[found + 1], newcomp + found,
- found * sizeof(*sc->components));
+ (sc->n_components - (found + 1)) * sizeof(*sc->components));
if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) {
LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be "
"removed from %s",
@@ -959,7 +959,7 @@ virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force,
free(sc->map, M_GVIRSTOR);
free(sc->components, M_GVIRSTOR);
- bzero(sc, sizeof *sc);
+ bzero(sc, sizeof(*sc));
free(sc, M_GVIRSTOR);
pp = LIST_FIRST(&gp->provider); /* We only offer one provider */
@@ -1213,7 +1213,7 @@ virstor_check_and_run(struct g_virstor_softc *sc)
sc->provider->name,
sc->chunk_count * (off_t)sc->chunk_size);
}
- sc->map_size = sc->chunk_count * sizeof *(sc->map);
+ sc->map_size = sc->chunk_count * sizeof(*(sc->map));
/* The following allocation is in order of 4MB - 8MB */
sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK);
KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s",
@@ -1267,7 +1267,7 @@ virstor_check_and_run(struct g_virstor_softc *sc)
bcopy(mapbuf, &sc->map[n], bs);
off += bs;
count += bs;
- n += bs / sizeof *(sc->map);
+ n += bs / sizeof(*(sc->map));
g_free(mapbuf);
}
g_access(sc->components[0].gcons, -1, 0, 0);
@@ -1306,8 +1306,8 @@ virstor_check_and_run(struct g_virstor_softc *sc)
sc->components[index].chunk_next);
}
- sc->me_per_sector = sc->sectorsize / sizeof *(sc->map);
- if (sc->sectorsize % sizeof *(sc->map) != 0) {
+ sc->me_per_sector = sc->sectorsize / sizeof(*(sc->map));
+ if (sc->sectorsize % sizeof(*(sc->map)) != 0) {
LOG_MSG(LVL_ERROR,
"%s: Map entries don't fit exactly in a sector (%s)",
__func__, sc->geom->name);
@@ -1653,7 +1653,7 @@ g_virstor_start(struct bio *b)
* XXX: this will prevent the fs from
* being umounted! */
struct g_virstor_bio_q *biq;
- biq = malloc(sizeof *biq, M_GVIRSTOR,
+ biq = malloc(sizeof(*biq), M_GVIRSTOR,
M_NOWAIT);
if (biq == NULL) {
bioq_dismantle(&bq);
@@ -1703,7 +1703,7 @@ g_virstor_start(struct bio *b)
* map array.
* sc_offset will end up pointing to the drive
* sector. */
- s_offset = chunk_index * sizeof *me;
+ s_offset = chunk_index * sizeof(*me);
s_offset = rounddown(s_offset, sc->sectorsize);
/* data_me points to map entry sector
diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC
index e7d460af21d4..f577cd07ac7c 100644
--- a/sys/i386/conf/GENERIC
+++ b/sys/i386/conf/GENERIC
@@ -17,6 +17,8 @@
# in NOTES.
#
+#NO_UNIVERSE
+
cpu I486_CPU
cpu I586_CPU
cpu I686_CPU
diff --git a/sys/i386/conf/GENERIC-NODEBUG b/sys/i386/conf/GENERIC-NODEBUG
index ea07613a796f..a93304481b5f 100644
--- a/sys/i386/conf/GENERIC-NODEBUG
+++ b/sys/i386/conf/GENERIC-NODEBUG
@@ -25,6 +25,8 @@
# in NOTES.
#
+#NO_UNIVERSE
+
include GENERIC
include "std.nodebug"
diff --git a/sys/i386/conf/LINT b/sys/i386/conf/LINT
index 41207eb63cb9..2e947202f723 100644
--- a/sys/i386/conf/LINT
+++ b/sys/i386/conf/LINT
@@ -1,3 +1,4 @@
+#NO_UNIVERSE
include "../../conf/NOTES"
include "../../x86/conf/NOTES"
diff --git a/sys/i386/conf/MINIMAL b/sys/i386/conf/MINIMAL
index 2a06eb84bff8..8019617ca4d4 100644
--- a/sys/i386/conf/MINIMAL
+++ b/sys/i386/conf/MINIMAL
@@ -31,6 +31,8 @@
# in NOTES.
#
+#NO_UNIVERSE
+
cpu I486_CPU
cpu I586_CPU
cpu I686_CPU
diff --git a/sys/i386/conf/PAE b/sys/i386/conf/PAE
index a39d32d77106..72af9e9a9eec 100644
--- a/sys/i386/conf/PAE
+++ b/sys/i386/conf/PAE
@@ -2,6 +2,8 @@
# PAE -- Generic kernel configuration file for FreeBSD/i386 PAE
#
+#NO_UNIVERSE
+
include GENERIC
ident PAE-GENERIC
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 465b4d0f365b..b44f5e08bbcf 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -876,14 +876,16 @@ __CONCAT(PMTYPE, init_pat)(void)
#ifdef PMAP_PAE_COMP
static void *
-pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
- int wait)
+pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *sflagsp,
+ int flags)
{
/* Inform UMA that this allocator uses kernel_map/object. */
- *flags = UMA_SLAB_KERNEL;
+ *sflagsp = UMA_SLAB_KERNEL;
+ /* contig allocations cannot be NEVERFREED */
+ flags &= ~M_NEVERFREED;
return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
- bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
+ bytes, flags, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
}
#endif
@@ -5617,6 +5619,8 @@ __CONCAT(PMTYPE, unmapdev)(void *p, vm_size_t size)
static void
__CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pat_mode == ma)
+ return;
m->md.pat_mode = ma;
if ((m->flags & PG_FICTITIOUS) != 0)
diff --git a/sys/kern/coredump_vnode.c b/sys/kern/coredump_vnode.c
new file mode 100644
index 000000000000..8b857e9aa4a2
--- /dev/null
+++ b/sys/kern/coredump_vnode.c
@@ -0,0 +1,562 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * - kern_sig.c
+ */
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * -kern_exec.c
+ */
+
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/compressor.h>
+#include <sys/devctl.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/limits.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/ucoredump.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+
+#include <security/audit/audit.h>
+
+#define GZIP_SUFFIX ".gz"
+#define ZSTD_SUFFIX ".zst"
+
+#define MAX_NUM_CORE_FILES 100000
+#ifndef NUM_CORE_FILES
+#define NUM_CORE_FILES 5
+#endif
+
+static coredumper_handle_fn coredump_vnode;
+static struct coredumper vnode_coredumper = {
+ .cd_name = "vnode_coredumper",
+ .cd_handle = coredump_vnode,
+};
+
+SYSINIT(vnode_coredumper_register, SI_SUB_EXEC, SI_ORDER_ANY,
+ coredumper_register, &vnode_coredumper);
+
+_Static_assert(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES,
+ "NUM_CORE_FILES is out of range (0 to " __STRING(MAX_NUM_CORE_FILES) ")");
+static int num_cores = NUM_CORE_FILES;
+
+static int capmode_coredump;
+SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
+ &capmode_coredump, 0, "Allow processes in capability mode to dump core");
+
+static int set_core_nodump_flag = 0;
+SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
+ 0, "Enable setting the NODUMP flag on coredump files");
+
+static int coredump_devctl = 0;
+SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
+ 0, "Generate a devctl notification when processes coredump");
+
+/*
+ * corefilename[] is protected by the allproc_lock.
+ */
+static char corefilename[MAXPATHLEN] = { "%N.core" };
+TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
+
+static int
+sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ sx_xlock(&allproc_lock);
+ error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
+ req);
+ sx_xunlock(&allproc_lock);
+
+ return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
+ CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
+ "Process corefile name format string");
+
+static int
+sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int new_val;
+
+ new_val = num_cores;
+ error = sysctl_handle_int(oidp, &new_val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (new_val > MAX_NUM_CORE_FILES)
+ new_val = MAX_NUM_CORE_FILES;
+ if (new_val < 0)
+ new_val = 0;
+ num_cores = new_val;
+ return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, ncores,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
+ sysctl_debug_num_cores_check, "I",
+ "Maximum number of generated process corefiles while using index format");
+
+static void
+vnode_close_locked(struct thread *td, struct vnode *vp)
+{
+
+ VOP_UNLOCK(vp);
+ vn_close(vp, FWRITE, td->td_ucred, td);
+}
+
+int
+core_vn_write(const struct coredump_writer *cdw, const void *base, size_t len,
+ off_t offset, enum uio_seg seg, struct ucred *cred, size_t *resid,
+ struct thread *td)
+{
+ struct coredump_vnode_ctx *ctx = cdw->ctx;
+
+ return (vn_rdwr_inchunks(UIO_WRITE, ctx->vp, __DECONST(void *, base),
+ len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
+ cred, ctx->fcred, resid, td));
+}
+
+int
+core_vn_extend(const struct coredump_writer *cdw, off_t newsz,
+ struct ucred *cred)
+{
+ struct coredump_vnode_ctx *ctx = cdw->ctx;
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(ctx->vp, &mp, V_WAIT);
+ if (error != 0)
+ return (error);
+ vn_lock(ctx->vp, LK_EXCLUSIVE | LK_RETRY);
+ error = vn_truncate_locked(ctx->vp, newsz, false, cred);
+ VOP_UNLOCK(ctx->vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * If the core format has a %I in it, then we need to check
+ * for existing corefiles before defining a name.
+ * To do this we iterate over 0..ncores to find a
+ * non-existing core file name to use. If all core files are
+ * already used we choose the oldest one.
+ */
+static int
+corefile_open_last(struct thread *td, char *name, int indexpos,
+ int indexlen, int ncores, struct vnode **vpp)
+{
+ struct vnode *oldvp, *nextvp, *vp;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error, i, flags, oflags, cmode;
+ char ch;
+ struct timespec lasttime;
+
+ nextvp = oldvp = NULL;
+ cmode = S_IRUSR | S_IWUSR;
+ oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
+ (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+
+ for (i = 0; i < ncores; i++) {
+ flags = O_CREAT | FWRITE | O_NOFOLLOW;
+
+ ch = name[indexpos + indexlen];
+ (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
+ i);
+ name[indexpos + indexlen] = ch;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
+ error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
+ NULL);
+ if (error != 0)
+ break;
+
+ vp = nd.ni_vp;
+ NDFREE_PNBUF(&nd);
+ if ((flags & O_CREAT) == O_CREAT) {
+ nextvp = vp;
+ break;
+ }
+
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+ if (error != 0) {
+ vnode_close_locked(td, vp);
+ break;
+ }
+
+ if (oldvp == NULL ||
+ lasttime.tv_sec > vattr.va_mtime.tv_sec ||
+ (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
+ lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
+ if (oldvp != NULL)
+ vn_close(oldvp, FWRITE, td->td_ucred, td);
+ oldvp = vp;
+ VOP_UNLOCK(oldvp);
+ lasttime = vattr.va_mtime;
+ } else {
+ vnode_close_locked(td, vp);
+ }
+ }
+
+ if (oldvp != NULL) {
+ if (nextvp == NULL) {
+ if ((td->td_proc->p_flag & P_SUGID) != 0) {
+ error = EFAULT;
+ vn_close(oldvp, FWRITE, td->td_ucred, td);
+ } else {
+ nextvp = oldvp;
+ error = vn_lock(nextvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vn_close(nextvp, FWRITE, td->td_ucred,
+ td);
+ nextvp = NULL;
+ }
+ }
+ } else {
+ vn_close(oldvp, FWRITE, td->td_ucred, td);
+ }
+ }
+ if (error != 0) {
+ if (nextvp != NULL)
+ vnode_close_locked(td, oldvp);
+ } else {
+ *vpp = nextvp;
+ }
+
+ return (error);
+}
+
+/*
+ * corefile_open(comm, uid, pid, td, compress, vpp, namep)
+ * Expand the name described in corefilename, using name, uid, and pid
+ * and open/create core file.
+ * corefilename is a printf-like string, with three format specifiers:
+ * %N name of process ("name")
+ * %P process id (pid)
+ * %U user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+static int
+corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
+ int compress, int signum, struct vnode **vpp, char **namep)
+{
+ struct sbuf sb;
+ struct nameidata nd;
+ const char *format;
+ char *hostname, *name;
+ int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
+
+ hostname = NULL;
+ format = corefilename;
+ name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
+ indexlen = 0;
+ indexpos = -1;
+ ncores = num_cores;
+ (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
+ sx_slock(&allproc_lock);
+ for (i = 0; format[i] != '\0'; i++) {
+ switch (format[i]) {
+ case '%': /* Format character */
+ i++;
+ switch (format[i]) {
+ case '%':
+ sbuf_putc(&sb, '%');
+ break;
+ case 'H': /* hostname */
+ if (hostname == NULL) {
+ hostname = malloc(MAXHOSTNAMELEN,
+ M_TEMP, M_WAITOK);
+ }
+ getcredhostname(td->td_ucred, hostname,
+ MAXHOSTNAMELEN);
+ sbuf_cat(&sb, hostname);
+ break;
+ case 'I': /* autoincrementing index */
+ if (indexpos != -1) {
+ sbuf_printf(&sb, "%%I");
+ break;
+ }
+
+ indexpos = sbuf_len(&sb);
+ sbuf_printf(&sb, "%u", ncores - 1);
+ indexlen = sbuf_len(&sb) - indexpos;
+ break;
+ case 'N': /* process name */
+ sbuf_printf(&sb, "%s", comm);
+ break;
+ case 'P': /* process id */
+ sbuf_printf(&sb, "%u", pid);
+ break;
+ case 'S': /* signal number */
+ sbuf_printf(&sb, "%i", signum);
+ break;
+ case 'U': /* user id */
+ sbuf_printf(&sb, "%u", uid);
+ break;
+ default:
+ log(LOG_ERR,
+ "Unknown format character %c in "
+ "corename `%s'\n", format[i], format);
+ break;
+ }
+ break;
+ default:
+ sbuf_putc(&sb, format[i]);
+ break;
+ }
+ }
+ sx_sunlock(&allproc_lock);
+ free(hostname, M_TEMP);
+ if (compress == COMPRESS_GZIP)
+ sbuf_cat(&sb, GZIP_SUFFIX);
+ else if (compress == COMPRESS_ZSTD)
+ sbuf_cat(&sb, ZSTD_SUFFIX);
+ if (sbuf_error(&sb) != 0) {
+ log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
+ "long\n", (long)pid, comm, (u_long)uid);
+ sbuf_delete(&sb);
+ free(name, M_TEMP);
+ return (ENOMEM);
+ }
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+
+ if (indexpos != -1) {
+ error = corefile_open_last(td, name, indexpos, indexlen, ncores,
+ vpp);
+ if (error != 0) {
+ log(LOG_ERR,
+ "pid %d (%s), uid (%u): Path `%s' failed "
+ "on initial open test, error = %d\n",
+ pid, comm, uid, name, error);
+ }
+ } else {
+ cmode = S_IRUSR | S_IWUSR;
+ oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
+ (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+ flags = O_CREAT | FWRITE | O_NOFOLLOW;
+ if ((td->td_proc->p_flag & P_SUGID) != 0)
+ flags |= O_EXCL;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
+ error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
+ NULL);
+ if (error == 0) {
+ *vpp = nd.ni_vp;
+ NDFREE_PNBUF(&nd);
+ }
+ }
+
+ if (error != 0) {
+#ifdef AUDIT
+ audit_proc_coredump(td, name, error);
+#endif
+ free(name, M_TEMP);
+ return (error);
+ }
+ *namep = name;
+ return (0);
+}
+
+/*
+ * The vnode dumper is the traditional coredump handler. Our policy and limits
+ * are generally checked already, so it creates the coredump name and passes on
+ * a vnode and a size limit to the process-specific coredump routine if there is
+ * one. If there _is not_ one, it returns ENOSYS; otherwise it returns the
+ * error from the process-specific routine.
+ */
+static int
+coredump_vnode(struct thread *td, off_t limit)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *cred = td->td_ucred;
+ struct vnode *vp;
+ struct coredump_vnode_ctx wctx;
+ struct coredump_writer cdw = { };
+ struct flock lf;
+ struct vattr vattr;
+ size_t fullpathsize;
+ int error, error1, jid, locked, ppid, sig;
+ char *name; /* name of corefile */
+ void *rl_cookie;
+ char *fullpath, *freepath = NULL;
+ struct sbuf *sb;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ ppid = p->p_oppid;
+ sig = p->p_sig;
+ jid = p->p_ucred->cr_prison->pr_id;
+ PROC_UNLOCK(p);
+
+ error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
+ compress_user_cores, sig, &vp, &name);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Don't dump to non-regular files or files with links.
+ * Do not dump into system files. Effective user must own the corefile.
+ */
+ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
+ vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
+ vattr.va_uid != cred->cr_uid) {
+ VOP_UNLOCK(vp);
+ error = EFAULT;
+ goto out;
+ }
+
+ VOP_UNLOCK(vp);
+
+ /* Postpone other writers, including core dumps of other processes. */
+ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_WRLCK;
+ locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
+
+ VATTR_NULL(&vattr);
+ vattr.va_size = 0;
+ if (set_core_nodump_flag)
+ vattr.va_flags = UF_NODUMP;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_SETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp);
+ PROC_LOCK(p);
+ p->p_acflag |= ACORE;
+ PROC_UNLOCK(p);
+
+ wctx.vp = vp;
+ wctx.fcred = NOCRED;
+
+ cdw.ctx = &wctx;
+ cdw.write_fn = core_vn_write;
+ cdw.extend_fn = core_vn_extend;
+
+ if (p->p_sysent->sv_coredump != NULL) {
+ error = p->p_sysent->sv_coredump(td, &cdw, limit, 0);
+ } else {
+ error = ENOSYS;
+ }
+
+ if (locked) {
+ lf.l_type = F_UNLCK;
+ VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+ }
+ vn_rangelock_unlock(vp, rl_cookie);
+
+ /*
+ * Notify the userland helper that a process triggered a core dump.
+ * This allows the helper to run an automated debugging session.
+ */
+ if (error != 0 || coredump_devctl == 0)
+ goto out;
+ sb = sbuf_new_auto();
+ if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
+ goto out2;
+ sbuf_cat(sb, "comm=\"");
+ devctl_safe_quote_sb(sb, fullpath);
+ free(freepath, M_TEMP);
+ sbuf_cat(sb, "\" core=\"");
+
+ /*
+ * We can't lookup core file vp directly. When we're replacing a core, and
+ * other random times, we flush the name cache, so it will fail. Instead,
+ * if the path of the core is relative, add the current dir in front if it.
+ */
+ if (name[0] != '/') {
+ fullpathsize = MAXPATHLEN;
+ freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
+ if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
+ free(freepath, M_TEMP);
+ goto out2;
+ }
+ devctl_safe_quote_sb(sb, fullpath);
+ free(freepath, M_TEMP);
+ sbuf_putc(sb, '/');
+ }
+ devctl_safe_quote_sb(sb, name);
+ sbuf_putc(sb, '"');
+
+ sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d",
+ jid, p->p_pid, ppid, sig);
+ if (sbuf_finish(sb) == 0)
+ devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
+out2:
+ sbuf_delete(sb);
+out:
+ error1 = vn_close(vp, FWRITE, cred, td);
+ if (error == 0)
+ error = error1;
+#ifdef AUDIT
+ audit_proc_coredump(td, name, error);
+#endif
+ free(name, M_TEMP);
+ return (error);
+}
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index b7ffbe68b483..2690ad3b2679 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -64,6 +64,7 @@
#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
+#include <sys/ucoredump.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
#include <sys/eventhandler.h>
@@ -1562,9 +1563,6 @@ struct note_info {
TAILQ_HEAD(note_info_list, note_info);
-extern int compress_user_cores;
-extern int compress_user_cores_level;
-
static void cb_put_phdr(vm_map_entry_t, void *);
static void cb_size_segment(vm_map_entry_t, void *);
static void each_dumpable_segment(struct thread *, segment_callback, void *,
@@ -1595,7 +1593,7 @@ core_compressed_write(void *base, size_t len, off_t offset, void *arg)
}
int
-__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
+__elfN(coredump)(struct thread *td, struct coredump_writer *cdw, off_t limit, int flags)
{
struct ucred *cred = td->td_ucred;
int compm, error = 0;
@@ -1625,9 +1623,8 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
/* Set up core dump parameters. */
params.offset = 0;
params.active_cred = cred;
- params.file_cred = NOCRED;
params.td = td;
- params.vp = vp;
+ params.cdw = cdw;
params.comp = NULL;
#ifdef RACCT
@@ -1662,6 +1659,12 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
}
+ if (cdw->init_fn != NULL) {
+ error = (*cdw->init_fn)(cdw, &params);
+ if (error != 0)
+ goto done;
+ }
+
/*
* Allocate memory for building the header, fill it up,
* and write it out following the notes.
diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
index 5d9e2f2f326b..d7eb82d5f259 100644
--- a/sys/kern/kern_cpuset.c
+++ b/sys/kern/kern_cpuset.c
@@ -530,7 +530,7 @@ _domainset_create(struct domainset *domain, struct domainlist *freelist)
* remove them and update the domainset accordingly. If only empty
* domains are present, we must return failure.
*/
-static bool
+bool
domainset_empty_vm(struct domainset *domain)
{
domainset_t empty;
@@ -2409,82 +2409,92 @@ sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
}
int
-kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
- id_t id, size_t domainsetsize, const domainset_t *maskp, int policy,
- const struct cpuset_copy_cb *cb)
+domainset_populate(struct domainset *domain, const domainset_t *mask, int policy,
+ size_t mask_size)
{
- struct cpuset *nset;
- struct cpuset *set;
- struct thread *ttd;
- struct proc *p;
- struct domainset domain;
- domainset_t *mask;
- int error;
- if (domainsetsize < sizeof(domainset_t) ||
- domainsetsize > DOMAINSET_MAXSIZE / NBBY)
- return (ERANGE);
if (policy <= DOMAINSET_POLICY_INVALID ||
- policy > DOMAINSET_POLICY_MAX)
+ policy > DOMAINSET_POLICY_MAX) {
return (EINVAL);
- error = cpuset_check_capabilities(td, level, which, id);
- if (error != 0)
- return (error);
- memset(&domain, 0, sizeof(domain));
- mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
- error = cb->cpuset_copyin(maskp, mask, domainsetsize);
- if (error)
- goto out;
+ }
+
/*
* Verify that no high bits are set.
*/
- if (domainsetsize > sizeof(domainset_t)) {
- char *end;
- char *cp;
+ if (mask_size > sizeof(domainset_t)) {
+ const char *end;
+ const char *cp;
- end = cp = (char *)&mask->__bits;
- end += domainsetsize;
+ end = cp = (const char *)&mask->__bits;
+ end += mask_size;
cp += sizeof(domainset_t);
- while (cp != end)
+ while (cp != end) {
if (*cp++ != 0) {
- error = EINVAL;
- goto out;
+ return (EINVAL);
}
+ }
}
if (DOMAINSET_EMPTY(mask)) {
- error = EDEADLK;
- goto out;
+ return (EDEADLK);
}
- DOMAINSET_COPY(mask, &domain.ds_mask);
- domain.ds_policy = policy;
+ DOMAINSET_COPY(mask, &domain->ds_mask);
+ domain->ds_policy = policy;
/*
* Sanitize the provided mask.
*/
- if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) {
- error = EINVAL;
- goto out;
+ if (!DOMAINSET_SUBSET(&all_domains, &domain->ds_mask)) {
+ return (EINVAL);
}
/* Translate preferred policy into a mask and fallback. */
if (policy == DOMAINSET_POLICY_PREFER) {
/* Only support a single preferred domain. */
- if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
- error = EINVAL;
- goto out;
+ if (DOMAINSET_COUNT(&domain->ds_mask) != 1) {
+ return (EINVAL);
}
- domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
+ domain->ds_prefer = DOMAINSET_FFS(&domain->ds_mask) - 1;
/* This will be constrained by domainset_shadow(). */
- DOMAINSET_COPY(&all_domains, &domain.ds_mask);
+ DOMAINSET_COPY(&all_domains, &domain->ds_mask);
}
+ return (0);
+}
+
+int
+kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
+ id_t id, size_t domainsetsize, const domainset_t *maskp, int policy,
+ const struct cpuset_copy_cb *cb)
+{
+ struct cpuset *nset;
+ struct cpuset *set;
+ struct thread *ttd;
+ struct proc *p;
+ struct domainset domain;
+ domainset_t *mask;
+ int error;
+
+ error = cpuset_check_capabilities(td, level, which, id);
+ if (error != 0)
+ return (error);
+ if (domainsetsize < sizeof(domainset_t) ||
+ domainsetsize > DOMAINSET_MAXSIZE / NBBY)
+ return (ERANGE);
+ memset(&domain, 0, sizeof(domain));
+ mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
+ error = cb->cpuset_copyin(maskp, mask, domainsetsize);
+ if (error)
+ goto out;
+ error = domainset_populate(&domain, mask, policy, domainsetsize);
+ if (error)
+ goto out;
+
/*
* When given an impossible policy, fall back to interleaving
* across all domains.
*/
if (domainset_empty_vm(&domain))
domainset_copy(domainset2, &domain);
-
switch (level) {
case CPU_LEVEL_ROOT:
case CPU_LEVEL_CPUSET:
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 93bdd41d1515..a27ab33b34da 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -557,8 +557,10 @@ open_to_fde_flags(int open_flags, bool sticky_orb)
{ .f = O_CLOFORK, .t = UF_FOCLOSE },
{ .f = O_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH },
};
+#if defined(__clang__) && __clang_major__ >= 19
_Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f ==
O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb");
+#endif
return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) -
(sticky_orb ? 0 : 1), open_flags));
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 03268365891e..0fc2d0e7f1bc 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -70,6 +70,7 @@
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/timers.h>
+#include <sys/ucoredump.h>
#include <sys/umtxvar.h>
#include <sys/vnode.h>
#include <sys/wait.h>
@@ -2002,10 +2003,14 @@ int
core_write(struct coredump_params *cp, const void *base, size_t len,
off_t offset, enum uio_seg seg, size_t *resid)
{
+ return ((*cp->cdw->write_fn)(cp->cdw, base, len, offset, seg,
+ cp->active_cred, resid, cp->td));
+}
- return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base),
- len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
- cp->active_cred, cp->file_cred, resid, cp->td));
+static int
+core_extend(struct coredump_params *cp, off_t newsz)
+{
+ return ((*cp->cdw->extend_fn)(cp->cdw, newsz, cp->active_cred));
}
int
@@ -2013,7 +2018,6 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp,
void *tmpbuf)
{
vm_map_t map;
- struct mount *mp;
size_t resid, runlen;
int error;
bool success;
@@ -2068,14 +2072,7 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp,
}
}
if (!success) {
- error = vn_start_write(cp->vp, &mp, V_WAIT);
- if (error != 0)
- break;
- vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY);
- error = vn_truncate_locked(cp->vp, offset + runlen,
- false, cp->td->td_ucred);
- VOP_UNLOCK(cp->vp);
- vn_finished_write(mp);
+ error = core_extend(cp, offset + runlen);
if (error != 0)
break;
}
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index d4529e096929..7ef1d19f0ea8 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3466,7 +3466,7 @@ prison_check_af(struct ucred *cred, int af)
pr = cred->cr_prison;
#ifdef VIMAGE
/* Prisons with their own network stack are not limited. */
- if (prison_owns_vnet(cred))
+ if (prison_owns_vnet(pr))
return (0);
#endif
@@ -3531,7 +3531,7 @@ prison_if(struct ucred *cred, const struct sockaddr *sa)
KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
#ifdef VIMAGE
- if (prison_owns_vnet(cred))
+ if (prison_owns_vnet(cred->cr_prison))
return (0);
#endif
@@ -3648,7 +3648,7 @@ jailed_without_vnet(struct ucred *cred)
if (!jailed(cred))
return (false);
#ifdef VIMAGE
- if (prison_owns_vnet(cred))
+ if (prison_owns_vnet(cred->cr_prison))
return (false);
#endif
@@ -3711,20 +3711,17 @@ getjailname(struct ucred *cred, char *name, size_t len)
#ifdef VIMAGE
/*
- * Determine whether the prison represented by cred owns
- * its vnet rather than having it inherited.
- *
- * Returns true in case the prison owns the vnet, false otherwise.
+ * Determine whether the prison owns its VNET.
*/
bool
-prison_owns_vnet(struct ucred *cred)
+prison_owns_vnet(struct prison *pr)
{
/*
* vnets cannot be added/removed after jail creation,
* so no need to lock here.
*/
- return ((cred->cr_prison->pr_flags & PR_VNET) != 0);
+ return ((pr->pr_flags & PR_VNET) != 0);
}
#endif
@@ -4425,7 +4422,7 @@ sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
#ifdef VIMAGE
struct ucred *cred = req->td->td_ucred;
- havevnet = jailed(cred) && prison_owns_vnet(cred);
+ havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison);
#else
havevnet = 0;
#endif
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index d9aeec68e620..0f0bc056cafd 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -287,7 +287,7 @@ sys_getgid(struct thread *td, struct getgid_args *uap)
td->td_retval[0] = td->td_ucred->cr_rgid;
#if defined(COMPAT_43)
- td->td_retval[1] = td->td_ucred->cr_groups[0];
+ td->td_retval[1] = td->td_ucred->cr_gid;
#endif
return (0);
}
@@ -307,7 +307,7 @@ int
sys_getegid(struct thread *td, struct getegid_args *uap)
{
- td->td_retval[0] = td->td_ucred->cr_groups[0];
+ td->td_retval[0] = td->td_ucred->cr_gid;
return (0);
}
@@ -1080,7 +1080,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
gid != oldcred->cr_svgid && /* allow setgid(saved gid) */
#endif
#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
- gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
+ gid != oldcred->cr_gid && /* allow setgid(getegid()) */
#endif
(error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0)
goto fail;
@@ -1092,7 +1092,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
*/
if (
#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */
- gid == oldcred->cr_groups[0] ||
+ gid == oldcred->cr_gid ||
#endif
/* We are using privs. */
priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0)
@@ -1121,7 +1121,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
* In all cases permitted cases, we are changing the egid.
* Copy credentials so other references do not see our changes.
*/
- if (oldcred->cr_groups[0] != gid) {
+ if (oldcred->cr_gid != gid) {
change_egid(newcred, gid);
setsugid(p);
}
@@ -1167,7 +1167,7 @@ sys_setegid(struct thread *td, struct setegid_args *uap)
(error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0)
goto fail;
- if (oldcred->cr_groups[0] != egid) {
+ if (oldcred->cr_gid != egid) {
change_egid(newcred, egid);
setsugid(p);
}
@@ -1393,12 +1393,12 @@ sys_setregid(struct thread *td, struct setregid_args *uap)
if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
rgid != oldcred->cr_svgid) ||
- (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
+ (egid != (gid_t)-1 && egid != oldcred->cr_gid &&
egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
(error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0)
goto fail;
- if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+ if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
change_egid(newcred, egid);
setsugid(p);
}
@@ -1406,9 +1406,9 @@ sys_setregid(struct thread *td, struct setregid_args *uap)
change_rgid(newcred, rgid);
setsugid(p);
}
- if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
- newcred->cr_svgid != newcred->cr_groups[0]) {
- change_svgid(newcred, newcred->cr_groups[0]);
+ if ((rgid != (gid_t)-1 || newcred->cr_gid != newcred->cr_rgid) &&
+ newcred->cr_svgid != newcred->cr_gid) {
+ change_svgid(newcred, newcred->cr_gid);
setsugid(p);
}
proc_set_cred(p, newcred);
@@ -1547,17 +1547,17 @@ sys_setresgid(struct thread *td, struct setresgid_args *uap)
if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
rgid != oldcred->cr_svgid &&
- rgid != oldcred->cr_groups[0]) ||
+ rgid != oldcred->cr_gid) ||
(egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
egid != oldcred->cr_svgid &&
- egid != oldcred->cr_groups[0]) ||
+ egid != oldcred->cr_gid) ||
(sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
sgid != oldcred->cr_svgid &&
- sgid != oldcred->cr_groups[0])) &&
+ sgid != oldcred->cr_gid)) &&
(error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0)
goto fail;
- if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+ if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
change_egid(newcred, egid);
setsugid(p);
}
@@ -1626,8 +1626,8 @@ sys_getresgid(struct thread *td, struct getresgid_args *uap)
error1 = copyout(&cred->cr_rgid,
uap->rgid, sizeof(cred->cr_rgid));
if (uap->egid)
- error2 = copyout(&cred->cr_groups[0],
- uap->egid, sizeof(cred->cr_groups[0]));
+ error2 = copyout(&cred->cr_gid,
+ uap->egid, sizeof(cred->cr_gid));
if (uap->sgid)
error3 = copyout(&cred->cr_svgid,
uap->sgid, sizeof(cred->cr_svgid));
@@ -1737,7 +1737,7 @@ groupmember(gid_t gid, const struct ucred *cred)
groups_check_positive_len(cred->cr_ngroups);
- if (gid == cred->cr_groups[0])
+ if (gid == cred->cr_gid)
return (true);
return (group_is_supplementary(gid, cred));
@@ -3015,7 +3015,7 @@ void
change_egid(struct ucred *newcred, gid_t egid)
{
- newcred->cr_groups[0] = egid;
+ newcred->cr_gid = egid;
}
/*-
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 35b258e68701..8438298afc0e 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -698,10 +698,13 @@ sendfile_wait_generic(struct socket *so, off_t need, int *space)
*/
error = 0;
SOCK_SENDBUF_LOCK(so);
- if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
- so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
- if (so->so_snd.sb_lowat < PAGE_SIZE && so->so_snd.sb_hiwat >= PAGE_SIZE)
- so->so_snd.sb_lowat = PAGE_SIZE;
+ if (so->so_snd.sb_flags & SB_AUTOLOWAT) {
+ if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+ if (so->so_snd.sb_lowat < PAGE_SIZE &&
+ so->so_snd.sb_hiwat >= PAGE_SIZE)
+ so->so_snd.sb_lowat = PAGE_SIZE;
+ }
retry_space:
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
error = EPIPE;
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 5d51aa675cb7..da0efac0598d 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -45,10 +45,10 @@
#include <sys/vnode.h>
#include <sys/acct.h>
#include <sys/capsicum.h>
-#include <sys/compressor.h>
#include <sys/condvar.h>
#include <sys/devctl.h>
#include <sys/event.h>
+#include <sys/exec.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
#include <sys/jail.h>
@@ -80,6 +80,7 @@
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/timers.h>
+#include <sys/ucoredump.h>
#include <sys/unistd.h>
#include <sys/vmmeter.h>
#include <sys/wait.h>
@@ -101,7 +102,6 @@ SDT_PROBE_DEFINE2(proc, , , signal__clear,
SDT_PROBE_DEFINE3(proc, , , signal__discard,
"struct thread *", "struct proc *", "int");
-static int coredump(struct thread *);
static int killpg1(struct thread *td, int sig, int pgid, int all,
ksiginfo_t *ksi);
static int issignal(struct thread *td);
@@ -126,11 +126,6 @@ const struct filterops sig_filtops = {
.f_event = filt_signal,
};
-static int kern_logsigexit = 1;
-SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
- &kern_logsigexit, 0,
- "Log processes quitting on abnormal signals to syslog(3)");
-
static int kern_forcesigexit = 1;
SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
&kern_forcesigexit, 0, "Force trap signal to be handled");
@@ -193,26 +188,6 @@ SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
(cr1)->cr_ruid == (cr2)->cr_uid || \
(cr1)->cr_uid == (cr2)->cr_uid)
-static int sugid_coredump;
-SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
- &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
-
-static int capmode_coredump;
-SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
- &capmode_coredump, 0, "Allow processes in capability mode to dump core");
-
-static int do_coredump = 1;
-SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
- &do_coredump, 0, "Enable/Disable coredumps");
-
-static int set_core_nodump_flag = 0;
-SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
- 0, "Enable setting the NODUMP flag on coredump files");
-
-static int coredump_devctl = 0;
-SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
- 0, "Generate a devctl notification when processes coredump");
-
/*
* Signal properties and actions.
* The array below categorizes the signals and their default actions
@@ -784,6 +759,13 @@ sigprop(int sig)
return (0);
}
+bool
+sig_do_core(int sig)
+{
+
+ return ((sigprop(sig) & SIGPROP_CORE) != 0);
+}
+
static bool
sigact_flag_test(const struct sigaction *act, int flag)
{
@@ -2665,6 +2647,8 @@ static void
ptrace_coredumpreq(struct thread *td, struct proc *p,
struct thr_coredump_req *tcq)
{
+ struct coredump_vnode_ctx wctx;
+ struct coredump_writer cdw;
void *rl_cookie;
if (p->p_sysent->sv_coredump == NULL) {
@@ -2672,8 +2656,15 @@ ptrace_coredumpreq(struct thread *td, struct proc *p,
return;
}
+ wctx.vp = tcq->tc_vp;
+ wctx.fcred = NOCRED;
+
+ cdw.ctx = &wctx;
+ cdw.write_fn = core_vn_write;
+ cdw.extend_fn = core_vn_extend;
+
rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX);
- tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp,
+ tcq->tc_error = p->p_sysent->sv_coredump(td, &cdw,
tcq->tc_limit, tcq->tc_flags);
vn_rangelock_unlock(tcq->tc_vp, rl_cookie);
}
@@ -3635,82 +3626,6 @@ killproc(struct proc *p, const char *why)
}
/*
- * Force the current process to exit with the specified signal, dumping core
- * if appropriate. We bypass the normal tests for masked and caught signals,
- * allowing unrecoverable failures to terminate the process without changing
- * signal state. Mark the accounting record with the signal termination.
- * If dumping core, save the signal number for the debugger. Calls exit and
- * does not return.
- */
-void
-sigexit(struct thread *td, int sig)
-{
- struct proc *p = td->td_proc;
- const char *coreinfo;
- int rv;
- bool logexit;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
- proc_set_p2_wexit(p);
-
- p->p_acflag |= AXSIG;
- if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0)
- logexit = kern_logsigexit != 0;
- else
- logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0;
-
- /*
- * We must be single-threading to generate a core dump. This
- * ensures that the registers in the core file are up-to-date.
- * Also, the ELF dump handler assumes that the thread list doesn't
- * change out from under it.
- *
- * XXX If another thread attempts to single-thread before us
- * (e.g. via fork()), we won't get a dump at all.
- */
- if ((sigprop(sig) & SIGPROP_CORE) &&
- thread_single(p, SINGLE_NO_EXIT) == 0) {
- p->p_sig = sig;
- /*
- * Log signals which would cause core dumps
- * (Log as LOG_INFO to appease those who don't want
- * these messages.)
- * XXX : Todo, as well as euid, write out ruid too
- * Note that coredump() drops proc lock.
- */
- rv = coredump(td);
- switch (rv) {
- case 0:
- sig |= WCOREFLAG;
- coreinfo = " (core dumped)";
- break;
- case EFAULT:
- coreinfo = " (no core dump - bad address)";
- break;
- case EINVAL:
- coreinfo = " (no core dump - invalid argument)";
- break;
- case EFBIG:
- coreinfo = " (no core dump - too large)";
- break;
- default:
- coreinfo = " (no core dump - other error)";
- break;
- }
- if (logexit)
- log(LOG_INFO,
- "pid %d (%s), jid %d, uid %d: exited on "
- "signal %d%s\n", p->p_pid, p->p_comm,
- p->p_ucred->cr_prison->pr_id,
- td->td_ucred->cr_uid,
- sig &~ WCOREFLAG, coreinfo);
- } else
- PROC_UNLOCK(p);
- exit1(td, 0, sig);
- /* NOTREACHED */
-}
-
-/*
* Send queued SIGCHLD to parent when child process's state
* is changed.
*/
@@ -3803,477 +3718,6 @@ childproc_exited(struct proc *p)
sigparent(p, reason, status);
}
-#define MAX_NUM_CORE_FILES 100000
-#ifndef NUM_CORE_FILES
-#define NUM_CORE_FILES 5
-#endif
-CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES);
-static int num_cores = NUM_CORE_FILES;
-
-static int
-sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
-{
- int error;
- int new_val;
-
- new_val = num_cores;
- error = sysctl_handle_int(oidp, &new_val, 0, req);
- if (error != 0 || req->newptr == NULL)
- return (error);
- if (new_val > MAX_NUM_CORE_FILES)
- new_val = MAX_NUM_CORE_FILES;
- if (new_val < 0)
- new_val = 0;
- num_cores = new_val;
- return (0);
-}
-SYSCTL_PROC(_debug, OID_AUTO, ncores,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
- sysctl_debug_num_cores_check, "I",
- "Maximum number of generated process corefiles while using index format");
-
-#define GZIP_SUFFIX ".gz"
-#define ZSTD_SUFFIX ".zst"
-
-int compress_user_cores = 0;
-
-static int
-sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
-{
- int error, val;
-
- val = compress_user_cores;
- error = sysctl_handle_int(oidp, &val, 0, req);
- if (error != 0 || req->newptr == NULL)
- return (error);
- if (val != 0 && !compressor_avail(val))
- return (EINVAL);
- compress_user_cores = val;
- return (error);
-}
-SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
- CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
- sysctl_compress_user_cores, "I",
- "Enable compression of user corefiles ("
- __XSTRING(COMPRESS_GZIP) " = gzip, "
- __XSTRING(COMPRESS_ZSTD) " = zstd)");
-
-int compress_user_cores_level = 6;
-SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
- &compress_user_cores_level, 0,
- "Corefile compression level");
-
-/*
- * Protect the access to corefilename[] by allproc_lock.
- */
-#define corefilename_lock allproc_lock
-
-static char corefilename[MAXPATHLEN] = {"%N.core"};
-TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
-
-static int
-sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
-{
- int error;
-
- sx_xlock(&corefilename_lock);
- error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
- req);
- sx_xunlock(&corefilename_lock);
-
- return (error);
-}
-SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
- CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
- "Process corefile name format string");
-
-static void
-vnode_close_locked(struct thread *td, struct vnode *vp)
-{
-
- VOP_UNLOCK(vp);
- vn_close(vp, FWRITE, td->td_ucred, td);
-}
-
-/*
- * If the core format has a %I in it, then we need to check
- * for existing corefiles before defining a name.
- * To do this we iterate over 0..ncores to find a
- * non-existing core file name to use. If all core files are
- * already used we choose the oldest one.
- */
-static int
-corefile_open_last(struct thread *td, char *name, int indexpos,
- int indexlen, int ncores, struct vnode **vpp)
-{
- struct vnode *oldvp, *nextvp, *vp;
- struct vattr vattr;
- struct nameidata nd;
- int error, i, flags, oflags, cmode;
- char ch;
- struct timespec lasttime;
-
- nextvp = oldvp = NULL;
- cmode = S_IRUSR | S_IWUSR;
- oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
- (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
-
- for (i = 0; i < ncores; i++) {
- flags = O_CREAT | FWRITE | O_NOFOLLOW;
-
- ch = name[indexpos + indexlen];
- (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
- i);
- name[indexpos + indexlen] = ch;
-
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
- error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
- NULL);
- if (error != 0)
- break;
-
- vp = nd.ni_vp;
- NDFREE_PNBUF(&nd);
- if ((flags & O_CREAT) == O_CREAT) {
- nextvp = vp;
- break;
- }
-
- error = VOP_GETATTR(vp, &vattr, td->td_ucred);
- if (error != 0) {
- vnode_close_locked(td, vp);
- break;
- }
-
- if (oldvp == NULL ||
- lasttime.tv_sec > vattr.va_mtime.tv_sec ||
- (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
- lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
- if (oldvp != NULL)
- vn_close(oldvp, FWRITE, td->td_ucred, td);
- oldvp = vp;
- VOP_UNLOCK(oldvp);
- lasttime = vattr.va_mtime;
- } else {
- vnode_close_locked(td, vp);
- }
- }
-
- if (oldvp != NULL) {
- if (nextvp == NULL) {
- if ((td->td_proc->p_flag & P_SUGID) != 0) {
- error = EFAULT;
- vn_close(oldvp, FWRITE, td->td_ucred, td);
- } else {
- nextvp = oldvp;
- error = vn_lock(nextvp, LK_EXCLUSIVE);
- if (error != 0) {
- vn_close(nextvp, FWRITE, td->td_ucred,
- td);
- nextvp = NULL;
- }
- }
- } else {
- vn_close(oldvp, FWRITE, td->td_ucred, td);
- }
- }
- if (error != 0) {
- if (nextvp != NULL)
- vnode_close_locked(td, oldvp);
- } else {
- *vpp = nextvp;
- }
-
- return (error);
-}
-
-/*
- * corefile_open(comm, uid, pid, td, compress, vpp, namep)
- * Expand the name described in corefilename, using name, uid, and pid
- * and open/create core file.
- * corefilename is a printf-like string, with three format specifiers:
- * %N name of process ("name")
- * %P process id (pid)
- * %U user id (uid)
- * For example, "%N.core" is the default; they can be disabled completely
- * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
- * This is controlled by the sysctl variable kern.corefile (see above).
- */
-static int
-corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
- int compress, int signum, struct vnode **vpp, char **namep)
-{
- struct sbuf sb;
- struct nameidata nd;
- const char *format;
- char *hostname, *name;
- int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
-
- hostname = NULL;
- format = corefilename;
- name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
- indexlen = 0;
- indexpos = -1;
- ncores = num_cores;
- (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
- sx_slock(&corefilename_lock);
- for (i = 0; format[i] != '\0'; i++) {
- switch (format[i]) {
- case '%': /* Format character */
- i++;
- switch (format[i]) {
- case '%':
- sbuf_putc(&sb, '%');
- break;
- case 'H': /* hostname */
- if (hostname == NULL) {
- hostname = malloc(MAXHOSTNAMELEN,
- M_TEMP, M_WAITOK);
- }
- getcredhostname(td->td_ucred, hostname,
- MAXHOSTNAMELEN);
- sbuf_cat(&sb, hostname);
- break;
- case 'I': /* autoincrementing index */
- if (indexpos != -1) {
- sbuf_printf(&sb, "%%I");
- break;
- }
-
- indexpos = sbuf_len(&sb);
- sbuf_printf(&sb, "%u", ncores - 1);
- indexlen = sbuf_len(&sb) - indexpos;
- break;
- case 'N': /* process name */
- sbuf_printf(&sb, "%s", comm);
- break;
- case 'P': /* process id */
- sbuf_printf(&sb, "%u", pid);
- break;
- case 'S': /* signal number */
- sbuf_printf(&sb, "%i", signum);
- break;
- case 'U': /* user id */
- sbuf_printf(&sb, "%u", uid);
- break;
- default:
- log(LOG_ERR,
- "Unknown format character %c in "
- "corename `%s'\n", format[i], format);
- break;
- }
- break;
- default:
- sbuf_putc(&sb, format[i]);
- break;
- }
- }
- sx_sunlock(&corefilename_lock);
- free(hostname, M_TEMP);
- if (compress == COMPRESS_GZIP)
- sbuf_cat(&sb, GZIP_SUFFIX);
- else if (compress == COMPRESS_ZSTD)
- sbuf_cat(&sb, ZSTD_SUFFIX);
- if (sbuf_error(&sb) != 0) {
- log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
- "long\n", (long)pid, comm, (u_long)uid);
- sbuf_delete(&sb);
- free(name, M_TEMP);
- return (ENOMEM);
- }
- sbuf_finish(&sb);
- sbuf_delete(&sb);
-
- if (indexpos != -1) {
- error = corefile_open_last(td, name, indexpos, indexlen, ncores,
- vpp);
- if (error != 0) {
- log(LOG_ERR,
- "pid %d (%s), uid (%u): Path `%s' failed "
- "on initial open test, error = %d\n",
- pid, comm, uid, name, error);
- }
- } else {
- cmode = S_IRUSR | S_IWUSR;
- oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
- (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
- flags = O_CREAT | FWRITE | O_NOFOLLOW;
- if ((td->td_proc->p_flag & P_SUGID) != 0)
- flags |= O_EXCL;
-
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
- error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
- NULL);
- if (error == 0) {
- *vpp = nd.ni_vp;
- NDFREE_PNBUF(&nd);
- }
- }
-
- if (error != 0) {
-#ifdef AUDIT
- audit_proc_coredump(td, name, error);
-#endif
- free(name, M_TEMP);
- return (error);
- }
- *namep = name;
- return (0);
-}
-
-/*
- * Dump a process' core. The main routine does some
- * policy checking, and creates the name of the coredump;
- * then it passes on a vnode and a size limit to the process-specific
- * coredump routine if there is one; if there _is not_ one, it returns
- * ENOSYS; otherwise it returns the error from the process-specific routine.
- */
-
-static int
-coredump(struct thread *td)
-{
- struct proc *p = td->td_proc;
- struct ucred *cred = td->td_ucred;
- struct vnode *vp;
- struct flock lf;
- struct vattr vattr;
- size_t fullpathsize;
- int error, error1, jid, locked, ppid, sig;
- char *name; /* name of corefile */
- void *rl_cookie;
- off_t limit;
- char *fullpath, *freepath = NULL;
- struct sbuf *sb;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
- MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
-
- if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
- (p->p_flag2 & P2_NOTRACE) != 0) {
- PROC_UNLOCK(p);
- return (EFAULT);
- }
-
- /*
- * Note that the bulk of limit checking is done after
- * the corefile is created. The exception is if the limit
- * for corefiles is 0, in which case we don't bother
- * creating the corefile at all. This layout means that
- * a corefile is truncated instead of not being created,
- * if it is larger than the limit.
- */
- limit = (off_t)lim_cur(td, RLIMIT_CORE);
- if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
- PROC_UNLOCK(p);
- return (EFBIG);
- }
-
- ppid = p->p_oppid;
- sig = p->p_sig;
- jid = p->p_ucred->cr_prison->pr_id;
- PROC_UNLOCK(p);
-
- error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
- compress_user_cores, p->p_sig, &vp, &name);
- if (error != 0)
- return (error);
-
- /*
- * Don't dump to non-regular files or files with links.
- * Do not dump into system files. Effective user must own the corefile.
- */
- if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
- vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
- vattr.va_uid != cred->cr_uid) {
- VOP_UNLOCK(vp);
- error = EFAULT;
- goto out;
- }
-
- VOP_UNLOCK(vp);
-
- /* Postpone other writers, including core dumps of other processes. */
- rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
-
- lf.l_whence = SEEK_SET;
- lf.l_start = 0;
- lf.l_len = 0;
- lf.l_type = F_WRLCK;
- locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
-
- VATTR_NULL(&vattr);
- vattr.va_size = 0;
- if (set_core_nodump_flag)
- vattr.va_flags = UF_NODUMP;
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- VOP_SETATTR(vp, &vattr, cred);
- VOP_UNLOCK(vp);
- PROC_LOCK(p);
- p->p_acflag |= ACORE;
- PROC_UNLOCK(p);
-
- if (p->p_sysent->sv_coredump != NULL) {
- error = p->p_sysent->sv_coredump(td, vp, limit, 0);
- } else {
- error = ENOSYS;
- }
-
- if (locked) {
- lf.l_type = F_UNLCK;
- VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
- }
- vn_rangelock_unlock(vp, rl_cookie);
-
- /*
- * Notify the userland helper that a process triggered a core dump.
- * This allows the helper to run an automated debugging session.
- */
- if (error != 0 || coredump_devctl == 0)
- goto out;
- sb = sbuf_new_auto();
- if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
- goto out2;
- sbuf_cat(sb, "comm=\"");
- devctl_safe_quote_sb(sb, fullpath);
- free(freepath, M_TEMP);
- sbuf_cat(sb, "\" core=\"");
-
- /*
- * We can't lookup core file vp directly. When we're replacing a core, and
- * other random times, we flush the name cache, so it will fail. Instead,
- * if the path of the core is relative, add the current dir in front if it.
- */
- if (name[0] != '/') {
- fullpathsize = MAXPATHLEN;
- freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
- if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
- free(freepath, M_TEMP);
- goto out2;
- }
- devctl_safe_quote_sb(sb, fullpath);
- free(freepath, M_TEMP);
- sbuf_putc(sb, '/');
- }
- devctl_safe_quote_sb(sb, name);
- sbuf_putc(sb, '"');
-
- sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d",
- jid, p->p_pid, ppid, sig);
- if (sbuf_finish(sb) == 0)
- devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
-out2:
- sbuf_delete(sb);
-out:
- error1 = vn_close(vp, FWRITE, cred, td);
- if (error == 0)
- error = error1;
-#ifdef AUDIT
- audit_proc_coredump(td, name, error);
-#endif
- free(name, M_TEMP);
- return (error);
-}
-
/*
* Nonexistent system call-- signal process (may want to handle it). Flag
* error in case process won't see signal immediately (blocked or ignored).
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 46226cc31980..25da134661e9 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -2368,7 +2368,7 @@ sysctl_root(SYSCTL_HANDLER_ARGS)
priv = PRIV_SYSCTL_WRITEJAIL;
#ifdef VIMAGE
else if ((oid->oid_kind & CTLFLAG_VNET) &&
- prison_owns_vnet(req->td->td_ucred))
+ prison_owns_vnet(req->td->td_ucred->cr_prison))
priv = PRIV_SYSCTL_WRITEJAIL;
#endif
else
diff --git a/sys/kern/kern_ucoredump.c b/sys/kern/kern_ucoredump.c
new file mode 100644
index 000000000000..d425596b5f24
--- /dev/null
+++ b/sys/kern/kern_ucoredump.c
@@ -0,0 +1,299 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/compressor.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rmlock.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucoredump.h>
+#include <sys/wait.h>
+
+static int coredump(struct thread *td, const char **);
+
+int compress_user_cores = 0;
+
+static SLIST_HEAD(, coredumper) coredumpers =
+ SLIST_HEAD_INITIALIZER(coredumpers);
+static struct rmlock coredump_rmlock;
+RM_SYSINIT(coredump_lock, &coredump_rmlock, "coredump_lock");
+
+static int kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
+ &kern_logsigexit, 0,
+ "Log processes quitting on abnormal signals to syslog(3)");
+
+static int sugid_coredump;
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
+ &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
+
+static int do_coredump = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
+ &do_coredump, 0, "Enable/Disable coredumps");
+
+static int
+sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = compress_user_cores;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (val != 0 && !compressor_avail(val))
+ return (EINVAL);
+ compress_user_cores = val;
+ return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
+ CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
+ sysctl_compress_user_cores, "I",
+ "Enable compression of user corefiles ("
+ __XSTRING(COMPRESS_GZIP) " = gzip, "
+ __XSTRING(COMPRESS_ZSTD) " = zstd)");
+
+int compress_user_cores_level = 6;
+SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
+ &compress_user_cores_level, 0,
+ "Corefile compression level");
+
+void
+coredumper_register(struct coredumper *cd)
+{
+
+ blockcount_init(&cd->cd_refcount);
+ rm_wlock(&coredump_rmlock);
+ SLIST_INSERT_HEAD(&coredumpers, cd, cd_entry);
+ rm_wunlock(&coredump_rmlock);
+}
+
+void
+coredumper_unregister(struct coredumper *cd)
+{
+
+ rm_wlock(&coredump_rmlock);
+ SLIST_REMOVE(&coredumpers, cd, coredumper, cd_entry);
+ rm_wunlock(&coredump_rmlock);
+
+ /*
+ * Wait for any in-process coredumps to finish before returning.
+ */
+ blockcount_wait(&cd->cd_refcount, NULL, "dumpwait", 0);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate. We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state. Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger. Calls exit and
+ * does not return.
+ */
+void
+sigexit(struct thread *td, int sig)
+{
+ struct proc *p = td->td_proc;
+ int rv;
+ bool logexit;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ proc_set_p2_wexit(p);
+
+ p->p_acflag |= AXSIG;
+ if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0)
+ logexit = kern_logsigexit != 0;
+ else
+ logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0;
+
+ /*
+ * We must be single-threading to generate a core dump. This
+ * ensures that the registers in the core file are up-to-date.
+ * Also, the ELF dump handler assumes that the thread list doesn't
+ * change out from under it.
+ *
+ * XXX If another thread attempts to single-thread before us
+ * (e.g. via fork()), we won't get a dump at all.
+ */
+ if (sig_do_core(sig) && thread_single(p, SINGLE_NO_EXIT) == 0) {
+ const char *err = NULL;
+
+ p->p_sig = sig;
+ /*
+ * Log signals which would cause core dumps
+ * (Log as LOG_INFO to appease those who don't want
+ * these messages.)
+ * XXX : Todo, as well as euid, write out ruid too
+ * Note that coredump() drops proc lock.
+ */
+ rv = coredump(td, &err);
+ if (rv == 0) {
+ MPASS(err == NULL);
+ sig |= WCOREFLAG;
+ } else if (err == NULL) {
+ switch (rv) {
+ case EFAULT:
+ err = "bad address";
+ break;
+ case EINVAL:
+ err = "invalild argument";
+ break;
+ case EFBIG:
+ err = "too large";
+ break;
+ default:
+ err = "other error";
+ break;
+ }
+ }
+ if (logexit)
+ log(LOG_INFO,
+ "pid %d (%s), jid %d, uid %d: exited on "
+ "signal %d (%s%s)\n", p->p_pid, p->p_comm,
+ p->p_ucred->cr_prison->pr_id,
+ td->td_ucred->cr_uid, sig &~ WCOREFLAG,
+ err != NULL ? "no core dump - " : "core dumped",
+ err != NULL ? err : "");
+ } else
+ PROC_UNLOCK(p);
+ exit1(td, 0, sig);
+ /* NOTREACHED */
+}
+
+
+/*
+ * Dump a process' core. The main routine does some
+ * policy checking, and creates the name of the coredump;
+ * then it passes on a vnode and a size limit to the process-specific
+ * coredump routine if there is one; if there _is not_ one, it returns
+ * ENOSYS; otherwise it returns the error from the process-specific routine.
+ */
+static int
+coredump(struct thread *td, const char **errmsg)
+{
+ struct coredumper *iter, *chosen;
+ struct proc *p = td->td_proc;
+ struct rm_priotracker tracker;
+ off_t limit;
+ int error, priority;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
+
+ if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
+ (p->p_flag2 & P2_NOTRACE) != 0) {
+ PROC_UNLOCK(p);
+
+ if (!do_coredump)
+ *errmsg = "denied by kern.coredump";
+ else if ((p->p_flag2 & P2_NOTRACE) != 0)
+ *errmsg = "process has trace disabled";
+ else
+ *errmsg = "sugid process denied by kern.sugid_coredump";
+ return (EFAULT);
+ }
+
+ /*
+ * Note that the bulk of limit checking is done after
+ * the corefile is created. The exception is if the limit
+ * for corefiles is 0, in which case we don't bother
+ * creating the corefile at all. This layout means that
+ * a corefile is truncated instead of not being created,
+ * if it is larger than the limit.
+ */
+ limit = (off_t)lim_cur(td, RLIMIT_CORE);
+ if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
+ PROC_UNLOCK(p);
+ *errmsg = "coredumpsize limit is 0";
+ return (EFBIG);
+ }
+
+ rm_rlock(&coredump_rmlock, &tracker);
+ priority = -1;
+ chosen = NULL;
+ SLIST_FOREACH(iter, &coredumpers, cd_entry) {
+ if (iter->cd_probe == NULL) {
+ /*
+ * If we haven't found anything of a higher priority
+ * yet, we'll call this a GENERIC. Ideally, we want
+ * coredumper modules to include a probe function.
+ */
+ if (priority < 0) {
+ priority = COREDUMPER_GENERIC;
+ chosen = iter;
+ }
+
+ continue;
+ }
+
+ error = (*iter->cd_probe)(td);
+ if (error < 0)
+ continue;
+
+ /*
+ * Higher priority than previous options.
+ */
+ if (error > priority) {
+ priority = error;
+ chosen = iter;
+ }
+ }
+
+ /*
+ * Acquire our refcount before we drop the lock so that
+ * coredumper_unregister() can safely assume that the refcount will only
+ * go down once it's dropped the rmlock.
+ */
+ blockcount_acquire(&chosen->cd_refcount, 1);
+ rm_runlock(&coredump_rmlock, &tracker);
+
+ /* Currently, we always have the vnode dumper built in. */
+ MPASS(chosen != NULL);
+ error = ((*chosen->cd_handle)(td, limit));
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+
+ blockcount_release(&chosen->cd_refcount, 1);
+
+ return (error);
+}
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 0edb631d1475..464efda1e91a 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code)
if (__predict_false(!kasan_enabled))
return;
- if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS &&
- (vm_offset_t)addr < DMAP_MAX_ADDRESS)
+ if (kasan_md_unsupported((vm_offset_t)addr))
return;
KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS &&
diff --git a/sys/kern/subr_compressor.c b/sys/kern/subr_compressor.c
index 280264881241..5d59622e0455 100644
--- a/sys/kern/subr_compressor.c
+++ b/sys/kern/subr_compressor.c
@@ -538,6 +538,12 @@ compressor_init(compressor_cb_t cb, int format, size_t maxiosize, int level,
return (s);
}
+int
+compressor_format(const struct compressor *stream)
+{
+ return (stream->methods->format);
+}
+
void
compressor_reset(struct compressor *stream)
{
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 18388ae5f232..bac7d0080c71 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -338,8 +338,9 @@ ast_handler(struct thread *td, struct trapframe *framep, bool dtor)
td->td_ast = 0;
}
- CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
- td->td_proc->p_comm);
+ CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td,
+ td->td_proc == NULL ? -1 : td->td_proc->p_pid,
+ td->td_proc == NULL ? "" : td->td_proc->p_comm);
KASSERT(framep == NULL || TRAPF_USERMODE(framep),
("ast in kernel mode"));
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 94e44d888181..5606b36f772f 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -2269,6 +2269,7 @@ exterr_copyout(struct thread *td)
ue.error = 0;
sz = sizeof(ue.error);
} else {
+ ktrexterr(td);
sz = sizeof(ue) - __offsetof(struct uexterror, error);
}
error = copyout(&ue.error, uloc, sz);
@@ -2309,6 +2310,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap)
return (EINVAL);
td->td_pflags2 &= ~TDP2_UEXTERR;
return (0);
+ case EXTERRCTL_UD:
+ /*
+ * Important: this code must always return EINVAL and never any
+ * extended error, for testing purposes.
+ */
+ /* FALLTHROUGH */
default:
return (EINVAL);
}
@@ -2329,7 +2336,6 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1,
td->td_kexterr.p1 = pp1;
td->td_kexterr.p2 = pp2;
td->td_kexterr.src_line = line;
- ktrexterr(td);
}
return (eerror);
}
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index ce09042abdac..66ce1b5a081d 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1207,7 +1207,7 @@ sb_mark_notready(struct sockbuf *sb)
for (; m != NULL; m = m->m_next) {
KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
__func__));
- KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
+ KASSERT((m->m_flags & M_NOTREADY) == 0, ("%s: mbuf not ready",
__func__));
KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
__func__));
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 6f83b875a6b6..85fe48ddd466 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -1134,10 +1134,10 @@ shm_doremove(struct shm_mapping *map)
int
kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
- int shmflags, struct filecaps *fcaps, const char *name __unused)
+ int shmflags, struct filecaps *fcaps, const char *name __unused,
+ struct shmfd *shmfd)
{
struct pwddesc *pdp;
- struct shmfd *shmfd;
struct file *fp;
char *path;
void *rl_cookie;
@@ -1214,23 +1214,41 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
if (error != 0)
goto outnofp;
- /* A SHM_ANON path pointer creates an anonymous object. */
+ /*
+ * A SHM_ANON path pointer creates an anonymous object. We allow other
+ * parts of the kernel to pre-populate a shmfd and then materialize an
+ * fd for it here as a means to pass data back up to userland. This
+ * doesn't really make sense for named shm objects, but it makes plenty
+ * of sense for anonymous objects.
+ */
if (userpath == SHM_ANON) {
- /* A read-only anonymous object is pointless. */
- if ((flags & O_ACCMODE) == O_RDONLY) {
- error = EINVAL;
- goto out;
- }
- shmfd = shm_alloc(td->td_ucred, cmode, largepage);
- if (shmfd == NULL) {
- error = ENOMEM;
- goto out;
+ if (shmfd != NULL) {
+ shm_hold(shmfd);
+ } else {
+ /*
+ * A read-only anonymous object is pointless, unless it
+ * was pre-populated by the kernel with the expectation
+ * that a shmfd would later be created for userland to
+ * access it through.
+ */
+ if ((flags & O_ACCMODE) == O_RDONLY) {
+ error = EINVAL;
+ goto out;
+ }
+ shmfd = shm_alloc(td->td_ucred, cmode, largepage);
+ if (shmfd == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ shmfd->shm_seals = initial_seals;
+ shmfd->shm_flags = shmflags;
}
- shmfd->shm_seals = initial_seals;
- shmfd->shm_flags = shmflags;
} else {
fnv = fnv_32_str(path, FNV1_32_INIT);
sx_xlock(&shm_dict_lock);
+
+ MPASS(shmfd == NULL);
shmfd = shm_lookup(path, fnv);
if (shmfd == NULL) {
/* Object does not yet exist, create it if requested. */
@@ -2173,7 +2191,7 @@ kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode,
struct filecaps *caps)
{
- return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL));
+ return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL, NULL));
}
/*
@@ -2191,7 +2209,7 @@ sys_shm_open2(struct thread *td, struct shm_open2_args *uap)
{
return (kern_shm_open2(td, uap->path, uap->flags, uap->mode,
- uap->shmflags, NULL, uap->name));
+ uap->shmflags, NULL, uap->name, NULL));
}
int
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index ec00878cd9a5..745702bd4a4f 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -195,14 +195,14 @@ int
sbready(struct sockbuf *sb, struct mbuf *m0, int count)
{
struct mbuf *m;
- u_int blocker;
+ bool blocker;
SOCKBUF_LOCK_ASSERT(sb);
KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
KASSERT(count > 0, ("%s: invalid count %d", __func__, count));
m = m0;
- blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
+ blocker = (sb->sb_fnrdy == m);
while (count > 0) {
KASSERT(m->m_flags & M_NOTREADY,
@@ -217,8 +217,7 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count)
m->m_epg_nrdy = 0;
} else
count--;
-
- m->m_flags &= ~(M_NOTREADY | blocker);
+ m->m_flags &= ~M_NOTREADY;
if (blocker)
sb->sb_acc += m->m_len;
m = m->m_next;
@@ -240,12 +239,8 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count)
}
/* This one was blocking all the queue. */
- for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
- KASSERT(m->m_flags & M_BLOCKED,
- ("%s: m %p !M_BLOCKED", __func__, m));
- m->m_flags &= ~M_BLOCKED;
+ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next)
sb->sb_acc += m->m_len;
- }
sb->sb_fnrdy = m;
sbready_compress(sb, m0, m);
@@ -269,8 +264,7 @@ sballoc(struct sockbuf *sb, struct mbuf *m)
sb->sb_fnrdy = m;
else
sb->sb_acc += m->m_len;
- } else
- m->m_flags |= M_BLOCKED;
+ }
if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
sb->sb_ctl += m->m_len;
@@ -287,29 +281,29 @@ sballoc(struct sockbuf *sb, struct mbuf *m)
void
sbfree(struct sockbuf *sb, struct mbuf *m)
{
+ struct mbuf *n;
#if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */
SOCKBUF_LOCK_ASSERT(sb);
#endif
-
sb->sb_ccc -= m->m_len;
- if (!(m->m_flags & M_NOTAVAIL))
- sb->sb_acc -= m->m_len;
-
if (m == sb->sb_fnrdy) {
- struct mbuf *n;
-
KASSERT(m->m_flags & M_NOTREADY,
("%s: m %p !M_NOTREADY", __func__, m));
n = m->m_next;
while (n != NULL && !(n->m_flags & M_NOTREADY)) {
- n->m_flags &= ~M_BLOCKED;
sb->sb_acc += n->m_len;
n = n->m_next;
}
sb->sb_fnrdy = n;
+ } else {
+ /* Assert that mbuf is not behind sb_fnrdy. */
+ for (n = sb->sb_fnrdy; n != NULL; n = n->m_next)
+ KASSERT(n != m, ("%s: sb %p freeing %p behind sb_fnrdy",
+ __func__, sb, m));
+ sb->sb_acc -= m->m_len;
}
if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
@@ -779,6 +773,7 @@ sbsetopt(struct socket *so, struct sockopt *sopt)
* high-water.
*/
*lowat = (cc > *hiwat) ? *hiwat : cc;
+ *flags &= ~SB_AUTOLOWAT;
break;
}
@@ -1128,13 +1123,7 @@ sbcheck(struct sockbuf *sb, const char *file, int line)
}
fnrdy = m;
}
- if (fnrdy) {
- if (!(m->m_flags & M_NOTAVAIL)) {
- printf("sb %p: fnrdy %p, m %p is avail\n",
- sb, sb->sb_fnrdy, m);
- goto fail;
- }
- } else
+ if (fnrdy == NULL)
acc += m->m_len;
ccc += m->m_len;
mbcnt += MSIZE;
@@ -1601,8 +1590,8 @@ sbcut_internal(struct sockbuf *sb, int len)
next = m->m_nextpkt;
}
if (m->m_len > len) {
- KASSERT(!(m->m_flags & M_NOTAVAIL),
- ("%s: m %p M_NOTAVAIL", __func__, m));
+ KASSERT(!(m->m_flags & M_NOTREADY),
+ ("%s: m %p M_NOTREADY", __func__, m));
m->m_len -= len;
m->m_data += len;
sb->sb_ccc -= len;
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 6c9eb7139cd1..fe2d8d056062 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1211,7 +1211,8 @@ solisten_clone(struct socket *head)
so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
- so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
+ so->so_snd.sb_flags = head->sol_sbsnd_flags &
+ (SB_AUTOSIZE | SB_AUTOLOWAT);
if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
so->so_snd.sb_mtx = &so->so_snd_mtx;
so->so_rcv.sb_mtx = &so->so_rcv_mtx;
@@ -2988,8 +2989,8 @@ dontblock:
*/
moff = 0;
offset = 0;
- while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
- && error == 0) {
+ while (m != NULL && !(m->m_flags & M_NOTREADY) && uio->uio_resid > 0 &&
+ error == 0) {
/*
* If the type of mbuf has changed since the last mbuf
* examined ('type'), end the receive operation.
@@ -3341,7 +3342,7 @@ deliver:
for (m = sb->sb_mb;
m != NULL && m->m_len <= len;
m = m->m_next) {
- KASSERT(!(m->m_flags & M_NOTAVAIL),
+ KASSERT(!(m->m_flags & M_NOTREADY),
("%s: m %p not available", __func__, m));
len -= m->m_len;
uio->uio_resid -= m->m_len;
@@ -4514,6 +4515,9 @@ sokqfilter_generic(struct socket *so, struct knote *kn)
SOCK_BUF_LOCK(so, which);
knlist_add(knl, kn, 1);
sb->sb_flags |= SB_KNOTE;
+ if ((kn->kn_sfflags & NOTE_LOWAT) &&
+ (sb->sb_flags & SB_AUTOLOWAT))
+ sb->sb_flags &= ~SB_AUTOLOWAT;
SOCK_BUF_UNLOCK(so, which);
}
SOCK_UNLOCK(so);
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 3d455b3874cc..89c1d779f04c 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -332,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
"char *");
SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
-SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
+ "enum cache_fpl_status");
SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
SDT_PROBE_DECLARE(vfs, namei, lookup, return);
@@ -6420,15 +6421,11 @@ out:
cache_fpl_smr_assert_not_entered(&fpl);
cache_fpl_assert_status(&fpl);
*status = fpl.status;
- if (SDT_PROBES_ENABLED()) {
- SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
- if (fpl.status == CACHE_FPL_STATUS_HANDLED)
- SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
- ndp);
- }
-
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
MPASS(error != CACHE_FPL_FAILED);
+ SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
+ ndp);
if (error != 0) {
cache_fpl_cleanup_cnp(fpl.cnp);
MPASS(fpl.dvp == NULL);
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
index 2b42228465a4..d3cd0d1f9832 100644
--- a/sys/kern/vfs_inotify.c
+++ b/sys/kern/vfs_inotify.c
@@ -371,7 +371,7 @@ inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watc
TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
- vn_irflag_unset_locked(vp, VIRF_INOTIFY);
+ vn_irflag_unset(vp, VIRF_INOTIFY);
}
/*
@@ -675,7 +675,8 @@ vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
struct vattr va;
int error;
- error = VOP_GETATTR(vp, &va, cnp->cn_cred);
+ error = VOP_GETATTR(vp, &va,
+ cnp->cn_cred);
if (error == 0 && va.va_nlink != 0)
selfevent = 0;
}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 918b256e6c59..29774cf87393 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -6533,17 +6533,6 @@ vop_read_pgcache_post(void *ap, int rc)
VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
}
-void
-vop_readdir_post(void *ap, int rc)
-{
- struct vop_readdir_args *a = ap;
-
- if (!rc) {
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
- INOTIFY(a->a_vp, IN_ACCESS);
- }
-}
-
static struct knlist fs_knlist;
static void
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index c71e0d9ee569..25d40a9806cb 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -2253,10 +2253,10 @@ kern_accessat(struct thread *td, int fd, const char *path,
cred = td->td_ucred;
if ((flag & AT_EACCESS) == 0 &&
((cred->cr_uid != cred->cr_ruid ||
- cred->cr_rgid != cred->cr_groups[0]))) {
+ cred->cr_rgid != cred->cr_gid))) {
usecred = crdup(cred);
usecred->cr_uid = cred->cr_ruid;
- usecred->cr_groups[0] = cred->cr_rgid;
+ usecred->cr_gid = cred->cr_rgid;
td->td_ucred = usecred;
} else
usecred = cred;
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index 38138a4af921..2e63215b2f97 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -242,8 +242,8 @@ vop_read_pgcache {
%% write vp L L L
-%! write pre VOP_WRITE_PRE
-%! write post VOP_WRITE_POST
+%! write pre vop_write_pre
+%! write post vop_write_post
vop_write {
IN struct vnode *vp;
@@ -380,6 +380,7 @@ vop_symlink {
%% readdir vp L L L
+%! readdir pre vop_readdir_pre
%! readdir post vop_readdir_post
vop_readdir {
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 9922796f8a1d..99c9ec9dcd01 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -34,6 +34,7 @@ SUBDIR= \
alq \
${_amd_ecc_inject} \
${_amdgpio} \
+ ${_amdsmu} \
${_amdsbwd} \
${_amdsmn} \
${_amdtemp} \
@@ -326,6 +327,7 @@ SUBDIR= \
proto \
pseudofs \
${_pst} \
+ ${_pt} \
pty \
puc \
pwm \
@@ -771,6 +773,7 @@ _acpi= acpi
_aesni= aesni
.endif
_amd_ecc_inject=amd_ecc_inject
+_amdsmu= amdsmu
_amdsbwd= amdsbwd
_amdsmn= amdsmn
_amdtemp= amdtemp
@@ -842,6 +845,7 @@ _iwx= iwx
_ixl= ixl
_nvdimm= nvdimm
_pms= pms
+_pt= pt
_qat= qat
.if ${MK_SOURCELESS_UCODE} != "no"
_qatfw= qatfw
diff --git a/sys/modules/amdsmu/Makefile b/sys/modules/amdsmu/Makefile
new file mode 100644
index 000000000000..752f57173d61
--- /dev/null
+++ b/sys/modules/amdsmu/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# Copyright (c) 2025 The FreeBSD Foundation
+#
+# This software was developed by Aymeric Wibo <obiwac@freebsd.org>
+# under sponsorship from the FreeBSD Foundation.
+
+.PATH: ${SRCTOP}/sys/dev/amdsmu
+
+KMOD= amdsmu
+SRCS= amdsmu.c
+SRCS+= bus_if.h device_if.h pci_if.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/efirt/Makefile b/sys/modules/efirt/Makefile
index 4738996fd4e6..c46484465b68 100644
--- a/sys/modules/efirt/Makefile
+++ b/sys/modules/efirt/Makefile
@@ -9,7 +9,7 @@ SRCS+= device_if.h bus_if.h clock_if.h
DPSRCS+= assym.inc
.if ${MACHINE_CPUARCH} == "amd64"
-SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h
+SRCS+= opt_acpi.h opt_hwpmc_hooks.h opt_kstack_pages.h
.endif
efirt_support.o: efirt_support.S assym.inc
diff --git a/sys/modules/ice/Makefile b/sys/modules/ice/Makefile
index 91f20193d878..9f9c9f602cda 100644
--- a/sys/modules/ice/Makefile
+++ b/sys/modules/ice/Makefile
@@ -13,6 +13,7 @@ SRCS += opt_inet.h opt_inet6.h opt_rss.h opt_iflib.h
SRCS += ice_lib.c ice_osdep.c ice_resmgr.c ice_strings.c
SRCS += ice_iflib_recovery_txrx.c ice_iflib_txrx.c if_ice_iflib.c
SRCS += ice_fw_logging.c ice_ddp_common.c
+SRCS.PCI_IOV += pci_iov_if.h ice_iov.c ice_vf_mbx.c
# RDMA Client interface
# TODO: Is this the right way to compile this?
diff --git a/sys/modules/pt/Makefile b/sys/modules/pt/Makefile
new file mode 100644
index 000000000000..416b072face9
--- /dev/null
+++ b/sys/modules/pt/Makefile
@@ -0,0 +1,8 @@
+
+.PATH: ${SRCTOP}/sys/amd64/pt
+
+KMOD= pt
+SRCS= pt.c pt.h device_if.h bus_if.h
+SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/qlnx/qlnxe/Makefile b/sys/modules/qlnx/qlnxe/Makefile
index 3d8415cf0e57..2a44ae6ddde5 100644
--- a/sys/modules/qlnx/qlnxe/Makefile
+++ b/sys/modules/qlnx/qlnxe/Makefile
@@ -58,6 +58,7 @@ SRCS+=qlnx_rdma.c
SRCS+=qlnx_ioctl.c
SRCS+=qlnx_os.c
+SRCS+=opt_inet.h
SRCS+= ${LINUXKPI_GENSRCS}
diff --git a/sys/net/ethernet.h b/sys/net/ethernet.h
index cf4f75bd0b6c..01485cf26e06 100644
--- a/sys/net/ethernet.h
+++ b/sys/net/ethernet.h
@@ -62,6 +62,8 @@ struct ether_header {
u_char ether_shost[ETHER_ADDR_LEN];
u_short ether_type;
} __packed;
+_Static_assert(sizeof(struct ether_header) == ETHER_HDR_LEN,
+ "size of struct ether_header is wrong");
/*
* Structure of a 48-bit Ethernet address.
@@ -69,6 +71,8 @@ struct ether_header {
struct ether_addr {
u_char octet[ETHER_ADDR_LEN];
} __packed;
+_Static_assert(sizeof(struct ether_addr) == ETHER_ADDR_LEN,
+ "size of struct ether_addr is wrong");
#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
#define ETHER_IS_IPV6_MULTICAST(addr) \
@@ -112,6 +116,8 @@ struct ether_vlan_header {
uint16_t evl_tag;
uint16_t evl_proto;
} __packed;
+_Static_assert(sizeof(struct ether_vlan_header) == ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN,
+ "size of struct ether_vlan_header is wrong");
#define EVL_VLID_MASK 0x0FFF
#define EVL_PRI_MASK 0xE000
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index 5b3ee740d75e..0a35fb4095fb 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -76,31 +76,34 @@
* heterogeneous bridges).
*/
-#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"
+#define EXTERR_CATEGORY EXTERR_CAT_BRIDGE
+
#include <sys/param.h>
+#include <sys/ctype.h> /* string functions */
#include <sys/eventhandler.h>
-#include <sys/mbuf.h>
+#include <sys/exterrvar.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
#include <sys/protosw.h>
+#include <sys/random.h>
#include <sys/systm.h>
-#include <sys/jail.h>
-#include <sys/time.h>
#include <sys/socket.h> /* for net/if.h */
#include <sys/sockio.h>
-#include <sys/ctype.h> /* string functions */
-#include <sys/kernel.h>
-#include <sys/random.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
+#include <sys/time.h>
+
#include <vm/uma.h>
-#include <sys/module.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
#include <net/bpf.h>
#include <net/if.h>
@@ -254,8 +257,8 @@ struct bridge_iflist {
uint32_t bif_addrcnt; /* cur. # of addresses */
uint32_t bif_addrexceeded;/* # of address violations */
struct epoch_context bif_epoch_ctx;
- ether_vlanid_t bif_untagged; /* untagged vlan id */
- ifbvlan_set_t bif_vlan_set; /* allowed tagged vlans */
+ ether_vlanid_t bif_pvid; /* port vlan id */
+ ifbvlan_set_t bif_vlan_set; /* if allowed tagged vlans */
};
/*
@@ -404,7 +407,7 @@ static int bridge_ioctl_sma(struct bridge_softc *, void *);
static int bridge_ioctl_sifprio(struct bridge_softc *, void *);
static int bridge_ioctl_sifcost(struct bridge_softc *, void *);
static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *);
-static int bridge_ioctl_sifuntagged(struct bridge_softc *, void *);
+static int bridge_ioctl_sifpvid(struct bridge_softc *, void *);
static int bridge_ioctl_sifvlanset(struct bridge_softc *, void *);
static int bridge_ioctl_gifvlanset(struct bridge_softc *, void *);
static int bridge_ioctl_addspan(struct bridge_softc *, void *);
@@ -625,7 +628,7 @@ static const struct bridge_control bridge_control_table[] = {
{ bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq),
BC_F_COPYIN|BC_F_SUSER },
- { bridge_ioctl_sifuntagged, sizeof(struct ifbreq),
+ { bridge_ioctl_sifpvid, sizeof(struct ifbreq),
BC_F_COPYIN|BC_F_SUSER },
{ bridge_ioctl_sifvlanset, sizeof(struct ifbif_vlan_req),
@@ -986,31 +989,37 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
case SIOCGDRVSPEC:
case SIOCSDRVSPEC:
if (ifd->ifd_cmd >= bridge_control_table_size) {
- error = EINVAL;
+ error = EXTERROR(EINVAL, "Invalid control command");
break;
}
bc = &bridge_control_table[ifd->ifd_cmd];
if (cmd == SIOCGDRVSPEC &&
(bc->bc_flags & BC_F_COPYOUT) == 0) {
- error = EINVAL;
+ error = EXTERROR(EINVAL,
+ "Inappropriate ioctl for command "
+ "(expected SIOCSDRVSPEC)");
break;
}
else if (cmd == SIOCSDRVSPEC &&
(bc->bc_flags & BC_F_COPYOUT) != 0) {
- error = EINVAL;
+ error = EXTERROR(EINVAL,
+ "Inappropriate ioctl for command "
+ "(expected SIOCGDRVSPEC)");
break;
}
if (bc->bc_flags & BC_F_SUSER) {
error = priv_check(td, PRIV_NET_BRIDGE);
- if (error)
+ if (error) {
+ EXTERROR(error, "PRIV_NET_BRIDGE required");
break;
+ }
}
if (ifd->ifd_len != bc->bc_argsize ||
ifd->ifd_len > sizeof(args)) {
- error = EINVAL;
+ error = EXTERROR(EINVAL, "Invalid argument size");
break;
}
@@ -1062,7 +1071,8 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
oldmtu = sc->sc_ifp->if_mtu;
if (ifr->ifr_mtu < IF_MINMTU) {
- error = EINVAL;
+ error = EXTERROR(EINVAL,
+ "Requested MTU is lower than IF_MINMTU");
break;
}
if (CK_LIST_EMPTY(&sc->sc_iflist)) {
@@ -1088,6 +1098,8 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
(*bif->bif_ifp->if_ioctl)(bif->bif_ifp,
SIOCSIFMTU, (caddr_t)ifr);
}
+ EXTERROR(error,
+ "Failed to set MTU on member interface");
} else {
sc->sc_ifp->if_mtu = ifr->ifr_mtu;
}
@@ -1125,14 +1137,14 @@ bridge_mutecaps(struct bridge_softc *sc)
mask = BRIDGE_IFCAPS_MASK;
CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
- /* Every member must support it or its disabled */
+ /* Every member must support it or it's disabled */
mask &= bif->bif_savedcaps;
}
CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
enabled = bif->bif_ifp->if_capenable;
enabled &= ~BRIDGE_IFCAPS_STRIP;
- /* strip off mask bits and enable them again if allowed */
+ /* Strip off mask bits and enable them again if allowed */
enabled &= ~BRIDGE_IFCAPS_MASK;
enabled |= mask;
bridge_set_ifcap(sc, bif, enabled);
@@ -1282,7 +1294,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
#endif
break;
}
- /* reneable any interface capabilities */
+ /* Re-enable any interface capabilities */
bridge_set_ifcap(sc, bif, bif->bif_savedcaps);
}
bstp_destroy(&bif->bif_stp); /* prepare to free */
@@ -1318,21 +1330,48 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
ifs = ifunit(req->ifbr_ifsname);
if (ifs == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "No such interface",
+ req->ifbr_ifsname));
if (ifs->if_ioctl == NULL) /* must be supported */
- return (EINVAL);
+ return (EXTERROR(EINVAL, "Interface must support ioctl(2)"));
+
+ /*
+ * If the new interface is a vlan(4), it could be a bridge SVI.
+ * Don't allow such things to be added to bridges.
+ */
+ if (ifs->if_type == IFT_L2VLAN) {
+ struct ifnet *parent;
+ struct epoch_tracker et;
+ bool is_bridge;
+
+ /*
+ * Entering NET_EPOCH with BRIDGE_LOCK held, but this is okay
+ * since we don't sleep here.
+ */
+ NET_EPOCH_ENTER(et);
+ parent = VLAN_TRUNKDEV(ifs);
+ is_bridge = (parent != NULL && parent->if_type == IFT_BRIDGE);
+ NET_EPOCH_EXIT(et);
+
+ if (is_bridge)
+ return (EXTERROR(EINVAL,
+ "Bridge SVI cannot be added to a bridge"));
+ }
/* If it's in the span list, it can't be a member. */
CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
if (ifs == bif->bif_ifp)
- return (EBUSY);
+ return (EXTERROR(EBUSY,
+ "Span interface cannot be a member"));
if (ifs->if_bridge) {
struct bridge_iflist *sbif = ifs->if_bridge;
if (sbif->bif_sc == sc)
- return (EEXIST);
+ return (EXTERROR(EEXIST,
+ "Interface is already a member of this bridge"));
- return (EBUSY);
+ return (EXTERROR(EBUSY,
+ "Interface is already a member of another bridge"));
}
switch (ifs->if_type) {
@@ -1342,7 +1381,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
/* permitted interface types */
break;
default:
- return (EINVAL);
+ return (EXTERROR(EINVAL, "Unsupported interface type"));
}
#ifdef INET6
@@ -1394,11 +1433,15 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
CK_STAILQ_FOREACH(ifa, &ifs->if_addrhead, ifa_link) {
#ifdef INET
if (ifa->ifa_addr->sa_family == AF_INET)
- return (EINVAL);
+ return (EXTERROR(EINVAL,
+ "Member interface may not have "
+ "an IPv4 address configured"));
#endif
#ifdef INET6
if (ifa->ifa_addr->sa_family == AF_INET6)
- return (EINVAL);
+ return (EXTERROR(EINVAL,
+ "Member interface may not have "
+ "an IPv6 address configured"));
#endif
}
}
@@ -1420,7 +1463,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
" new member %s\n", sc->sc_ifp->if_xname,
ifr.ifr_mtu,
ifs->if_xname);
- return (EINVAL);
+ return (EXTERROR(EINVAL,
+ "Failed to set MTU on new member"));
}
}
@@ -1482,7 +1526,7 @@ bridge_ioctl_del(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->ifbr_ifsname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
bridge_delete_member(sc, bif, 0);
@@ -1498,7 +1542,7 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->ifbr_ifsname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
bp = &bif->bif_stp;
req->ifbr_ifsflags = bif->bif_flags;
@@ -1512,7 +1556,7 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
req->ifbr_addrcnt = bif->bif_addrcnt;
req->ifbr_addrmax = bif->bif_addrmax;
req->ifbr_addrexceeded = bif->bif_addrexceeded;
- req->ifbr_untagged = bif->bif_untagged;
+ req->ifbr_pvid = bif->bif_pvid;
/* Copy STP state options as flags */
if (bp->bp_operedge)
@@ -1541,12 +1585,12 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->ifbr_ifsname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
bp = &bif->bif_stp;
if (req->ifbr_ifsflags & IFBIF_SPAN)
/* SPAN is readonly */
- return (EINVAL);
+ return (EXTERROR(EINVAL, "Span interface cannot be modified"));
NET_EPOCH_ENTER(et);
@@ -1555,7 +1599,8 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg)
error = bstp_enable(&bif->bif_stp);
if (error) {
NET_EPOCH_EXIT(et);
- return (error);
+ return (EXTERROR(error,
+ "Failed to enable STP"));
}
}
} else {
@@ -1724,7 +1769,7 @@ bridge_ioctl_saddr(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->ifba_ifsname);
if (bif == NULL) {
NET_EPOCH_EXIT(et);
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
}
/* bridge_rtupdate() may acquire the lock. */
@@ -1858,7 +1903,7 @@ bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->ifbr_ifsname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority));
}
@@ -1871,7 +1916,7 @@ bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->ifbr_ifsname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost));
}
@@ -1884,28 +1929,28 @@ bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->ifbr_ifsname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
bif->bif_addrmax = req->ifbr_addrmax;
return (0);
}
static int
-bridge_ioctl_sifuntagged(struct bridge_softc *sc, void *arg)
+bridge_ioctl_sifpvid(struct bridge_softc *sc, void *arg)
{
struct ifbreq *req = arg;
struct bridge_iflist *bif;
bif = bridge_lookup_member(sc, req->ifbr_ifsname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
- if (req->ifbr_untagged > DOT1Q_VID_MAX)
- return (EINVAL);
+ if (req->ifbr_pvid > DOT1Q_VID_MAX)
+ return (EXTERROR(EINVAL, "Invalid VLAN ID"));
- if (req->ifbr_untagged != DOT1Q_VID_NULL)
+ if (req->ifbr_pvid != DOT1Q_VID_NULL)
bif->bif_flags |= IFBIF_VLANFILTER;
- bif->bif_untagged = req->ifbr_untagged;
+ bif->bif_pvid = req->ifbr_pvid;
return (0);
}
@@ -1917,12 +1962,12 @@ bridge_ioctl_sifvlanset(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->bv_ifname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
/* Reject invalid VIDs. */
if (BRVLAN_TEST(&req->bv_set, DOT1Q_VID_NULL) ||
BRVLAN_TEST(&req->bv_set, DOT1Q_VID_RSVD_IMPL))
- return (EINVAL);
+ return (EXTERROR(EINVAL, "Invalid VLAN ID in set"));
switch (req->bv_op) {
/* Replace the existing vlan set with the new set */
@@ -1942,7 +1987,8 @@ bridge_ioctl_sifvlanset(struct bridge_softc *sc, void *arg)
/* Invalid or unknown operation */
default:
- return (EINVAL);
+ return (EXTERROR(EINVAL,
+ "Unsupported BRDGSIFVLANSET operation"));
}
/*
@@ -1962,7 +2008,7 @@ bridge_ioctl_gifvlanset(struct bridge_softc *sc, void *arg)
bif = bridge_lookup_member(sc, req->bv_ifname);
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a bridge member"));
BIT_COPY(BRVLAN_SETSIZE, &bif->bif_vlan_set, &req->bv_set);
return (0);
@@ -1977,14 +2023,16 @@ bridge_ioctl_addspan(struct bridge_softc *sc, void *arg)
ifs = ifunit(req->ifbr_ifsname);
if (ifs == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "No such interface"));
CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
if (ifs == bif->bif_ifp)
- return (EBUSY);
+ return (EXTERROR(EBUSY,
+ "Interface is already a span port"));
if (ifs->if_bridge != NULL)
- return (EBUSY);
+ return (EXTERROR(EEXIST,
+ "Interface is already a bridge member"));
switch (ifs->if_type) {
case IFT_ETHER:
@@ -1992,7 +2040,7 @@ bridge_ioctl_addspan(struct bridge_softc *sc, void *arg)
case IFT_L2VLAN:
break;
default:
- return (EINVAL);
+ return (EXTERROR(EINVAL, "Unsupported interface type"));
}
bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
@@ -2016,14 +2064,14 @@ bridge_ioctl_delspan(struct bridge_softc *sc, void *arg)
ifs = ifunit(req->ifbr_ifsname);
if (ifs == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "No such interface"));
CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
if (ifs == bif->bif_ifp)
break;
if (bif == NULL)
- return (ENOENT);
+ return (EXTERROR(ENOENT, "Interface is not a span port"));
bridge_delete_span(sc, bif);
@@ -2278,8 +2326,8 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m,
* the VLAN header.
*/
if ((bif->bif_flags & IFBIF_VLANFILTER) &&
- bif->bif_untagged != DOT1Q_VID_NULL &&
- VLANTAGOF(m) == bif->bif_untagged) {
+ bif->bif_pvid != DOT1Q_VID_NULL &&
+ VLANTAGOF(m) == bif->bif_pvid) {
m->m_flags &= ~M_VLANTAG;
m->m_pkthdr.ether_vtag = 0;
}
@@ -3145,14 +3193,14 @@ bridge_vfilter_in(const struct bridge_iflist *sbif, struct mbuf *m)
* The frame doesn't have a tag. If the interface does not
* have an untagged vlan configured, drop the frame.
*/
- if (sbif->bif_untagged == DOT1Q_VID_NULL)
+ if (sbif->bif_pvid == DOT1Q_VID_NULL)
return (false);
/*
* Otherwise, insert a new tag based on the interface's
* untagged vlan id.
*/
- m->m_pkthdr.ether_vtag = sbif->bif_untagged;
+ m->m_pkthdr.ether_vtag = sbif->bif_pvid;
m->m_flags |= M_VLANTAG;
} else {
/*
@@ -3213,7 +3261,7 @@ bridge_vfilter_out(const struct bridge_iflist *dbif, const struct mbuf *m)
* If the frame's vlan matches the interfaces's untagged vlan,
* allow it.
*/
- if (vlan == dbif->bif_untagged)
+ if (vlan == dbif->bif_pvid)
return (true);
/*
@@ -3244,10 +3292,11 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst,
BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc);
/* Check the source address is valid and not multicast. */
- if (ETHER_IS_MULTICAST(dst) ||
- (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 &&
- dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0)
- return (EINVAL);
+ if (ETHER_IS_MULTICAST(dst))
+ return (EXTERROR(EINVAL, "Multicast address not permitted"));
+ if (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 &&
+ dst[3] == 0 && dst[4] == 0 && dst[5] == 0)
+ return (EXTERROR(EINVAL, "Zero address not permitted"));
/*
* A route for this destination might already exist. If so,
@@ -3266,13 +3315,14 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst,
if (sc->sc_brtcnt >= sc->sc_brtmax) {
sc->sc_brtexceeded++;
BRIDGE_RT_UNLOCK(sc);
- return (ENOSPC);
+ return (EXTERROR(ENOSPC, "Address table is full"));
}
/* Check per interface address limits (if enabled) */
if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) {
bif->bif_addrexceeded++;
BRIDGE_RT_UNLOCK(sc);
- return (ENOSPC);
+ return (EXTERROR(ENOSPC,
+ "Interface address limit exceeded"));
}
/*
@@ -3283,7 +3333,8 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst,
brt = uma_zalloc(V_bridge_rtnode_zone, M_NOWAIT | M_ZERO);
if (brt == NULL) {
BRIDGE_RT_UNLOCK(sc);
- return (ENOMEM);
+ return (EXTERROR(ENOMEM,
+ "Cannot allocate address node"));
}
brt->brt_vnet = curvnet;
@@ -3631,7 +3682,7 @@ bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt)
do {
dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr);
if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan)
- return (EEXIST);
+ return (EXTERROR(EEXIST, "Address already exists"));
if (dir > 0) {
CK_LIST_INSERT_BEFORE(lbrt, brt, brt_hash);
goto out;
diff --git a/sys/net/if_bridgevar.h b/sys/net/if_bridgevar.h
index 97b63e3d4416..c458dcc152a0 100644
--- a/sys/net/if_bridgevar.h
+++ b/sys/net/if_bridgevar.h
@@ -124,7 +124,7 @@
#define BRDGSPROTO 28 /* set protocol (ifbrparam) */
#define BRDGSTXHC 29 /* set tx hold count (ifbrparam) */
#define BRDGSIFAMAX 30 /* set max interface addrs (ifbreq) */
-#define BRDGSIFUNTAGGED 31 /* set if untagged vlan */
+#define BRDGSIFPVID 31 /* set if PVID */
#define BRDGSIFVLANSET 32 /* set if vlan set */
#define BRDGGIFVLANSET 33 /* get if vlan set */
@@ -144,7 +144,7 @@ struct ifbreq {
uint32_t ifbr_addrcnt; /* member if addr number */
uint32_t ifbr_addrmax; /* member if addr max */
uint32_t ifbr_addrexceeded; /* member if addr violations */
- ether_vlanid_t ifbr_untagged; /* member if untagged vlan */
+ ether_vlanid_t ifbr_pvid; /* member if PVID */
uint8_t pad[32];
};
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index 7be4dfac23e7..3ae0c01c0efc 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -92,11 +92,6 @@
#include <crypto/sha1.h>
-#ifdef CTASSERT
-CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
-CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
-#endif
-
VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */
/* netgraph node hooks for ng_ether(4) */
diff --git a/sys/net/if_gif.h b/sys/net/if_gif.h
index 3c1846b8f82a..c6692d3dd6bc 100644
--- a/sys/net/if_gif.h
+++ b/sys/net/if_gif.h
@@ -120,7 +120,8 @@ int in6_gif_setopts(struct gif_softc *, u_int);
#define GIFGOPTS _IOWR('i', 150, struct ifreq)
#define GIFSOPTS _IOW('i', 151, struct ifreq)
+#define GIF_NOCLAMP 0x0001
#define GIF_IGNORE_SOURCE 0x0002
-#define GIF_OPTMASK (GIF_IGNORE_SOURCE)
+#define GIF_OPTMASK (GIF_NOCLAMP|GIF_IGNORE_SOURCE)
#endif /* _NET_IF_GIF_H_ */
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index 9867a718e148..5b52bfa80e3b 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -718,6 +718,7 @@ lagg_capabilities(struct lagg_softc *sc)
sc->sc_ifp->if_capenable = ena;
sc->sc_ifp->if_capenable2 = ena2;
sc->sc_ifp->if_hwassist = hwa;
+ (void)if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax);
getmicrotime(&sc->sc_ifp->if_lastchange);
if (sc->sc_ifflags & IFF_DEBUG)
diff --git a/sys/net/if_ovpn.c b/sys/net/if_ovpn.c
index 7bdbc565f4ca..fe3e7bbd7fff 100644
--- a/sys/net/if_ovpn.c
+++ b/sys/net/if_ovpn.c
@@ -34,11 +34,13 @@
#include <sys/epoch.h>
#include <sys/file.h>
#include <sys/filedesc.h>
+#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/nv.h>
+#include <sys/osd.h>
#include <sys/priv.h>
#include <sys/protosw.h>
#include <sys/rmlock.h>
@@ -79,7 +81,6 @@
#include "if_ovpn.h"
struct ovpn_kkey_dir {
- int refcount;
uint8_t key[32];
uint8_t keylen;
uint8_t nonce[8];
@@ -132,6 +133,9 @@ struct ovpn_notification {
/* Delete notification */
enum ovpn_del_reason del_reason;
struct ovpn_peer_counters counters;
+
+ /* Float notification */
+ struct sockaddr_storage address;
};
struct ovpn_softc;
@@ -196,6 +200,10 @@ struct ovpn_softc {
struct epoch_context epoch_ctx;
};
+struct ovpn_mtag {
+ struct sockaddr_storage addr;
+};
+
static struct ovpn_kpeer *ovpn_find_peer(struct ovpn_softc *, uint32_t);
static bool ovpn_udp_input(struct mbuf *, int, struct inpcb *,
const struct sockaddr *, void *);
@@ -205,7 +213,10 @@ static int ovpn_encap(struct ovpn_softc *, uint32_t, struct mbuf *);
static int ovpn_get_af(struct mbuf *);
static void ovpn_free_kkey_dir(struct ovpn_kkey_dir *);
static bool ovpn_check_replay(struct ovpn_kkey_dir *, uint32_t);
-static int ovpn_peer_compare(struct ovpn_kpeer *, struct ovpn_kpeer *);
+static int ovpn_peer_compare(const struct ovpn_kpeer *,
+ const struct ovpn_kpeer *);
+static bool ovpn_sockaddr_compare(const struct sockaddr *,
+ const struct sockaddr *);
static RB_PROTOTYPE(ovpn_kpeers, ovpn_kpeer, tree, ovpn_peer_compare);
static RB_GENERATE(ovpn_kpeers, ovpn_kpeer, tree, ovpn_peer_compare);
@@ -278,11 +289,48 @@ SYSCTL_INT(_net_link_openvpn, OID_AUTO, netisr_queue,
"Use netisr_queue() rather than netisr_dispatch().");
static int
-ovpn_peer_compare(struct ovpn_kpeer *a, struct ovpn_kpeer *b)
+ovpn_peer_compare(const struct ovpn_kpeer *a, const struct ovpn_kpeer *b)
{
return (a->peerid - b->peerid);
}
+static bool
+ovpn_sockaddr_compare(const struct sockaddr *a,
+ const struct sockaddr *b)
+{
+ if (a->sa_family != b->sa_family)
+ return (false);
+ MPASS(a->sa_len == b->sa_len);
+
+ switch (a->sa_family) {
+ case AF_INET: {
+ const struct sockaddr_in *a4, *b4;
+
+ a4 = (const struct sockaddr_in *)a;
+ b4 = (const struct sockaddr_in *)b;
+
+ if (a4->sin_port != b4->sin_port)
+ return (false);
+
+ return (a4->sin_addr.s_addr == b4->sin_addr.s_addr);
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *a6, *b6;
+
+ a6 = (const struct sockaddr_in6 *)a;
+ b6 = (const struct sockaddr_in6 *)b;
+
+ if (a6->sin6_port != b6->sin6_port)
+ return (false);
+
+ return (memcmp(&a6->sin6_addr, &b6->sin6_addr,
+ sizeof(a6->sin6_addr)) == 0);
+ }
+ default:
+ panic("Unknown address family %d", a->sa_family);
+ }
+}
+
static struct ovpn_kpeer *
ovpn_find_peer(struct ovpn_softc *sc, uint32_t peerid)
{
@@ -304,15 +352,15 @@ ovpn_find_only_peer(struct ovpn_softc *sc)
}
static uint16_t
-ovpn_get_port(struct sockaddr_storage *s)
+ovpn_get_port(const struct sockaddr_storage *s)
{
switch (s->ss_family) {
case AF_INET: {
- struct sockaddr_in *in = (struct sockaddr_in *)s;
+ const struct sockaddr_in *in = (const struct sockaddr_in *)s;
return (in->sin_port);
}
case AF_INET6: {
- struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)s;
+ const struct sockaddr_in6 *in6 = (const struct sockaddr_in6 *)s;
return (in6->sin6_port);
}
default:
@@ -320,6 +368,25 @@ ovpn_get_port(struct sockaddr_storage *s)
}
}
+static void
+ovpn_set_port(struct sockaddr_storage *s, unsigned short port)
+{
+ switch (s->ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *in = (struct sockaddr_in *)s;
+ in->sin_port = port;
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)s;
+ in6->sin6_port = port;
+ break;
+ }
+ default:
+ panic("Unsupported address family %d", s->ss_family);
+ }
+}
+
static int
ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa)
{
@@ -333,14 +400,16 @@ ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa)
return (EINVAL);
af = nvlist_get_number(nvl, "af");
-
switch (af) {
#ifdef INET
case AF_INET: {
struct sockaddr_in *in = (struct sockaddr_in *)sa;
size_t len;
const void *addr = nvlist_get_binary(nvl, "address", &len);
+
+ memset(in, 0, sizeof(*in));
in->sin_family = af;
+ in->sin_len = sizeof(*in);
if (len != sizeof(in->sin_addr))
return (EINVAL);
@@ -354,7 +423,10 @@ ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa)
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)sa;
size_t len;
const void *addr = nvlist_get_binary(nvl, "address", &len);
+
+ memset(in6, 0, sizeof(*in6));
in6->sin6_family = af;
+ in6->sin6_len = sizeof(*in6);
if (len != sizeof(in6->sin6_addr))
return (EINVAL);
@@ -370,31 +442,42 @@ ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa)
return (0);
}
-static bool
-ovpn_has_peers(struct ovpn_softc *sc)
+static int
+ovpn_add_sockaddr(nvlist_t *parent, const char *name, const struct sockaddr *s)
{
- OVPN_ASSERT(sc);
-
- return (sc->peercount > 0);
-}
+ nvlist_t *nvl;
-static void
-ovpn_rele_so(struct ovpn_softc *sc)
-{
- bool has_peers;
+ nvl = nvlist_create(0);
+ if (nvl == NULL)
+ return (ENOMEM);
- OVPN_WASSERT(sc);
+ nvlist_add_number(nvl, "af", s->sa_family);
- if (sc->so == NULL)
- return;
+ switch (s->sa_family) {
+ case AF_INET: {
+ const struct sockaddr_in *s4 = (const struct sockaddr_in *)s;
- has_peers = ovpn_has_peers(sc);
+ nvlist_add_number(nvl, "port", s4->sin_port);
+ nvlist_add_binary(nvl, "address", &s4->sin_addr,
+ sizeof(s4->sin_addr));
+ break;
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *s6 = (const struct sockaddr_in6 *)s;
- if (! has_peers) {
- MPASS(sc->peercount == 0);
- } else {
- MPASS(sc->peercount > 0);
+ nvlist_add_number(nvl, "port", s6->sin6_port);
+ nvlist_add_binary(nvl, "address", &s6->sin6_addr,
+ sizeof(s6->sin6_addr));
+ break;
+ }
+ default:
+ nvlist_destroy(nvl);
+ return (EINVAL);
}
+
+ nvlist_move_nvlist(parent, name, nvl);
+
+ return (0);
}
static void
@@ -449,6 +532,33 @@ ovpn_notify_key_rotation(struct ovpn_softc *sc, struct ovpn_kpeer *peer)
}
}
+static int
+ovpn_notify_float(struct ovpn_softc *sc, uint32_t peerid,
+ const struct sockaddr_storage *remote)
+{
+ struct ovpn_notification *n;
+
+ n = malloc(sizeof(*n), M_OVPN, M_NOWAIT | M_ZERO);
+ if (n == NULL)
+ return (ENOMEM);
+
+ n->peerid = peerid;
+ n->type = OVPN_NOTIF_FLOAT;
+ memcpy(&n->address, remote, sizeof(n->address));
+
+ if (buf_ring_enqueue(sc->notifring, n) != 0) {
+ free(n, M_OVPN);
+ return (ENOMEM);
+ } else if (sc->so != NULL) {
+ /* Wake up userspace */
+ sc->so->so_error = EAGAIN;
+ sorwakeup(sc->so);
+ sowwakeup(sc->so);
+ }
+
+ return (0);
+}
+
static void
ovpn_peer_release_ref(struct ovpn_kpeer *peer, bool locked)
{
@@ -485,8 +595,6 @@ ovpn_peer_release_ref(struct ovpn_kpeer *peer, bool locked)
ovpn_free_kkey_dir(peer->keys[i].decrypt);
}
- ovpn_rele_so(sc);
-
callout_stop(&peer->ping_send);
callout_stop(&peer->ping_rcv);
uma_zfree_pcpu(pcpu_zone_4, peer->last_active);
@@ -502,7 +610,7 @@ ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl)
#ifdef INET6
struct epoch_tracker et;
#endif
- struct sockaddr_storage remote;
+ struct sockaddr_storage local, remote;
struct ovpn_kpeer *peer = NULL;
struct file *fp = NULL;
struct ovpn_softc *sc = ifp->if_softc;
@@ -571,20 +679,37 @@ ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl)
callout_init_rm(&peer->ping_send, &sc->lock, CALLOUT_SHAREDLOCK);
callout_init_rm(&peer->ping_rcv, &sc->lock, 0);
- peer->local.ss_len = sizeof(peer->local);
- ret = sosockaddr(so, (struct sockaddr *)&peer->local);
- if (ret)
+ memset(&local, 0, sizeof(local));
+ local.ss_len = sizeof(local);
+ ret = sosockaddr(so, (struct sockaddr *)&local);
+ if (ret != 0)
goto error;
+ if (nvlist_exists_nvlist(nvl, "local")) {
+ struct sockaddr_storage local1;
+
+ ret = ovpn_nvlist_to_sockaddr(nvlist_get_nvlist(nvl, "local"),
+ &local1);
+ if (ret != 0)
+ goto error;
- if (ovpn_get_port(&peer->local) == 0) {
+ /*
+ * openvpn doesn't provide a port here when in multihome mode,
+ * just steal the one the socket is bound to.
+ */
+ if (ovpn_get_port(&local1) == 0)
+ ovpn_set_port(&local1, ovpn_get_port(&local));
+ memcpy(&local, &local1, sizeof(local1));
+ }
+ if (ovpn_get_port(&local) == 0) {
ret = EINVAL;
goto error;
}
- if (peer->local.ss_family != remote.ss_family) {
+ if (local.ss_family != remote.ss_family) {
ret = EINVAL;
goto error;
}
+ memcpy(&peer->local, &local, sizeof(local));
memcpy(&peer->remote, &remote, sizeof(remote));
#ifdef INET6
@@ -633,6 +758,7 @@ ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl)
* a new one.
*/
ret = udp_set_kernel_tunneling(sc->so, NULL, NULL, NULL);
+ MPASS(ret == 0);
sorele(sc->so);
sc->so = NULL;
}
@@ -1364,12 +1490,36 @@ opvn_get_pkt(struct ovpn_softc *sc, nvlist_t **onvl)
}
nvlist_add_number(nvl, "peerid", n->peerid);
nvlist_add_number(nvl, "notification", n->type);
- if (n->type == OVPN_NOTIF_DEL_PEER) {
+ switch (n->type) {
+ case OVPN_NOTIF_DEL_PEER: {
nvlist_add_number(nvl, "del_reason", n->del_reason);
/* No error handling, because we want to send the notification
* even if we can't attach the counters. */
ovpn_notif_add_counters(nvl, n);
+ break;
+ }
+ case OVPN_NOTIF_FLOAT: {
+ int ret;
+
+ ret = ovpn_add_sockaddr(nvl, "address",
+ (struct sockaddr *)&n->address);
+
+ if (ret) {
+ /*
+ * Try to re-enqueue the notification. Maybe we'll
+ * have better luck next time. No error handling,
+ * because if we fail to re-enqueue there's nothing we can do.
+ */
+ (void)ovpn_notify_float(sc, n->peerid, &n->address);
+ nvlist_destroy(nvl);
+ free(n, M_OVPN);
+ return (ret);
+ }
+ break;
+ }
+ default:
+ break;
}
free(n, M_OVPN);
@@ -1525,6 +1675,7 @@ ovpn_finish_rx(struct ovpn_softc *sc, struct mbuf *m,
struct rm_priotracker *_ovpn_lock_trackerp)
{
uint32_t af;
+ struct m_tag *mtag;
OVPN_RASSERT(sc);
NET_EPOCH_ASSERT();
@@ -1543,6 +1694,38 @@ ovpn_finish_rx(struct ovpn_softc *sc, struct mbuf *m,
OVPN_RUNLOCK(sc);
+ /* Check if the peer changed to a new source address. */
+ mtag = m_tag_find(m, PACKET_TAG_OVPN, NULL);
+ if (mtag != NULL) {
+ struct ovpn_mtag *ot = (struct ovpn_mtag *)(mtag + 1);
+
+ OVPN_WLOCK(sc);
+
+ /*
+ * Check the address against the peer's remote again, because we may race
+ * against ourselves (i.e. we may have tagged multiple packets to indicate we
+ * floated).
+ */
+ if (ovpn_sockaddr_compare((struct sockaddr *)&ot->addr,
+ (struct sockaddr *)&peer->remote)) {
+ OVPN_WUNLOCK(sc);
+ goto skip_float;
+ }
+
+ /* And notify userspace. */
+ if (ovpn_notify_float(sc, peer->peerid, &ot->addr) == 0) {
+ /*
+ * Update the 'remote' for this peer, but only if
+ * we've actually enqueued the notification.
+ * Otherwise we can try again later.
+ */
+ memcpy(&peer->remote, &ot->addr, sizeof(peer->remote));
+ }
+
+ OVPN_WUNLOCK(sc);
+ }
+
+skip_float:
OVPN_COUNTER_ADD(sc, received_data_pkts, 1);
OVPN_COUNTER_ADD(sc, tunnel_bytes_received, m->m_pkthdr.len);
OVPN_PEER_COUNTER_ADD(peer, pkt_in, 1);
@@ -2305,6 +2488,29 @@ ovpn_udp_input(struct mbuf *m, int off, struct inpcb *inp,
return (true);
}
+ /*
+ * If we got this from a different address than we expected tag the packet.
+ * We'll deal with notifiying userspace later, after we've decrypted and
+ * verified.
+ */
+ if (! ovpn_sockaddr_compare((struct sockaddr *)&peer->remote, sa)) {
+ struct m_tag *mt;
+ struct ovpn_mtag *ot;
+
+ MPASS(sa->sa_len <= sizeof(ot->addr));
+ mt = m_tag_get(PACKET_TAG_OVPN, sizeof(*ot), M_NOWAIT);
+ /*
+ * If we fail to allocate here we'll just try again on the next
+ * packet.
+ */
+ if (mt != NULL) {
+ ot = (struct ovpn_mtag *)(mt + 1);
+ memcpy(&ot->addr, sa, sa->sa_len);
+
+ m_tag_prepend(m, mt);
+ }
+ }
+
if (key->decrypt->cipher == OVPN_CIPHER_ALG_NONE) {
/* Now remove the outer headers */
m_adj_decap(m, sizeof(struct udphdr) + ohdrlen);
@@ -2519,6 +2725,7 @@ ovpn_clone_destroy_cb(struct epoch_context *ctx)
COUNTER_ARRAY_FREE(sc->counters, OVPN_COUNTER_SIZE);
+ rm_destroy(&sc->lock);
if_free(sc->ifp);
free(sc, M_OVPN);
}
@@ -2579,23 +2786,53 @@ vnet_ovpn_init(const void *unused __unused)
VNET_SYSINIT(vnet_ovpn_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
vnet_ovpn_init, NULL);
-static void
-vnet_ovpn_uninit(const void *unused __unused)
+static int
+ovpn_prison_remove(void *obj, void *data __unused)
{
- if_clone_detach(V_ovpn_cloner);
+#ifdef VIMAGE
+ struct prison *pr;
+
+ pr = obj;
+ if (prison_owns_vnet(pr)) {
+ CURVNET_SET(pr->pr_vnet);
+ if (V_ovpn_cloner != NULL) {
+ ifc_detach_cloner(V_ovpn_cloner);
+ V_ovpn_cloner = NULL;
+ }
+ CURVNET_RESTORE();
+ }
+#endif
+ return (0);
}
-VNET_SYSUNINIT(vnet_ovpn_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
- vnet_ovpn_uninit, NULL);
static int
ovpnmodevent(module_t mod, int type, void *data)
{
+ static int ovpn_osd_jail_slot;
+
switch (type) {
- case MOD_LOAD:
- /* Done in vnet_ovpn_init() */
+ case MOD_LOAD: {
+ /*
+ * Registration is handled in vnet_ovpn_init(), but cloned
+ * interfaces must be destroyed via PR_METHOD_REMOVE since they
+ * hold a reference to the prison via the UDP socket, which
+ * prevents the prison from being destroyed.
+ */
+ osd_method_t methods[PR_MAXMETHOD] = {
+ [PR_METHOD_REMOVE] = ovpn_prison_remove,
+ };
+ ovpn_osd_jail_slot = osd_jail_register(NULL, methods);
break;
+ }
case MOD_UNLOAD:
- /* Done in vnet_ovpn_uninit() */
+ if (ovpn_osd_jail_slot != 0)
+ osd_jail_deregister(ovpn_osd_jail_slot);
+ CURVNET_SET(vnet0);
+ if (V_ovpn_cloner != NULL) {
+ ifc_detach_cloner(V_ovpn_cloner);
+ V_ovpn_cloner = NULL;
+ }
+ CURVNET_RESTORE();
break;
default:
return (EOPNOTSUPP);
diff --git a/sys/net/if_ovpn.h b/sys/net/if_ovpn.h
index 2d6b8c1e7eff..2a24c35788a9 100644
--- a/sys/net/if_ovpn.h
+++ b/sys/net/if_ovpn.h
@@ -37,6 +37,7 @@
enum ovpn_notif_type {
OVPN_NOTIF_DEL_PEER,
OVPN_NOTIF_ROTATE_KEY,
+ OVPN_NOTIF_FLOAT,
};
enum ovpn_del_reason {
diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c
index 3bab04aa4d38..5e6f65c04b2f 100644
--- a/sys/net/if_tuntap.c
+++ b/sys/net/if_tuntap.c
@@ -74,6 +74,7 @@
#include <sys/malloc.h>
#include <sys/random.h>
#include <sys/ctype.h>
+#include <sys/osd.h>
#include <net/ethernet.h>
#include <net/if.h>
@@ -178,6 +179,7 @@ struct tuntap_softc {
static struct mtx tunmtx;
static eventhandler_tag arrival_tag;
static eventhandler_tag clone_tag;
+static int tuntap_osd_jail_slot;
static const char tunname[] = "tun";
static const char tapname[] = "tap";
static const char vmnetname[] = "vmnet";
@@ -497,6 +499,10 @@ vmnet_clone_match(struct if_clone *ifc, const char *name)
return (0);
}
+/*
+ * Create a clone via the ifnet cloning mechanism. Note that this is invoked
+ * indirectly by tunclone() below.
+ */
static int
tun_clone_create(struct if_clone *ifc, char *name, size_t len,
struct ifc_data *ifd, struct ifnet **ifpp)
@@ -532,15 +538,19 @@ tun_clone_create(struct if_clone *ifc, char *name, size_t len,
if (i != 0)
i = tun_create_device(drv, unit, NULL, &dev, name);
if (i == 0) {
- dev_ref(dev);
+ struct tuntap_softc *tp;
+
tuncreate(dev);
- struct tuntap_softc *tp = dev->si_drv1;
+ tp = dev->si_drv1;
*ifpp = tp->tun_ifp;
}
return (i);
}
+/*
+ * Create a clone via devfs access.
+ */
static void
tunclone(void *arg, struct ucred *cred, char *name, int namelen,
struct cdev **dev)
@@ -595,11 +605,12 @@ tunclone(void *arg, struct ucred *cred, char *name, int namelen,
}
i = tun_create_device(drv, u, cred, dev, name);
- }
- if (i == 0) {
+ } else {
+ /* Consumed by the dev_clone invoker. */
dev_ref(*dev);
- if_clone_create(name, namelen, NULL);
}
+ if (i == 0)
+ if_clone_create(name, namelen, NULL);
out:
CURVNET_RESTORE();
}
@@ -670,16 +681,6 @@ VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
vnet_tun_init, NULL);
static void
-vnet_tun_uninit(const void *unused __unused)
-{
-
- for (u_int i = 0; i < NDRV; ++i)
- if_clone_detach(V_tuntap_driver_cloners[i]);
-}
-VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
- vnet_tun_uninit, NULL);
-
-static void
tun_uninit(const void *unused __unused)
{
struct tuntap_driver *drv;
@@ -689,6 +690,16 @@ tun_uninit(const void *unused __unused)
EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag);
EVENTHANDLER_DEREGISTER(dev_clone, clone_tag);
+ CURVNET_SET(vnet0);
+ for (u_int i = 0; i < NDRV; i++) {
+ if_clone_detach(V_tuntap_driver_cloners[i]);
+ V_tuntap_driver_cloners[i] = NULL;
+ }
+ CURVNET_RESTORE();
+
+ if (tuntap_osd_jail_slot != 0)
+ osd_jail_deregister(tuntap_osd_jail_slot);
+
mtx_lock(&tunmtx);
while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
TAILQ_REMOVE(&tunhead, tp, tun_list);
@@ -724,6 +735,30 @@ tuntap_driver_from_ifnet(const struct ifnet *ifp)
return (NULL);
}
+/*
+ * Remove devices that were created by devfs cloning, as they hold references
+ * which prevent the prison from collapsing, in which state VNET sysuninits will
+ * not be invoked.
+ */
+static int
+tuntap_prison_remove(void *obj, void *data __unused)
+{
+#ifdef VIMAGE
+ struct prison *pr;
+
+ pr = obj;
+ if (prison_owns_vnet(pr)) {
+ CURVNET_SET(pr->pr_vnet);
+ for (u_int i = 0; i < NDRV; i++) {
+ if_clone_detach(V_tuntap_driver_cloners[i]);
+ V_tuntap_driver_cloners[i] = NULL;
+ }
+ CURVNET_RESTORE();
+ }
+#endif
+ return (0);
+}
+
static int
tuntapmodevent(module_t mod, int type, void *data)
{
@@ -738,8 +773,12 @@ tuntapmodevent(module_t mod, int type, void *data)
clone_setup(&drv->clones);
drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx);
}
+ osd_method_t methods[PR_MAXMETHOD] = {
+ [PR_METHOD_REMOVE] = tuntap_prison_remove,
+ };
+ tuntap_osd_jail_slot = osd_jail_register(NULL, methods);
arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event,
- tunrename, 0, 1000);
+ tunrename, 0, 1000);
if (arrival_tag == NULL)
return (ENOMEM);
clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
@@ -747,7 +786,7 @@ tuntapmodevent(module_t mod, int type, void *data)
return (ENOMEM);
break;
case MOD_UNLOAD:
- /* See tun_uninit, so it's done after the vnet_sysuninit() */
+ /* See tun_uninit(). */
break;
default:
return EOPNOTSUPP;
@@ -798,6 +837,8 @@ tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr,
args.mda_si_drv1 = tp;
error = make_dev_s(&args, dev, "%s", name);
if (error != 0) {
+ mtx_destroy(&tp->tun_mtx);
+ cv_destroy(&tp->tun_cv);
free(tp, M_TUN);
return (error);
}
@@ -914,7 +955,6 @@ tap_transmit(struct ifnet *ifp, struct mbuf *m)
return (error);
}
-/* XXX: should return an error code so it can fail. */
static void
tuncreate(struct cdev *dev)
{
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index 22fcb7bf7c64..61000018e5a4 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -2336,6 +2336,18 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
error = ENOENT;
break;
}
+
+ /*
+ * If the ifp is in a bridge, do not allow setting the device
+ * to a bridge; this prevents having a bridge SVI as a bridge
+ * member (which is not permitted).
+ */
+ if (ifp->if_bridge != NULL && p->if_type == IFT_BRIDGE) {
+ if_rele(p);
+ error = EINVAL;
+ break;
+ }
+
if (vlr.vlr_proto == 0)
vlr.vlr_proto = ETHERTYPE_VLAN;
oldmtu = ifp->if_mtu;
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index 36fab1a03ee6..d55afe750869 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -331,6 +331,14 @@ MALLOC_DECLARE(M_PF_RULE_ITEM);
SDT_PROVIDER_DECLARE(pf);
SDT_PROBE_DECLARE(pf, , test, reason_set);
+SDT_PROBE_DECLARE(pf, , log, log);
+
+#define DPFPRINTF(n, fmt, x...) \
+ do { \
+ SDT_PROBE2(pf, , log, log, (n), fmt); \
+ if (V_pf_status.debug >= (n)) \
+ printf(fmt "\n", ##x); \
+ } while (0)
struct pfi_dynaddr {
TAILQ_ENTRY(pfi_dynaddr) entry;
@@ -1370,7 +1378,6 @@ struct pf_kruleset {
struct pf_krulequeue queues[2];
struct {
struct pf_krulequeue *ptr;
- struct pf_krule **ptr_array;
u_int32_t rcount;
u_int32_t ticket;
int open;
@@ -1677,6 +1684,9 @@ struct pf_pdesc {
u_int32_t fragoff; /* fragment header offset */
u_int32_t jumbolen; /* length from v6 jumbo header */
u_int32_t badopts; /* v4 options or v6 routing headers */
+#define PF_OPT_OTHER 0x0001
+#define PF_OPT_JUMBO 0x0002
+#define PF_OPT_ROUTER_ALERT 0x0004
u_int16_t *ip_sum;
u_int16_t flags; /* Let SCRUB trigger behavior in
@@ -2500,7 +2510,7 @@ int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t);
void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t,
u_int64_t, int, int, int);
int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, sa_family_t,
- pf_addr_filter_func_t);
+ pf_addr_filter_func_t, bool);
void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *);
struct pfr_ktable *
pfr_attach_table(struct pf_kruleset *, char *);
@@ -2534,6 +2544,8 @@ int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int);
int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int);
int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *,
int *, u_int32_t, int);
+struct pfr_ktable
+ *pfr_ktable_select_active(struct pfr_ktable *);
MALLOC_DECLARE(PFI_MTYPE);
VNET_DECLARE(struct pfi_kkif *, pfi_all);
@@ -2712,7 +2724,6 @@ u_short pf_map_addr(u_int8_t, struct pf_krule *,
u_short pf_map_addr_sn(u_int8_t, struct pf_krule *,
struct pf_addr *, struct pf_addr *,
struct pfi_kkif **nkif, struct pf_addr *,
- struct pf_ksrc_node **, struct pf_srchash **,
struct pf_kpool *, pf_sn_types_t);
int pf_get_transaddr_af(struct pf_krule *,
struct pf_pdesc *);
diff --git a/sys/net80211/ieee80211_hostap.c b/sys/net80211/ieee80211_hostap.c
index c5a478533313..9074878e17e4 100644
--- a/sys/net80211/ieee80211_hostap.c
+++ b/sys/net80211/ieee80211_hostap.c
@@ -2214,12 +2214,9 @@ hostap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0,
/* VHT */
if (IEEE80211_IS_CHAN_VHT(ni->ni_chan) &&
- vhtcap != NULL &&
- vhtinfo != NULL) {
- /* XXX TODO; see below */
- net80211_vap_printf(vap, "%s: VHT TODO!\n", __func__);
+ vhtcap != NULL) {
ieee80211_vht_node_init(ni);
- ieee80211_vht_update_cap(ni, vhtcap, vhtinfo);
+ ieee80211_vht_update_cap(ni, vhtcap);
} else if (ni->ni_flags & IEEE80211_NODE_VHT)
ieee80211_vht_node_cleanup(ni);
diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c
index 5ec80e3646b8..c28f124648a1 100644
--- a/sys/net80211/ieee80211_ht.c
+++ b/sys/net80211/ieee80211_ht.c
@@ -1952,6 +1952,11 @@ do { \
_RETURN_CHAN_BITS(0);
/*
+ * TODO: should we bail out if there's no htinfo?
+ * Or just treat it as if we can't do the HT20/HT40 check?
+ */
+
+ /*
* The original code was based on
* 802.11ac-2013, Table 8-183x-VHT Operation Information subfields.
* 802.11-2020, Table 9-274-VHT Operation Information subfields
@@ -1962,8 +1967,12 @@ do { \
*/
htinfo = (const struct ieee80211_ie_htinfo *)ni->ni_ies.htinfo_ie;
- ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
- IEEE80211_HTINFO_TXWIDTH_2040);
+ if (htinfo != NULL)
+ ht40 = ((htinfo->hi_byte1 & IEEE80211_HTINFO_TXWIDTH) ==
+ IEEE80211_HTINFO_TXWIDTH_2040);
+ else
+ ht40 = false;
+
can_vht160 = can_vht80p80 = can_vht80 = false;
/* 20 Mhz */
diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c
index e91977f1ef98..de0b691d4d2a 100644
--- a/sys/net80211/ieee80211_vht.c
+++ b/sys/net80211/ieee80211_vht.c
@@ -838,12 +838,10 @@ ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *ni)
}
void
-ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie,
- const uint8_t *vhtop_ie)
+ieee80211_vht_update_cap(struct ieee80211_node *ni, const uint8_t *vhtcap_ie)
{
ieee80211_parse_vhtcap(ni, vhtcap_ie);
- ieee80211_parse_vhtopmode(ni, vhtop_ie);
}
static struct ieee80211_channel *
diff --git a/sys/net80211/ieee80211_vht.h b/sys/net80211/ieee80211_vht.h
index 2964de63c343..a1529df4a85b 100644
--- a/sys/net80211/ieee80211_vht.h
+++ b/sys/net80211/ieee80211_vht.h
@@ -52,8 +52,7 @@ uint8_t * ieee80211_add_vhtinfo(uint8_t *frm, struct ieee80211_node *);
uint8_t *ieee80211_add_vhtcap_ch(uint8_t *, struct ieee80211vap *,
struct ieee80211_channel *);
-void ieee80211_vht_update_cap(struct ieee80211_node *,
- const uint8_t *, const uint8_t *);
+void ieee80211_vht_update_cap(struct ieee80211_node *, const uint8_t *);
struct ieee80211_channel *
ieee80211_vht_adjust_channel(struct ieee80211com *,
diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h
index b1f2b0ebf911..d6b75e482e35 100644
--- a/sys/netinet/icmp_var.h
+++ b/sys/netinet/icmp_var.h
@@ -104,11 +104,10 @@ extern int badport_bandlim(int);
#define BANDLIM_ICMP_UNREACH 0
#define BANDLIM_ICMP_ECHO 1
#define BANDLIM_ICMP_TSTAMP 2
-#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
-#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */
-#define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_SCTP_OOTB 6
-#define BANDLIM_MAX 7
+#define BANDLIM_TCP_RST 3
+#define BANDLIM_ICMP6_UNREACH 4
+#define BANDLIM_SCTP_OOTB 5
+#define BANDLIM_MAX 6
#endif
#endif
diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c
index b889131b544b..538cd43a88a3 100644
--- a/sys/netinet/in_fib_dxr.c
+++ b/sys/netinet/in_fib_dxr.c
@@ -345,7 +345,7 @@ initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
struct heap_entry *fhp = &da->heap[0];
struct rtentry *rt;
struct route_nhop_data rnd;
-
+
da->heap_index = 0;
da->dst.sin_addr.s_addr = htonl(dst_u32);
rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
@@ -1143,7 +1143,7 @@ dxr_destroy(void *data)
free(da, M_DXRAUX);
}
-static void
+static void
epoch_dxr_destroy(epoch_context_t ctx)
{
struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
@@ -1202,7 +1202,7 @@ dxr_dump_end(void *data, struct fib_dp *dp)
static enum flm_op_result
dxr_dump_rib_item(struct rtentry *rt, void *data)
{
-
+
return (FLM_SUCCESS);
}
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
index cb4b6df57c57..71b75d18efd0 100644
--- a/sys/netinet/ip_icmp.c
+++ b/sys/netinet/ip_icmp.c
@@ -1097,8 +1097,7 @@ static const char *icmp_rate_descrs[BANDLIM_MAX] = {
[BANDLIM_ICMP_UNREACH] = "icmp unreach",
[BANDLIM_ICMP_ECHO] = "icmp ping",
[BANDLIM_ICMP_TSTAMP] = "icmp tstamp",
- [BANDLIM_RST_CLOSEDPORT] = "closed port RST",
- [BANDLIM_RST_OPENPORT] = "open port RST",
+ [BANDLIM_TCP_RST] = "tcp reset",
[BANDLIM_ICMP6_UNREACH] = "icmp6 unreach",
[BANDLIM_SCTP_OOTB] = "sctp ootb",
};
diff --git a/sys/netinet/sctp_timer.c b/sys/netinet/sctp_timer.c
index 66af716eea52..7d8cb965ab09 100644
--- a/sys/netinet/sctp_timer.c
+++ b/sys/netinet/sctp_timer.c
@@ -35,7 +35,6 @@
#define _IP_VHL
#include <netinet/sctp_os.h>
#include <netinet/sctp_pcb.h>
-
#include <netinet/sctp_var.h>
#include <netinet/sctp_sysctl.h>
#include <netinet/sctp_timer.h>
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 91f8251589e4..b60cdf45af52 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -433,38 +433,40 @@ static void
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
int slots_to_run, int idx, bool from_callout)
{
- union tcp_log_stackspecific log;
- /*
- * Unused logs are
- * 64 bit - delRate, rttProp, bw_inuse
- * 16 bit - cwnd_gain
- * 8 bit - bbr_state, bbr_substate, inhpts;
- */
- memset(&log, 0, sizeof(log));
- log.u_bbr.flex1 = hpts->p_nxt_slot;
- log.u_bbr.flex2 = hpts->p_cur_slot;
- log.u_bbr.flex3 = hpts->p_prev_slot;
- log.u_bbr.flex4 = idx;
- log.u_bbr.flex5 = hpts->p_curtick;
- log.u_bbr.flex6 = hpts->p_on_queue_cnt;
- log.u_bbr.flex7 = hpts->p_cpu;
- log.u_bbr.flex8 = (uint8_t)from_callout;
- log.u_bbr.inflight = slots_to_run;
- log.u_bbr.applimited = hpts->overidden_sleep;
- log.u_bbr.delivered = hpts->saved_curtick;
- log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
- log.u_bbr.epoch = hpts->saved_curslot;
- log.u_bbr.lt_epoch = hpts->saved_prev_slot;
- log.u_bbr.pkts_out = hpts->p_delayed_by;
- log.u_bbr.lost = hpts->p_hpts_sleep_time;
- log.u_bbr.pacing_gain = hpts->p_cpu;
- log.u_bbr.pkt_epoch = hpts->p_runningslot;
- log.u_bbr.use_lt_bw = 1;
- TCP_LOG_EVENTP(tp, NULL,
- &tptosocket(tp)->so_rcv,
- &tptosocket(tp)->so_snd,
- BBR_LOG_HPTSDIAG, 0,
- 0, &log, false, tv);
+ if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
+ union tcp_log_stackspecific log;
+ /*
+ * Unused logs are
+ * 64 bit - delRate, rttProp, bw_inuse
+ * 16 bit - cwnd_gain
+ * 8 bit - bbr_state, bbr_substate, inhpts;
+ */
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = hpts->p_nxt_slot;
+ log.u_bbr.flex2 = hpts->p_cur_slot;
+ log.u_bbr.flex3 = hpts->p_prev_slot;
+ log.u_bbr.flex4 = idx;
+ log.u_bbr.flex5 = hpts->p_curtick;
+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+ log.u_bbr.flex7 = hpts->p_cpu;
+ log.u_bbr.flex8 = (uint8_t)from_callout;
+ log.u_bbr.inflight = slots_to_run;
+ log.u_bbr.applimited = hpts->overidden_sleep;
+ log.u_bbr.delivered = hpts->saved_curtick;
+ log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
+ log.u_bbr.epoch = hpts->saved_curslot;
+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+ log.u_bbr.pkts_out = hpts->p_delayed_by;
+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
+ log.u_bbr.pacing_gain = hpts->p_cpu;
+ log.u_bbr.pkt_epoch = hpts->p_runningslot;
+ log.u_bbr.use_lt_bw = 1;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tptosocket(tp)->so_rcv,
+ &tptosocket(tp)->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, tv);
+ }
}
static void
@@ -1353,10 +1355,7 @@ again:
}
CURVNET_SET(inp->inp_vnet);
/* Lets do any logging that we might want to */
- if (hpts_does_tp_logging && tcp_bblogging_on(tp)) {
- tcp_hpts_log(hpts, tp, &tv, slots_to_run, i,
- from_callout);
- }
+ tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
@@ -1487,7 +1486,7 @@ no_run:
}
void
-__tcp_set_hpts(struct tcpcb *tp, int32_t line)
+tcp_set_hpts(struct tcpcb *tp)
{
struct tcp_hpts_entry *hpts;
int failed;
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index b097a2b98db9..f5856ed8e688 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -149,8 +149,7 @@ uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,
#define tcp_hpts_insert(inp, slot) \
tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)
-void __tcp_set_hpts(struct tcpcb *tp, int32_t line);
-#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
+void tcp_set_hpts(struct tcpcb *tp);
void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason);
@@ -165,25 +164,25 @@ extern int32_t tcp_min_hptsi_time;
* The following functions should also be available
* to userspace as well.
*/
-static __inline uint32_t
+static inline uint32_t
tcp_tv_to_hptstick(const struct timeval *sv)
{
return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
}
-static __inline uint32_t
+static inline uint32_t
tcp_tv_to_usectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
-static __inline uint32_t
+static inline uint32_t
tcp_tv_to_mssectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
}
-static __inline uint64_t
+static inline uint64_t
tcp_tv_to_lusectick(const struct timeval *sv)
{
return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
@@ -199,7 +198,7 @@ get_hpts_min_sleep_time(void)
return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
}
-static __inline uint32_t
+static inline uint32_t
tcp_gethptstick(struct timeval *sv)
{
struct timeval tv;
@@ -210,7 +209,7 @@ tcp_gethptstick(struct timeval *sv)
return (tcp_tv_to_hptstick(sv));
}
-static __inline uint64_t
+static inline uint64_t
tcp_get_u64_usecs(struct timeval *tv)
{
struct timeval tvd;
@@ -221,7 +220,7 @@ tcp_get_u64_usecs(struct timeval *tv)
return (tcp_tv_to_lusectick(tv));
}
-static __inline uint32_t
+static inline uint32_t
tcp_get_usecs(struct timeval *tv)
{
struct timeval tvd;
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 7c032e13f37a..de428ae1af6f 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -621,6 +621,7 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
#endif /* INET6 */
struct tcpopt to; /* options in this segment */
char *s = NULL; /* address and port logging */
+ bool closed_port = false; /* segment is hitting a closed port */
NET_EPOCH_ASSERT();
@@ -907,7 +908,8 @@ findpcb:
log(LOG_INFO, "%s; %s: Connection attempt "
"to closed port\n", s, __func__);
}
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ rstreason = BANDLIM_TCP_RST;
+ closed_port = true;
goto dropwithreset;
}
INP_LOCK_ASSERT(inp);
@@ -998,12 +1000,14 @@ findpcb:
* down or it is in the CLOSED state. Either way we drop the
* segment and send an appropriate response.
*/
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ rstreason = BANDLIM_TCP_RST;
+ closed_port = true;
goto dropwithreset;
}
if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
- rstreason = BANDLIM_RST_CLOSEDPORT;
+ rstreason = BANDLIM_TCP_RST;
+ closed_port = true;
goto dropwithreset;
}
@@ -1055,6 +1059,8 @@ findpcb:
* socket appended to the listen queue in SYN_RECEIVED state.
*/
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+ int result;
+
/*
* Parse the TCP options here because
* syncookies need access to the reflected
@@ -1064,8 +1070,8 @@ findpcb:
/*
* NB: syncache_expand() doesn't unlock inp.
*/
- rstreason = syncache_expand(&inc, &to, th, &so, m, port);
- if (rstreason < 0) {
+ result = syncache_expand(&inc, &to, th, &so, m, port);
+ if (result < 0) {
/*
* A failing TCP MD5 signature comparison
* must result in the segment being dropped
@@ -1073,7 +1079,7 @@ findpcb:
* to the sender.
*/
goto dropunlock;
- } else if (rstreason == 0) {
+ } else if (result == 0) {
/*
* No syncache entry, or ACK was not for our
* SYN/ACK. Do our protection against double
@@ -1092,7 +1098,7 @@ findpcb:
* of the failure cause.
*/
INP_WUNLOCK(inp);
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
lookupflag &= ~INPLOOKUP_WILDCARD;
goto findpcb;
}
@@ -1183,7 +1189,7 @@ tfo_socket_result:
s, __func__);
syncache_badack(&inc, port); /* XXX: Not needed! */
TCPSTAT_INC(tcps_badsyn);
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
goto dropwithreset;
}
/*
@@ -1259,7 +1265,7 @@ tfo_socket_result:
"Connection attempt to deprecated "
"IPv6 address rejected\n",
s, __func__);
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
goto dropwithreset;
}
}
@@ -1380,9 +1386,10 @@ dropwithreset:
* When blackholing do not respond with a RST but
* completely ignore the segment and drop it.
*/
- if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) ||
- (rstreason == BANDLIM_RST_CLOSEDPORT &&
- ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
+ if (rstreason == BANDLIM_TCP_RST &&
+ ((!closed_port && V_blackhole == 3) ||
+ (closed_port &&
+ ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
(V_blackhole_local || (
#ifdef INET6
isipv6 ? !in6_localip(&ip6->ip6_src) :
@@ -1515,7 +1522,9 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
struct tcpopt to;
int tfo_syn;
u_int maxseg = 0;
+ bool no_data;
+ no_data = (tlen == 0);
thflags = tcp_get_flags(th);
tp->sackhint.last_sack_ack = 0;
sack_changed = SACK_NOCHANGE;
@@ -1754,7 +1763,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tp->ts_recent = to.to_tsval;
}
- if (tlen == 0) {
+ if (no_data) {
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
!IN_RECOVERY(tp->t_flags) &&
@@ -1963,7 +1972,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -1976,7 +1985,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* FIN, or a RST.
*/
if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
} else if (thflags & TH_SYN) {
@@ -2244,7 +2253,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
* for the "LAND" DoS attack.
*/
if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
@@ -2557,7 +2566,7 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
maxseg = tcp_maxseg(tp);
- if (tlen == 0 &&
+ if (no_data &&
(tiwin == tp->snd_wnd ||
(tp->t_flags & TF_SACK_PERMIT))) {
/*
@@ -3113,8 +3122,7 @@ step6:
(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
/* keep track of pure window updates */
- if (tlen == 0 &&
- tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
TCPSTAT_INC(tcps_rcvwinupd);
tp->snd_wnd = tiwin;
tp->snd_wl1 = th->th_seq;
@@ -3424,7 +3432,7 @@ dropafterack:
if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
(SEQ_GT(tp->snd_una, th->th_ack) ||
SEQ_GT(th->th_ack, tp->snd_max)) ) {
- rstreason = BANDLIM_RST_OPENPORT;
+ rstreason = BANDLIM_TCP_RST;
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
goto dropwithreset;
}
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
index 75d693bc019b..e24790ece43d 100644
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -2878,7 +2878,7 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
/* double check log state now that we have the lock */
if (inp->inp_flags & INP_DROPPED)
goto done;
- if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
+ if (tcp_bblogging_on(tp)) {
struct timeval tv;
tcp_log_eventspecific_t log;
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index fef32e16b2e4..f8c064b6a104 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -377,12 +377,12 @@ extern int32_t tcp_trace_point_count;
/*
* Returns true if any sort of BB logging is enabled,
- * commonly used throughout the codebase.
+ * commonly used throughout the codebase.
*/
static inline int
tcp_bblogging_on(struct tcpcb *tp)
{
- if (tp->_t_logstate <= TCP_LOG_STATE_OFF)
+ if (tp->_t_logstate <= TCP_LOG_STATE_OFF)
return (0);
if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS)
return (0);
@@ -427,7 +427,7 @@ tcp_set_bblog_state(struct tcpcb *tp, uint8_t ls, uint8_t bbpoint)
}
}
-static inline uint32_t
+static inline uint32_t
tcp_get_bblog_state(struct tcpcb *tp)
{
return (tp->_t_logstate);
@@ -539,12 +539,12 @@ struct tcpcb;
NULL, NULL, 0, NULL); \
} while (0)
#endif /* TCP_LOG_FORCEVERBOSE */
+/* Assumes/requires the caller has already checked tcp_bblogging_on(tp). */
#define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
do { \
- if (tcp_bblogging_on(tp)) \
- tcp_log_event(tp, th, rxbuf, txbuf, eventid, \
- errornum, len, stackinfo, th_hostorder, \
- NULL, NULL, 0, tv); \
+ KASSERT(tcp_bblogging_on(tp), ("bblogging is off")); \
+ tcp_log_event(tp, th, rxbuf, txbuf, eventid, errornum, len, \
+ stackinfo, th_hostorder, NULL, NULL, 0, tv); \
} while (0)
#ifdef TCP_BLACKBOX
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index 10afed17bf3b..7512679bd4e9 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -1301,9 +1301,9 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h
return (TCP_LRO_CANNOT);
#endif
if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
- ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
+ ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
(m->m_pkthdr.csum_data != 0xffff)) {
- /*
+ /*
* The checksum either did not have hardware offload
* or it was a bad checksum. We can't LRO such
* a packet.
@@ -1334,7 +1334,7 @@ tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_h
#endif
/* If no hardware or arrival stamp on the packet add timestamp */
if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) {
- m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
+ m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
m->m_flags |= M_TSTMP_LRO;
}
@@ -1429,9 +1429,9 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
int error;
if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
- ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
+ ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) ||
(m->m_pkthdr.csum_data != 0xffff)) {
- /*
+ /*
* The checksum either did not have hardware offload
* or it was a bad checksum. We can't LRO such
* a packet.
@@ -1481,7 +1481,7 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
((mb->m_flags & M_TSTMP) == 0)) {
/* Add in an LRO time since no hardware */
binuptime(&lc->lro_last_queue_time);
- mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
+ mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time);
mb->m_flags |= M_TSTMP_LRO;
}
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 90d789f0e224..4405098a8620 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -744,7 +744,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
while (cur != NULL) {
if (!(sblkp >= sack_blocks)) {
if (((loss_sblks >= tcprexmtthresh) ||
- (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
+ (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
break;
loss_thresh += loss_hiack - cur->end;
loss_hiack = cur->start;
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index e2cfec5c9275..b232d3f08fe6 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -5126,8 +5126,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
tp->t_maxseg = tp->t_pmtud_saved_maxseg;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
@@ -8763,7 +8763,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -8965,7 +8965,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (tp->t_flags & TF_FASTOPEN) {
@@ -8977,7 +8977,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
@@ -9010,7 +9010,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (SEQ_LT(th->th_seq, tp->irs)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -9288,7 +9288,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9385,7 +9385,7 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9535,7 +9535,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9637,7 +9637,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9739,7 +9739,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -9848,7 +9848,7 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -10141,7 +10141,7 @@ bbr_init(struct tcpcb *tp, void **ptr)
* flags.
*/
bbr_stop_all_timers(tp, bbr);
- /*
+ /*
* Validate the timers are not in usec, if they are convert.
* BBR should in theory move to USEC and get rid of a
* lot of the TICKS_2 calls.. but for now we stay
@@ -11510,7 +11510,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (tiwin > bbr->r_ctl.rc_high_rwnd)
@@ -11544,7 +11544,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
if (nxt_pkt == 0) {
if ((bbr->r_wanted_output != 0) ||
- (tp->t_flags & TF_ACKNOW)) {
+ (tp->t_flags & TF_ACKNOW)) {
bbr->rc_output_starts_timer = 0;
did_out = 1;
@@ -13172,11 +13172,7 @@ send:
mb, moff, &len,
if_hw_tsomaxsegcount,
if_hw_tsomaxsegsize, msb,
- ((rsm == NULL) ? hw_tls : 0)
-#ifdef NETFLIX_COPY_ARGS
- , NULL, NULL
-#endif
- );
+ ((rsm == NULL) ? hw_tls : 0));
if (len <= maxseg) {
/*
* Must have ran out of mbufs for the copy
@@ -13806,8 +13802,8 @@ nomore:
tp->t_maxseg = old_maxseg - 40;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 8e05498863b9..940a4024bb73 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -40,7 +40,6 @@
#endif
#include <sys/lock.h>
#include <sys/malloc.h>
-#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
@@ -198,7 +197,7 @@ static uint32_t rack_pcm_blast = 0;
static uint32_t rack_pcm_is_enabled = 1;
static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
-static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */
+static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */
static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
@@ -938,7 +937,7 @@ rack_init_sysctls(void)
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
OID_AUTO, "time_between", CTLFLAG_RW,
- & rack_time_between_probertt, 96000000,
+ &rack_time_between_probertt, 96000000,
"How many useconds between the lowest rtt falling must past before we enter probertt");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_probertt),
@@ -3480,9 +3479,9 @@ static void
rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
{
if (rsm->r_flags & RACK_APP_LIMITED) {
- if (rack->r_ctl.rc_app_limited_cnt > 0) {
- rack->r_ctl.rc_app_limited_cnt--;
- }
+ KASSERT((rack->r_ctl.rc_app_limited_cnt > 0),
+ ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm));
+ rack->r_ctl.rc_app_limited_cnt--;
}
if (rsm->r_limit_type) {
/* currently there is only one limit type */
@@ -3554,8 +3553,7 @@ rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
* earlier.
*
* So lets calculate the BDP with the "known" b/w using
- * the SRTT has our rtt and then multiply it by the
- * goal.
+ * the SRTT as our rtt and then multiply it by the goal.
*/
bw = rack_get_bw(rack);
srtt = (uint64_t)tp->t_srtt;
@@ -5793,7 +5791,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
tp->t_badrxtwin = 0;
break;
}
- if ((CC_ALGO(tp)->cong_signal != NULL) &&
+ if ((CC_ALGO(tp)->cong_signal != NULL) &&
(type != CC_RTO)){
tp->t_ccv.curack = ack;
CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
@@ -5904,7 +5902,7 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int li
*
* If reorder-fade is configured, then we track the last time we saw
* re-ordering occur. If we reach the point where enough time as
- * passed we no longer consider reordering has occuring.
+ * passed we no longer consider reordering as occurring.
*
* Or if reorder-face is 0, then once we see reordering we consider
* the connection to alway be subject to reordering and just set lro
@@ -7045,6 +7043,9 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
/* Push bit must go to the right edge as well */
if (rsm->r_flags & RACK_HAD_PUSH)
rsm->r_flags &= ~RACK_HAD_PUSH;
+ /* Update the count if app limited */
+ if (nrsm->r_flags & RACK_APP_LIMITED)
+ rack->r_ctl.rc_app_limited_cnt++;
/* Clone over the state of the hw_tls flag */
nrsm->r_hw_tls = rsm->r_hw_tls;
/*
@@ -7096,7 +7097,7 @@ rack_merge_rsm(struct tcp_rack *rack,
l_rsm->r_flags |= RACK_TLP;
if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
l_rsm->r_flags |= RACK_RWND_COLLAPSED;
- if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
+ if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
/*
* If both are app-limited then let the
@@ -7887,8 +7888,8 @@ drop_it:
tp->t_maxseg = tp->t_pmtud_saved_maxseg;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
@@ -8137,7 +8138,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
* remove the lost desgination and reduce the
* bytes considered lost.
*/
- rsm->r_flags &= ~RACK_WAS_LOST;
+ rsm->r_flags &= ~RACK_WAS_LOST;
KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -8832,7 +8833,7 @@ rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts
val = rack_probertt_lower_within * rack_time_between_probertt;
val /= 100;
- if ((rack->in_probe_rtt == 0) &&
+ if ((rack->in_probe_rtt == 0) &&
(rack->rc_skip_timely == 0) &&
((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
rack_enter_probertt(rack, us_cts);
@@ -10369,7 +10370,7 @@ more:
* and yet before retransmitting we get an ack
* which can happen due to reordering.
*/
- rsm->r_flags &= ~RACK_WAS_LOST;
+ rsm->r_flags &= ~RACK_WAS_LOST;
KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
@@ -11065,7 +11066,7 @@ rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
* We need to skip anything already set
* to be retransmitted.
*/
- if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
+ if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
(rsm->r_flags & RACK_MUST_RXT)) {
rsm = TAILQ_NEXT(rsm, r_tnext);
continue;
@@ -12875,7 +12876,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
@@ -13089,7 +13090,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
(SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (tp->t_flags & TF_FASTOPEN) {
@@ -13102,7 +13103,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
@@ -13136,7 +13137,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (SEQ_LT(th->th_seq, tp->irs)) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
@@ -13399,7 +13400,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&so->so_snd)) {
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13495,7 +13496,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13645,7 +13646,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13746,7 +13747,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13848,7 +13849,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -13952,7 +13953,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (ctf_progress_timeout_check(tp, true)) {
rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
tp, tick, PROGRESS_DROP, __LINE__);
- ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_TCP_RST, tlen);
return (1);
}
}
@@ -14637,9 +14638,6 @@ rack_init(struct tcpcb *tp, void **ptr)
if (rack->r_ctl.pcm_s == NULL) {
rack->r_ctl.pcm_i.cnt_alloc = 0;
}
-#ifdef NETFLIX_STATS
- rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask;
-#endif
rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
if (rack_enable_shared_cwnd)
@@ -15563,7 +15561,7 @@ rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2,
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
-
+
(void)tcp_get_usecs(&tv);
memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
@@ -16655,7 +16653,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
#ifdef TCP_ACCOUNTING
sched_unpin();
#endif
@@ -16919,7 +16917,7 @@ do_output_now:
} else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
goto do_output_now;
} else if ((no_output == 1) &&
- (nxt_pkt == 0) &&
+ (nxt_pkt == 0) &&
(tcp_in_hpts(rack->rc_tp) == 0)) {
/*
* We are not in hpts and we had a pacing timer up. Use
@@ -17546,7 +17544,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
rack->r_ctl.rc_last_us_rtt,
88, __LINE__, NULL, gain);
}
- if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
+ if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
(rack->use_fixed_rate == 0)) {
/*
* No way yet to make a b/w estimate or
@@ -17986,7 +17984,7 @@ start_set:
tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
rack->r_ctl.rc_gp_cumack_ts = 0;
if ((rack->r_ctl.cleared_app_ack == 1) &&
- (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) {
+ (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) {
/*
* We just cleared an application limited period
* so the next seq out needs to skip the first
@@ -19914,7 +19912,7 @@ rack_output(struct tcpcb *tp)
goto nomore;
} else {
/* Return == 0, if there is more we can send tot_len wise fall through and send */
- if (tot_len_this_send >= pace_max_seg)
+ if (tot_len_this_send >= pace_max_seg)
return (ret);
#ifdef TCP_ACCOUNTING
/* We need to re-pin since fast_output un-pined */
@@ -20043,7 +20041,7 @@ again:
rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
}
}
- if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
+ if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
uint32_t rw_avail, cwa;
if (tp->snd_wnd > ctf_outstanding(tp))
@@ -21031,7 +21029,7 @@ just_return_nolock:
} else
log = 1;
}
- /* Mark the last packet has app limited */
+ /* Mark the last packet as app limited */
rsm = tqhash_max(rack->r_ctl.tqh);
if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
if (rack->r_ctl.rc_app_limited_cnt == 0)
@@ -21555,11 +21553,7 @@ send:
m->m_next = tcp_m_copym(
mb, moff, &len,
if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
- ((rsm == NULL) ? hw_tls : 0)
-#ifdef NETFLIX_COPY_ARGS
- , &s_mb, &s_moff
-#endif
- );
+ ((rsm == NULL) ? hw_tls : 0));
if (len <= (tp->t_maxseg - optlen)) {
/*
* Must have ran out of mbufs for the copy
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index da26b8cb1f9b..d1c4ba58bf55 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -672,7 +672,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t
(SEQ_GT(tp->snd_una, th->th_ack) ||
SEQ_GT(th->th_ack, tp->snd_max))) {
*ret_val = 1;
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_TCP_RST, tlen);
return;
} else
*ret_val = 0;
diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c
index b0e300847c4a..101e6826536c 100644
--- a/sys/netinet/tcp_stacks/rack_pcm.c
+++ b/sys/netinet/tcp_stacks/rack_pcm.c
@@ -172,7 +172,7 @@ rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack, uint32_t start, uint3
goto skip_ack_accounting;
}
/*
- * Record ACK data.
+ * Record ACK data.
*/
ack_arrival = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
if (SEQ_GT(end, rack->r_ctl.pcm_i.eseq)) {
@@ -305,7 +305,7 @@ skip_ack_accounting:
0, &log, false, NULL, NULL, 0, &tv);
}
}
- /*
+ /*
* Here we need a lot to be added including:
* 1) Some form of measurement, where if we think the measurement
* is valid we iterate over the PCM data and come up with a path
diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c
index fc9ee8454a1e..2b70548f3cc6 100644
--- a/sys/netinet/tcp_stacks/sack_filter.c
+++ b/sys/netinet/tcp_stacks/sack_filter.c
@@ -400,7 +400,7 @@ sack_filter_run(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq
break;
}
/* Copy it out to the outbound */
- memcpy(&in[at], &blkboard[i], sizeof(struct sackblk));
+ memcpy(&in[at], &blkboard[i], sizeof(struct sackblk));
at++;
room--;
/* now lets add it to our sack-board */
@@ -588,7 +588,7 @@ sack_filter_blks(struct tcpcb *tp, struct sack_filter *sf, struct sackblk *in, i
sf->sf_ack = th_ack;
for(i=0, sf->sf_cur=0; i<numblks; i++) {
- if ((in[i].end != tp->snd_max) &&
+ if ((in[i].end != tp->snd_max) &&
((in[i].end - in[i].start) < segmax)) {
/*
* We do not accept blocks less than a MSS minus all
@@ -707,7 +707,7 @@ main(int argc, char **argv)
out = stdout;
memset(&tp, 0, sizeof(tp));
tp.t_maxseg = 1460;
-
+
while ((i = getopt(argc, argv, "dIi:o:?hS:")) != -1) {
switch (i) {
case 'S':
@@ -883,7 +883,7 @@ main(int argc, char **argv)
} else {
printf("can't open sack_setup.bin -- sorry no load\n");
}
-
+
} else if (strncmp(buffer, "help", 4) == 0) {
help:
fprintf(out, "You can input:\n");
diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h
index b12fcf84567c..a1c0684a4359 100644
--- a/sys/netinet/tcp_stacks/sack_filter.h
+++ b/sys/netinet/tcp_stacks/sack_filter.h
@@ -42,7 +42,7 @@
* previously processed sack information.
*
* The second thing that the sack filter does is help protect against malicious
- * attackers that are trying to attack any linked lists (or other data structures)
+ * attackers that are trying to attack any linked lists (or other data structures)
* that are used in sack processing. Consider an attacker sending in sacks for
* every other byte of data outstanding. This could in theory drastically split
* up any scoreboard you are maintaining and make you search through a very large
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index db415f6bdf03..26e7e53d540c 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -4537,7 +4537,7 @@ tcp_change_time_units(struct tcpcb *tp, int granularity)
panic("Unknown granularity:%d tp:%p",
granularity, tp);
}
-#endif
+#endif
}
void
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 32ce3001929c..3b9fe7a317b0 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -757,8 +757,8 @@ tcp_timer_rexmt(struct tcpcb *tp)
tp->t_maxseg = tp->t_pmtud_saved_maxseg;
if (tp->t_maxseg < V_tcp_mssdflt) {
/*
- * The MSS is so small we should not
- * process incoming SACK's since we are
+ * The MSS is so small we should not
+ * process incoming SACK's since we are
* subject to attack in such a case.
*/
tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 687b0d538666..98c934955121 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -164,7 +164,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td)
goto out;
so->so_rcv.sb_flags |= SB_AUTOSIZE;
- so->so_snd.sb_flags |= SB_AUTOSIZE;
+ so->so_snd.sb_flags |= (SB_AUTOLOWAT | SB_AUTOSIZE);
error = in_pcballoc(so, &V_tcbinfo);
if (error)
goto out;
@@ -1768,9 +1768,9 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
/*
* Release the ref count the lookup
* acquired.
- */
+ */
refcount_release(&blk->tfb_refcnt);
- /*
+ /*
* Now there is a chance that the
* init() function mucked with some
* things before it failed, such as
@@ -1800,7 +1800,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
* new one already.
*/
refcount_release(&tp->t_fb->tfb_refcnt);
- /*
+ /*
* Set in the new stack.
*/
tp->t_fb = blk;
@@ -1934,7 +1934,7 @@ tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
CC_LIST_RUNLOCK();
return(ESRCH);
}
- /*
+ /*
* With a reference the algorithm cannot be removed
* so we hold a reference through the change process.
*/
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 059b2aff689d..b90f65e83cb1 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -182,7 +182,7 @@ struct tcp_sendfile_track {
* snd_una). When the response comes back indicating
* that there was data (return value 1), then the caller
* can build a sendmap entry based on the range and the
- * times. The next query would then be done at the
+ * times. The next query would then be done at the
* newly created sendmap_end. Repeated until sendmap_end == snd_max.
*
* Flags in sendmap_flags are defined below as well.
@@ -197,7 +197,7 @@ struct tcp_sendfile_track {
* The rack_times are a misc collection of information that
* the old stack might possibly fill in. Of course its possible
* that an old stack may not have a piece of information. If so
- * then setting that value to zero is advised. Setting any
+ * then setting that value to zero is advised. Setting any
* timestamp passed should only place a zero in it when it
* is unfilled. This may mean that a time is off by a micro-second
* but this is ok in the grand scheme of things.
@@ -205,13 +205,13 @@ struct tcp_sendfile_track {
* When switching stacks it is desireable to get as much information
* from the old stack to the new stack as possible. Though not always
* will the stack be compatible in the types of information. The
- * init() function needs to take care when it begins changing
+ * init() function needs to take care when it begins changing
* things such as inp_flags2 and the timer units to position these
* changes at a point where it is unlikely they will fail after
* making such changes. A stack optionally can have an "undo"
- * function
+ * function
*
- * To transfer information to the old stack from the new in
+ * To transfer information to the old stack from the new in
* respect to LRO and the inp_flags2, the new stack should set
* the inp_flags2 to what it supports. The old stack in its
* fini() function should call the tcp_handle_orphaned_packets()
@@ -544,13 +544,13 @@ typedef enum {
* do is:
* a) Make sure that the inp_flags2 is setup correctly
* for LRO. There are two flags that the previous
- * stack may have set INP_MBUF_ACKCMP and
+ * stack may have set INP_MBUF_ACKCMP and
* INP_SUPPORTS_MBUFQ. If the new stack does not
* support these it *should* clear the flags.
* b) Make sure that the timers are in the proper
* granularity that the stack wants. The stack
* should check the t_tmr_granularity field. Currently
- * there are two values that it may hold
+ * there are two values that it may hold
* TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC.
* Use the functions tcp_timer_convert(tp, granularity);
* to move the timers to the correct format for your stack.
@@ -558,14 +558,14 @@ typedef enum {
* The new stack may also optionally query the tfb_chg_query
* function if the old stack has one. The new stack may ask
* for one of three entries and can also state to the old
- * stack its support for the INP_MBUF_ACKCMP and
+ * stack its support for the INP_MBUF_ACKCMP and
* INP_SUPPORTS_MBUFQ. This is important since if there are
* queued ack's without that statement the old stack will
* be forced to discard the queued acks. The requests that
* can be made for information by the new stacks are:
*
* Note also that the tfb_tcp_fb_init() when called can
- * determine if a query is needed by looking at the
+ * determine if a query is needed by looking at the
* value passed in the ptr. The ptr is designed to be
* set in with any allocated memory, but the address
* of the condtion (ptr == &tp->t_fb_ptr) will be
@@ -573,17 +573,17 @@ typedef enum {
* setup of a tcb (which means no query would be needed).
* If, however, the value is not t_fb_ptr, then the caller
* is in the middle of a stack switch and is the new stack.
- * A query would be appropriate (if the new stack support
+ * A query would be appropriate (if the new stack support
* the query mechanism).
*
* TCP_QUERY_SENDMAP - Query of outstanding data.
* TCP_QUERY_TIMERS_UP - Query about running timers.
- * TCP_SUPPORTED_LRO - Declaration in req_param of
- * the inp_flags2 supported by
+ * TCP_SUPPORTED_LRO - Declaration in req_param of
+ * the inp_flags2 supported by
* the new stack.
* TCP_QUERY_RACK_TIMES - Enquire about various timestamps
* and states the old stack may be in.
- *
+ *
* tfb_tcp_fb_fini is changed to add a flag to tell
* the old stack if the tcb is being destroyed or
* not. A one in the flag means the TCB is being
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index dafbaf6dc672..42cfb919e263 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -243,7 +243,6 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
struct sockaddr_in6 udp_in6;
#endif
struct udpcb *up;
- bool filtered;
INP_LOCK_ASSERT(inp);
@@ -252,13 +251,19 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
*/
up = intoudpcb(inp);
if (up->u_tun_func != NULL) {
+ bool filtered;
+
in_pcbref(inp);
INP_RUNLOCK(inp);
filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
up->u_tun_ctx);
INP_RLOCK(inp);
- if (filtered)
- return (in_pcbrele_rlocked(inp));
+ if (in_pcbrele_rlocked(inp))
+ return (1);
+ if (filtered) {
+ INP_RUNLOCK(inp);
+ return (1);
+ }
}
off += sizeof(struct udphdr);
diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c
index d476829e8e3b..2bab1c57ce2a 100644
--- a/sys/netinet6/in6_gif.c
+++ b/sys/netinet6/in6_gif.c
@@ -194,6 +194,11 @@ in6_gif_setopts(struct gif_softc *sc, u_int options)
sc->gif_options = options;
in6_gif_attach(sc);
}
+
+ if ((options & GIF_NOCLAMP) !=
+ (sc->gif_options & GIF_NOCLAMP)) {
+ sc->gif_options = options;
+ }
return (0);
}
@@ -289,6 +294,7 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
{
struct gif_softc *sc = ifp->if_softc;
struct ip6_hdr *ip6;
+ u_long mtu;
/* prepend new IP header */
NET_EPOCH_ASSERT();
@@ -304,11 +310,15 @@ in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
ip6->ip6_nxt = proto;
ip6->ip6_hlim = V_ip6_gif_hlim;
/*
- * force fragmentation to minimum MTU, to avoid path MTU discovery.
- * it is too painful to ask for resend of inner packet, to achieve
- * path MTU discovery for encapsulated packets.
+ * Enforce fragmentation to minimum MTU, even if the interface MTU
+ * is larger, to avoid path MTU discovery when NOCLAMP is not
+ * set (default). IPv6 does not allow fragmentation on intermediate
+ * router nodes, so it is too painful to ask for resend of inner
+ * packet, to achieve path MTU discovery for encapsulated packets.
*/
- return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL));
+ mtu = ((sc->gif_options & GIF_NOCLAMP) == 0) ? IPV6_MINMTU : 0;
+
+ return (ip6_output(m, 0, NULL, mtu, 0, NULL, NULL));
}
static int
diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c
index 06fe9e8820c9..a825658bd9ee 100644
--- a/sys/netinet6/mld6.c
+++ b/sys/netinet6/mld6.c
@@ -234,17 +234,20 @@ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
"Per-interface MLDv2 state");
-static int mld_v1enable = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
- &mld_v1enable, 0, "Enable fallback to MLDv1");
+VNET_DEFINE_STATIC(bool, mld_v1enable) = true;
+#define V_mld_v1enable VNET(mld_v1enable)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RWTUN,
+ &VNET_NAME(mld_v1enable), 0, "Enable fallback to MLDv1");
-static int mld_v2enable = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN,
- &mld_v2enable, 0, "Enable MLDv2");
+VNET_DEFINE_STATIC(bool, mld_v2enable) = true;
+#define V_mld_v2enable VNET(mld_v2enable)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RWTUN,
+ &VNET_NAME(mld_v2enable), 0, "Enable MLDv2");
-static int mld_use_allow = 1;
-SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
- &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
+VNET_DEFINE_STATIC(bool, mld_use_allow) = true;
+#define V_mld_use_allow VNET(mld_use_allow)
+SYSCTL_BOOL(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_VNET | CTLFLAG_RWTUN,
+ &VNET_NAME(mld_use_allow), 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
/*
* Packed Router Alert option structure declaration.
@@ -481,7 +484,7 @@ mld_domifattach(struct ifnet *ifp)
mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
if ((ifp->if_flags & IFF_MULTICAST) == 0)
mli->mli_flags |= MLIF_SILENT;
- if (mld_use_allow)
+ if (V_mld_use_allow)
mli->mli_flags |= MLIF_USEALLOW;
MLD_LOCK();
@@ -614,7 +617,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
is_general_query = 0;
- if (!mld_v1enable) {
+ if (!V_mld_v1enable) {
CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
@@ -790,7 +793,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
NET_EPOCH_ASSERT();
- if (!mld_v2enable) {
+ if (!V_mld_v2enable) {
CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &ip6->ip6_src),
ifp, if_name(ifp));
@@ -1076,7 +1079,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
NET_EPOCH_ASSERT();
- if (!mld_v1enable) {
+ if (!V_mld_v1enable) {
CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
ip6_sprintf(ip6tbuf, &mld->mld_addr),
ifp, if_name(ifp));
diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c
index 0379ef7c789a..c90a1213bd66 100644
--- a/sys/netinet6/raw_ip6.c
+++ b/sys/netinet6/raw_ip6.c
@@ -765,8 +765,7 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
}
if (ifa != NULL &&
((struct in6_ifaddr *)ifa)->ia6_flags &
- (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
- IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
+ (IN6_IFF_NOTREADY|IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
NET_EPOCH_EXIT(et);
return (EADDRNOTAVAIL);
}
diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c
index 304effa26e01..b3ed16fda713 100644
--- a/sys/netinet6/udp6_usrreq.c
+++ b/sys/netinet6/udp6_usrreq.c
@@ -142,7 +142,6 @@ udp6_append(struct inpcb *inp, struct mbuf *n, int off,
struct socket *so;
struct mbuf *opts = NULL, *tmp_opts;
struct udpcb *up;
- bool filtered;
INP_LOCK_ASSERT(inp);
@@ -151,13 +150,19 @@ udp6_append(struct inpcb *inp, struct mbuf *n, int off,
*/
up = intoudpcb(inp);
if (up->u_tun_func != NULL) {
+ bool filtered;
+
in_pcbref(inp);
INP_RUNLOCK(inp);
filtered = (*up->u_tun_func)(n, off, inp,
(struct sockaddr *)&fromsa[0], up->u_tun_ctx);
INP_RLOCK(inp);
- if (filtered)
- return (in_pcbrele_rlocked(inp));
+ if (in_pcbrele_rlocked(inp))
+ return (1);
+ if (filtered) {
+ INP_RUNLOCK(inp);
+ return (1);
+ }
}
off += sizeof(struct udphdr);
diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c
index 6bacc68b7441..92d0201b398a 100644
--- a/sys/netipsec/ipsec.c
+++ b/sys/netipsec/ipsec.c
@@ -636,8 +636,10 @@ ipsec4_in_reject1(const struct mbuf *m, struct ip *ip1, struct inpcb *inp)
#ifdef IPSEC_OFFLOAD
tag = ipsec_accel_input_tag_lookup(m);
- if (tag != NULL)
- return (0);
+ if (tag != NULL) {
+ tag->tag.m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
+ __DECONST(struct mbuf *, m)->m_flags |= M_DECRYPTED;
+ }
#endif
if (ip1 == NULL) {
diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c
index 467d5ded1d7a..8a09d5f37b4a 100644
--- a/sys/netipsec/ipsec_offload.c
+++ b/sys/netipsec/ipsec_offload.c
@@ -94,6 +94,7 @@ struct ifp_handle_sav {
size_t hdr_ext_size;
uint64_t cnt_octets;
uint64_t cnt_allocs;
+ struct xform_history xfh;
};
#define IFP_HS_HANDLED 0x00000001
@@ -159,6 +160,8 @@ static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav,
static int ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp);
+static bool ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
static void
ipsec_accel_init(void *arg)
@@ -185,6 +188,7 @@ ipsec_accel_init(void *arg)
ipsec_accel_drv_sa_lifetime_update_impl;
ipsec_accel_drv_sa_lifetime_fetch_p =
ipsec_accel_drv_sa_lifetime_fetch_impl;
+ ipsec_accel_fill_xh_p = ipsec_accel_fill_xh_impl;
pctrie_init(&drv_spi_pctrie);
ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER(
ifnet_departure_event, ipsec_accel_ifdetach_event, NULL,
@@ -209,6 +213,7 @@ ipsec_accel_fini(void *arg)
ipsec_accel_on_ifdown_p = NULL;
ipsec_accel_drv_sa_lifetime_update_p = NULL;
ipsec_accel_drv_sa_lifetime_fetch_p = NULL;
+ ipsec_accel_fill_xh_p = NULL;
ipsec_accel_sync_imp();
clean_unrhdr(drv_spi_unr); /* avoid panic, should go later */
clear_unrhdr(drv_spi_unr);
@@ -412,6 +417,10 @@ ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp,
ihs->ifdata = priv;
ihs->flags = flags;
ihs->hdr_ext_size = esp_hdrsiz(sav);
+ memcpy(&ihs->xfh.dst, &sav->sah->saidx.dst, sizeof(ihs->xfh.dst));
+ ihs->xfh.spi = sav->spi;
+ ihs->xfh.proto = sav->sah->saidx.proto;
+ ihs->xfh.mode = sav->sah->saidx.mode;
mtx_lock(&ipsec_accel_sav_tmp);
CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) {
if (i->ifp == ifp) {
@@ -1162,4 +1171,20 @@ ipsec_accel_key_setaccelif_impl(struct secasvar *sav)
return (m);
}
+static bool
+ipsec_accel_fill_xh_impl(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+ struct ifp_handle_sav *i;
+
+ if (drv_spi < IPSEC_ACCEL_DRV_SPI_MIN ||
+ drv_spi > IPSEC_ACCEL_DRV_SPI_MAX)
+ return (false);
+
+ i = DRVSPI_SA_PCTRIE_LOOKUP(&drv_spi_pctrie, drv_spi);
+ if (i == NULL)
+ return (false);
+ memcpy(xh, &i->xfh, sizeof(*xh));
+ return (true);
+}
+
#endif /* IPSEC_OFFLOAD */
diff --git a/sys/netipsec/ipsec_offload.h b/sys/netipsec/ipsec_offload.h
index 904fe6252396..ae60eaa8ae78 100644
--- a/sys/netipsec/ipsec_offload.h
+++ b/sys/netipsec/ipsec_offload.h
@@ -30,6 +30,7 @@
#include <sys/errno.h>
#include <net/if.h>
#include <net/if_var.h>
+#include <netipsec/xform.h>
struct secpolicy;
struct secasvar;
@@ -42,6 +43,7 @@ struct ipsec_accel_out_tag {
struct ipsec_accel_in_tag {
struct m_tag tag;
+ struct xform_history xh; /* Must be first to mimic IPSEC_IN_DONE */
uint16_t drv_spi;
};
@@ -66,6 +68,8 @@ extern void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs);
extern int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav,
if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+extern bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
#ifdef IPSEC_OFFLOAD
/*
@@ -158,6 +162,16 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
return (NULL);
}
+static inline bool
+ipsec_accel_fill_xh(if_t ifp, uint32_t drv_spi, struct xform_history *xh)
+{
+ bool (*p)(if_t ifp, uint32_t drv_spi, struct xform_history *xh);
+
+ p = atomic_load_ptr(&ipsec_accel_fill_xh_p);
+ if (p != NULL)
+ return (p(ifp, drv_spi, xh));
+ return (false);
+}
#else
#define ipsec_accel_sa_newkey(a)
@@ -168,6 +182,7 @@ ipsec_accel_key_setaccelif(struct secasvar *sav)
#define ipsec_accel_sync()
#define ipsec_accel_is_accel_sav(a)
#define ipsec_accel_key_setaccelif(a)
+#define ipsec_accel_fill_xh(a, b, c) (false)
#endif
void ipsec_accel_forget_sav_impl(struct secasvar *sav);
@@ -180,6 +195,7 @@ bool ipsec_accel_output(struct ifnet *ifp, struct mbuf *m,
struct inpcb *inp, struct secpolicy *sp, struct secasvar *sav, int af,
int mtu, int *hwassist);
void ipsec_accel_forget_sav(struct secasvar *sav);
+struct xform_history;
#else
#define ipsec_accel_input(a, b, c) (ENXIO)
#define ipsec_accel_output(a, b, c, d, e, f, g, h) ({ \
diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c
index ae67d83c6d13..4ba1b49c24f0 100644
--- a/sys/netipsec/key.c
+++ b/sys/netipsec/key.c
@@ -114,6 +114,8 @@ void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp,
u_int drv_spi, uint64_t octets, uint64_t allocs);
int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp,
u_int drv_spi, uint64_t *octets, uint64_t *allocs);
+bool (*ipsec_accel_fill_xh_p)(if_t ifp, uint32_t drv_spi,
+ struct xform_history *xh);
#endif
#define FULLMASK 0xff
diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h
index 8492ecb3021b..720317ed74f3 100644
--- a/sys/netlink/netlink_message_parser.h
+++ b/sys/netlink/netlink_message_parser.h
@@ -209,7 +209,8 @@ int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt,
int nlattr_get_nested_ptr(struct nlattr *nla, struct nl_pstate *npt,
const void *arg, void *target);
-bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...);
+bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...)
+ __printflike(2, 3);
#define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \
nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \
diff --git a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c
index 04850549db98..6eb6cf2a7a47 100644
--- a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c
+++ b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c
@@ -463,13 +463,14 @@ ipf_send_ip(fr_info_t *fin, mb_t *m)
int
ipf_send_icmp_err(int type, fr_info_t *fin, int dst)
{
- int err, hlen, xtra, iclen, ohlen, avail, code;
+ int err, hlen, xtra, iclen, ohlen, avail;
struct in_addr dst4;
struct icmp *icmp;
struct mbuf *m;
i6addr_t dst6;
void *ifp;
#ifdef USE_INET6
+ int code;
ip6_t *ip6;
#endif
ip_t *ip, *ip2;
@@ -477,8 +478,8 @@ ipf_send_icmp_err(int type, fr_info_t *fin, int dst)
if ((type < 0) || (type >= ICMP_MAXTYPE))
return (-1);
- code = fin->fin_icode;
#ifdef USE_INET6
+ code = fin->fin_icode;
/* See NetBSD ip_fil_netbsd.c r1.4: */
if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int)))
return (-1);
diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c
index 923633d76df7..c129c8c49921 100644
--- a/sys/netpfil/ipfw/ip_fw2.c
+++ b/sys/netpfil/ipfw/ip_fw2.c
@@ -196,7 +196,7 @@ SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Firewall");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
- "Only do a single pass through ipfw when using dummynet(4)");
+ "Only do a single pass through ipfw when using dummynet(4), ipfw_nat or other divert(4)-like interfaces");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
"Rule number auto-increment step");
diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c
index 0a84f9d680ac..cb96d2fcc44c 100644
--- a/sys/netpfil/pf/if_pflog.c
+++ b/sys/netpfil/pf/if_pflog.c
@@ -284,9 +284,9 @@ pflog_packet(uint8_t action, u_int8_t reason,
* state lock, since this leads to unsafe LOR.
* These conditions are very very rare, however.
*/
- if (trigger->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe)
+ if (trigger->log & PF_LOG_USER && !pd->lookup.done && lookupsafe)
pd->lookup.done = pf_socket_lookup(pd);
- if (pd->lookup.done > 0)
+ if (trigger->log & PF_LOG_USER && pd->lookup.done > 0)
hdr.uid = pd->lookup.uid;
else
hdr.uid = -1;
diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
index 2391edaf1a5a..ee10a997c977 100644
--- a/sys/netpfil/pf/if_pfsync.c
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -110,8 +110,6 @@
#include <netpfil/pf/pfsync_nv.h>
-#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
-
struct pfsync_bucket;
struct pfsync_softc;
@@ -532,6 +530,7 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
struct pf_kpooladdr *rpool_first;
int error;
uint8_t rt = 0;
+ int n = 0;
PF_RULES_RASSERT();
@@ -557,10 +556,12 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
*/
if (sp->pfs_1301.rule != htonl(-1) && sp->pfs_1301.anchor == htonl(-1) &&
(flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->pfs_1301.rule) <
- pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
- r = pf_main_ruleset.rules[
- PF_RULESET_FILTER].active.ptr_array[ntohl(sp->pfs_1301.rule)];
- else
+ pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) {
+ TAILQ_FOREACH(r, pf_main_ruleset.rules[
+ PF_RULESET_FILTER].active.ptr, entries)
+ if (ntohl(sp->pfs_1301.rule) == n++)
+ break;
+ } else
r = &V_pf_default_rule;
/*
@@ -594,9 +595,9 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
if ((rpool_first == NULL) ||
(TAILQ_NEXT(rpool_first, entries) != NULL)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("%s: can't recover routing information "
- "because of empty or bad redirection pool\n",
- __func__));
+ "%s: can't recover routing information "
+ "because of empty or bad redirection pool",
+ __func__);
return ((flags & PFSYNC_SI_IOCTL) ? EINVAL : 0);
}
rt = r->rt;
@@ -607,8 +608,8 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
* give up on recovering.
*/
DPFPRINTF(PF_DEBUG_MISC,
- ("%s: can't recover routing information "
- "because of different ruleset\n", __func__));
+ "%s: can't recover routing information "
+ "because of different ruleset", __func__);
return ((flags & PFSYNC_SI_IOCTL) ? EINVAL : 0);
}
break;
@@ -621,8 +622,8 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
rt_kif = pfi_kkif_find(sp->pfs_1400.rt_ifname);
if (rt_kif == NULL) {
DPFPRINTF(PF_DEBUG_MISC,
- ("%s: unknown route interface: %s\n",
- __func__, sp->pfs_1400.rt_ifname));
+ "%s: unknown route interface: %s",
+ __func__, sp->pfs_1400.rt_ifname);
return ((flags & PFSYNC_SI_IOCTL) ? EINVAL : 0);
}
rt = sp->pfs_1400.rt;
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index d5f01e5c4956..41e9ca27912d 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -119,8 +119,6 @@
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
-#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
-
SDT_PROVIDER_DEFINE(pf);
SDT_PROBE_DEFINE2(pf, , test, reason_set, "int", "int");
SDT_PROBE_DEFINE4(pf, ip, test, done, "int", "int", "struct pf_krule *",
@@ -161,6 +159,7 @@ SDT_PROBE_DEFINE2(pf, eth, test_rule, match, "int", "struct pf_keth_rule *");
SDT_PROBE_DEFINE2(pf, eth, test_rule, final_match,
"int", "struct pf_keth_rule *");
SDT_PROBE_DEFINE2(pf, purge, state, rowcount, "int", "size_t");
+SDT_PROBE_DEFINE2(pf, , log, log, "int", "const char *");
/*
* Global variables
@@ -375,6 +374,8 @@ static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t,
int, u_int16_t);
static int pf_check_proto_cksum(struct mbuf *, int, int,
u_int8_t, sa_family_t);
+static int pf_walk_option(struct pf_pdesc *, struct ip *,
+ int, int, u_short *);
static int pf_walk_header(struct pf_pdesc *, struct ip *, u_short *);
#ifdef INET6
static int pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *,
@@ -4615,8 +4616,8 @@ pf_match_rcvif(struct mbuf *m, struct pf_krule *r)
if (kif == NULL) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: kif == NULL, @%d via %s\n", __func__, r->nr,
- r->rcv_ifname));
+ "%s: kif == NULL, @%d via %s", __func__, r->nr,
+ r->rcv_ifname);
return (0);
}
@@ -4975,7 +4976,7 @@ pf_socket_lookup(struct pf_pdesc *pd)
}
INP_RLOCK_ASSERT(inp);
pd->lookup.uid = inp->inp_cred->cr_uid;
- pd->lookup.gid = inp->inp_cred->cr_groups[0];
+ pd->lookup.gid = inp->inp_cred->cr_gid;
INP_RUNLOCK(inp);
return (1);
@@ -5242,8 +5243,8 @@ pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
if (__predict_false(m->m_len < sizeof(struct ether_header)) &&
(m = *m0 = m_pullup(*m0, sizeof(struct ether_header))) == NULL) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: m_len < sizeof(struct ether_header)"
- ", pullup failed\n", __func__));
+ "%s: m_len < sizeof(struct ether_header)"
+ ", pullup failed", __func__);
return (PF_DROP);
}
e = mtod(m, struct ether_header *);
@@ -5759,7 +5760,7 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
pd->lookup.uid = inp->inp_cred->cr_uid;
- pd->lookup.gid = inp->inp_cred->cr_groups[0];
+ pd->lookup.gid = inp->inp_cred->cr_gid;
pd->lookup.done = 1;
}
@@ -5901,18 +5902,17 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
M_SETFIB(pd->m, pd->act.rtableid);
if (r->rt) {
- struct pf_ksrc_node *sn = NULL;
- struct pf_srchash *snh = NULL;
/*
* Set act.rt here instead of in pf_rule_to_actions() because
* it is applied only from the last pass rule.
*/
pd->act.rt = r->rt;
- /* Don't use REASON_SET, pf_map_addr increases the reason counters */
- ctx.reason = pf_map_addr_sn(pd->af, r, pd->src, &pd->act.rt_addr,
- &pd->act.rt_kif, NULL, &sn, &snh, &(r->route), PF_SN_ROUTE);
- if (ctx.reason != 0)
+ if ((transerror = pf_map_addr_sn(pd->af, r, pd->src,
+ &pd->act.rt_addr, &pd->act.rt_kif, NULL, &(r->route),
+ PF_SN_ROUTE)) != PFRES_MATCH) {
+ REASON_SET(&ctx.reason, transerror);
goto cleanup;
+ }
}
if (pd->virtual_proto != PF_VPROTO_FRAGMENT &&
@@ -6056,9 +6056,16 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx,
/* src node for translation rule */
if (ctx->nr != NULL) {
KASSERT(ctx->nat_pool != NULL, ("%s: nat_pool is NULL", __func__));
+ /*
+ * The NAT addresses are chosen during ruleset parsing.
+ * The new afto code stores post-nat addresses in nsaddr.
+ * The old nat code (also used for new nat-to rules) creates
+ * state keys and stores addresses in them.
+ */
if ((ctx->nat_pool->opts & PF_POOL_STICKYADDR) &&
(sn_reason = pf_insert_src_node(sns, snhs, ctx->nr,
- &ctx->sk->addr[pd->sidx], pd->af, &ctx->nk->addr[1], NULL,
+ ctx->sk ? &(ctx->sk->addr[pd->sidx]) : pd->src, pd->af,
+ ctx->nk ? &(ctx->nk->addr[1]) : &(pd->nsaddr), NULL,
PF_SN_NAT)) != 0 ) {
REASON_SET(&ctx->reason, sn_reason);
goto csfailed;
@@ -6162,8 +6169,8 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx,
&s->src, &s->dst, &ctx->rewrite)) {
/* This really shouldn't happen!!! */
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: tcp normalize failed on first "
- "pkt\n", __func__));
+ "%s: tcp normalize failed on first "
+ "pkt", __func__);
goto csfailed;
}
} else if (pd->proto == IPPROTO_SCTP) {
@@ -6213,7 +6220,7 @@ pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx,
if (ctx->tag > 0)
s->tag = ctx->tag;
if (pd->proto == IPPROTO_TCP && (tcp_get_flags(th) & (TH_SYN|TH_ACK)) ==
- TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
+ TH_SYN && r->keep_state == PF_STATE_SYNPROXY && pd->dir == PF_IN) {
pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC);
pf_undo_nat(ctx->nr, pd, bip_sum);
s->src.seqhi = arc4random();
@@ -7959,8 +7966,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (!pf_pull_hdr(pd->m, ipoff2, &h2, sizeof(h2),
NULL, reason, pd2.af)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message too short "
- "(ip)\n"));
+ "pf: ICMP error message too short "
+ "(ip)");
return (PF_DROP);
}
/*
@@ -7990,8 +7997,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (!pf_pull_hdr(pd->m, ipoff2, &h2_6, sizeof(h2_6),
NULL, reason, pd2.af)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message too short "
- "(ip6)\n"));
+ "pf: ICMP error message too short "
+ "(ip6)");
return (PF_DROP);
}
pd2.off = ipoff2;
@@ -8043,8 +8050,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (!pf_pull_hdr(pd->m, pd2.off, th, 8, NULL, reason,
pd2.af)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message too short "
- "(tcp)\n"));
+ "pf: ICMP error message too short "
+ "(tcp)");
return (PF_DROP);
}
pd2.pcksum = &pd2.hdr.tcp.th_sum;
@@ -8238,8 +8245,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (!pf_pull_hdr(pd->m, pd2.off, uh, sizeof(*uh),
NULL, reason, pd2.af)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message too short "
- "(udp)\n"));
+ "pf: ICMP error message too short "
+ "(udp)");
return (PF_DROP);
}
pd2.pcksum = &pd2.hdr.udp.uh_sum;
@@ -8370,8 +8377,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (! pf_pull_hdr(pd->m, pd2.off, sh, sizeof(*sh), NULL, reason,
pd2.af)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message too short "
- "(sctp)\n"));
+ "pf: ICMP error message too short "
+ "(sctp)");
return (PF_DROP);
}
pd2.pcksum = &pd2.sctp_dummy_sum;
@@ -8401,8 +8408,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (src->scrub->pfss_v_tag != sh->v_tag) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message has incorrect "
- "SCTP v_tag\n"));
+ "pf: ICMP error message has incorrect "
+ "SCTP v_tag");
return (PF_DROP);
}
@@ -8525,8 +8532,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (!pf_pull_hdr(pd->m, pd2.off, iih, ICMP_MINLEN,
NULL, reason, pd2.af)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message too short i"
- "(icmp)\n"));
+ "pf: ICMP error message too short i"
+ "(icmp)");
return (PF_DROP);
}
pd2.pcksum = &pd2.hdr.icmp.icmp_cksum;
@@ -8645,8 +8652,8 @@ pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
if (!pf_pull_hdr(pd->m, pd2.off, iih,
sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: ICMP error message too short "
- "(icmp6)\n"));
+ "pf: ICMP error message too short "
+ "(icmp6)");
return (PF_DROP);
}
pd2.pcksum = &pd2.hdr.icmp6.icmp6_cksum;
@@ -9062,6 +9069,9 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
goto bad;
}
+ if (r->rt == PF_DUPTO)
+ skip_test = true;
+
if (pd->dir == PF_IN && !skip_test) {
if (pf_test(AF_INET, PF_OUT, PFIL_FWD, ifp, &m0, inp,
&pd->act) != PF_PASS) {
@@ -9073,7 +9083,7 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
}
if (m0->m_len < sizeof(struct ip)) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
+ "%s: m0->m_len < sizeof(struct ip)", __func__);
SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
goto bad;
}
@@ -9364,6 +9374,9 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp,
goto bad;
}
+ if (r->rt == PF_DUPTO)
+ skip_test = true;
+
if (pd->dir == PF_IN && !skip_test) {
if (pf_test(AF_INET6, PF_OUT, PFIL_FWD | PF_PFIL_NOREFRAGMENT,
ifp, &m0, inp, &pd->act) != PF_PASS) {
@@ -9375,8 +9388,8 @@ pf_route6(struct pf_krule *r, struct ifnet *oifp,
}
if (m0->m_len < sizeof(struct ip6_hdr)) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
- __func__));
+ "%s: m0->m_len < sizeof(struct ip6_hdr)",
+ __func__);
SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
goto bad;
}
@@ -9671,7 +9684,7 @@ pf_test_eth(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0,
if (kif == NULL) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: kif == NULL, if_xname %s\n", __func__, ifp->if_xname));
+ "%s: kif == NULL, if_xname %s", __func__, ifp->if_xname);
return (PF_DROP);
}
if (kif->pfik_flags & PFI_IFLAG_SKIP)
@@ -9786,6 +9799,62 @@ pf_dummynet_route(struct pf_pdesc *pd, struct pf_kstate *s,
}
static int
+pf_walk_option(struct pf_pdesc *pd, struct ip *h, int off, int end,
+ u_short *reason)
+{
+ uint8_t type, length, opts[15 * 4 - sizeof(struct ip)];
+
+ /* IP header in payload of ICMP packet may be too short */
+ if (pd->m->m_pkthdr.len < end) {
+ DPFPRINTF(PF_DEBUG_MISC, "IP option too short");
+ REASON_SET(reason, PFRES_SHORT);
+ return (PF_DROP);
+ }
+
+ MPASS(end - off <= sizeof(opts));
+ m_copydata(pd->m, off, end - off, opts);
+ end -= off;
+ off = 0;
+
+ while (off < end) {
+ type = opts[off];
+ if (type == IPOPT_EOL)
+ break;
+ if (type == IPOPT_NOP) {
+ off++;
+ continue;
+ }
+ if (off + 2 > end) {
+ DPFPRINTF(PF_DEBUG_MISC, "IP length opt");
+ REASON_SET(reason, PFRES_IPOPTIONS);
+ return (PF_DROP);
+ }
+ length = opts[off + 1];
+ if (length < 2) {
+ DPFPRINTF(PF_DEBUG_MISC, "IP short opt");
+ REASON_SET(reason, PFRES_IPOPTIONS);
+ return (PF_DROP);
+ }
+ if (off + length > end) {
+ DPFPRINTF(PF_DEBUG_MISC, "IP long opt");
+ REASON_SET(reason, PFRES_IPOPTIONS);
+ return (PF_DROP);
+ }
+ switch (type) {
+ case IPOPT_RA:
+ pd->badopts |= PF_OPT_ROUTER_ALERT;
+ break;
+ default:
+ pd->badopts |= PF_OPT_OTHER;
+ break;
+ }
+ off += length;
+ }
+
+ return (PF_PASS);
+}
+
+static int
pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
{
struct ah ext;
@@ -9797,11 +9866,28 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
REASON_SET(reason, PFRES_SHORT);
return (PF_DROP);
}
- if (hlen != sizeof(struct ip))
- pd->badopts++;
+ if (hlen != sizeof(struct ip)) {
+ if (pf_walk_option(pd, h, pd->off + sizeof(struct ip),
+ pd->off + hlen, reason) != PF_PASS)
+ return (PF_DROP);
+ /* header options which contain only padding is fishy */
+ if (pd->badopts == 0)
+ pd->badopts |= PF_OPT_OTHER;
+ }
end = pd->off + ntohs(h->ip_len);
pd->off += hlen;
pd->proto = h->ip_p;
+ /* IGMP packets have router alert options, allow them */
+ if (pd->proto == IPPROTO_IGMP) {
+ /* According to RFC 1112 ttl must be set to 1. */
+ if ((h->ip_ttl != 1) ||
+ !IN_MULTICAST(ntohl(h->ip_dst.s_addr))) {
+ DPFPRINTF(PF_DEBUG_MISC, "Invalid IGMP");
+ REASON_SET(reason, PFRES_IPOPTIONS);
+ return (PF_DROP);
+ }
+ pd->badopts &= ~PF_OPT_ROUTER_ALERT;
+ }
/* stop walking over non initial fragments */
if ((h->ip_off & htons(IP_OFFMASK)) != 0)
return (PF_PASS);
@@ -9814,7 +9900,7 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
return (PF_PASS);
if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext),
NULL, reason, AF_INET)) {
- DPFPRINTF(PF_DEBUG_MISC, ("IP short exthdr"));
+ DPFPRINTF(PF_DEBUG_MISC, "IP short exthdr");
return (PF_DROP);
}
pd->off += (ext.ah_len + 2) * 4;
@@ -9824,7 +9910,7 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
return (PF_PASS);
}
}
- DPFPRINTF(PF_DEBUG_MISC, ("IPv4 nested authentication header limit"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv4 nested authentication header limit");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
@@ -9840,7 +9926,7 @@ pf_walk_option6(struct pf_pdesc *pd, struct ip6_hdr *h, int off, int end,
while (off < end) {
if (!pf_pull_hdr(pd->m, off, &opt.ip6o_type,
sizeof(opt.ip6o_type), NULL, reason, AF_INET6)) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short opt type"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short opt type");
return (PF_DROP);
}
if (opt.ip6o_type == IP6OPT_PAD1) {
@@ -9849,41 +9935,48 @@ pf_walk_option6(struct pf_pdesc *pd, struct ip6_hdr *h, int off, int end,
}
if (!pf_pull_hdr(pd->m, off, &opt, sizeof(opt), NULL,
reason, AF_INET6)) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short opt"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short opt");
return (PF_DROP);
}
if (off + sizeof(opt) + opt.ip6o_len > end) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 long opt"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 long opt");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
switch (opt.ip6o_type) {
+ case IP6OPT_PADN:
+ break;
case IP6OPT_JUMBO:
+ pd->badopts |= PF_OPT_JUMBO;
if (pd->jumbolen != 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 multiple jumbo"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple jumbo");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
if (ntohs(h->ip6_plen) != 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 bad jumbo plen"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 bad jumbo plen");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
if (!pf_pull_hdr(pd->m, off, &jumbo, sizeof(jumbo), NULL,
reason, AF_INET6)) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short jumbo"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short jumbo");
return (PF_DROP);
}
memcpy(&pd->jumbolen, jumbo.ip6oj_jumbo_len,
sizeof(pd->jumbolen));
pd->jumbolen = ntohl(pd->jumbolen);
if (pd->jumbolen < IPV6_MAXPACKET) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short jumbolen"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short jumbolen");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
break;
+ case IP6OPT_ROUTER_ALERT:
+ pd->badopts |= PF_OPT_ROUTER_ALERT;
+ break;
default:
+ pd->badopts |= PF_OPT_OTHER;
break;
}
off += sizeof(opt) + opt.ip6o_len;
@@ -9897,6 +9990,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
{
struct ip6_frag frag;
struct ip6_ext ext;
+ struct icmp6_hdr icmp6;
struct ip6_rthdr rthdr;
uint32_t end;
int hdr_cnt, fraghdr_cnt = 0, rthdr_cnt = 0;
@@ -9908,27 +10002,40 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
for (hdr_cnt = 0; hdr_cnt < PF_HDR_LIMIT; hdr_cnt++) {
switch (pd->proto) {
case IPPROTO_ROUTING:
- case IPPROTO_HOPOPTS:
case IPPROTO_DSTOPTS:
- pd->badopts++;
+ pd->badopts |= PF_OPT_OTHER;
+ break;
+ case IPPROTO_HOPOPTS:
+ if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext),
+ NULL, reason, AF_INET6)) {
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short exthdr");
+ return (PF_DROP);
+ }
+ if (pf_walk_option6(pd, h, pd->off + sizeof(ext),
+ pd->off + (ext.ip6e_len + 1) * 8,
+ reason) != PF_PASS)
+ return (PF_DROP);
+ /* option header which contains only padding is fishy */
+ if (pd->badopts == 0)
+ pd->badopts |= PF_OPT_OTHER;
break;
}
switch (pd->proto) {
case IPPROTO_FRAGMENT:
if (fraghdr_cnt++) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 multiple fragment"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple fragment");
REASON_SET(reason, PFRES_FRAG);
return (PF_DROP);
}
/* jumbo payload packets cannot be fragmented */
if (pd->jumbolen != 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 fragmented jumbo"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 fragmented jumbo");
REASON_SET(reason, PFRES_FRAG);
return (PF_DROP);
}
if (!pf_pull_hdr(pd->m, pd->off, &frag, sizeof(frag),
NULL, reason, AF_INET6)) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short fragment"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short fragment");
return (PF_DROP);
}
/* stop walking over non initial fragments */
@@ -9944,7 +10051,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
break;
case IPPROTO_ROUTING:
if (rthdr_cnt++) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 multiple rthdr"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple rthdr");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
@@ -9956,11 +10063,11 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
}
if (!pf_pull_hdr(pd->m, pd->off, &rthdr, sizeof(rthdr),
NULL, reason, AF_INET6)) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short rthdr"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short rthdr");
return (PF_DROP);
}
if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 rthdr0"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 rthdr0");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
@@ -9968,7 +10075,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
case IPPROTO_HOPOPTS:
/* RFC2460 4.1: Hop-by-Hop only after IPv6 header */
if (pd->proto == IPPROTO_HOPOPTS && hdr_cnt > 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 hopopts not first"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 hopopts not first");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
@@ -9977,7 +10084,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
case IPPROTO_DSTOPTS:
if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext),
NULL, reason, AF_INET6)) {
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 short exthdr"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 short exthdr");
return (PF_DROP);
}
/* fragments may be short */
@@ -9989,18 +10096,11 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
/* reassembly needs the ext header before the frag */
if (pd->fragoff == 0)
pd->extoff = pd->off;
- if (pd->proto == IPPROTO_HOPOPTS && pd->fragoff == 0) {
- if (pf_walk_option6(pd, h,
- pd->off + sizeof(ext),
- pd->off + (ext.ip6e_len + 1) * 8, reason)
- != PF_PASS)
- return (PF_DROP);
- if (ntohs(h->ip6_plen) == 0 && pd->jumbolen != 0) {
- DPFPRINTF(PF_DEBUG_MISC,
- ("IPv6 missing jumbo"));
- REASON_SET(reason, PFRES_IPOPTIONS);
- return (PF_DROP);
- }
+ if (pd->proto == IPPROTO_HOPOPTS && pd->fragoff == 0 &&
+ ntohs(h->ip6_plen) == 0 && pd->jumbolen != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 missing jumbo");
+ REASON_SET(reason, PFRES_IPOPTIONS);
+ return (PF_DROP);
}
if (pd->proto == IPPROTO_AH)
pd->off += (ext.ip6e_len + 2) * 4;
@@ -10008,10 +10108,45 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
pd->off += (ext.ip6e_len + 1) * 8;
pd->proto = ext.ip6e_nxt;
break;
+ case IPPROTO_ICMPV6:
+ /* fragments may be short, ignore inner header then */
+ if (pd->fragoff != 0 && end < pd->off + sizeof(icmp6)) {
+ pd->off = pd->fragoff;
+ pd->proto = IPPROTO_FRAGMENT;
+ return (PF_PASS);
+ }
+ if (!pf_pull_hdr(pd->m, pd->off, &icmp6, sizeof(icmp6),
+ NULL, reason, AF_INET6)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ "IPv6 short icmp6hdr");
+ return (PF_DROP);
+ }
+ /* ICMP multicast packets have router alert options */
+ switch (icmp6.icmp6_type) {
+ case MLD_LISTENER_QUERY:
+ case MLD_LISTENER_REPORT:
+ case MLD_LISTENER_DONE:
+ case MLDV2_LISTENER_REPORT:
+ /*
+ * According to RFC 2710 all MLD messages are
+ * sent with hop-limit (ttl) set to 1, and link
+ * local source address. If either one is
+ * missing then MLD message is invalid and
+ * should be discarded.
+ */
+ if ((h->ip6_hlim != 1) ||
+ !IN6_IS_ADDR_LINKLOCAL(&h->ip6_src)) {
+ DPFPRINTF(PF_DEBUG_MISC, "Invalid MLD");
+ REASON_SET(reason, PFRES_IPOPTIONS);
+ return (PF_DROP);
+ }
+ pd->badopts &= ~PF_OPT_ROUTER_ALERT;
+ break;
+ }
+ return (PF_PASS);
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_SCTP:
- case IPPROTO_ICMPV6:
/* fragments may be short, ignore inner header then */
if (pd->fragoff != 0 && end < pd->off +
(pd->proto == IPPROTO_TCP ? sizeof(struct tcphdr) :
@@ -10026,7 +10161,7 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
return (PF_PASS);
}
}
- DPFPRINTF(PF_DEBUG_MISC, ("IPv6 nested extension header limit"));
+ DPFPRINTF(PF_DEBUG_MISC, "IPv6 nested extension header limit");
REASON_SET(reason, PFRES_IPOPTIONS);
return (PF_DROP);
}
@@ -10052,6 +10187,8 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
pd->didx = (dir == PF_IN) ? 1 : 0;
pd->af = pd->naf = af;
+ PF_RULES_ASSERT();
+
TAILQ_INIT(&pd->sctp_multihome_jobs);
if (default_actions != NULL)
memcpy(&pd->act, default_actions, sizeof(pd->act));
@@ -10069,8 +10206,15 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
if (__predict_false((*m0)->m_len < sizeof(struct ip)) &&
(pd->m = *m0 = m_pullup(*m0, sizeof(struct ip))) == NULL) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: m_len < sizeof(struct ip), pullup failed\n",
- __func__));
+ "%s: m_len < sizeof(struct ip), pullup failed",
+ __func__);
+ *action = PF_DROP;
+ REASON_SET(reason, PFRES_SHORT);
+ return (-1);
+ }
+
+ h = mtod(pd->m, struct ip *);
+ if (pd->m->m_pkthdr.len < ntohs(h->ip_len)) {
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
return (-1);
@@ -10083,13 +10227,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
return (-1);
}
*m0 = pd->m;
-
h = mtod(pd->m, struct ip *);
- if (pd->m->m_pkthdr.len < ntohs(h->ip_len)) {
- *action = PF_DROP;
- REASON_SET(reason, PFRES_SHORT);
- return (-1);
- }
if (pf_walk_header(pd, h, reason) != PF_PASS) {
*action = PF_DROP;
@@ -10119,14 +10257,29 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
if (__predict_false((*m0)->m_len < sizeof(struct ip6_hdr)) &&
(pd->m = *m0 = m_pullup(*m0, sizeof(struct ip6_hdr))) == NULL) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: m_len < sizeof(struct ip6_hdr)"
- ", pullup failed\n", __func__));
+ "%s: m_len < sizeof(struct ip6_hdr)"
+ ", pullup failed", __func__);
*action = PF_DROP;
REASON_SET(reason, PFRES_SHORT);
return (-1);
}
h = mtod(pd->m, struct ip6_hdr *);
+ if (pd->m->m_pkthdr.len <
+ sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) {
+ *action = PF_DROP;
+ REASON_SET(reason, PFRES_SHORT);
+ return (-1);
+ }
+
+ /*
+ * we do not support jumbogram. if we keep going, zero ip6_plen
+ * will do something bad, so drop the packet for now.
+ */
+ if (htons(h->ip6_plen) == 0) {
+ *action = PF_DROP;
+ return (-1);
+ }
if (pf_walk_header6(pd, h, reason) != PF_PASS) {
*action = PF_DROP;
@@ -10147,15 +10300,6 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
pd->virtual_proto = (pd->fragoff != 0) ?
PF_VPROTO_FRAGMENT : pd->proto;
- /*
- * we do not support jumbogram. if we keep going, zero ip6_plen
- * will do something bad, so drop the packet for now.
- */
- if (htons(h->ip6_plen) == 0) {
- *action = PF_DROP;
- return (-1);
- }
-
/* We do IP header normalization and packet reassembly here */
if (pf_normalize_ip6(pd->fragoff, reason, pd) !=
PF_PASS) {
@@ -10465,35 +10609,30 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
PF_RULES_RLOCK_TRACKER;
KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
M_ASSERTPKTHDR(*m0);
+ NET_EPOCH_ASSERT();
if (!V_pf_status.running)
return (PF_PASS);
- PF_RULES_RLOCK();
-
kif = (struct pfi_kkif *)ifp->if_pf_kif;
if (__predict_false(kif == NULL)) {
DPFPRINTF(PF_DEBUG_URGENT,
- ("%s: kif == NULL, if_xname %s\n",
- __func__, ifp->if_xname));
- PF_RULES_RUNLOCK();
+ "%s: kif == NULL, if_xname %s",
+ __func__, ifp->if_xname);
return (PF_DROP);
}
if (kif->pfik_flags & PFI_IFLAG_SKIP) {
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
if ((*m0)->m_flags & M_SKIP_FIREWALL) {
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
if (__predict_false(! M_WRITABLE(*m0))) {
*m0 = m_unshare(*m0, M_NOWAIT);
if (*m0 == NULL) {
- PF_RULES_RUNLOCK();
return (PF_DROP);
}
}
@@ -10506,12 +10645,10 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
pd.pf_mtag->if_idxgen);
if (ifp == NULL || ifp->if_flags & IFF_DYING) {
- PF_RULES_RUNLOCK();
m_freem(*m0);
*m0 = NULL;
return (PF_PASS);
}
- PF_RULES_RUNLOCK();
(ifp->if_output)(ifp, *m0, sintosa(&pd.pf_mtag->dst), NULL);
*m0 = NULL;
return (PF_PASS);
@@ -10526,11 +10663,12 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
/* But only once. We may see the packet multiple times (e.g.
* PFIL_IN/PFIL_OUT). */
pf_dummynet_flag_remove(pd.m, pd.pf_mtag);
- PF_RULES_RUNLOCK();
return (PF_PASS);
}
+ PF_RULES_RLOCK();
+
if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason,
kif, default_actions) == -1) {
if (action != PF_PASS)
@@ -10685,14 +10823,14 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
action = PF_DROP;
REASON_SET(&reason, PFRES_NORM);
DPFPRINTF(PF_DEBUG_MISC,
- ("dropping IPv6 packet with ICMPv4 payload"));
+ "dropping IPv6 packet with ICMPv4 payload");
break;
}
if (pd.virtual_proto == IPPROTO_ICMPV6 && af != AF_INET6) {
action = PF_DROP;
REASON_SET(&reason, PFRES_NORM);
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
+ "pf: dropping IPv4 packet with ICMPv6 payload");
break;
}
action = pf_test_state_icmp(&s, &pd, &reason);
@@ -10718,12 +10856,12 @@ done:
if (s)
memcpy(&pd.act, &s->act, sizeof(s->act));
- if (action == PF_PASS && pd.badopts && !pd.act.allow_opts) {
+ if (action == PF_PASS && pd.badopts != 0 && !pd.act.allow_opts) {
action = PF_DROP;
REASON_SET(&reason, PFRES_IPOPTIONS);
pd.act.log = PF_LOG_FORCE;
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: dropping packet with dangerous headers\n"));
+ "pf: dropping packet with dangerous headers");
}
if (pd.act.max_pkt_size && pd.act.max_pkt_size &&
@@ -10732,7 +10870,7 @@ done:
REASON_SET(&reason, PFRES_NORM);
pd.act.log = PF_LOG_FORCE;
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: dropping overly long packet\n"));
+ "pf: dropping overly long packet");
}
if (s) {
@@ -10764,7 +10902,7 @@ done:
REASON_SET(&reason, PFRES_MEMORY);
pd.act.log = PF_LOG_FORCE;
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: failed to allocate 802.1q mtag\n"));
+ "pf: failed to allocate 802.1q mtag");
}
}
@@ -10821,7 +10959,7 @@ done:
REASON_SET(&reason, PFRES_MEMORY);
pd.act.log = PF_LOG_FORCE;
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: failed to allocate tag\n"));
+ "pf: failed to allocate tag");
} else {
pd.pf_mtag->flags |=
PF_MTAG_FLAG_FASTFWD_OURS_PRESENT;
@@ -10838,7 +10976,7 @@ done:
REASON_SET(&reason, PFRES_MEMORY);
pd.act.log = PF_LOG_FORCE;
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: failed to allocate divert tag\n"));
+ "pf: failed to allocate divert tag");
}
}
/* XXX: Anybody working on it?! */
diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h
index 2009d2907985..cfff58064922 100644
--- a/sys/netpfil/pf/pf.h
+++ b/sys/netpfil/pf/pf.h
@@ -140,7 +140,7 @@ enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL,
#define PF_LOG 0x01
#define PF_LOG_ALL 0x02
-#define PF_LOG_SOCKET_LOOKUP 0x04
+#define PF_LOG_USER 0x04
#define PF_LOG_FORCE 0x08
#define PF_LOG_MATCHES 0x10
@@ -490,6 +490,7 @@ struct pf_osfp_ioctl {
#define PF_ANCHOR_NAME_SIZE 64
#define PF_ANCHOR_MAXPATH (MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1)
+#define PF_OPTIMIZER_TABLE_PFX "__automatic_"
struct pf_rule {
struct pf_rule_addr src;
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index c96741023db9..9abc07c36788 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -217,8 +217,6 @@ static u_int16_t tagname2tag(struct pf_tagset *, const char *);
static u_int16_t pf_tagname2tag(const char *);
static void tag_unref(struct pf_tagset *, u_int16_t);
-#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
-
struct cdev *pf_dev;
/*
@@ -1274,7 +1272,9 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr)
PF_MD5_UPD(pfr, addr.iflags);
break;
case PF_ADDR_TABLE:
- PF_MD5_UPD(pfr, addr.v.tblname);
+ if (strncmp(pfr->addr.v.tblname, PF_OPTIMIZER_TABLE_PFX,
+ strlen(PF_OPTIMIZER_TABLE_PFX)))
+ PF_MD5_UPD(pfr, addr.v.tblname);
break;
case PF_ADDR_ADDRMASK:
/* XXX ignore af? */
@@ -1357,7 +1357,7 @@ static int
pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
{
struct pf_kruleset *rs;
- struct pf_krule *rule, **old_array, *old_rule;
+ struct pf_krule *rule, *old_rule;
struct pf_krulequeue *old_rules;
struct pf_krule_global *old_tree;
int error;
@@ -1382,13 +1382,10 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
/* Swap rules, keep the old. */
old_rules = rs->rules[rs_num].active.ptr;
old_rcount = rs->rules[rs_num].active.rcount;
- old_array = rs->rules[rs_num].active.ptr_array;
old_tree = rs->rules[rs_num].active.tree;
rs->rules[rs_num].active.ptr =
rs->rules[rs_num].inactive.ptr;
- rs->rules[rs_num].active.ptr_array =
- rs->rules[rs_num].inactive.ptr_array;
rs->rules[rs_num].active.tree =
rs->rules[rs_num].inactive.tree;
rs->rules[rs_num].active.rcount =
@@ -1418,7 +1415,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
}
rs->rules[rs_num].inactive.ptr = old_rules;
- rs->rules[rs_num].inactive.ptr_array = old_array;
rs->rules[rs_num].inactive.tree = NULL; /* important for pf_ioctl_addrule */
rs->rules[rs_num].inactive.rcount = old_rcount;
@@ -1431,9 +1427,6 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
while ((rule = TAILQ_FIRST(old_rules)) != NULL)
pf_unlink_rule_locked(old_rules, rule);
PF_UNLNKDRULES_UNLOCK();
- if (rs->rules[rs_num].inactive.ptr_array)
- free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
- rs->rules[rs_num].inactive.ptr_array = NULL;
rs->rules[rs_num].inactive.rcount = 0;
rs->rules[rs_num].inactive.open = 0;
pf_remove_if_empty_kruleset(rs);
@@ -1456,24 +1449,11 @@ pf_setup_pfsync_matching(struct pf_kruleset *rs)
if (rs_cnt == PF_RULESET_SCRUB)
continue;
- if (rs->rules[rs_cnt].inactive.ptr_array)
- free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
- rs->rules[rs_cnt].inactive.ptr_array = NULL;
-
if (rs->rules[rs_cnt].inactive.rcount) {
- rs->rules[rs_cnt].inactive.ptr_array =
- mallocarray(rs->rules[rs_cnt].inactive.rcount,
- sizeof(struct pf_rule **),
- M_TEMP, M_NOWAIT);
-
- if (!rs->rules[rs_cnt].inactive.ptr_array)
- return (ENOMEM);
- }
-
- TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
- entries) {
- pf_hash_rule_rolling(&ctx, rule);
- (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
+ TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
+ entries) {
+ pf_hash_rule_rolling(&ctx, rule);
+ }
}
}
@@ -2059,6 +2039,47 @@ pf_ioctl_getrules(struct pfioc_rule *pr)
return (0);
}
+static int
+pf_rule_checkaf(struct pf_krule *r)
+{
+ switch (r->af) {
+ case 0:
+ if (r->rule_flag & PFRULE_AFTO)
+ return (EPFNOSUPPORT);
+ break;
+ case AF_INET:
+ if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET6)
+ return (EPFNOSUPPORT);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if ((r->rule_flag & PFRULE_AFTO) && r->naf != AF_INET)
+ return (EPFNOSUPPORT);
+ break;
+#endif /* INET6 */
+ default:
+ return (EPFNOSUPPORT);
+ }
+
+ if ((r->rule_flag & PFRULE_AFTO) == 0 && r->naf != 0)
+ return (EPFNOSUPPORT);
+
+ return (0);
+}
+
+static int
+pf_validate_range(uint8_t op, uint16_t port[2])
+{
+ uint16_t a = ntohs(port[0]);
+ uint16_t b = ntohs(port[1]);
+
+ if ((op == PF_OP_RRG && a > b) || /* 34:12, i.e. none */
+ (op == PF_OP_IRG && a >= b) || /* 34><12, i.e. none */
+ (op == PF_OP_XRG && a > b)) /* 34<>22, i.e. all */
+ return 1;
+ return 0;
+}
+
int
pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
uint32_t pool_ticket, const char *anchor, const char *anchor_call,
@@ -2071,12 +2092,18 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
int rs_num;
int error = 0;
- if ((rule->return_icmp >> 8) > ICMP_MAXTYPE) {
- error = EINVAL;
- goto errout_unlocked;
- }
+#define ERROUT(x) ERROUT_FUNCTION(errout, x)
+#define ERROUT_UNLOCKED(x) ERROUT_FUNCTION(errout_unlocked, x)
-#define ERROUT(x) ERROUT_FUNCTION(errout, x)
+ if ((rule->return_icmp >> 8) > ICMP_MAXTYPE)
+ ERROUT_UNLOCKED(EINVAL);
+
+ if ((error = pf_rule_checkaf(rule)))
+ ERROUT_UNLOCKED(error);
+ if (pf_validate_range(rule->src.port_op, rule->src.port))
+ ERROUT_UNLOCKED(EINVAL);
+ if (pf_validate_range(rule->dst.port_op, rule->dst.port))
+ ERROUT_UNLOCKED(EINVAL);
if (rule->ifname[0])
kif = pf_kkif_create(M_WAITOK);
@@ -2113,14 +2140,14 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
ERROUT(EINVAL);
if (ticket != ruleset->rules[rs_num].inactive.ticket) {
DPFPRINTF(PF_DEBUG_MISC,
- ("ticket: %d != [%d]%d\n", ticket, rs_num,
- ruleset->rules[rs_num].inactive.ticket));
+ "ticket: %d != [%d]%d", ticket, rs_num,
+ ruleset->rules[rs_num].inactive.ticket);
ERROUT(EBUSY);
}
if (pool_ticket != V_ticket_pabuf) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pool_ticket: %d != %d\n", pool_ticket,
- V_ticket_pabuf));
+ "pool_ticket: %d != %d", pool_ticket,
+ V_ticket_pabuf);
ERROUT(EBUSY);
}
/*
@@ -2266,6 +2293,7 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
return (0);
#undef ERROUT
+#undef ERROUT_UNLOCKED
errout:
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
@@ -2439,7 +2467,7 @@ pf_start(void)
V_pf_status.since = time_uptime;
new_unrhdr64(&V_pf_stateid, time_second);
- DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "pf: started");
}
sx_xunlock(&V_pf_ioctl_lock);
@@ -2459,7 +2487,7 @@ pf_stop(void)
dehook_pf();
dehook_pf_eth();
V_pf_status.since = time_uptime;
- DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "pf: stopped");
}
sx_xunlock(&V_pf_ioctl_lock);
@@ -3234,9 +3262,9 @@ DIOCGETETHRULE_error:
if (nvlist_get_number(nvl, "ticket") !=
ruleset->inactive.ticket) {
DPFPRINTF(PF_DEBUG_MISC,
- ("ticket: %d != %d\n",
+ "ticket: %d != %d",
(u_int32_t)nvlist_get_number(nvl, "ticket"),
- ruleset->inactive.ticket));
+ ruleset->inactive.ticket);
ERROUT(EBUSY);
}
@@ -3567,7 +3595,7 @@ DIOCADDRULENV_error:
error = pf_rule_to_krule(&pr->rule, rule);
if (error != 0) {
pf_krule_free(rule);
- break;
+ goto fail;
}
pr->anchor[sizeof(pr->anchor) - 1] = '\0';
@@ -3726,11 +3754,11 @@ DIOCGETRULENV_error:
if (pcr->action < PF_CHANGE_ADD_HEAD ||
pcr->action > PF_CHANGE_GET_TICKET) {
error = EINVAL;
- break;
+ goto fail;
}
if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pcr->action != PF_CHANGE_REMOVE) {
@@ -3738,9 +3766,13 @@ DIOCGETRULENV_error:
error = pf_rule_to_krule(&pcr->rule, newrule);
if (error != 0) {
pf_krule_free(newrule);
- break;
+ goto fail;
}
+ if ((error = pf_rule_checkaf(newrule))) {
+ pf_krule_free(newrule);
+ goto fail;
+ }
if (newrule->ifname[0])
kif = pf_kkif_create(M_WAITOK);
pf_counter_u64_init(&newrule->evaluations, M_WAITOK);
@@ -3888,7 +3920,7 @@ DIOCGETRULENV_error:
pf_free_rule(newrule);
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
- break;
+ goto fail;
}
newrule->nat.cur = TAILQ_FIRST(&newrule->nat.list);
@@ -3915,7 +3947,7 @@ DIOCGETRULENV_error:
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
}
@@ -3933,7 +3965,7 @@ DIOCGETRULENV_error:
PF_RULES_WUNLOCK();
PF_CONFIG_UNLOCK();
error = EEXIST;
- break;
+ goto fail;
}
if (oldrule == NULL)
@@ -3989,7 +4021,7 @@ DIOCCHANGERULE_error:
if (sp->timeout >= PFTM_MAX) {
error = EINVAL;
- break;
+ goto fail;
}
if (V_pfsync_state_import_ptr != NULL) {
PF_RULES_RLOCK();
@@ -4009,7 +4041,7 @@ DIOCCHANGERULE_error:
s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
if (s == NULL) {
error = ENOENT;
- break;
+ goto fail;
}
pfsync_state_export((union pfsync_state_union*)&ps->state,
@@ -4088,7 +4120,7 @@ DIOCGETSTATES_retry:
error = copyout(pstore, out,
sizeof(struct pfsync_state_1301) * count);
if (error)
- break;
+ goto fail;
out = ps->ps_states + nr;
}
DIOCGETSTATES_full:
@@ -4108,7 +4140,7 @@ DIOCGETSTATES_full:
if (ps->ps_req_version > PF_STATE_VERSION) {
error = ENOTSUP;
- break;
+ goto fail;
}
if (ps->ps_len <= 0) {
@@ -4166,7 +4198,7 @@ DIOCGETSTATESV2_retry:
error = copyout(pstore, out,
sizeof(struct pf_state_export) * count);
if (error)
- break;
+ goto fail;
out = ps->ps_states + nr;
}
DIOCGETSTATESV2_full:
@@ -4272,12 +4304,12 @@ DIOCGETSTATESV2_full:
if (psp->ifname[0] == '\0') {
error = EINVAL;
- break;
+ goto fail;
}
error = pf_user_strcpy(ps.ifname, psp->ifname, IFNAMSIZ);
if (error != 0)
- break;
+ goto fail;
ifp = ifunit(ps.ifname);
if (ifp != NULL) {
psp->baudrate32 =
@@ -4306,7 +4338,7 @@ DIOCGETSTATESV2_full:
if (error == 0)
V_pf_altq_running = 1;
PF_RULES_WUNLOCK();
- DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "altq: started");
break;
}
@@ -4325,7 +4357,7 @@ DIOCGETSTATESV2_full:
if (error == 0)
V_pf_altq_running = 0;
PF_RULES_WUNLOCK();
- DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "altq: stopped");
break;
}
@@ -4338,7 +4370,7 @@ DIOCGETSTATESV2_full:
altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK | M_ZERO);
error = pf_import_kaltq(pa, altq, IOCPARM_LEN(cmd));
if (error)
- break;
+ goto fail;
altq->local_flags = 0;
PF_RULES_WLOCK();
@@ -4346,7 +4378,7 @@ DIOCGETSTATESV2_full:
PF_RULES_WUNLOCK();
free(altq, M_PFALTQ);
error = EBUSY;
- break;
+ goto fail;
}
/*
@@ -4358,7 +4390,7 @@ DIOCGETSTATESV2_full:
PF_RULES_WUNLOCK();
error = EBUSY;
free(altq, M_PFALTQ);
- break;
+ goto fail;
}
altq->altq_disc = NULL;
TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) {
@@ -4378,7 +4410,7 @@ DIOCGETSTATESV2_full:
if (error) {
PF_RULES_WUNLOCK();
free(altq, M_PFALTQ);
- break;
+ goto fail;
}
if (altq->qname[0] != 0)
@@ -4416,13 +4448,13 @@ DIOCGETSTATESV2_full:
if (pa->ticket != V_ticket_altqs_active) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
altq = pf_altq_get_nth_active(pa->nr);
if (altq == NULL) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd));
PF_RULES_RUNLOCK();
@@ -4446,20 +4478,20 @@ DIOCGETSTATESV2_full:
if (pq->ticket != V_ticket_altqs_active) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
nbytes = pq->nbytes;
altq = pf_altq_get_nth_active(pq->nr);
if (altq == NULL) {
PF_RULES_RUNLOCK();
error = EBUSY;
- break;
+ goto fail;
}
if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
PF_RULES_RUNLOCK();
error = ENXIO;
- break;
+ goto fail;
}
PF_RULES_RUNLOCK();
if (cmd == DIOCGETQSTATSV0)
@@ -4528,30 +4560,30 @@ DIOCGETSTATESV2_full:
if (pca->action < PF_CHANGE_ADD_HEAD ||
pca->action > PF_CHANGE_REMOVE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
pca->addr.addr.type != PF_ADDR_DYNIFTL &&
pca->addr.addr.type != PF_ADDR_TABLE) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->addr.addr.p.dyn != NULL) {
error = EINVAL;
- break;
+ goto fail;
}
if (pca->action != PF_CHANGE_REMOVE) {
#ifndef INET
if (pca->af == AF_INET) {
error = EAFNOSUPPORT;
- break;
+ goto fail;
}
#endif /* INET */
#ifndef INET6
if (pca->af == AF_INET6) {
error = EAFNOSUPPORT;
- break;
+ goto fail;
}
#endif /* INET6 */
newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
@@ -4674,7 +4706,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != 0) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
@@ -4690,13 +4722,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
error = ENOMEM;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4705,7 +4737,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_add_tables(pfrts, io->pfrio_size,
@@ -4722,13 +4754,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
error = ENOMEM;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4737,7 +4769,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_del_tables(pfrts, io->pfrio_size,
@@ -4755,14 +4787,14 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_RLOCK();
n = pfr_table_count(&io->pfrio_table, io->pfrio_flags);
if (n < 0) {
PF_RULES_RUNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4773,7 +4805,7 @@ DIOCCHANGEADDR_error:
if (pfrts == NULL) {
error = ENOMEM;
PF_RULES_RUNLOCK();
- break;
+ goto fail;
}
error = pfr_get_tables(&io->pfrio_table, pfrts,
&io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4792,7 +4824,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_TABLE_STATS_LOCK();
PF_RULES_RLOCK();
@@ -4801,7 +4833,7 @@ DIOCCHANGEADDR_error:
PF_RULES_RUNLOCK();
PF_TABLE_STATS_UNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4812,7 +4844,7 @@ DIOCCHANGEADDR_error:
error = ENOMEM;
PF_RULES_RUNLOCK();
PF_TABLE_STATS_UNLOCK();
- break;
+ goto fail;
}
error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
&io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
@@ -4831,7 +4863,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
@@ -4840,7 +4872,7 @@ DIOCCHANGEADDR_error:
* size, so we didn't fail on overly large requests.
* Keep doing so. */
io->pfrio_size = pf_ioctl_maxcount;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_table);
@@ -4849,7 +4881,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_TABLE_STATS_LOCK();
@@ -4870,7 +4902,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_table)) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_RLOCK();
@@ -4878,7 +4910,7 @@ DIOCCHANGEADDR_error:
if (n < 0) {
PF_RULES_RUNLOCK();
error = EINVAL;
- break;
+ goto fail;
}
io->pfrio_size = min(io->pfrio_size, n);
@@ -4890,7 +4922,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfrts, totlen);
if (error) {
free(pfrts, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_set_tflags(pfrts, io->pfrio_size,
@@ -4906,7 +4938,7 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != 0) {
error = ENODEV;
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
@@ -4922,13 +4954,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4936,7 +4968,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_add_addrs(&io->pfrio_table, pfras,
@@ -4956,13 +4988,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -4970,7 +5002,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_del_addrs(&io->pfrio_table, pfras,
@@ -4990,17 +5022,17 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 || io->pfrio_size2 < 0) {
error = EINVAL;
- break;
+ goto fail;
}
count = max(io->pfrio_size, io->pfrio_size2);
if (count > pf_ioctl_maxcount ||
WOULD_OVERFLOW(count, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = count * sizeof(struct pfr_addr);
pfras = mallocarray(count, sizeof(struct pfr_addr), M_TEMP,
@@ -5008,7 +5040,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_set_addrs(&io->pfrio_table, pfras,
@@ -5029,13 +5061,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5057,13 +5089,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_astats)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_astats))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_astats);
pfrastats = mallocarray(io->pfrio_size,
@@ -5085,13 +5117,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5099,7 +5131,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_clr_astats(&io->pfrio_table, pfras,
@@ -5119,13 +5151,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5133,7 +5165,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_RLOCK();
error = pfr_tst_addrs(&io->pfrio_table, pfras,
@@ -5153,13 +5185,13 @@ DIOCCHANGEADDR_error:
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfrio_size < 0 ||
io->pfrio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = io->pfrio_size * sizeof(struct pfr_addr);
pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
@@ -5167,7 +5199,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->pfrio_buffer, pfras, totlen);
if (error) {
free(pfras, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
error = pfr_ina_define(&io->pfrio_table, pfras,
@@ -5202,13 +5234,13 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5216,7 +5248,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5283,13 +5315,13 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
@@ -5297,7 +5329,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
@@ -5366,14 +5398,14 @@ DIOCCHANGEADDR_error:
if (io->esize != sizeof(*ioe)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->size < 0 ||
io->size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
error = EINVAL;
- break;
+ goto fail;
}
totlen = sizeof(struct pfioc_trans_e) * io->size;
@@ -5382,7 +5414,7 @@ DIOCCHANGEADDR_error:
error = copyin(io->array, ioes, totlen);
if (error) {
free(ioes, M_TEMP);
- break;
+ goto fail;
}
PF_RULES_WLOCK();
/* First makes sure everything will succeed. */
@@ -5523,7 +5555,7 @@ DIOCCHANGEADDR_error:
if (psn->psn_len == 0) {
psn->psn_len = sizeof(struct pf_src_node) * nr;
- break;
+ goto fail;
}
nr = 0;
@@ -5548,7 +5580,7 @@ DIOCCHANGEADDR_error:
sizeof(struct pf_src_node) * nr);
if (error) {
free(pstore, M_TEMP);
- break;
+ goto fail;
}
psn->psn_len = sizeof(struct pf_src_node) * nr;
free(pstore, M_TEMP);
@@ -5604,14 +5636,14 @@ DIOCCHANGEADDR_error:
if (io->pfiio_esize != sizeof(struct pfi_kif)) {
error = ENODEV;
- break;
+ goto fail;
}
if (io->pfiio_size < 0 ||
io->pfiio_size > pf_ioctl_maxcount ||
WOULD_OVERFLOW(io->pfiio_size, sizeof(struct pfi_kif))) {
error = EINVAL;
- break;
+ goto fail;
}
io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0';
@@ -6423,9 +6455,9 @@ shutdown_pf(void)
for (rs_num = 0; rs_num < PF_RULESET_MAX; ++rs_num) {
if ((error = pf_begin_rules(&t[rs_num], rs_num,
anchor->path)) != 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: "
- "anchor.path=%s rs_num=%d\n",
- __func__, anchor->path, rs_num));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: "
+ "anchor.path=%s rs_num=%d",
+ __func__, anchor->path, rs_num);
goto error; /* XXX: rollback? */
}
}
@@ -6447,9 +6479,9 @@ shutdown_pf(void)
eth_anchor->refcnt = 1;
if ((error = pf_begin_eth(&t[0], eth_anchor->path))
!= 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: eth "
- "anchor.path=%s\n", __func__,
- eth_anchor->path));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: eth "
+ "anchor.path=%s", __func__,
+ eth_anchor->path);
goto error;
}
error = pf_commit_eth(t[0], eth_anchor->path);
@@ -6458,27 +6490,27 @@ shutdown_pf(void)
if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn))
!= 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: SCRUB\n", __func__));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: SCRUB", __func__);
break;
}
if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn))
!= 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: FILTER\n", __func__));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: FILTER", __func__);
break; /* XXX: rollback? */
}
if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn))
!= 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: NAT\n", __func__));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: NAT", __func__);
break; /* XXX: rollback? */
}
if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn))
!= 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: BINAT\n", __func__));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: BINAT", __func__);
break; /* XXX: rollback? */
}
if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn))
!= 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: RDR\n", __func__));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: RDR", __func__);
break; /* XXX: rollback? */
}
@@ -6497,7 +6529,7 @@ shutdown_pf(void)
break;
if ((error = pf_begin_eth(&t[0], &nn)) != 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: eth\n", __func__));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: eth", __func__);
break;
}
error = pf_commit_eth(t[0], &nn);
@@ -6505,7 +6537,7 @@ shutdown_pf(void)
#ifdef ALTQ
if ((error = pf_begin_altq(&t[0])) != 0) {
- DPFPRINTF(PF_DEBUG_MISC, ("%s: ALTQ\n", __func__));
+ DPFPRINTF(PF_DEBUG_MISC, "%s: ALTQ", __func__);
break;
}
pf_commit_altq(t[0]);
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index 308d76c46e5b..ea0d6facf695 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -71,8 +71,6 @@
#define V_pf_rdr_srcport_rewrite_tries VNET(pf_rdr_srcport_rewrite_tries)
VNET_DEFINE_STATIC(int, pf_rdr_srcport_rewrite_tries) = 16;
-#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
-
static uint64_t pf_hash(struct pf_addr *, struct pf_addr *,
struct pf_poolhashkey *, sa_family_t);
struct pf_krule *pf_match_translation(int, struct pf_test_ctx *);
@@ -80,7 +78,6 @@ static enum pf_test_status pf_step_into_translation_anchor(int, struct pf_test_c
struct pf_krule *);
static int pf_get_sport(struct pf_pdesc *, struct pf_krule *,
struct pf_addr *, uint16_t *, uint16_t, uint16_t,
- struct pf_ksrc_node **, struct pf_srchash **,
struct pf_kpool *, struct pf_udp_mapping **,
pf_sn_types_t);
static bool pf_islinklocal(const sa_family_t, const struct pf_addr *);
@@ -291,10 +288,8 @@ pf_match_translation(int rs_num, struct pf_test_ctx *ctx)
}
static int
-pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
- struct pf_addr *naddr, uint16_t *nport, uint16_t low,
- uint16_t high, struct pf_ksrc_node **sn,
- struct pf_srchash **sh, struct pf_kpool *rpool,
+pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r, struct pf_addr *naddr,
+ uint16_t *nport, uint16_t low, uint16_t high, struct pf_kpool *rpool,
struct pf_udp_mapping **udp_mapping, pf_sn_types_t sn_type)
{
struct pf_state_key_cmp key;
@@ -322,19 +317,24 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
pf_addrcpy(&udp_source.addr, &pd->nsaddr, pd->af);
udp_source.port = pd->nsport;
if (udp_mapping) {
+ struct pf_ksrc_node *sn = NULL;
+ struct pf_srchash *sh = NULL;
*udp_mapping = pf_udp_mapping_find(&udp_source);
if (*udp_mapping) {
pf_addrcpy(naddr,
&(*udp_mapping)->endpoints[1].addr,
pd->af);
*nport = (*udp_mapping)->endpoints[1].port;
- /* Try to find a src_node as per pf_map_addr(). */
- if (*sn == NULL && rpool->opts & PF_POOL_STICKYADDR &&
+ /*
+ * Try to find a src_node as per pf_map_addr().
+ * XXX: Why? This code seems to do nothing.
+ */
+ if (rpool->opts & PF_POOL_STICKYADDR &&
(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
- *sn = pf_find_src_node(&pd->nsaddr, r,
- pd->af, sh, sn_type, false);
- if (*sn != NULL)
- PF_SRC_NODE_UNLOCK(*sn);
+ sn = pf_find_src_node(&pd->nsaddr, r,
+ pd->af, &sh, sn_type, false);
+ if (sn != NULL)
+ PF_SRC_NODE_UNLOCK(sn);
return (0);
} else {
*udp_mapping = pf_udp_mapping_create(pd->af, &pd->nsaddr,
@@ -346,7 +346,7 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
}
if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL, &init_addr,
- sn, sh, rpool, sn_type))
+ rpool, sn_type))
goto failed;
if (pd->proto == IPPROTO_ICMP) {
@@ -470,9 +470,8 @@ pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
* pick a different source address since we're out
* of free port choices for the current one.
*/
- (*sn) = NULL;
if (pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL,
- &init_addr, sn, sh, rpool, sn_type))
+ &init_addr, rpool, sn_type))
return (1);
break;
case PF_POOL_NONE:
@@ -503,7 +502,6 @@ pf_islinklocal(const sa_family_t af, const struct pf_addr *addr)
static int
pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r,
struct pf_addr *naddr, uint16_t *nport,
- struct pf_ksrc_node **sn, struct pf_srchash **sh,
struct pf_udp_mapping **udp_mapping, struct pf_kpool *rpool)
{
uint16_t psmask, low, highmask;
@@ -523,16 +521,14 @@ pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r,
for (i = cut; i <= ahigh; i++) {
low = (i << ashift) | psmask;
- if (!pf_get_sport(pd, r,
- naddr, nport, low, low | highmask, sn, sh, rpool,
- udp_mapping, PF_SN_NAT))
+ if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask,
+ rpool, udp_mapping, PF_SN_NAT))
return (0);
}
for (i = cut - 1; i > 0; i--) {
low = (i << ashift) | psmask;
- if (!pf_get_sport(pd, r,
- naddr, nport, low, low | highmask, sn, sh, rpool,
- udp_mapping, PF_SN_NAT))
+ if (!pf_get_sport(pd, r, naddr, nport, low, low | highmask,
+ rpool, udp_mapping, PF_SN_NAT))
return (0);
}
return (1);
@@ -545,6 +541,7 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
{
u_short reason = PFRES_MATCH;
struct pf_addr *raddr = NULL, *rmask = NULL;
+ struct pfr_ktable *kt;
uint64_t hashidx;
int cnt;
@@ -600,29 +597,25 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
pf_poolmask(naddr, raddr, rmask, saddr, af);
break;
case PF_POOL_RANDOM:
- if (rpool->cur->addr.type == PF_ADDR_TABLE) {
- cnt = rpool->cur->addr.p.tbl->pfrkt_cnt;
- if (cnt == 0)
- rpool->tblidx = 0;
+ if (rpool->cur->addr.type == PF_ADDR_TABLE ||
+ rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ if (rpool->cur->addr.type == PF_ADDR_TABLE)
+ kt = rpool->cur->addr.p.tbl;
else
- rpool->tblidx = (int)arc4random_uniform(cnt);
- memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL)) {
+ kt = rpool->cur->addr.p.dyn->pfid_kt;
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
- pf_addrcpy(naddr, &rpool->counter, af);
- } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
- cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt;
+ cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
rpool->tblidx = (int)arc4random_uniform(cnt);
memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af,
- pf_islinklocal)) {
+ if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
+ af, pf_islinklocal, false)) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
@@ -671,29 +664,25 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
hashidx =
pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
- if (rpool->cur->addr.type == PF_ADDR_TABLE) {
- cnt = rpool->cur->addr.p.tbl->pfrkt_cnt;
- if (cnt == 0)
- rpool->tblidx = 0;
+ if (rpool->cur->addr.type == PF_ADDR_TABLE ||
+ rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ if (rpool->cur->addr.type == PF_ADDR_TABLE)
+ kt = rpool->cur->addr.p.tbl;
else
- rpool->tblidx = (int)(hashidx % cnt);
- memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL)) {
+ kt = rpool->cur->addr.p.dyn->pfid_kt;
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
- pf_addrcpy(naddr, &rpool->counter, af);
- } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
- cnt = rpool->cur->addr.p.dyn->pfid_kt->pfrkt_cnt;
+ cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
rpool->tblidx = (int)(hashidx % cnt);
memset(&rpool->counter, 0, sizeof(rpool->counter));
- if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af,
- pf_islinklocal)) {
+ if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
+ af, pf_islinklocal, false)) {
reason = PFRES_MAPFAILED;
goto done_pool_mtx; /* unsupported */
}
@@ -710,11 +699,12 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
if (rpool->cur->addr.type == PF_ADDR_TABLE) {
if (!pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL))
+ &rpool->tblidx, &rpool->counter, af, NULL, true))
goto get_addr;
} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af, pf_islinklocal))
+ &rpool->tblidx, &rpool->counter, af, pf_islinklocal,
+ true))
goto get_addr;
} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
goto get_addr;
@@ -724,9 +714,10 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
rpool->cur = TAILQ_FIRST(&rpool->list);
else
rpool->cur = TAILQ_NEXT(rpool->cur, entries);
+ rpool->tblidx = -1;
if (rpool->cur->addr.type == PF_ADDR_TABLE) {
if (pfr_pool_get(rpool->cur->addr.p.tbl,
- &rpool->tblidx, &rpool->counter, af, NULL)) {
+ &rpool->tblidx, &rpool->counter, af, NULL, true)) {
/* table contains no address of type 'af' */
if (rpool->cur != acur)
goto try_next;
@@ -734,9 +725,9 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
goto done_pool_mtx;
}
} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
- rpool->tblidx = -1;
if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
- &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) {
+ &rpool->tblidx, &rpool->counter, af, pf_islinklocal,
+ true)) {
/* table contains no address of type 'af' */
if (rpool->cur != acur)
goto try_next;
@@ -764,48 +755,41 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
done_pool_mtx:
mtx_unlock(&rpool->mtx);
- if (reason) {
- counter_u64_add(V_pf_status.counters[reason], 1);
- }
-
return (reason);
}
u_short
pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr,
- struct pf_ksrc_node **sn, struct pf_srchash **sh, struct pf_kpool *rpool,
- pf_sn_types_t sn_type)
+ struct pf_kpool *rpool, pf_sn_types_t sn_type)
{
+ struct pf_ksrc_node *sn = NULL;
+ struct pf_srchash *sh = NULL;
u_short reason = 0;
- KASSERT(*sn == NULL, ("*sn not NULL"));
-
/*
* If this is a sticky-address rule, try to find an existing src_node.
- * Request the sh to be unlocked if sn was not found, as we never
- * insert a new sn when parsing the ruleset.
*/
if (rpool->opts & PF_POOL_STICKYADDR &&
(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
- *sn = pf_find_src_node(saddr, r, af, sh, sn_type, false);
+ sn = pf_find_src_node(saddr, r, af, &sh, sn_type, false);
- if (*sn != NULL) {
- PF_SRC_NODE_LOCK_ASSERT(*sn);
+ if (sn != NULL) {
+ PF_SRC_NODE_LOCK_ASSERT(sn);
/* If the supplied address is the same as the current one we've
* been asked before, so tell the caller that there's no other
* address to be had. */
- if (PF_AEQ(naddr, &(*sn)->raddr, af)) {
+ if (PF_AEQ(naddr, &(sn->raddr), af)) {
reason = PFRES_MAPFAILED;
goto done;
}
- pf_addrcpy(naddr, &(*sn)->raddr, af);
+ pf_addrcpy(naddr, &(sn->raddr), af);
if (nkif)
- *nkif = (*sn)->rkif;
+ *nkif = sn->rkif;
if (V_pf_status.debug >= PF_DEBUG_NOISY) {
- printf("pf_map_addr: src tracking maps ");
+ printf("%s: src tracking maps ", __func__);
pf_print_host(saddr, 0, af);
printf(" to ");
pf_print_host(naddr, 0, af);
@@ -820,14 +804,16 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
* Source node has not been found. Find a new address and store it
* in variables given by the caller.
*/
- if (pf_map_addr(af, r, saddr, naddr, nkif, init_addr, rpool) != 0) {
- /* pf_map_addr() sets reason counters on its own */
+ if ((reason = pf_map_addr(af, r, saddr, naddr, nkif, init_addr,
+ rpool)) != 0) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: pf_map_addr has failed\n", __func__);
goto done;
}
if (V_pf_status.debug >= PF_DEBUG_NOISY &&
(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
- printf("pf_map_addr: selected address ");
+ printf("%s: selected address ", __func__);
pf_print_host(naddr, 0, af);
if (nkif)
printf("@%s", (*nkif)->pfik_name);
@@ -835,12 +821,8 @@ pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
}
done:
- if ((*sn) != NULL)
- PF_SRC_NODE_UNLOCK(*sn);
-
- if (reason) {
- counter_u64_add(V_pf_status.counters[reason], 1);
- }
+ if (sn != NULL)
+ PF_SRC_NODE_UNLOCK(sn);
return (reason);
}
@@ -890,8 +872,6 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
{
struct pf_pdesc *pd = ctx->pd;
struct pf_addr *naddr;
- struct pf_ksrc_node *sn = NULL;
- struct pf_srchash *sh = NULL;
uint16_t *nportp;
uint16_t low, high;
u_short reason;
@@ -919,22 +899,22 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
high = rpool->proxy_port[1];
}
if (rpool->mape.offset > 0) {
- if (pf_get_mape_sport(pd, r, naddr, nportp, &sn,
- &sh, &ctx->udp_mapping, rpool)) {
+ if (pf_get_mape_sport(pd, r, naddr, nportp,
+ &ctx->udp_mapping, rpool)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: MAP-E port allocation (%u/%u/%u)"
- " failed\n",
+ "pf: MAP-E port allocation (%u/%u/%u)"
+ " failed",
rpool->mape.offset,
rpool->mape.psidlen,
- rpool->mape.psid));
+ rpool->mape.psid);
reason = PFRES_MAPFAILED;
goto notrans;
}
- } else if (pf_get_sport(pd, r, naddr, nportp, low, high, &sn,
- &sh, rpool, &ctx->udp_mapping, PF_SN_NAT)) {
+ } else if (pf_get_sport(pd, r, naddr, nportp, low, high,
+ rpool, &ctx->udp_mapping, PF_SN_NAT)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: NAT proxy port allocation (%u-%u) failed\n",
- rpool->proxy_port[0], rpool->proxy_port[1]));
+ "pf: NAT proxy port allocation (%u-%u) failed",
+ rpool->proxy_port[0], rpool->proxy_port[1]);
reason = PFRES_MAPFAILED;
goto notrans;
}
@@ -1017,7 +997,7 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
uint16_t cut, low, high, nport;
reason = pf_map_addr_sn(pd->af, r, &pd->nsaddr, naddr, NULL,
- NULL, &sn, &sh, rpool, PF_SN_NAT);
+ NULL, rpool, PF_SN_NAT);
if (reason != 0)
goto notrans;
if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
@@ -1030,10 +1010,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
if (rpool->proxy_port[1]) {
uint32_t tmp_nport;
+ uint16_t div;
- tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) %
- (rpool->proxy_port[1] - rpool->proxy_port[0] +
- 1)) + rpool->proxy_port[0];
+ div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1;
+ div = (div == 0) ? 1 : div;
+
+ tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) +
+ rpool->proxy_port[0];
/* Wrap around if necessary. */
if (tmp_nport > 65535)
@@ -1100,13 +1083,13 @@ pf_get_transaddr(struct pf_test_ctx *ctx, struct pf_krule *r,
* the state may be reused if the TCP state is terminal.
*/
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: RDR source port allocation failed\n"));
+ "pf: RDR source port allocation failed");
break;
out:
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: RDR source port allocation %u->%u\n",
- ntohs(pd->nsport), ntohs(ctx->nk->port[0])));
+ "pf: RDR source port allocation %u->%u",
+ ntohs(pd->nsport), ntohs(ctx->nk->port[0]));
break;
}
default:
@@ -1134,8 +1117,6 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
struct pf_addr ndaddr, nsaddr, naddr;
u_int16_t nport = 0;
int prefixlen = 96;
- struct pf_srchash *sh = NULL;
- struct pf_ksrc_node *sns = NULL;
bzero(&nsaddr, sizeof(nsaddr));
bzero(&ndaddr, sizeof(ndaddr));
@@ -1154,12 +1135,11 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
panic("pf_get_transaddr_af: no nat pool for source address");
/* get source address and port */
- if (pf_get_sport(pd, r, &nsaddr, &nport,
- r->nat.proxy_port[0], r->nat.proxy_port[1], &sns, &sh, &r->nat,
- NULL, PF_SN_NAT)) {
+ if (pf_get_sport(pd, r, &nsaddr, &nport, r->nat.proxy_port[0],
+ r->nat.proxy_port[1], &r->nat, NULL, PF_SN_NAT)) {
DPFPRINTF(PF_DEBUG_MISC,
- ("pf: af-to NAT proxy port allocation (%u-%u) failed",
- r->nat.proxy_port[0], r->nat.proxy_port[1]));
+ "pf: af-to NAT proxy port allocation (%u-%u) failed",
+ r->nat.proxy_port[0], r->nat.proxy_port[1]);
return (-1);
}
@@ -1182,7 +1162,7 @@ pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
/* get the destination address and port */
if (! TAILQ_EMPTY(&r->rdr.list)) {
if (pf_map_addr_sn(pd->naf, r, &nsaddr, &naddr, NULL, NULL,
- &sns, NULL, &r->rdr, PF_SN_NAT))
+ &r->rdr, PF_SN_NAT))
return (-1);
if (r->rdr.proxy_port[0])
pd->ndport = htons(r->rdr.proxy_port[0]);
diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c
index 369292ca365e..8cea9557633c 100644
--- a/sys/netpfil/pf/pf_norm.c
+++ b/sys/netpfil/pf/pf_norm.c
@@ -160,13 +160,6 @@ static int pf_reassemble6(struct mbuf **,
struct ip6_frag *, uint16_t, uint16_t, u_short *);
#endif /* INET6 */
-#define DPFPRINTF(x) do { \
- if (V_pf_status.debug >= PF_DEBUG_MISC) { \
- printf("%s: ", __func__); \
- printf x ; \
- } \
-} while(0)
-
#ifdef INET
static void
pf_ip2key(struct ip *ip, struct pf_frnode *key)
@@ -262,7 +255,8 @@ pf_purge_fragments(uint32_t expire)
if (frag->fr_timeout > expire)
break;
- DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
+ DPFPRINTF(PF_DEBUG_MISC, "expiring %d(%p)",
+ frag->fr_id, frag);
pf_free_fragment(frag);
}
@@ -281,7 +275,7 @@ pf_flush_fragments(void)
PF_FRAG_ASSERT();
goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10;
- DPFPRINTF(("trying to free %d frag entriess\n", goal));
+ DPFPRINTF(PF_DEBUG_MISC, "trying to free %d frag entriess", goal);
while (goal < uma_zone_get_cur(V_pf_frent_z)) {
frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue);
if (frag)
@@ -573,26 +567,30 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
/* No empty fragments. */
if (frent->fe_len == 0) {
- DPFPRINTF(("bad fragment: len 0\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "bad fragment: len 0");
goto bad_fragment;
}
/* All fragments are 8 byte aligned. */
if (frent->fe_mff && (frent->fe_len & 0x7)) {
- DPFPRINTF(("bad fragment: mff and len %d\n", frent->fe_len));
+ DPFPRINTF(PF_DEBUG_MISC, "bad fragment: mff and len %d",
+ frent->fe_len);
goto bad_fragment;
}
/* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */
if (frent->fe_off + frent->fe_len > IP_MAXPACKET) {
- DPFPRINTF(("bad fragment: max packet %d\n",
- frent->fe_off + frent->fe_len));
+ DPFPRINTF(PF_DEBUG_MISC, "bad fragment: max packet %d",
+ frent->fe_off + frent->fe_len);
goto bad_fragment;
}
- DPFPRINTF((key->fn_af == AF_INET ?
- "reass frag %d @ %d-%d\n" : "reass frag %#08x @ %d-%d\n",
- id, frent->fe_off, frent->fe_off + frent->fe_len));
+ if (key->fn_af == AF_INET)
+ DPFPRINTF(PF_DEBUG_MISC, "reass frag %d @ %d-%d\n",
+ id, frent->fe_off, frent->fe_off + frent->fe_len);
+ else
+ DPFPRINTF(PF_DEBUG_MISC, "reass frag %#08x @ %d-%d",
+ id, frent->fe_off, frent->fe_off + frent->fe_len);
/* Fully buffer all of the fragments in this fragment queue. */
frag = pf_find_fragment(key, id);
@@ -690,10 +688,10 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
precut = prev->fe_off + prev->fe_len - frent->fe_off;
if (precut >= frent->fe_len) {
- DPFPRINTF(("new frag overlapped\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "new frag overlapped");
goto drop_fragment;
}
- DPFPRINTF(("frag head overlap %d\n", precut));
+ DPFPRINTF(PF_DEBUG_MISC, "frag head overlap %d", precut);
m_adj(frent->fe_m, precut);
frent->fe_off += precut;
frent->fe_len -= precut;
@@ -705,7 +703,8 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
aftercut = frent->fe_off + frent->fe_len - after->fe_off;
if (aftercut < after->fe_len) {
- DPFPRINTF(("frag tail overlap %d", aftercut));
+ DPFPRINTF(PF_DEBUG_MISC, "frag tail overlap %d",
+ aftercut);
m_adj(after->fe_m, aftercut);
/* Fragment may switch queue as fe_off changes */
pf_frent_remove(frag, after);
@@ -713,7 +712,8 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
after->fe_len -= aftercut;
/* Insert into correct queue */
if (pf_frent_insert(frag, after, prev)) {
- DPFPRINTF(("fragment requeue limit exceeded"));
+ DPFPRINTF(PF_DEBUG_MISC,
+ "fragment requeue limit exceeded");
m_freem(after->fe_m);
uma_zfree(V_pf_frent_z, after);
/* There is not way to recover */
@@ -723,7 +723,7 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
}
/* This fragment is completely overlapped, lose it. */
- DPFPRINTF(("old frag overlapped\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "old frag overlapped");
next = TAILQ_NEXT(after, fr_next);
pf_frent_remove(frag, after);
m_freem(after->fe_m);
@@ -732,7 +732,7 @@ pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
/* If part of the queue gets too long, there is not way to recover. */
if (pf_frent_insert(frag, frent, prev)) {
- DPFPRINTF(("fragment queue limit exceeded\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "fragment queue limit exceeded");
goto bad_fragment;
}
@@ -748,7 +748,7 @@ free_fragment:
* fragment, the entire datagram (and any constituent fragments) MUST
* be silently discarded.
*/
- DPFPRINTF(("flush overlapping fragments\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "flush overlapping fragments");
pf_free_fragment(frag);
bad_fragment:
@@ -826,7 +826,8 @@ pf_reassemble(struct mbuf **m0, u_short *reason)
m = *m0 = NULL;
if (frag->fr_holes) {
- DPFPRINTF(("frag %d, holes %d\n", frag->fr_id, frag->fr_holes));
+ DPFPRINTF(PF_DEBUG_MISC, "frag %d, holes %d",
+ frag->fr_id, frag->fr_holes);
return (PF_PASS); /* drop because *m0 is NULL, no error */
}
@@ -872,14 +873,14 @@ pf_reassemble(struct mbuf **m0, u_short *reason)
ip->ip_off &= ~(IP_MF|IP_OFFMASK);
if (hdrlen + total > IP_MAXPACKET) {
- DPFPRINTF(("drop: too big: %d\n", total));
+ DPFPRINTF(PF_DEBUG_MISC, "drop: too big: %d", total);
ip->ip_len = 0;
REASON_SET(reason, PFRES_SHORT);
/* PF_DROP requires a valid mbuf *m0 in pf_test() */
return (PF_DROP);
}
- DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
+ DPFPRINTF(PF_DEBUG_MISC, "complete: %p(%d)", m, ntohs(ip->ip_len));
return (PF_PASS);
}
#endif /* INET */
@@ -931,8 +932,8 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr,
m = *m0 = NULL;
if (frag->fr_holes) {
- DPFPRINTF(("frag %d, holes %d\n", frag->fr_id,
- frag->fr_holes));
+ DPFPRINTF(PF_DEBUG_MISC, "frag %d, holes %d", frag->fr_id,
+ frag->fr_holes);
PF_FRAG_UNLOCK();
return (PF_PASS); /* Drop because *m0 is NULL, no error. */
}
@@ -993,14 +994,15 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr,
ip6->ip6_nxt = proto;
if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) {
- DPFPRINTF(("drop: too big: %d\n", total));
+ DPFPRINTF(PF_DEBUG_MISC, "drop: too big: %d", total);
ip6->ip6_plen = 0;
REASON_SET(reason, PFRES_SHORT);
/* PF_DROP requires a valid mbuf *m0 in pf_test6(). */
return (PF_DROP);
}
- DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip6->ip6_plen)));
+ DPFPRINTF(PF_DEBUG_MISC, "complete: %p(%d)", m,
+ ntohs(ip6->ip6_plen));
return (PF_PASS);
fail:
@@ -1090,7 +1092,7 @@ pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag,
action = PF_PASS;
} else {
/* Drop expects an mbuf to free. */
- DPFPRINTF(("refragment error %d\n", error));
+ DPFPRINTF(PF_DEBUG_MISC, "refragment error %d", error);
action = PF_DROP;
}
for (; m; m = t) {
@@ -1230,7 +1232,7 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd)
* no-df above, fine. Otherwise drop it.
*/
if (h->ip_off & htons(IP_DF)) {
- DPFPRINTF(("IP_DF\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "IP_DF");
goto bad;
}
@@ -1238,13 +1240,13 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd)
/* All fragments are 8 byte aligned */
if (mff && (ip_len & 0x7)) {
- DPFPRINTF(("mff and %d\n", ip_len));
+ DPFPRINTF(PF_DEBUG_MISC, "mff and %d", ip_len);
goto bad;
}
/* Respect maximum length */
if (fragoff + ip_len > IP_MAXPACKET) {
- DPFPRINTF(("max packet %d\n", fragoff + ip_len));
+ DPFPRINTF(PF_DEBUG_MISC, "max packet %d", fragoff + ip_len);
goto bad;
}
@@ -1256,7 +1258,8 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd)
/* Fully buffer all of the fragments
* Might return a completely reassembled mbuf, or NULL */
PF_FRAG_LOCK();
- DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
+ DPFPRINTF(PF_DEBUG_MISC, "reass frag %d @ %d-%d",
+ h->ip_id, fragoff, max);
verdict = pf_reassemble(&pd->m, reason);
PF_FRAG_UNLOCK();
@@ -1282,7 +1285,7 @@ pf_normalize_ip(u_short *reason, struct pf_pdesc *pd)
return (PF_PASS);
bad:
- DPFPRINTF(("dropping bad fragment\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "dropping bad fragment");
REASON_SET(reason, PFRES_FRAG);
drop:
if (r != NULL && r->log)
@@ -1711,7 +1714,7 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd,
(uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
time_uptime - (state->creation / 1000) > TS_MAX_CONN)) {
if (V_pf_status.debug >= PF_DEBUG_MISC) {
- DPFPRINTF(("src idled out of PAWS\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "src idled out of PAWS");
pf_print_state(state);
printf("\n");
}
@@ -1721,7 +1724,7 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd,
if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
if (V_pf_status.debug >= PF_DEBUG_MISC) {
- DPFPRINTF(("dst idled out of PAWS\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "dst idled out of PAWS");
pf_print_state(state);
printf("\n");
}
@@ -1826,22 +1829,22 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd,
* an old timestamp.
*/
- DPFPRINTF(("Timestamp failed %c%c%c%c\n",
+ DPFPRINTF(PF_DEBUG_MISC, "Timestamp failed %c%c%c%c",
SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
SEQ_GT(tsval, src->scrub->pfss_tsval +
tsval_from_last) ? '1' : ' ',
SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
- SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
- DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u "
- "idle: %jus %lums\n",
+ SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ');
+ DPFPRINTF(PF_DEBUG_MISC, " tsval: %u tsecr: %u +ticks: "
+ "%u idle: %jus %lums",
tsval, tsecr, tsval_from_last,
(uintmax_t)delta_ts.tv_sec,
- delta_ts.tv_usec / 1000));
- DPFPRINTF((" src->tsval: %u tsecr: %u\n",
- src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
- DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u"
- "\n", dst->scrub->pfss_tsval,
- dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
+ delta_ts.tv_usec / 1000);
+ DPFPRINTF(PF_DEBUG_MISC, " src->tsval: %u tsecr: %u",
+ src->scrub->pfss_tsval, src->scrub->pfss_tsecr);
+ DPFPRINTF(PF_DEBUG_MISC, " dst->tsval: %u tsecr: %u "
+ "tsval0: %u", dst->scrub->pfss_tsval,
+ dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0);
if (V_pf_status.debug >= PF_DEBUG_MISC) {
pf_print_state(state);
pf_print_flags(tcp_get_flags(th));
@@ -1891,8 +1894,8 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd,
* stack changed its RFC1323 behavior?!?!
*/
if (V_pf_status.debug >= PF_DEBUG_MISC) {
- DPFPRINTF(("Did not receive expected RFC1323 "
- "timestamp\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "Did not receive expected "
+ "RFC1323 timestamp");
pf_print_state(state);
pf_print_flags(tcp_get_flags(th));
printf("\n");
@@ -1919,9 +1922,9 @@ pf_normalize_tcp_stateful(struct pf_pdesc *pd,
if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
(dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
/* Don't warn if other host rejected RFC1323 */
- DPFPRINTF(("Broken RFC1323 stack did not "
- "timestamp data packet. Disabled PAWS "
- "security.\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "Broken RFC1323 stack did "
+ "not timestamp data packet. Disabled PAWS "
+ "security.");
pf_print_state(state);
pf_print_flags(tcp_get_flags(th));
printf("\n");
diff --git a/sys/netpfil/pf/pf_osfp.c b/sys/netpfil/pf/pf_osfp.c
index 3e00cc7c80a2..150626c5f3fb 100644
--- a/sys/netpfil/pf/pf_osfp.c
+++ b/sys/netpfil/pf/pf_osfp.c
@@ -40,9 +40,6 @@
#endif
static MALLOC_DEFINE(M_PFOSFP, "pf_osfp", "pf(4) operating system fingerprints");
-#define DPFPRINTF(format, x...) \
- if (V_pf_status.debug >= PF_DEBUG_NOISY) \
- printf(format , ##x)
SLIST_HEAD(pf_osfp_list, pf_os_fingerprint);
VNET_DEFINE_STATIC(struct pf_osfp_list, pf_osfp_list) =
@@ -189,8 +186,8 @@ pf_osfp_fingerprint_hdr(const struct ip *ip, const struct ip6_hdr *ip6, const st
optlen = MAX(optlen, 1); /* paranoia */
}
- DPFPRINTF("fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) "
- "(TS=%s,M=%s%d,W=%s%d)\n",
+ DPFPRINTF(PF_DEBUG_NOISY, "fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) "
+ "(TS=%s,M=%s%d,W=%s%d)",
srcname, ntohs(tcp->th_sport),
fp.fp_wsize, fp.fp_ttl, (fp.fp_flags & PF_OSFP_DF) != 0,
fp.fp_psize, (long long int)fp.fp_tcpopts, fp.fp_optcnt,
@@ -219,7 +216,7 @@ pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os)
if (os == PF_OSFP_ANY)
return (1);
if (list == NULL) {
- DPFPRINTF("osfp no match against %x\n", os);
+ DPFPRINTF(PF_DEBUG_NOISY, "osfp no match against %x", os);
return (os == PF_OSFP_UNKNOWN);
}
PF_OSFP_UNPACK(os, os_class, os_version, os_subtype);
@@ -228,13 +225,13 @@ pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os)
if ((os_class == PF_OSFP_ANY || en_class == os_class) &&
(os_version == PF_OSFP_ANY || en_version == os_version) &&
(os_subtype == PF_OSFP_ANY || en_subtype == os_subtype)) {
- DPFPRINTF("osfp matched %s %s %s %x==%x\n",
+ DPFPRINTF(PF_DEBUG_NOISY, "osfp matched %s %s %s %x==%x",
entry->fp_class_nm, entry->fp_version_nm,
entry->fp_subtype_nm, os, entry->fp_os);
return (1);
}
}
- DPFPRINTF("fingerprint 0x%x didn't match\n", os);
+ DPFPRINTF(PF_DEBUG_NOISY, "fingerprint 0x%x didn't match", os);
return (0);
}
@@ -275,8 +272,8 @@ pf_osfp_add(struct pf_osfp_ioctl *fpioc)
fpadd.fp_ttl = fpioc->fp_ttl;
#if 0 /* XXX RYAN wants to fix logging */
- DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d "
- "(TS=%s,M=%s%d,W=%s%d) %x\n",
+ DPFPRINTF(PF_DEBUG_NOISY, "adding osfp %s %s %s ="
+ " %s%d:%d:%d:%s%d:0x%llx %d (TS=%s,M=%s%d,W=%s%d) %x",
fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm,
fpioc->fp_os.fp_subtype_nm,
(fpadd.fp_flags & PF_OSFP_WSIZE_MOD) ? "%" :
diff --git a/sys/netpfil/pf/pf_ruleset.c b/sys/netpfil/pf/pf_ruleset.c
index 2e5165a9900c..43b51f2933f4 100644
--- a/sys/netpfil/pf/pf_ruleset.c
+++ b/sys/netpfil/pf/pf_ruleset.c
@@ -59,9 +59,6 @@
#error "Kernel only file. Please use sbin/pfctl/pf_ruleset.c instead."
#endif
-#define DPFPRINTF(format, x...) \
- if (V_pf_status.debug >= PF_DEBUG_NOISY) \
- printf(format , ##x)
#define rs_malloc(x) malloc(x, M_TEMP, M_NOWAIT|M_ZERO)
#define rs_free(x) free(x, M_TEMP)
@@ -386,7 +383,8 @@ pf_kanchor_setup(struct pf_krule *r, const struct pf_kruleset *s,
strlcpy(path, s->anchor->path, MAXPATHLEN);
while (name[0] == '.' && name[1] == '.' && name[2] == '/') {
if (!path[0]) {
- DPFPRINTF("%s: .. beyond root\n", __func__);
+ DPFPRINTF(PF_DEBUG_NOISY, "%s: .. beyond root",
+ __func__);
rs_free(path);
return (1);
}
@@ -408,7 +406,7 @@ pf_kanchor_setup(struct pf_krule *r, const struct pf_kruleset *s,
ruleset = pf_find_or_create_kruleset(path);
rs_free(path);
if (ruleset == NULL || ruleset == &pf_main_ruleset) {
- DPFPRINTF("%s: ruleset\n", __func__);
+ DPFPRINTF(PF_DEBUG_NOISY, "%s: ruleset", __func__);
return (1);
}
r->anchor = ruleset->anchor;
@@ -690,7 +688,8 @@ pf_keth_anchor_setup(struct pf_keth_rule *r, const struct pf_keth_ruleset *s,
strlcpy(path, s->anchor->path, MAXPATHLEN);
while (name[0] == '.' && name[1] == '.' && name[2] == '/') {
if (!path[0]) {
- DPFPRINTF("%s: .. beyond root\n", __func__);
+ DPFPRINTF(PF_DEBUG_NOISY, "%s: .. beyond root",
+ __func__);
rs_free(path);
return (1);
}
@@ -712,7 +711,7 @@ pf_keth_anchor_setup(struct pf_keth_rule *r, const struct pf_keth_ruleset *s,
ruleset = pf_find_or_create_keth_ruleset(path);
rs_free(path);
if (ruleset == NULL || ruleset->anchor == NULL) {
- DPFPRINTF("%s: ruleset\n", __func__);
+ DPFPRINTF(PF_DEBUG_NOISY, "%s: ruleset", __func__);
return (1);
}
r->anchor = ruleset->anchor;
diff --git a/sys/netpfil/pf/pf_syncookies.c b/sys/netpfil/pf/pf_syncookies.c
index 66757fa4b756..4a935bc65767 100644
--- a/sys/netpfil/pf/pf_syncookies.c
+++ b/sys/netpfil/pf/pf_syncookies.c
@@ -88,8 +88,6 @@
#include <net/pfvar.h>
#include <netpfil/pf/pf_nv.h>
-#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
-
union pf_syncookie {
uint8_t cookie;
struct {
@@ -281,7 +279,7 @@ pf_synflood_check(struct pf_pdesc *pd)
pf_syncookie_rotate, curvnet);
V_pf_status.syncookies_active = true;
DPFPRINTF(LOG_WARNING,
- ("synflood detected, enabling syncookies\n"));
+ "synflood detected, enabling syncookies");
// XXXTODO V_pf_status.lcounters[LCNT_SYNFLOODS]++;
}
@@ -367,7 +365,7 @@ pf_syncookie_rotate(void *arg)
V_pf_status.syncookies_mode == PF_SYNCOOKIES_NEVER)
) {
V_pf_status.syncookies_active = false;
- DPFPRINTF(PF_DEBUG_MISC, ("syncookies disabled\n"));
+ DPFPRINTF(PF_DEBUG_MISC, "syncookies disabled");
}
/* nothing in flight any more? delete keys and return */
diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c
index 43e4366845a2..ecc185f89ad7 100644
--- a/sys/netpfil/pf/pf_table.c
+++ b/sys/netpfil/pf/pf_table.c
@@ -49,8 +49,6 @@
#include <net/vnet.h>
#include <net/pfvar.h>
-#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
-
#define ACCEPT_FLAGS(flags, oklist) \
do { \
if ((flags & ~(oklist)) & \
@@ -819,10 +817,10 @@ pfr_create_kentry(struct pfr_addr *ad, bool counters)
static void
pfr_destroy_kentries(struct pfr_kentryworkq *workq)
{
- struct pfr_kentry *p, *q;
+ struct pfr_kentry *p;
- for (p = SLIST_FIRST(workq); p != NULL; p = q) {
- q = SLIST_NEXT(p, pfrke_workq);
+ while ((p = SLIST_FIRST(workq)) != NULL) {
+ SLIST_REMOVE_HEAD(workq, pfrke_workq);
pfr_destroy_kentry(p);
}
}
@@ -1680,8 +1678,7 @@ pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd,
}
if (!(flags & PFR_FLAG_DUMMY)) {
- for (p = SLIST_FIRST(&workq); p != NULL; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ SLIST_FOREACH_SAFE(p, &workq, pfrkt_workq, q) {
pfr_commit_ktable(p, tzero);
}
rs->topen = 0;
@@ -1710,7 +1707,7 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
} else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) {
/* kt might contain addresses */
struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq;
- struct pfr_kentry *p, *q, *next;
+ struct pfr_kentry *p, *q;
struct pfr_addr ad;
pfr_enqueue_addrs(shadow, &addrq, NULL, 0);
@@ -1720,7 +1717,8 @@ pfr_commit_ktable(struct pfr_ktable *kt, time_t tzero)
SLIST_INIT(&delq);
SLIST_INIT(&garbageq);
pfr_clean_node_mask(shadow, &addrq);
- SLIST_FOREACH_SAFE(p, &addrq, pfrke_workq, next) {
+ while ((p = SLIST_FIRST(&addrq)) != NULL) {
+ SLIST_REMOVE_HEAD(&addrq, pfrke_workq);
pfr_copyout_addr(&ad, p);
q = pfr_lookup_addr(kt, &ad, 1);
if (q != NULL) {
@@ -1864,8 +1862,7 @@ pfr_setflags_ktables(struct pfr_ktableworkq *workq)
{
struct pfr_ktable *p, *q;
- for (p = SLIST_FIRST(workq); p; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ SLIST_FOREACH_SAFE(p, workq, pfrkt_workq, q) {
pfr_setflags_ktable(p, p->pfrkt_nflags);
}
}
@@ -2015,10 +2012,10 @@ pfr_create_ktable(struct pfr_table *tbl, time_t tzero, int attachruleset)
static void
pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr)
{
- struct pfr_ktable *p, *q;
+ struct pfr_ktable *p;
- for (p = SLIST_FIRST(workq); p; p = q) {
- q = SLIST_NEXT(p, pfrkt_workq);
+ while ((p = SLIST_FIRST(workq)) != NULL) {
+ SLIST_REMOVE_HEAD(workq, pfrkt_workq);
pfr_destroy_ktable(p, flushaddr);
}
}
@@ -2074,17 +2071,16 @@ pfr_lookup_table(struct pfr_table *tbl)
(struct pfr_ktable *)tbl));
}
-int
-pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
+static struct pfr_kentry *
+pfr_kentry_byaddr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
+ int exact)
{
struct pfr_kentry *ke = NULL;
- int match;
PF_RULES_RASSERT();
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
- kt = kt->pfrkt_root;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL)
return (0);
switch (af) {
@@ -2121,11 +2117,26 @@ pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
default:
unhandled_af(af);
}
+ if (exact && ke && KENTRY_NETWORK(ke))
+ ke = NULL;
+
+ return (ke);
+}
+
+int
+pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
+{
+ struct pfr_kentry *ke = NULL;
+ int match;
+
+ ke = pfr_kentry_byaddr(kt, a, af, 0);
+
match = (ke && !ke->pfrke_not);
if (match)
pfr_kstate_counter_add(&kt->pfrkt_match, 1);
else
pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1);
+
return (match);
}
@@ -2135,9 +2146,8 @@ pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
{
struct pfr_kentry *ke = NULL;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
- kt = kt->pfrkt_root;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL)
return;
switch (af) {
@@ -2177,7 +2187,7 @@ pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
if ((ke == NULL || ke->pfrke_not) != notrule) {
if (op_pass != PFR_OP_PASS)
DPFPRINTF(PF_DEBUG_URGENT,
- ("pfr_update_stats: assertion failed.\n"));
+ "pfr_update_stats: assertion failed.");
op_pass = PFR_OP_XPASS;
}
pfr_kstate_counter_add(&kt->pfrkt_packets[dir_out][op_pass], 1);
@@ -2281,7 +2291,7 @@ pfr_detach_table(struct pfr_ktable *kt)
int
pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter,
- sa_family_t af, pf_addr_filter_func_t filter)
+ sa_family_t af, pf_addr_filter_func_t filter, bool loop_once)
{
struct pf_addr *addr, cur, mask, umask_addr;
union sockaddr_union uaddr, umask;
@@ -2306,9 +2316,8 @@ pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter,
unhandled_af(af);
}
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
- kt = kt->pfrkt_root;
- if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ kt = pfr_ktable_select_active(kt);
+ if (kt == NULL)
return (-1);
idx = *pidx;
@@ -2327,7 +2336,7 @@ _next_block:
ke = pfr_kentry_byidx(kt, idx, af);
if (ke == NULL) {
/* we don't have this idx, try looping */
- if (loop || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) {
+ if ((loop || loop_once) || (ke = pfr_kentry_byidx(kt, 0, af)) == NULL) {
pfr_kstate_counter_add(&kt->pfrkt_nomatch, 1);
return (1);
}
@@ -2455,3 +2464,14 @@ pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn)
unhandled_af(dyn->pfid_af);
}
}
+
+struct pfr_ktable *
+pfr_ktable_select_active(struct pfr_ktable *kt)
+{
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (NULL);
+
+ return (kt);
+}
diff --git a/sys/netsmb/smb_conn.c b/sys/netsmb/smb_conn.c
index 259635e2d8d5..ab6cd130a057 100644
--- a/sys/netsmb/smb_conn.c
+++ b/sys/netsmb/smb_conn.c
@@ -422,7 +422,7 @@ smb_vc_create(struct smb_vcspec *vcspec,
if (uid == SMBM_ANY_OWNER)
uid = realuid;
if (gid == SMBM_ANY_GROUP)
- gid = cred->cr_groups[0];
+ gid = cred->cr_gid;
vcp->vc_uid = uid;
vcp->vc_grp = gid;
@@ -765,7 +765,7 @@ smb_share_create(struct smb_vc *vcp, struct smb_sharespec *shspec,
if (uid == SMBM_ANY_OWNER)
uid = realuid;
if (gid == SMBM_ANY_GROUP)
- gid = cred->cr_groups[0];
+ gid = cred->cr_gid;
ssp = smb_zmalloc(sizeof(*ssp), M_SMBCONN, M_WAITOK);
smb_co_init(SSTOCP(ssp), SMBL_SHARE, "smbss ilock", "smbss");
ssp->obj.co_free = smb_share_free;
diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c
index 7746b668265d..ae17b3289593 100644
--- a/sys/powerpc/aim/mmu_oea.c
+++ b/sys/powerpc/aim/mmu_oea.c
@@ -1469,6 +1469,9 @@ moea_page_set_memattr(vm_page_t m, vm_memattr_t ma)
pmap_t pmap;
u_int lo;
+ if (m->md.mdpg_cache_attrs == ma)
+ return;
+
if ((m->oflags & VPO_UNMANAGED) != 0) {
m->md.mdpg_cache_attrs = ma;
return;
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 79cea408bb5f..796b1719b8ba 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -2134,6 +2134,9 @@ moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x",
__func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma);
+ if (m->md.mdpg_cache_attrs == ma)
+ return;
+
if ((m->oflags & VPO_UNMANAGED) != 0) {
m->md.mdpg_cache_attrs = ma;
return;
diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c
index 45f7bef8bcc9..a12142fc2d7b 100644
--- a/sys/powerpc/aim/mmu_radix.c
+++ b/sys/powerpc/aim/mmu_radix.c
@@ -5937,6 +5937,10 @@ mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
+
+ if (m->md.mdpg_cache_attrs == ma)
+ return;
+
m->md.mdpg_cache_attrs = ma;
/*
diff --git a/sys/powerpc/include/pcb.h b/sys/powerpc/include/pcb.h
index 050ada6b0f64..0230cf78aba7 100644
--- a/sys/powerpc/include/pcb.h
+++ b/sys/powerpc/include/pcb.h
@@ -66,16 +66,8 @@ struct pcb {
#define PCB_VECREGS 0x200 /* Process had Altivec registers initialized */
struct fpu {
union {
-#if _BYTE_ORDER == _BIG_ENDIAN
- double fpr;
- uint32_t vsr[4];
-#else
uint32_t vsr[4];
- struct {
- double padding;
- double fpr;
- };
-#endif
+ double fpr;
} fpr[32];
double fpscr; /* FPSCR stored as double for easier access */
} pcb_fpu; /* Floating point processor */
diff --git a/sys/powerpc/include/ucontext.h b/sys/powerpc/include/ucontext.h
index d35c6c773fe0..dc87edd578bc 100644
--- a/sys/powerpc/include/ucontext.h
+++ b/sys/powerpc/include/ucontext.h
@@ -41,6 +41,7 @@ typedef struct __mcontext {
int mc_flags;
#define _MC_FP_VALID 0x01
#define _MC_AV_VALID 0x02
+#define _MC_VS_VALID 0x04
int mc_onstack; /* saved onstack flag */
int mc_len; /* sizeof(__mcontext) */
__uint64_t mc_avec[32*2]; /* vector register file */
@@ -56,6 +57,7 @@ typedef struct __mcontext32 {
int mc_flags;
#define _MC_FP_VALID 0x01
#define _MC_AV_VALID 0x02
+#define _MC_VS_VALID 0x04
int mc_onstack; /* saved onstack flag */
int mc_len; /* sizeof(__mcontext) */
uint64_t mc_avec[32*2]; /* vector register file */
diff --git a/sys/powerpc/powerpc/exec_machdep.c b/sys/powerpc/powerpc/exec_machdep.c
index 1893d79f29a8..8a33d0f589a7 100644
--- a/sys/powerpc/powerpc/exec_machdep.c
+++ b/sys/powerpc/powerpc/exec_machdep.c
@@ -214,10 +214,10 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
sfpsize = sizeof(sf);
#ifdef __powerpc64__
/*
- * 64-bit PPC defines a 288 byte scratch region
- * below the stack.
+ * 64-bit PPC defines a 512 byte red zone below
+ * the existing stack (ELF ABI v2 §2.2.2.4)
*/
- rndfsize = 288 + roundup(sizeof(sf), 48);
+ rndfsize = 512 + roundup(sizeof(sf), 48);
#else
rndfsize = roundup(sizeof(sf), 16);
#endif
@@ -349,13 +349,6 @@ sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
if (error != 0)
return (error);
- /*
- * Save FPU state if needed. User may have changed it on
- * signal handler
- */
- if (uc.uc_mcontext.mc_srr1 & PSL_FP)
- save_fpu(td);
-
kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
@@ -432,6 +425,7 @@ grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
}
if (pcb->pcb_flags & PCB_VSX) {
+ mcp->mc_flags |= _MC_VS_VALID;
for (i = 0; i < 32; i++)
memcpy(&mcp->mc_vsxfpreg[i],
&pcb->pcb_fpu.fpr[i].vsr[2], sizeof(double));
@@ -481,6 +475,7 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
struct pcb *pcb;
struct trapframe *tf;
register_t tls;
+ register_t msr;
int i;
pcb = td->td_pcb;
@@ -531,6 +526,22 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
tf->srr1 &= ~(PSL_FP | PSL_VSX | PSL_VEC);
pcb->pcb_flags &= ~(PCB_FPU | PCB_VSX | PCB_VEC);
+ /*
+ * Ensure the FPU is also disabled in hardware.
+ *
+ * Without this, it's possible for the register reload to fail if we
+ * don't switch to a FPU disabled context before resuming the original
+ * thread. Specifically, if the FPU/VSX unavailable exception is never
+ * hit, then whatever data is still in the FP/VSX registers when
+ * sigresume is callled will used by the resumed thread, instead of the
+ * previously saved data from the mcontext.
+ */
+ critical_enter();
+ msr = mfmsr() & ~(PSL_FP | PSL_VSX | PSL_VEC);
+ isync();
+ mtmsr(msr);
+ critical_exit();
+
if (mcp->mc_flags & _MC_FP_VALID) {
/* enable_fpu() will happen lazily on a fault */
pcb->pcb_flags |= PCB_FPREGS;
@@ -539,8 +550,12 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
for (i = 0; i < 32; i++) {
memcpy(&pcb->pcb_fpu.fpr[i].fpr, &mcp->mc_fpreg[i],
sizeof(double));
- memcpy(&pcb->pcb_fpu.fpr[i].vsr[2],
- &mcp->mc_vsxfpreg[i], sizeof(double));
+ }
+ if (mcp->mc_flags & _MC_VS_VALID) {
+ for (i = 0; i < 32; i++) {
+ memcpy(&pcb->pcb_fpu.fpr[i].vsr[2],
+ &mcp->mc_vsxfpreg[i], sizeof(double));
+ }
}
}
diff --git a/sys/powerpc/powerpc/fpu.c b/sys/powerpc/powerpc/fpu.c
index 0eaff2ea4932..cc8f22f7dda3 100644
--- a/sys/powerpc/powerpc/fpu.c
+++ b/sys/powerpc/powerpc/fpu.c
@@ -64,8 +64,19 @@ save_fpu_int(struct thread *td)
* Save the floating-point registers and FPSCR to the PCB
*/
if (pcb->pcb_flags & PCB_VSX) {
- #define SFP(n) __asm ("stxvw4x " #n ", 0,%0" \
+#if _BYTE_ORDER == _BIG_ENDIAN
+ #define SFP(n) __asm("stxvw4x " #n ", 0,%0" \
:: "b"(&pcb->pcb_fpu.fpr[n]));
+#else
+ /*
+ * stxvw2x will swap words within the FP double word on LE systems,
+ * leading to corruption if VSX is used to store state and FP is
+ * subsequently used to restore state.
+ * Use stxvd2x instead.
+ */
+ #define SFP(n) __asm("stxvd2x " #n ", 0,%0" \
+ :: "b"(&pcb->pcb_fpu.fpr[n]));
+#endif
SFP(0); SFP(1); SFP(2); SFP(3);
SFP(4); SFP(5); SFP(6); SFP(7);
SFP(8); SFP(9); SFP(10); SFP(11);
@@ -76,7 +87,7 @@ save_fpu_int(struct thread *td)
SFP(28); SFP(29); SFP(30); SFP(31);
#undef SFP
} else {
- #define SFP(n) __asm ("stfd " #n ", 0(%0)" \
+ #define SFP(n) __asm("stfd " #n ", 0(%0)" \
:: "b"(&pcb->pcb_fpu.fpr[n].fpr));
SFP(0); SFP(1); SFP(2); SFP(3);
SFP(4); SFP(5); SFP(6); SFP(7);
@@ -149,8 +160,19 @@ enable_fpu(struct thread *td)
:: "b"(&pcb->pcb_fpu.fpscr));
if (pcb->pcb_flags & PCB_VSX) {
- #define LFP(n) __asm ("lxvw4x " #n ", 0,%0" \
+#if _BYTE_ORDER == _BIG_ENDIAN
+ #define LFP(n) __asm("lxvw4x " #n ", 0,%0" \
+ :: "b"(&pcb->pcb_fpu.fpr[n]));
+#else
+ /*
+ * lxvw4x will swap words within the FP double word on LE systems,
+ * leading to corruption if FP is used to store state and VSX is
+ * subsequently used to restore state.
+ * Use lxvd2x instead.
+ */
+ #define LFP(n) __asm("lxvd2x " #n ", 0,%0" \
:: "b"(&pcb->pcb_fpu.fpr[n]));
+#endif
LFP(0); LFP(1); LFP(2); LFP(3);
LFP(4); LFP(5); LFP(6); LFP(7);
LFP(8); LFP(9); LFP(10); LFP(11);
@@ -161,7 +183,7 @@ enable_fpu(struct thread *td)
LFP(28); LFP(29); LFP(30); LFP(31);
#undef LFP
} else {
- #define LFP(n) __asm ("lfd " #n ", 0(%0)" \
+ #define LFP(n) __asm("lfd " #n ", 0(%0)" \
:: "b"(&pcb->pcb_fpu.fpr[n].fpr));
LFP(0); LFP(1); LFP(2); LFP(3);
LFP(4); LFP(5); LFP(6); LFP(7);
diff --git a/sys/riscv/allwinner/files.allwinner b/sys/riscv/allwinner/files.allwinner
index 73fa9660e2d2..7a4ff6b9c62e 100644
--- a/sys/riscv/allwinner/files.allwinner
+++ b/sys/riscv/allwinner/files.allwinner
@@ -1,5 +1,6 @@
arm/allwinner/aw_gpio.c optional gpio aw_gpio fdt
+arm/allwinner/aw_mmc.c optional mmc aw_mmc fdt | mmccam aw_mmc fdt
arm/allwinner/aw_rtc.c optional aw_rtc fdt
arm/allwinner/aw_syscon.c optional syscon
arm/allwinner/aw_sid.c optional aw_sid nvmem
diff --git a/sys/riscv/conf/std.allwinner b/sys/riscv/conf/std.allwinner
index 2b1e0d4e09dc..34fe195b01ba 100644
--- a/sys/riscv/conf/std.allwinner
+++ b/sys/riscv/conf/std.allwinner
@@ -7,6 +7,7 @@ options SOC_ALLWINNER_D1
device aw_ccu # Allwinner clock controller
device aw_gpio # Allwinner GPIO controller
+device aw_mmc # Allwinner SD/MMC controller
device aw_rtc # Allwinner Real-time Clock
device aw_sid # Allwinner Secure ID EFUSE
device aw_timer # Allwinner Timer
diff --git a/sys/riscv/include/vmm_dev.h b/sys/riscv/include/vmm_dev.h
index 856ff0778b95..4d30d5a1c35b 100644
--- a/sys/riscv/include/vmm_dev.h
+++ b/sys/riscv/include/vmm_dev.h
@@ -34,6 +34,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
+#include <sys/domainset.h>
+
#include <machine/vmm.h>
struct vm_memmap {
@@ -56,6 +58,9 @@ struct vm_memseg {
int segid;
size_t len;
char name[VM_MAX_SUFFIXLEN + 1];
+ domainset_t *ds_mask;
+ size_t ds_mask_size;
+ int ds_policy;
};
struct vm_register {
diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c
index 5d15bd671285..26efaecc64d1 100644
--- a/sys/riscv/riscv/pmap.c
+++ b/sys/riscv/riscv/pmap.c
@@ -4838,6 +4838,8 @@ pmap_unmapbios(void *p, vm_size_t size)
void
pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
{
+ if (m->md.pv_memattr == ma)
+ return;
m->md.pv_memattr = ma;
diff --git a/sys/rpc/authunix_prot.c b/sys/rpc/authunix_prot.c
index 91fb96f44397..7b531946488a 100644
--- a/sys/rpc/authunix_prot.c
+++ b/sys/rpc/authunix_prot.c
@@ -93,9 +93,10 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred)
if (!xdr_uint32_t(xdrs, &cred->cr_uid))
return (FALSE);
- if (!xdr_uint32_t(xdrs, &cred->cr_groups[0]))
+ if (!xdr_uint32_t(xdrs, &cred->cr_gid))
return (FALSE);
+ /* XXXKE Fix this is cr_gid gets separated out. */
if (xdrs->x_op == XDR_ENCODE) {
ngroups = cred->cr_ngroups - 1;
if (ngroups > NGRPS)
@@ -105,7 +106,7 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred)
if (!xdr_uint32_t(xdrs, &ngroups))
return (FALSE);
for (i = 0; i < ngroups; i++) {
- if (i + 1 < ngroups_max + 1) {
+ if (i < ngroups_max) {
if (!xdr_uint32_t(xdrs, &cred->cr_groups[i + 1]))
return (FALSE);
} else {
@@ -115,7 +116,7 @@ xdr_authunix_parms(XDR *xdrs, uint32_t *time, struct xucred *cred)
}
if (xdrs->x_op == XDR_DECODE) {
- if (ngroups + 1 > ngroups_max + 1)
+ if (ngroups > ngroups_max)
cred->cr_ngroups = ngroups_max + 1;
else
cred->cr_ngroups = ngroups + 1;
diff --git a/sys/rpc/clnt_rc.c b/sys/rpc/clnt_rc.c
index 9e87af578885..44b63e38a8e6 100644
--- a/sys/rpc/clnt_rc.c
+++ b/sys/rpc/clnt_rc.c
@@ -198,6 +198,12 @@ clnt_reconnect_connect(CLIENT *cl)
newclient = clnt_vc_create(so,
(struct sockaddr *) &rc->rc_addr, rc->rc_prog, rc->rc_vers,
rc->rc_sendsz, rc->rc_recvsz, rc->rc_intr);
+ /*
+ * CLSET_FD_CLOSE must be done now, in case rpctls_connect()
+ * fails just below.
+ */
+ if (newclient != NULL)
+ CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
if (rc->rc_tls && newclient != NULL) {
CURVNET_SET(so->so_vnet);
stat = rpctls_connect(newclient, rc->rc_tlscertname, so,
@@ -236,7 +242,6 @@ clnt_reconnect_connect(CLIENT *cl)
goto out;
}
- CLNT_CONTROL(newclient, CLSET_FD_CLOSE, 0);
CLNT_CONTROL(newclient, CLSET_CONNECT, &one);
CLNT_CONTROL(newclient, CLSET_TIMEOUT, &rc->rc_timeout);
CLNT_CONTROL(newclient, CLSET_RETRY_TIMEOUT, &rc->rc_retry);
diff --git a/sys/rpc/rpcsec_gss/rpcsec_gss.c b/sys/rpc/rpcsec_gss/rpcsec_gss.c
index 62c71937a185..983dd251f81f 100644
--- a/sys/rpc/rpcsec_gss/rpcsec_gss.c
+++ b/sys/rpc/rpcsec_gss/rpcsec_gss.c
@@ -67,6 +67,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/hash.h>
+#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/lock.h>
@@ -772,6 +773,17 @@ rpc_gss_init(AUTH *auth, rpc_gss_options_ret_t *options_ret)
gd->gd_cred.gc_seq = 0;
/*
+ * XXX Threads from inside jails can get here via calls
+ * to clnt_vc_call()->AUTH_REFRESH()->rpc_gss_refresh()
+ * but the NFS mount is always done outside of the
+ * jails in vnet0. Since the thread credentials won't
+ * necessarily have cr_prison == vnet0 and this function
+ * has no access to the socket, using vnet0 seems the
+ * only option. This is broken if NFS mounts are enabled
+ * within vnet prisons.
+ */
+ KGSS_CURVNET_SET_QUIET(vnet0);
+ /*
* For KerberosV, if there is a client principal name, that implies
* that this is a host based initiator credential in the default
* keytab file. For this case, it is necessary to do a
@@ -994,12 +1006,14 @@ out:
gss_delete_sec_context(&min_stat, &gd->gd_ctx,
GSS_C_NO_BUFFER);
}
+ KGSS_CURVNET_RESTORE();
mtx_lock(&gd->gd_lock);
gd->gd_state = RPCSEC_GSS_START;
wakeup(gd);
mtx_unlock(&gd->gd_lock);
return (FALSE);
}
+ KGSS_CURVNET_RESTORE();
mtx_lock(&gd->gd_lock);
gd->gd_state = RPCSEC_GSS_ESTABLISHED;
diff --git a/sys/rpc/rpcsec_tls/rpctls_impl.c b/sys/rpc/rpcsec_tls/rpctls_impl.c
index 93fe283e65fd..51fe270b13d9 100644
--- a/sys/rpc/rpcsec_tls/rpctls_impl.c
+++ b/sys/rpc/rpcsec_tls/rpctls_impl.c
@@ -240,6 +240,14 @@ rpctls_rpc_failed(struct upsock *ups, struct socket *so)
* failed to do the handshake.
*/
mtx_unlock(&rpctls_lock);
+ /*
+ * Do a shutdown on the socket, since the daemon is
+ * probably stuck in SSL_accept() or SSL_connect() trying to
+ * read the socket. Do not soclose() the socket, since the
+ * daemon will close() the socket after SSL_accept()
+ * returns an error.
+ */
+ soshutdown(so, SHUT_RD);
}
}
diff --git a/sys/rpc/svc_auth_unix.c b/sys/rpc/svc_auth_unix.c
index 5d6402a05006..b10ef33be704 100644
--- a/sys/rpc/svc_auth_unix.c
+++ b/sys/rpc/svc_auth_unix.c
@@ -83,12 +83,13 @@ _svcauth_unix(struct svc_req *rqst, struct rpc_msg *msg)
str_len = RNDUP(str_len);
buf += str_len / sizeof (int32_t);
xcr->cr_uid = IXDR_GET_UINT32(buf);
- xcr->cr_groups[0] = IXDR_GET_UINT32(buf);
+ xcr->cr_gid = IXDR_GET_UINT32(buf);
gid_len = (size_t)IXDR_GET_UINT32(buf);
if (gid_len > NGRPS) {
stat = AUTH_BADCRED;
goto done;
}
+ /* XXXKE Fix this if cr_gid gets separated out. */
for (i = 0; i < gid_len; i++) {
if (i + 1 < XU_NGROUPS)
xcr->cr_groups[i + 1] = IXDR_GET_INT32(buf);
diff --git a/sys/security/audit/audit.c b/sys/security/audit/audit.c
index 05928f1c33e8..7ec50d990d4e 100644
--- a/sys/security/audit/audit.c
+++ b/sys/security/audit/audit.c
@@ -279,7 +279,7 @@ audit_record_ctor(void *mem, int size, void *arg, int flags)
cru2x(cred, &ar->k_ar.ar_subj_cred);
ar->k_ar.ar_subj_ruid = cred->cr_ruid;
ar->k_ar.ar_subj_rgid = cred->cr_rgid;
- ar->k_ar.ar_subj_egid = cred->cr_groups[0];
+ ar->k_ar.ar_subj_egid = cred->cr_gid;
ar->k_ar.ar_subj_auid = cred->cr_audit.ai_auid;
ar->k_ar.ar_subj_asid = cred->cr_audit.ai_asid;
ar->k_ar.ar_subj_pid = td->td_proc->p_pid;
diff --git a/sys/security/audit/audit_arg.c b/sys/security/audit/audit_arg.c
index c667d3968817..3ea645373dbe 100644
--- a/sys/security/audit/audit_arg.c
+++ b/sys/security/audit/audit_arg.c
@@ -408,7 +408,7 @@ audit_arg_process(struct proc *p)
cred = p->p_ucred;
ar->k_ar.ar_arg_auid = cred->cr_audit.ai_auid;
ar->k_ar.ar_arg_euid = cred->cr_uid;
- ar->k_ar.ar_arg_egid = cred->cr_groups[0];
+ ar->k_ar.ar_arg_egid = cred->cr_gid;
ar->k_ar.ar_arg_ruid = cred->cr_ruid;
ar->k_ar.ar_arg_rgid = cred->cr_rgid;
ar->k_ar.ar_arg_asid = cred->cr_audit.ai_asid;
diff --git a/sys/sys/compressor.h b/sys/sys/compressor.h
index cad9080b46ff..e59eeabec2cd 100644
--- a/sys/sys/compressor.h
+++ b/sys/sys/compressor.h
@@ -42,6 +42,7 @@ struct compressor;
bool compressor_avail(int format);
struct compressor *compressor_init(compressor_cb_t cb, int format,
size_t maxiosize, int level, void *arg);
+int compressor_format(const struct compressor *stream);
void compressor_reset(struct compressor *stream);
int compressor_write(struct compressor *stream, void *data,
size_t len);
diff --git a/sys/sys/domainset.h b/sys/sys/domainset.h
index f98b175e9bc8..f3dc92ec6383 100644
--- a/sys/sys/domainset.h
+++ b/sys/sys/domainset.h
@@ -113,6 +113,20 @@ void domainset_zero(void);
* returned value will not match the key pointer.
*/
struct domainset *domainset_create(const struct domainset *);
+
+/*
+ * Remove empty domains from a given domainset.
+ * Returns 'false' if the domainset consists entirely of empty domains.
+ */
+bool domainset_empty_vm(struct domainset *domain);
+
+/*
+ * Validate and populate a domainset structure according to the specified
+ * policy and mask.
+ */
+int domainset_populate(struct domainset *domain, const domainset_t *mask, int policy,
+ size_t mask_size);
+
#ifdef _SYS_SYSCTL_H_
int sysctl_handle_domainset(SYSCTL_HANDLER_ARGS);
#endif
diff --git a/sys/sys/efi.h b/sys/sys/efi.h
index 95a433a950db..89c8b15519de 100644
--- a/sys/sys/efi.h
+++ b/sys/sys/efi.h
@@ -42,6 +42,8 @@
{0xb122a263,0x3661,0x4f68,{0x99,0x29,0x78,0xf8,0xb0,0xd6,0x21,0x80}}
#define EFI_PROPERTIES_TABLE \
{0x880aaca3,0x4adc,0x4a04,{0x90,0x79,0xb7,0x47,0x34,0x08,0x25,0xe5}}
+#define EFI_MEMORY_ATTRIBUTES_TABLE \
+ {0xdcfa911d,0x26eb,0x469f,{0xa2,0x20,0x38,0xb7,0xdc,0x46,0x12,0x20}}
#define LINUX_EFI_MEMRESERVE_TABLE \
{0x888eb0c6,0x8ede,0x4ff5,{0xa8,0xf0,0x9a,0xee,0x5c,0xb9,0x77,0xc2}}
@@ -166,6 +168,22 @@ struct efi_prop_table {
uint64_t memory_protection_attribute;
};
+struct efi_memory_descriptor {
+ uint32_t type;
+ caddr_t phy_addr;
+ caddr_t virt_addr;
+ uint64_t pages;
+ uint64_t attrs;
+};
+
+struct efi_memory_attribute_table {
+ uint32_t version;
+ uint32_t num_ents;
+ uint32_t descriptor_size;
+ uint32_t flags;
+ struct efi_memory_descriptor tables[];
+};
+
#ifdef _KERNEL
#ifdef EFIABI_ATTR
diff --git a/sys/sys/elf_common.h b/sys/sys/elf_common.h
index 87460aae2dd4..efda38279848 100644
--- a/sys/sys/elf_common.h
+++ b/sys/sys/elf_common.h
@@ -306,7 +306,7 @@ typedef struct {
and MPRC of Peking University */
#define EM_AARCH64 183 /* AArch64 (64-bit ARM) */
#define EM_RISCV 243 /* RISC-V */
-#define EM_LOONGARCH 258 /* Loongson LoongArch */
+#define EM_LOONGARCH 258 /* Loongson LoongArch */
/* Non-standard or deprecated. */
#define EM_486 6 /* Intel i486. */
@@ -392,15 +392,15 @@ typedef struct {
*/
/* LoongArch Base ABI Modifiers */
-#define EF_LOONGARCH_ABI_SOFT_FLOAT 0x00000001
-#define EF_LOONGARCH_ABI_SINGLE_FLOAT 0x00000002
-#define EF_LOONGARCH_ABI_DOUBLE_FLOAT 0x00000003
-#define EF_LOONGARCH_ABI_MODIFIER_MASK 0x00000007
+#define EF_LOONGARCH_ABI_SOFT_FLOAT 0x00000001
+#define EF_LOONGARCH_ABI_SINGLE_FLOAT 0x00000002
+#define EF_LOONGARCH_ABI_DOUBLE_FLOAT 0x00000003
+#define EF_LOONGARCH_ABI_MODIFIER_MASK 0x00000007
/* LoongArch Object file ABI versions */
-#define EF_LOONGARCH_OBJABI_V0 0x00000000
-#define EF_LOONGARCH_OBJABI_V1 0x00000040
-#define EF_LOONGARCH_OBJABI_MASK 0x000000C0
+#define EF_LOONGARCH_OBJABI_V0 0x00000000
+#define EF_LOONGARCH_OBJABI_V1 0x00000040
+#define EF_LOONGARCH_OBJABI_MASK 0x000000C0
#define EF_SPARC_EXT_MASK 0x00ffff00
#define EF_SPARC_32PLUS 0x00000100
@@ -470,12 +470,12 @@ typedef struct {
#define SHT_HIOS 0x6fffffff /* Last of OS specific semantics */
#define SHT_LOPROC 0x70000000 /* reserved range for processor */
#define SHT_X86_64_UNWIND 0x70000001 /* unwind information */
-#define SHT_AMD64_UNWIND SHT_X86_64_UNWIND
+#define SHT_AMD64_UNWIND SHT_X86_64_UNWIND
#define SHT_ARM_EXIDX 0x70000001 /* Exception index table. */
-#define SHT_ARM_PREEMPTMAP 0x70000002 /* BPABI DLL dynamic linking
+#define SHT_ARM_PREEMPTMAP 0x70000002 /* BPABI DLL dynamic linking
pre-emption map. */
-#define SHT_ARM_ATTRIBUTES 0x70000003 /* Object file compatibility
+#define SHT_ARM_ATTRIBUTES 0x70000003 /* Object file compatibility
attributes. */
#define SHT_ARM_DEBUGOVERLAY 0x70000004 /* See DBGOVL for details. */
#define SHT_ARM_OVERLAYSECTION 0x70000005 /* See DBGOVL for details. */
@@ -791,7 +791,7 @@ typedef struct {
#define DF_1_NODELETE 0x00000008 /* Set the RTLD_NODELETE for object */
#define DF_1_LOADFLTR 0x00000010 /* Immediate loading of filtees */
#define DF_1_INITFIRST 0x00000020 /* Initialize DSO first at runtime */
-#define DF_1_NOOPEN 0x00000040 /* Do not allow loading on dlopen() */
+#define DF_1_NOOPEN 0x00000040 /* Do not allow loading on dlopen() */
#define DF_1_ORIGIN 0x00000080 /* Process $ORIGIN */
#define DF_1_INTERPOSE 0x00000400 /* Interpose all objects but main */
#define DF_1_NODEFLIB 0x00000800 /* Do not search default paths */
@@ -908,7 +908,7 @@ typedef struct {
#define STV_ELIMINATE 0x6
/* Architecture specific data - st_other */
-#define STO_AARCH64_VARIANT_PCS 0x80
+#define STO_AARCH64_VARIANT_PCS 0x80
/* Special symbol table indexes. */
#define STN_UNDEF 0 /* Undefined symbol index. */
@@ -1084,11 +1084,11 @@ typedef struct {
#define R_AARCH64_COPY 1024 /* Copy data from shared object */
#define R_AARCH64_GLOB_DAT 1025 /* Set GOT entry to data address */
#define R_AARCH64_JUMP_SLOT 1026 /* Set GOT entry to code address */
-#define R_AARCH64_RELATIVE 1027 /* Add load address of shared object */
+#define R_AARCH64_RELATIVE 1027 /* Add load address of shared object */
#define R_AARCH64_TLS_DTPREL64 1028
#define R_AARCH64_TLS_DTPMOD64 1029
-#define R_AARCH64_TLS_TPREL64 1030
-#define R_AARCH64_TLSDESC 1031 /* Identify the TLS descriptor */
+#define R_AARCH64_TLS_TPREL64 1030
+#define R_AARCH64_TLSDESC 1031 /* Identify the TLS descriptor */
#define R_AARCH64_IRELATIVE 1032
#define R_ARM_NONE 0 /* No relocation. */
@@ -1231,8 +1231,8 @@ typedef struct {
#define R_MIPS_GOT_HI16 22 /* GOT HI 16 bit */
#define R_MIPS_GOT_LO16 23 /* GOT LO 16 bit */
#define R_MIPS_SUB 24
-#define R_MIPS_CALLHI16 30 /* upper 16 bit GOT entry for function */
-#define R_MIPS_CALLLO16 31 /* lower 16 bit GOT entry for function */
+#define R_MIPS_CALLHI16 30 /* upper 16 bit GOT entry for function */
+#define R_MIPS_CALLLO16 31 /* lower 16 bit GOT entry for function */
#define R_MIPS_JALR 37
#define R_MIPS_TLS_GD 42
#define R_MIPS_COPY 126
@@ -1352,7 +1352,6 @@ typedef struct {
* RISC-V relocation types.
*/
-/* Relocation types used by the dynamic linker. */
#define R_RISCV_NONE 0
#define R_RISCV_32 1
#define R_RISCV_64 2
@@ -1365,8 +1364,7 @@ typedef struct {
#define R_RISCV_TLS_DTPREL64 9
#define R_RISCV_TLS_TPREL32 10
#define R_RISCV_TLS_TPREL64 11
-
-/* Relocation types not used by the dynamic linker. */
+#define R_RISCV_TLSDESC 12
#define R_RISCV_BRANCH 16
#define R_RISCV_JAL 17
#define R_RISCV_CALL 18
@@ -1392,10 +1390,10 @@ typedef struct {
#define R_RISCV_SUB16 38
#define R_RISCV_SUB32 39
#define R_RISCV_SUB64 40
+#define R_RISCV_GOT32_PCREL 41
#define R_RISCV_ALIGN 43
#define R_RISCV_RVC_BRANCH 44
#define R_RISCV_RVC_JUMP 45
-#define R_RISCV_RVC_LUI 46
#define R_RISCV_RELAX 51
#define R_RISCV_SUB6 52
#define R_RISCV_SET6 53
@@ -1404,6 +1402,14 @@ typedef struct {
#define R_RISCV_SET32 56
#define R_RISCV_32_PCREL 57
#define R_RISCV_IRELATIVE 58
+#define R_RISCV_PLT32 59
+#define R_RISCV_SET_ULEB128 60
+#define R_RISCV_SUB_ULEB128 61
+#define R_RISCV_TLSDESC_HI20 62
+#define R_RISCV_TLSDESC_LOAD_LO12 63
+#define R_RISCV_TLSDESC_ADD_LO12 64
+#define R_RISCV_TLSDESC_CALL 65
+#define R_RISCV_VENDOR 191
/*
* Loongson LoongArch relocation types.
@@ -1413,101 +1419,101 @@ typedef struct {
*/
/* Relocation types used by the dynamic linker */
-#define R_LARCH_NONE 0
-#define R_LARCH_32 1
-#define R_LARCH_64 2
-#define R_LARCH_RELATIVE 3
-#define R_LARCH_COPY 4
-#define R_LARCH_JUMP_SLOT 5
-#define R_LARCH_TLS_DTPMOD32 6
-#define R_LARCH_TLS_DTPMOD64 7
-#define R_LARCH_TLS_DTPREL32 8
-#define R_LARCH_TLS_DTPREL64 9
-#define R_LARCH_TLS_TPREL32 10
-#define R_LARCH_TLS_TPREL64 11
-#define R_LARCH_IRELATIVE 12
-#define R_LARCH_MARK_LA 20
-#define R_LARCH_MARK_PCREL 21
-#define R_LARCH_SOP_PUSH_PCREL 22
-#define R_LARCH_SOP_PUSH_ABSOLUTE 23
-#define R_LARCH_SOP_PUSH_DUP 24
-#define R_LARCH_SOP_PUSH_GPREL 25
-#define R_LARCH_SOP_PUSH_TLS_TPREL 26
-#define R_LARCH_SOP_PUSH_TLS_GOT 27
-#define R_LARCH_SOP_PUSH_TLS_GD 28
-#define R_LARCH_SOP_PUSH_PLT_PCREL 29
-#define R_LARCH_SOP_ASSERT 30
-#define R_LARCH_SOP_NOT 31
-#define R_LARCH_SOP_SUB 32
-#define R_LARCH_SOP_SL 33
-#define R_LARCH_SOP_SR 34
-#define R_LARCH_SOP_ADD 35
-#define R_LARCH_SOP_AND 36
-#define R_LARCH_SOP_IF_ELSE 37
-#define R_LARCH_SOP_POP_32_S_10_5 38
-#define R_LARCH_SOP_POP_32_U_10_12 39
-#define R_LARCH_SOP_POP_32_S_10_12 40
-#define R_LARCH_SOP_POP_32_S_10_16 41
-#define R_LARCH_SOP_POP_32_S_10_16_S2 42
-#define R_LARCH_SOP_POP_32_S_5_20 43
-#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2 44
-#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2 45
-#define R_LARCH_SOP_POP_32_U 46
-#define R_LARCH_ADD8 47
-#define R_LARCH_ADD16 48
-#define R_LARCH_ADD24 49
-#define R_LARCH_ADD32 50
-#define R_LARCH_ADD64 51
-#define R_LARCH_SUB8 52
-#define R_LARCH_SUB16 53
-#define R_LARCH_SUB24 54
-#define R_LARCH_SUB32 55
-#define R_LARCH_SUB64 56
-#define R_LARCH_GNU_VTINHERIT 57
-#define R_LARCH_GNU_VTENTRY 58
+#define R_LARCH_NONE 0
+#define R_LARCH_32 1
+#define R_LARCH_64 2
+#define R_LARCH_RELATIVE 3
+#define R_LARCH_COPY 4
+#define R_LARCH_JUMP_SLOT 5
+#define R_LARCH_TLS_DTPMOD32 6
+#define R_LARCH_TLS_DTPMOD64 7
+#define R_LARCH_TLS_DTPREL32 8
+#define R_LARCH_TLS_DTPREL64 9
+#define R_LARCH_TLS_TPREL32 10
+#define R_LARCH_TLS_TPREL64 11
+#define R_LARCH_IRELATIVE 12
+#define R_LARCH_MARK_LA 20
+#define R_LARCH_MARK_PCREL 21
+#define R_LARCH_SOP_PUSH_PCREL 22
+#define R_LARCH_SOP_PUSH_ABSOLUTE 23
+#define R_LARCH_SOP_PUSH_DUP 24
+#define R_LARCH_SOP_PUSH_GPREL 25
+#define R_LARCH_SOP_PUSH_TLS_TPREL 26
+#define R_LARCH_SOP_PUSH_TLS_GOT 27
+#define R_LARCH_SOP_PUSH_TLS_GD 28
+#define R_LARCH_SOP_PUSH_PLT_PCREL 29
+#define R_LARCH_SOP_ASSERT 30
+#define R_LARCH_SOP_NOT 31
+#define R_LARCH_SOP_SUB 32
+#define R_LARCH_SOP_SL 33
+#define R_LARCH_SOP_SR 34
+#define R_LARCH_SOP_ADD 35
+#define R_LARCH_SOP_AND 36
+#define R_LARCH_SOP_IF_ELSE 37
+#define R_LARCH_SOP_POP_32_S_10_5 38
+#define R_LARCH_SOP_POP_32_U_10_12 39
+#define R_LARCH_SOP_POP_32_S_10_12 40
+#define R_LARCH_SOP_POP_32_S_10_16 41
+#define R_LARCH_SOP_POP_32_S_10_16_S2 42
+#define R_LARCH_SOP_POP_32_S_5_20 43
+#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2 44
+#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2 45
+#define R_LARCH_SOP_POP_32_U 46
+#define R_LARCH_ADD8 47
+#define R_LARCH_ADD16 48
+#define R_LARCH_ADD24 49
+#define R_LARCH_ADD32 50
+#define R_LARCH_ADD64 51
+#define R_LARCH_SUB8 52
+#define R_LARCH_SUB16 53
+#define R_LARCH_SUB24 54
+#define R_LARCH_SUB32 55
+#define R_LARCH_SUB64 56
+#define R_LARCH_GNU_VTINHERIT 57
+#define R_LARCH_GNU_VTENTRY 58
/*
* Relocs whose processing do not require a stack machine.
*
* Spec addition: https://github.com/loongson/LoongArch-Documentation/pull/57
*/
-#define R_LARCH_B16 64
-#define R_LARCH_B21 65
-#define R_LARCH_B26 66
-#define R_LARCH_ABS_HI20 67
-#define R_LARCH_ABS_LO12 68
-#define R_LARCH_ABS64_LO20 69
-#define R_LARCH_ABS64_HI12 70
-#define R_LARCH_PCALA_HI20 71
-#define R_LARCH_PCALA_LO12 72
-#define R_LARCH_PCALA64_LO20 73
-#define R_LARCH_PCALA64_HI12 74
-#define R_LARCH_GOT_PC_HI20 75
-#define R_LARCH_GOT_PC_LO12 76
-#define R_LARCH_GOT64_PC_LO20 77
-#define R_LARCH_GOT64_PC_HI12 78
-#define R_LARCH_GOT_HI20 79
-#define R_LARCH_GOT_LO12 80
-#define R_LARCH_GOT64_LO20 81
-#define R_LARCH_GOT64_HI12 82
-#define R_LARCH_TLS_LE_HI20 83
-#define R_LARCH_TLS_LE_LO12 84
-#define R_LARCH_TLS_LE64_LO20 85
-#define R_LARCH_TLS_LE64_HI12 86
-#define R_LARCH_TLS_IE_PC_HI20 87
-#define R_LARCH_TLS_IE_PC_LO12 88
-#define R_LARCH_TLS_IE64_PC_LO20 89
-#define R_LARCH_TLS_IE64_PC_HI12 90
-#define R_LARCH_TLS_IE_HI20 91
-#define R_LARCH_TLS_IE_LO12 92
-#define R_LARCH_TLS_IE64_LO20 93
-#define R_LARCH_TLS_IE64_HI12 94
-#define R_LARCH_TLS_LD_PC_HI20 95
-#define R_LARCH_TLS_LD_HI20 96
-#define R_LARCH_TLS_GD_PC_HI20 97
-#define R_LARCH_TLS_GD_HI20 98
-#define R_LARCH_32_PCREL 99
-#define R_LARCH_RELAX 100
+#define R_LARCH_B16 64
+#define R_LARCH_B21 65
+#define R_LARCH_B26 66
+#define R_LARCH_ABS_HI20 67
+#define R_LARCH_ABS_LO12 68
+#define R_LARCH_ABS64_LO20 69
+#define R_LARCH_ABS64_HI12 70
+#define R_LARCH_PCALA_HI20 71
+#define R_LARCH_PCALA_LO12 72
+#define R_LARCH_PCALA64_LO20 73
+#define R_LARCH_PCALA64_HI12 74
+#define R_LARCH_GOT_PC_HI20 75
+#define R_LARCH_GOT_PC_LO12 76
+#define R_LARCH_GOT64_PC_LO20 77
+#define R_LARCH_GOT64_PC_HI12 78
+#define R_LARCH_GOT_HI20 79
+#define R_LARCH_GOT_LO12 80
+#define R_LARCH_GOT64_LO20 81
+#define R_LARCH_GOT64_HI12 82
+#define R_LARCH_TLS_LE_HI20 83
+#define R_LARCH_TLS_LE_LO12 84
+#define R_LARCH_TLS_LE64_LO20 85
+#define R_LARCH_TLS_LE64_HI12 86
+#define R_LARCH_TLS_IE_PC_HI20 87
+#define R_LARCH_TLS_IE_PC_LO12 88
+#define R_LARCH_TLS_IE64_PC_LO20 89
+#define R_LARCH_TLS_IE64_PC_HI12 90
+#define R_LARCH_TLS_IE_HI20 91
+#define R_LARCH_TLS_IE_LO12 92
+#define R_LARCH_TLS_IE64_LO20 93
+#define R_LARCH_TLS_IE64_HI12 94
+#define R_LARCH_TLS_LD_PC_HI20 95
+#define R_LARCH_TLS_LD_HI20 96
+#define R_LARCH_TLS_GD_PC_HI20 97
+#define R_LARCH_TLS_GD_HI20 98
+#define R_LARCH_32_PCREL 99
+#define R_LARCH_RELAX 100
/*
* Relocs added in ELF for the LoongArchâ„¢ Architecture v20230519, part of the
@@ -1520,13 +1526,13 @@ typedef struct {
* in psABI v2.20 because they were proved not necessary to be exposed outside
* of the linker.
*/
-#define R_LARCH_ALIGN 102
-#define R_LARCH_PCREL20_S2 103
-#define R_LARCH_ADD6 105
-#define R_LARCH_SUB6 106
-#define R_LARCH_ADD_ULEB128 107
-#define R_LARCH_SUB_ULEB128 108
-#define R_LARCH_64_PCREL 109
+#define R_LARCH_ALIGN 102
+#define R_LARCH_PCREL20_S2 103
+#define R_LARCH_ADD6 105
+#define R_LARCH_SUB6 106
+#define R_LARCH_ADD_ULEB128 107
+#define R_LARCH_SUB_ULEB128 108
+#define R_LARCH_64_PCREL 109
/*
* Relocs added in ELF for the LoongArchâ„¢ Architecture v20231102, part of the
@@ -1534,7 +1540,7 @@ typedef struct {
*
* Spec addition: https://github.com/loongson/la-abi-specs/pull/4
*/
-#define R_LARCH_CALL36 110
+#define R_LARCH_CALL36 110
/*
* Relocs added in ELF for the LoongArchâ„¢ Architecture v20231219, part of the
@@ -1542,24 +1548,24 @@ typedef struct {
*
* Spec addition: https://github.com/loongson/la-abi-specs/pull/5
*/
-#define R_LARCH_TLS_DESC32 13
-#define R_LARCH_TLS_DESC64 14
-#define R_LARCH_TLS_DESC_PC_HI20 111
-#define R_LARCH_TLS_DESC_PC_LO12 112
-#define R_LARCH_TLS_DESC64_PC_LO20 113
-#define R_LARCH_TLS_DESC64_PC_HI12 114
-#define R_LARCH_TLS_DESC_HI20 115
-#define R_LARCH_TLS_DESC_LO12 116
-#define R_LARCH_TLS_DESC64_LO20 117
-#define R_LARCH_TLS_DESC64_HI12 118
-#define R_LARCH_TLS_DESC_LD 119
-#define R_LARCH_TLS_DESC_CALL 120
-#define R_LARCH_TLS_LE_HI20_R 121
-#define R_LARCH_TLS_LE_ADD_R 122
-#define R_LARCH_TLS_LE_LO12_R 123
-#define R_LARCH_TLS_LD_PCREL20_S2 124
-#define R_LARCH_TLS_GD_PCREL20_S2 125
-#define R_LARCH_TLS_DESC_PCREL20_S2 126
+#define R_LARCH_TLS_DESC32 13
+#define R_LARCH_TLS_DESC64 14
+#define R_LARCH_TLS_DESC_PC_HI20 111
+#define R_LARCH_TLS_DESC_PC_LO12 112
+#define R_LARCH_TLS_DESC64_PC_LO20 113
+#define R_LARCH_TLS_DESC64_PC_HI12 114
+#define R_LARCH_TLS_DESC_HI20 115
+#define R_LARCH_TLS_DESC_LO12 116
+#define R_LARCH_TLS_DESC64_LO20 117
+#define R_LARCH_TLS_DESC64_HI12 118
+#define R_LARCH_TLS_DESC_LD 119
+#define R_LARCH_TLS_DESC_CALL 120
+#define R_LARCH_TLS_LE_HI20_R 121
+#define R_LARCH_TLS_LE_ADD_R 122
+#define R_LARCH_TLS_LE_LO12_R 123
+#define R_LARCH_TLS_LD_PCREL20_S2 124
+#define R_LARCH_TLS_GD_PCREL20_S2 125
+#define R_LARCH_TLS_DESC_PCREL20_S2 126
#define R_SPARC_NONE 0
#define R_SPARC_8 1
diff --git a/sys/sys/exec.h b/sys/sys/exec.h
index 4bf114a7c698..580a5372c4db 100644
--- a/sys/sys/exec.h
+++ b/sys/sys/exec.h
@@ -57,16 +57,6 @@ struct ps_strings {
unsigned int ps_nenvstr; /* the number of environment strings */
};
-/* Coredump output parameters. */
-struct coredump_params {
- off_t offset;
- struct ucred *active_cred;
- struct ucred *file_cred;
- struct thread *td;
- struct vnode *vp;
- struct compressor *comp;
-};
-
struct image_params;
struct execsw {
@@ -105,16 +95,6 @@ int exec_unregister(const struct execsw *);
enum uio_seg;
-#define CORE_BUF_SIZE (16 * 1024)
-
-int core_write(struct coredump_params *, const void *, size_t, off_t,
- enum uio_seg, size_t *);
-int core_output(char *, size_t, off_t, struct coredump_params *, void *);
-int sbuf_drain_core_output(void *, const char *, int);
-
-extern int coredump_pack_fileinfo;
-extern int coredump_pack_vmmapinfo;
-
/*
* note: name##_mod cannot be const storage because the
* linker_file_sysinit() function modifies _file in the
diff --git a/sys/sys/exterr_cat.h b/sys/sys/exterr_cat.h
index cab94ac511a5..a8e1f56e132e 100644
--- a/sys/sys/exterr_cat.h
+++ b/sys/sys/exterr_cat.h
@@ -18,6 +18,7 @@
#define EXTERR_CAT_FUSE 4
#define EXTERR_CAT_INOTIFY 5
#define EXTERR_CAT_GENIO 6
+#define EXTERR_CAT_BRIDGE 7
#endif
diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h
index 15557c614f88..7bf1d264ff5e 100644
--- a/sys/sys/exterrvar.h
+++ b/sys/sys/exterrvar.h
@@ -21,6 +21,7 @@
#define EXTERRCTL_ENABLE 1
#define EXTERRCTL_DISABLE 2
+#define EXTERRCTL_UD 3
#define EXTERRCTLF_FORCE 0x00000001
diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h
index c9444e5aec41..2845a9dbc1e2 100644
--- a/sys/sys/imgact_elf.h
+++ b/sys/sys/imgact_elf.h
@@ -45,6 +45,7 @@
{(pos)->a_type = (id); (pos)->a_un.a_ptr = (ptr); (pos)++;}
#endif
+struct coredump_writer;
struct image_params;
struct thread;
struct vnode;
@@ -114,7 +115,7 @@ bool __elfN(brand_inuse)(Elf_Brandinfo *entry);
int __elfN(insert_brand_entry)(Elf_Brandinfo *entry);
int __elfN(remove_brand_entry)(Elf_Brandinfo *entry);
int __elfN(freebsd_fixup)(uintptr_t *, struct image_params *);
-int __elfN(coredump)(struct thread *, struct vnode *, off_t, int);
+int __elfN(coredump)(struct thread *, struct coredump_writer *, off_t, int);
size_t __elfN(populate_note)(int, void *, void *, size_t, void **);
int __elfN(freebsd_copyout_auxargs)(struct image_params *, uintptr_t);
void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t, int);
diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h
index 65dc5dba43f3..d1f23d5898bb 100644
--- a/sys/sys/inotify.h
+++ b/sys/sys/inotify.h
@@ -107,11 +107,18 @@ void vn_inotify_revoke(struct vnode *);
} while (0)
/* Log an inotify event using a specific name for the vnode. */
-#define INOTIFY_NAME(vp, dvp, cnp, ev) do { \
+#define INOTIFY_NAME_LOCK(vp, dvp, cnp, ev, lock) do { \
if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 || \
- (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) \
+ (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) { \
+ if (lock) \
+ vn_lock((vp), LK_SHARED | LK_RETRY); \
VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0); \
+ if (lock) \
+ VOP_UNLOCK(vp); \
+ } \
} while (0)
+#define INOTIFY_NAME(vp, dvp, cnp, ev) \
+ INOTIFY_NAME_LOCK((vp), (dvp), (cnp), (ev), false)
extern __uint32_t inotify_rename_cookie;
@@ -126,7 +133,8 @@ extern __uint32_t inotify_rename_cookie;
VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie); \
} \
if ((tvp) != NULL) \
- INOTIFY_NAME((tvp), (tdvp), (tcnp), _IN_MOVE_DELETE); \
+ INOTIFY_NAME_LOCK((tvp), (tdvp), (tcnp), \
+ _IN_MOVE_DELETE, true); \
} while (0)
#define INOTIFY_REVOKE(vp) do { \
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index 08caa9f49270..24c420e2c976 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -435,7 +435,7 @@ void prison0_init(void);
bool prison_allow(struct ucred *, unsigned);
int prison_check(struct ucred *cred1, struct ucred *cred2);
bool prison_check_nfsd(struct ucred *cred);
-bool prison_owns_vnet(struct ucred *);
+bool prison_owns_vnet(struct prison *pr);
int prison_canseemount(struct ucred *cred, struct mount *mp);
void prison_enforce_statfs(struct ucred *cred, struct mount *mp,
struct statfs *sp);
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index c75094aea450..304bd019c9fc 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -1391,6 +1391,7 @@ extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */
#define PACKET_TAG_PF_REASSEMBLED 31
#define PACKET_TAG_IPSEC_ACCEL_OUT 32 /* IPSEC accel out */
#define PACKET_TAG_IPSEC_ACCEL_IN 33 /* IPSEC accel in */
+#define PACKET_TAG_OVPN 34 /* if_ovpn */
/* Specific cookies and tags. */
diff --git a/sys/sys/param.h b/sys/sys/param.h
index af116d6e3f7a..33d61e8a1619 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -74,7 +74,7 @@
* cannot include sys/param.h and should only be updated here.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 1500051
+#define __FreeBSD_version 1500055
/*
* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/random.h b/sys/sys/random.h
index 254ba9451d0a..5abf762cd200 100644
--- a/sys/sys/random.h
+++ b/sys/sys/random.h
@@ -85,7 +85,8 @@ enum random_entropy_source {
RANDOM_FS_ATIME,
RANDOM_UMA, /* Special!! UMA/SLAB Allocator */
RANDOM_CALLOUT,
- RANDOM_ENVIRONMENTAL_END = RANDOM_CALLOUT,
+ RANDOM_RANDOMDEV,
+ RANDOM_ENVIRONMENTAL_END = RANDOM_RANDOMDEV,
/* Fast hardware random-number sources from here on. */
RANDOM_PURE_START,
RANDOM_PURE_OCTEON = RANDOM_PURE_START,
diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h
index 23e8426b26ee..8f181b7beee6 100644
--- a/sys/sys/signalvar.h
+++ b/sys/sys/signalvar.h
@@ -403,6 +403,7 @@ int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **);
void sigfastblock_clear(struct thread *td);
void sigfastblock_fetch(struct thread *td);
int sig_intr(void);
+bool sig_do_core(int);
void siginit(struct proc *p);
void signotify(struct thread *td);
void sigqueue_delete(struct sigqueue *queue, int sig);
diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h
index 7f6234ade6f4..b4593f38f592 100644
--- a/sys/sys/sockbuf.h
+++ b/sys/sys/sockbuf.h
@@ -40,7 +40,7 @@
#define SB_SEL 0x08 /* someone is selecting */
#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */
#define SB_UPCALL 0x20 /* someone wants an upcall */
-/* was SB_NOINTR 0x40 */
+#define SB_AUTOLOWAT 0x40 /* sendfile(2) may autotune sb_lowat */
#define SB_AIO 0x80 /* AIO operations queued */
#define SB_KNOTE 0x100 /* kernel note attached */
#define SB_NOCOALESCE 0x200 /* don't coalesce new data into existing mbufs */
@@ -210,8 +210,6 @@ typedef enum { SO_RCV, SO_SND } sb_which;
* Socket buffer private mbuf(9) flags.
*/
#define M_NOTREADY M_PROTO1 /* m_data not populated yet */
-#define M_BLOCKED M_PROTO2 /* M_NOTREADY in front of m */
-#define M_NOTAVAIL (M_NOTREADY | M_BLOCKED)
void sbappend(struct sockbuf *sb, struct mbuf *m, int flags);
void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags);
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index fd183ffbc7a4..8237165b84ce 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -60,6 +60,7 @@ struct rusage;
struct sched_param;
struct sembuf;
union semun;
+struct shmfd;
struct sockaddr;
struct spacectl_range;
struct stat;
@@ -337,7 +338,7 @@ int kern_shm_open(struct thread *td, const char *userpath, int flags,
mode_t mode, struct filecaps *fcaps);
int kern_shm_open2(struct thread *td, const char *path, int flags,
mode_t mode, int shmflags, struct filecaps *fcaps,
- const char *name);
+ const char *name, struct shmfd *shmfd);
int kern_shmat(struct thread *td, int shmid, const void *shmaddr,
int shmflg);
int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf,
diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h
index 4ddfc8516053..1714fa5a7416 100644
--- a/sys/sys/sysent.h
+++ b/sys/sys/sysent.h
@@ -90,6 +90,7 @@ struct sysent { /* system call table */
#define SY_THR_STATIC_KLD SY_THR_STATIC
#endif
+struct coredump_writer;
struct image_params;
struct proc;
struct __sigset;
@@ -108,7 +109,8 @@ struct sysentvec {
int *sv_szsigcode; /* size of sigtramp code */
int sv_sigcodeoff;
char *sv_name; /* name of binary type */
- int (*sv_coredump)(struct thread *, struct vnode *, off_t, int);
+ int (*sv_coredump)(struct thread *, struct coredump_writer *,
+ off_t, int);
/* function to dump core, or NULL */
int sv_elf_core_osabi;
const char *sv_elf_core_abi_vendor;
diff --git a/sys/sys/ucoredump.h b/sys/sys/ucoredump.h
new file mode 100644
index 000000000000..0a51ee7f50c8
--- /dev/null
+++ b/sys/sys/ucoredump.h
@@ -0,0 +1,99 @@
+/*
+ *
+ * Copyright (c) 2015 Mark Johnston <markj@FreeBSD.org>
+ * Copyright (c) 2025 Kyle Evans <kevans@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ */
+
+#ifndef _SYS_UCOREDUMP_H_
+#define _SYS_UCOREDUMP_H_
+
+#ifdef _KERNEL
+
+#include <sys/_uio.h>
+#include <sys/blockcount.h>
+#include <sys/queue.h>
+
+/* Coredump output parameters. */
+struct coredump_params;
+struct coredump_writer;
+struct thread;
+struct ucred;
+
+typedef int coredump_init_fn(const struct coredump_writer *,
+ const struct coredump_params *);
+typedef int coredump_write_fn(const struct coredump_writer *, const void *, size_t,
+ off_t, enum uio_seg, struct ucred *, size_t *, struct thread *);
+typedef int coredump_extend_fn(const struct coredump_writer *, off_t,
+ struct ucred *);
+
+struct coredump_vnode_ctx {
+ struct vnode *vp;
+ struct ucred *fcred;
+};
+
+coredump_write_fn core_vn_write;
+coredump_extend_fn core_vn_extend;
+
+struct coredump_writer {
+ void *ctx;
+ coredump_init_fn *init_fn;
+ coredump_write_fn *write_fn;
+ coredump_extend_fn *extend_fn;
+};
+
+struct coredump_params {
+ off_t offset;
+ struct ucred *active_cred;
+ struct thread *td;
+ const struct coredump_writer *cdw;
+ struct compressor *comp;
+};
+
+#define CORE_BUF_SIZE (16 * 1024)
+
+int core_write(struct coredump_params *, const void *, size_t, off_t,
+ enum uio_seg, size_t *);
+int core_output(char *, size_t, off_t, struct coredump_params *, void *);
+int sbuf_drain_core_output(void *, const char *, int);
+
+extern int coredump_pack_fileinfo;
+extern int coredump_pack_vmmapinfo;
+
+extern int compress_user_cores;
+extern int compress_user_cores_level;
+
+typedef int coredumper_probe_fn(struct thread *);
+
+/*
+ * Some arbitrary values for coredumper probes to return. The highest priority
+ * we can find wins. It's somewhat expected that a coredumper may want to bid
+ * differently based on the process in question. Note that probe functions will
+ * be called with the proc lock held, so they must not sleep.
+ */
+#define COREDUMPER_NOMATCH (-1) /* Decline to touch it */
+#define COREDUMPER_GENERIC (0) /* I handle coredumps */
+#define COREDUMPER_SPECIAL (50) /* Special handler */
+#define COREDUMPER_HIGH_PRIORITY (100) /* High-priority handler */
+
+/*
+ * The handle functions will be called with the proc lock held, and should
+ * return with the proc lock dropped.
+ */
+typedef int coredumper_handle_fn(struct thread *, off_t);
+
+struct coredumper {
+ SLIST_ENTRY(coredumper) cd_entry;
+ const char *cd_name;
+ coredumper_probe_fn *cd_probe;
+ coredumper_handle_fn *cd_handle;
+ blockcount_t cd_refcount;
+};
+
+void coredumper_register(struct coredumper *);
+void coredumper_unregister(struct coredumper *);
+
+#endif /* _KERNEL */
+#endif /* _SYS_UCOREDUMP_H_ */
diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h
index c291c1dc2b95..85ed93fd359d 100644
--- a/sys/sys/unistd.h
+++ b/sys/sys/unistd.h
@@ -156,6 +156,8 @@
#define _PC_DEALLOC_PRESENT 65
#define _PC_NAMEDATTR_ENABLED 66
#define _PC_HAS_NAMEDATTR 67
+#define _PC_XATTR_ENABLED _PC_NAMEDATTR_ENABLED /* Solaris Compatible */
+#define _PC_XATTR_EXISTS _PC_HAS_NAMEDATTR /* Solaris Compatible */
#define _PC_HAS_HIDDENSYSTEM 68
#endif
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 2c6947103c94..a416fddcddc3 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -939,7 +939,6 @@ void vop_mknod_post(void *a, int rc);
void vop_open_post(void *a, int rc);
void vop_read_post(void *a, int rc);
void vop_read_pgcache_post(void *ap, int rc);
-void vop_readdir_post(void *a, int rc);
void vop_reclaim_post(void *a, int rc);
void vop_remove_pre(void *a);
void vop_remove_post(void *a, int rc);
@@ -1015,7 +1014,36 @@ void vop_rename_fail(struct vop_rename_args *ap);
_error; \
})
-#define VOP_WRITE_PRE(ap) \
+#ifdef INVARIANTS
+#define vop_readdir_pre_assert(ap) \
+ ssize_t nresid, oresid; \
+ \
+ oresid = (ap)->a_uio->uio_resid;
+
+#define vop_readdir_post_assert(ap, ret) \
+ nresid = (ap)->a_uio->uio_resid; \
+ if ((ret) == 0 && (ap)->a_eofflag != NULL) { \
+ VNASSERT(oresid == 0 || nresid != oresid || \
+ *(ap)->a_eofflag == 1, \
+ (ap)->a_vp, ("VOP_READDIR: eofflag not set")); \
+ }
+#else
+#define vop_readdir_pre_assert(ap)
+#define vop_readdir_post_assert(ap, ret)
+#endif
+
+#define vop_readdir_pre(ap) do { \
+ vop_readdir_pre_assert(ap)
+
+#define vop_readdir_post(ap, ret) \
+ vop_readdir_post_assert(ap, ret); \
+ if ((ret) == 0) { \
+ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_READ); \
+ INOTIFY((ap)->a_vp, IN_ACCESS); \
+ } \
+} while (0)
+
+#define vop_write_pre(ap) \
struct vattr va; \
int error; \
off_t osize, ooffset, noffset; \
@@ -1029,7 +1057,7 @@ void vop_rename_fail(struct vop_rename_args *ap);
osize = (off_t)va.va_size; \
}
-#define VOP_WRITE_POST(ap, ret) \
+#define vop_write_post(ap, ret) \
noffset = (ap)->a_uio->uio_offset; \
if (noffset > ooffset) { \
if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 53fac4b0665e..2757fb066981 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -1268,7 +1268,8 @@ ufs_rename(
struct inode *fip, *tip, *tdp, *fdp;
struct direct newdir;
off_t endoff;
- int doingdirectory, newparent;
+ int doingdirectory;
+ u_int newparent;
int error = 0;
struct mount *mp;
ino_t ino;
@@ -1475,7 +1476,7 @@ relock:
* the user must have write permission in the source so
* as to be able to change "..".
*/
- if (doingdirectory && newparent) {
+ if (doingdirectory && newparent != 0) {
error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, curthread);
if (error)
goto unlockout;
@@ -1538,7 +1539,7 @@ relock:
if (tip == NULL) {
if (ITODEV(tdp) != ITODEV(fip))
panic("ufs_rename: EXDEV");
- if (doingdirectory && newparent) {
+ if (doingdirectory && newparent != 0) {
/*
* Account for ".." in new directory.
* When source and destination have the same
@@ -1631,7 +1632,7 @@ relock:
goto bad;
}
if (doingdirectory) {
- if (!newparent) {
+ if (newparent == 0) {
tdp->i_effnlink--;
if (DOINGSOFTDEP(tdvp))
softdep_change_linkcnt(tdp);
@@ -1641,11 +1642,10 @@ relock:
softdep_change_linkcnt(tip);
}
error = ufs_dirrewrite(tdp, tip, fip->i_number,
- IFTODT(fip->i_mode),
- (doingdirectory && newparent) ? newparent : doingdirectory);
+ IFTODT(fip->i_mode), doingdirectory);
if (error) {
if (doingdirectory) {
- if (!newparent) {
+ if (newparent == 0) {
tdp->i_effnlink++;
if (DOINGSOFTDEP(tdvp))
softdep_change_linkcnt(tdp);
@@ -1668,7 +1668,7 @@ relock:
* disk, so when running with that code we avoid doing
* them now.
*/
- if (!newparent) {
+ if (newparent == 0) {
tdp->i_nlink--;
DIP_SET_NLINK(tdp, tdp->i_nlink);
UFS_INODE_SET_FLAG(tdp, IN_CHANGE);
@@ -1697,7 +1697,7 @@ relock:
* parent directory must be decremented
* and ".." set to point to the new parent.
*/
- if (doingdirectory && newparent) {
+ if (doingdirectory && newparent != 0) {
/*
* Set the directory depth based on its new parent.
*/
@@ -2064,9 +2064,13 @@ ufs_mkdir(
*/
ucred.cr_ref = 1;
ucred.cr_uid = ip->i_uid;
+
+ /*
+ * XXXKE Fix this is cr_gid gets separated out
+ */
ucred.cr_ngroups = 1;
ucred.cr_groups = &ucred_group;
- ucred.cr_groups[0] = dp->i_gid;
+ ucred.cr_gid = ucred_group = dp->i_gid;
ucp = &ucred;
}
#endif
@@ -2823,9 +2827,13 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
*/
ucred.cr_ref = 1;
ucred.cr_uid = ip->i_uid;
+
+ /*
+ * XXXKE Fix this is cr_gid gets separated out
+ */
ucred.cr_ngroups = 1;
ucred.cr_groups = &ucred_group;
- ucred.cr_groups[0] = pdir->i_gid;
+ ucred.cr_gid = ucred_group = pdir->i_gid;
ucp = &ucred;
#endif
} else {
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 86b75a2d7989..d6bd06226d04 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -384,8 +384,8 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
#endif
}
-static int swap_pager_full = 2; /* swap space exhaustion (task killing) */
-static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
+static bool swap_pager_full = true; /* swap space exhaustion (task killing) */
+static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */
static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */
static int nsw_wcount_async; /* limit async write buffers */
static int nsw_wcount_async_max;/* assigned maximum */
@@ -642,14 +642,14 @@ swp_sizecheck(void)
{
if (swap_pager_avail < nswap_lowat) {
- if (swap_pager_almost_full == 0) {
+ if (!swap_pager_almost_full) {
printf("swap_pager: out of swap space\n");
- swap_pager_almost_full = 1;
+ swap_pager_almost_full = true;
}
} else {
- swap_pager_full = 0;
+ swap_pager_full = false;
if (swap_pager_avail > nswap_hiwat)
- swap_pager_almost_full = 0;
+ swap_pager_almost_full = false;
}
}
@@ -958,11 +958,10 @@ swp_pager_getswapspace(int *io_npages)
swp_sizecheck();
swdevhd = TAILQ_NEXT(sp, sw_list);
} else {
- if (swap_pager_full != 2) {
+ if (!swap_pager_full) {
printf("swp_pager_getswapspace(%d): failed\n",
*io_npages);
- swap_pager_full = 2;
- swap_pager_almost_full = 1;
+ swap_pager_full = swap_pager_almost_full = true;
}
swdevhd = NULL;
}
@@ -2863,10 +2862,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags)
sp->sw_id = NULL;
TAILQ_REMOVE(&swtailq, sp, sw_list);
nswapdev--;
- if (nswapdev == 0) {
- swap_pager_full = 2;
- swap_pager_almost_full = 1;
- }
+ if (nswapdev == 0)
+ swap_pager_full = swap_pager_almost_full = true;
if (swdevhd == sp)
swdevhd = NULL;
mtx_unlock(&sw_dev_mtx);
diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c
index 7b8bf4c77663..b44bdb96b0d4 100644
--- a/sys/vm/vm_domainset.c
+++ b/sys/vm/vm_domainset.c
@@ -131,8 +131,7 @@ static void
vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
{
- KASSERT(di->di_n > 0,
- ("vm_domainset_iter_first: Invalid n %d", di->di_n));
+ KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
switch (di->di_policy) {
case DOMAINSET_POLICY_FIRSTTOUCH:
/*
@@ -149,11 +148,10 @@ vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
vm_domainset_iter_prefer(di, domain);
break;
default:
- panic("vm_domainset_iter_first: Unknown policy %d",
- di->di_policy);
+ panic("%s: Unknown policy %d", __func__, di->di_policy);
}
KASSERT(*domain < vm_ndomains,
- ("vm_domainset_iter_next: Invalid domain %d", *domain));
+ ("%s: Invalid domain %d", __func__, *domain));
}
static void
@@ -189,13 +187,11 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
di->di_n = di->di_domain->ds_cnt;
break;
default:
- panic("vm_domainset_iter_first: Unknown policy %d",
- di->di_policy);
+ panic("%s: Unknown policy %d", __func__, di->di_policy);
}
- KASSERT(di->di_n > 0,
- ("vm_domainset_iter_first: Invalid n %d", di->di_n));
+ KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
KASSERT(*domain < vm_ndomains,
- ("vm_domainset_iter_first: Invalid domain %d", *domain));
+ ("%s: Invalid domain %d", __func__, *domain));
}
void
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 875c22d27628..e7d7b6726d2c 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -110,11 +110,18 @@ u_int exec_map_entry_size;
u_int exec_map_entries;
SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
- SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
+#if defined(__amd64__)
+ &kva_layout.km_low, 0,
+#else
+ SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS,
+#endif
+ "Min kernel address");
SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
#if defined(__arm__)
&vm_max_kernel_address, 0,
+#elif defined(__amd64__)
+ &kva_layout.km_high, 0,
#else
SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS,
#endif
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index bbae55895c2c..b239a6ffb4ce 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -396,7 +396,7 @@ vm_page_blacklist_load(char **list, char **end)
}
*list = ptr;
if (ptr != NULL)
- *end = ptr + len;
+ *end = ptr + len - 1;
else
*end = NULL;
return;
diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h
index cbbd27389662..9bd3b389fb60 100644
--- a/sys/vm/vm_pagequeue.h
+++ b/sys/vm/vm_pagequeue.h
@@ -260,9 +260,9 @@ struct vm_domain {
u_int vmd_inactive_shortage; /* Per-thread shortage. */
blockcount_t vmd_inactive_running; /* Number of inactive threads. */
blockcount_t vmd_inactive_starting; /* Number of threads started. */
- volatile u_int vmd_addl_shortage; /* Shortage accumulator. */
- volatile u_int vmd_inactive_freed; /* Successful inactive frees. */
- volatile u_int vmd_inactive_us; /* Microseconds for above. */
+ u_int vmd_addl_shortage; /* (a) Shortage accumulator. */
+ u_int vmd_inactive_freed; /* (a) Successful inactive frees. */
+ u_int vmd_inactive_us; /* (a) Microseconds for above. */
u_int vmd_inactive_pps; /* Exponential decay frees/second. */
int vmd_oom_seq;
int vmd_last_active_scan;