26 files changed, 1113 insertions, 770 deletions
diff --git a/sys/kern/coredump_vnode.c b/sys/kern/coredump_vnode.c
new file mode 100644
index 000000000000..8b857e9aa4a2
--- /dev/null
+++ b/sys/kern/coredump_vnode.c
@@ -0,0 +1,562 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * - kern_sig.c
+ */
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * -kern_exec.c
+ */
+
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/compressor.h>
+#include <sys/devctl.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/limits.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/ucoredump.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+
+#include <security/audit/audit.h>
+
+#define	GZIP_SUFFIX	".gz"
+#define	ZSTD_SUFFIX	".zst"
+
+#define	MAX_NUM_CORE_FILES 100000
+#ifndef NUM_CORE_FILES
+#define	NUM_CORE_FILES 5
+#endif
+
+static coredumper_handle_fn	coredump_vnode;
+static struct coredumper vnode_coredumper = {
+	.cd_name = "vnode_coredumper",
+	.cd_handle = coredump_vnode,
+};
+
+SYSINIT(vnode_coredumper_register, SI_SUB_EXEC, SI_ORDER_ANY,
+    coredumper_register, &vnode_coredumper);
+
+_Static_assert(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES,
+    "NUM_CORE_FILES is out of range (0 to " __STRING(MAX_NUM_CORE_FILES) ")");
+static int num_cores = NUM_CORE_FILES;
+
+static int capmode_coredump;
+SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
+    &capmode_coredump, 0, "Allow processes in capability mode to dump core");
+
+static int set_core_nodump_flag = 0;
+SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
+	0, "Enable setting the NODUMP flag on coredump files");
+
+static int coredump_devctl = 0;
+SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
+	0, "Generate a devctl notification when processes coredump");
+
+/*
+ * corefilename[] is protected by the allproc_lock.
+ */
+static char corefilename[MAXPATHLEN] = { "%N.core" };
+TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
+
+static int
+sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	sx_xlock(&allproc_lock);
+	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
+	    req);
+	sx_xunlock(&allproc_lock);
+
+	return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
+    "Process corefile name format string");
+
+static int
+sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int new_val;
+
+	new_val = num_cores;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (new_val > MAX_NUM_CORE_FILES)
+		new_val = MAX_NUM_CORE_FILES;
+	if (new_val < 0)
+		new_val = 0;
+	num_cores = new_val;
+	return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, ncores,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
+    sysctl_debug_num_cores_check, "I",
+    "Maximum number of generated process corefiles while using index format");
+
+static void
+vnode_close_locked(struct thread *td, struct vnode *vp)
+{
+
+	VOP_UNLOCK(vp);
+	vn_close(vp, FWRITE, td->td_ucred, td);
+}
+
+int
+core_vn_write(const struct coredump_writer *cdw, const void *base, size_t len,
+    off_t offset, enum uio_seg seg, struct ucred *cred, size_t *resid,
+    struct thread *td)
+{
+	struct coredump_vnode_ctx *ctx = cdw->ctx;
+
+	return (vn_rdwr_inchunks(UIO_WRITE, ctx->vp, __DECONST(void *, base),
+	    len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
+	    cred, ctx->fcred, resid, td));
+}
+
+int
+core_vn_extend(const struct coredump_writer *cdw, off_t newsz,
+    struct ucred *cred)
+{
+	struct coredump_vnode_ctx *ctx = cdw->ctx;
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(ctx->vp, &mp, V_WAIT);
+	if (error != 0)
+		return (error);
+	vn_lock(ctx->vp, LK_EXCLUSIVE | LK_RETRY);
+	error = vn_truncate_locked(ctx->vp, newsz, false, cred);
+	VOP_UNLOCK(ctx->vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * If the core format has a %I in it, then we need to check
+ * for existing corefiles before defining a name.
+ * To do this we iterate over 0..ncores to find a
+ * non-existing core file name to use. If all core files are
+ * already used we choose the oldest one.
+ */
+static int
+corefile_open_last(struct thread *td, char *name, int indexpos,
+    int indexlen, int ncores, struct vnode **vpp)
+{
+	struct vnode *oldvp, *nextvp, *vp;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error, i, flags, oflags, cmode;
+	char ch;
+	struct timespec lasttime;
+
+	nextvp = oldvp = NULL;
+	cmode = S_IRUSR | S_IWUSR;
+	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
+	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+
+	for (i = 0; i < ncores; i++) {
+		flags = O_CREAT | FWRITE | O_NOFOLLOW;
+
+		ch = name[indexpos + indexlen];
+		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
+		    i);
+		name[indexpos + indexlen] = ch;
+
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
+		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
+		    NULL);
+		if (error != 0)
+			break;
+
+		vp = nd.ni_vp;
+		NDFREE_PNBUF(&nd);
+		if ((flags & O_CREAT) == O_CREAT) {
+			nextvp = vp;
+			break;
+		}
+
+		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+		if (error != 0) {
+			vnode_close_locked(td, vp);
+			break;
+		}
+
+		if (oldvp == NULL ||
+		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
+		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
+		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
+			if (oldvp != NULL)
+				vn_close(oldvp, FWRITE, td->td_ucred, td);
+			oldvp = vp;
+			VOP_UNLOCK(oldvp);
+			lasttime = vattr.va_mtime;
+		} else {
+			vnode_close_locked(td, vp);
+		}
+	}
+
+	if (oldvp != NULL) {
+		if (nextvp == NULL) {
+			if ((td->td_proc->p_flag & P_SUGID) != 0) {
+				error = EFAULT;
+				vn_close(oldvp, FWRITE, td->td_ucred, td);
+			} else {
+				nextvp = oldvp;
+				error = vn_lock(nextvp, LK_EXCLUSIVE);
+				if (error != 0) {
+					vn_close(nextvp, FWRITE, td->td_ucred,
+					    td);
+					nextvp = NULL;
+				}
+			}
+		} else {
+			vn_close(oldvp, FWRITE, td->td_ucred, td);
+		}
+	}
+	if (error != 0) {
+		if (nextvp != NULL)
+			vnode_close_locked(td, oldvp);
+	} else {
+		*vpp = nextvp;
+	}
+
+	return (error);
+}
+
+/*
+ * corefile_open(comm, uid, pid, td, compress, vpp, namep)
+ * Expand the name described in corefilename, using name, uid, and pid
+ * and open/create core file.
+ * corefilename is a printf-like string, with three format specifiers:
+ *	%N	name of process ("name")
+ *	%P	process id (pid)
+ *	%U	user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+static int
+corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
+    int compress, int signum, struct vnode **vpp, char **namep)
+{
+	struct sbuf sb;
+	struct nameidata nd;
+	const char *format;
+	char *hostname, *name;
+	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
+
+	hostname = NULL;
+	format = corefilename;
+	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
+	indexlen = 0;
+	indexpos = -1;
+	ncores = num_cores;
+	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
+	sx_slock(&allproc_lock);
+	for (i = 0; format[i] != '\0'; i++) {
+		switch (format[i]) {
+		case '%':	/* Format character */
+			i++;
+			switch (format[i]) {
+			case '%':
+				sbuf_putc(&sb, '%');
+				break;
+			case 'H':	/* hostname */
+				if (hostname == NULL) {
+					hostname = malloc(MAXHOSTNAMELEN,
+					    M_TEMP, M_WAITOK);
+				}
+				getcredhostname(td->td_ucred, hostname,
+				    MAXHOSTNAMELEN);
+				sbuf_cat(&sb, hostname);
+				break;
+			case 'I':	/* autoincrementing index */
+				if (indexpos != -1) {
+					sbuf_printf(&sb, "%%I");
+					break;
+				}
+
+				indexpos = sbuf_len(&sb);
+				sbuf_printf(&sb, "%u", ncores - 1);
+				indexlen = sbuf_len(&sb) - indexpos;
+				break;
+			case 'N':	/* process name */
+				sbuf_printf(&sb, "%s", comm);
+				break;
+			case 'P':	/* process id */
+				sbuf_printf(&sb, "%u", pid);
+				break;
+			case 'S':	/* signal number */
+				sbuf_printf(&sb, "%i", signum);
+				break;
+			case 'U':	/* user id */
+				sbuf_printf(&sb, "%u", uid);
+				break;
+			default:
+				log(LOG_ERR,
+				    "Unknown format character %c in "
+				    "corename `%s'\n", format[i], format);
+				break;
+			}
+			break;
+		default:
+			sbuf_putc(&sb, format[i]);
+			break;
+		}
+	}
+	sx_sunlock(&allproc_lock);
+	free(hostname, M_TEMP);
+	if (compress == COMPRESS_GZIP)
+		sbuf_cat(&sb, GZIP_SUFFIX);
+	else if (compress == COMPRESS_ZSTD)
+		sbuf_cat(&sb, ZSTD_SUFFIX);
+	if (sbuf_error(&sb) != 0) {
+		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
+		    "long\n", (long)pid, comm, (u_long)uid);
+		sbuf_delete(&sb);
+		free(name, M_TEMP);
+		return (ENOMEM);
+	}
+	sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+	if (indexpos != -1) {
+		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
+		    vpp);
+		if (error != 0) {
+			log(LOG_ERR,
+			    "pid %d (%s), uid (%u):  Path `%s' failed "
+			    "on initial open test, error = %d\n",
+			    pid, comm, uid, name, error);
+		}
+	} else {
+		cmode = S_IRUSR | S_IWUSR;
+		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
+		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+		flags = O_CREAT | FWRITE | O_NOFOLLOW;
+		if ((td->td_proc->p_flag & P_SUGID) != 0)
+			flags |= O_EXCL;
+
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
+		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
+		    NULL);
+		if (error == 0) {
+			*vpp = nd.ni_vp;
+			NDFREE_PNBUF(&nd);
+		}
+	}
+
+	if (error != 0) {
+#ifdef AUDIT
+		audit_proc_coredump(td, name, error);
+#endif
+		free(name, M_TEMP);
+		return (error);
+	}
+	*namep = name;
+	return (0);
+}
+
+/*
+ * The vnode dumper is the traditional coredump handler.  Our policy and limits
+ * are generally checked already, so it creates the coredump name and passes on
+ * a vnode and a size limit to the process-specific coredump routine if there is
+ * one.  If there _is not_ one, it returns ENOSYS; otherwise it returns the
+ * error from the process-specific routine.
+ */
+static int
+coredump_vnode(struct thread *td, off_t limit)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *cred = td->td_ucred;
+	struct vnode *vp;
+	struct coredump_vnode_ctx wctx;
+	struct coredump_writer cdw = { };
+	struct flock lf;
+	struct vattr vattr;
+	size_t fullpathsize;
+	int error, error1, jid, locked, ppid, sig;
+	char *name;			/* name of corefile */
+	void *rl_cookie;
+	char *fullpath, *freepath = NULL;
+	struct sbuf *sb;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	ppid = p->p_oppid;
+	sig = p->p_sig;
+	jid = p->p_ucred->cr_prison->pr_id;
+	PROC_UNLOCK(p);
+
+	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
+	    compress_user_cores, sig, &vp, &name);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Don't dump to non-regular files or files with links.
+	 * Do not dump into system files. Effective user must own the corefile.
+	 */
+	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
+	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
+	    vattr.va_uid != cred->cr_uid) {
+		VOP_UNLOCK(vp);
+		error = EFAULT;
+		goto out;
+	}
+
+	VOP_UNLOCK(vp);
+
+	/* Postpone other writers, including core dumps of other processes. */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	lf.l_type = F_WRLCK;
+	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
+
+	VATTR_NULL(&vattr);
+	vattr.va_size = 0;
+	if (set_core_nodump_flag)
+		vattr.va_flags = UF_NODUMP;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VOP_SETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp);
+	PROC_LOCK(p);
+	p->p_acflag |= ACORE;
+	PROC_UNLOCK(p);
+
+	wctx.vp = vp;
+	wctx.fcred = NOCRED;
+
+	cdw.ctx = &wctx;
+	cdw.write_fn = core_vn_write;
+	cdw.extend_fn = core_vn_extend;
+
+	if (p->p_sysent->sv_coredump != NULL) {
+		error = p->p_sysent->sv_coredump(td, &cdw, limit, 0);
+	} else {
+		error = ENOSYS;
+	}
+
+	if (locked) {
+		lf.l_type = F_UNLCK;
+		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+	}
+	vn_rangelock_unlock(vp, rl_cookie);
+
+	/*
+	 * Notify the userland helper that a process triggered a core dump.
+	 * This allows the helper to run an automated debugging session.
+	 */
+	if (error != 0 || coredump_devctl == 0)
+		goto out;
+	sb = sbuf_new_auto();
+	if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
+		goto out2;
+	sbuf_cat(sb, "comm=\"");
+	devctl_safe_quote_sb(sb, fullpath);
+	free(freepath, M_TEMP);
+	sbuf_cat(sb, "\" core=\"");
+
+	/*
+	 * We can't lookup core file vp directly. When we're replacing a core, and
+	 * other random times, we flush the name cache, so it will fail. Instead,
+	 * if the path of the core is relative, add the current dir in front if it.
+	 */
+	if (name[0] != '/') {
+		fullpathsize = MAXPATHLEN;
+		freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
+		if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
+			free(freepath, M_TEMP);
+			goto out2;
+		}
+		devctl_safe_quote_sb(sb, fullpath);
+		free(freepath, M_TEMP);
+		sbuf_putc(sb, '/');
+	}
+	devctl_safe_quote_sb(sb, name);
+	sbuf_putc(sb, '"');
+
+	sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d",
+	    jid, p->p_pid, ppid, sig);
+	if (sbuf_finish(sb) == 0)
+		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
+out2:
+	sbuf_delete(sb);
+out:
+	error1 = vn_close(vp, FWRITE, cred, td);
+	if (error == 0)
+		error = error1;
+#ifdef AUDIT
+	audit_proc_coredump(td, name, error);
+#endif
+	free(name, M_TEMP);
+	return (error);
+}
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index b7ffbe68b483..2690ad3b2679 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -64,6 +64,7 @@
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
+#include <sys/ucoredump.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/eventhandler.h>
@@ -1562,9 +1563,6 @@ struct note_info {
 
 TAILQ_HEAD(note_info_list, note_info);
 
-extern int compress_user_cores;
-extern int compress_user_cores_level;
-
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static void each_dumpable_segment(struct thread *, segment_callback, void *,
@@ -1595,7 +1593,7 @@ core_compressed_write(void *base, size_t len, off_t offset, void *arg)
 }
 
 int
-__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
+__elfN(coredump)(struct thread *td, struct coredump_writer *cdw, off_t limit, int flags)
 {
 	struct ucred *cred = td->td_ucred;
 	int compm, error = 0;
@@ -1625,9 +1623,8 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 	/* Set up core dump parameters. */
 	params.offset = 0;
 	params.active_cred = cred;
-	params.file_cred = NOCRED;
 	params.td = td;
-	params.vp = vp;
+	params.cdw = cdw;
 	params.comp = NULL;
 
 #ifdef RACCT
@@ -1662,6 +1659,12 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 		tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
         }
 
+	if (cdw->init_fn != NULL) {
+		error = (*cdw->init_fn)(cdw, &params);
+		if (error != 0)
+			goto done;
+	}
+
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out following the notes.
diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
index 5d9e2f2f326b..d7eb82d5f259 100644
--- a/sys/kern/kern_cpuset.c
+++ b/sys/kern/kern_cpuset.c
@@ -530,7 +530,7 @@ _domainset_create(struct domainset *domain, struct domainlist *freelist)
  * remove them and update the domainset accordingly.  If only empty
  * domains are present, we must return failure.
  */
-static bool
+bool
 domainset_empty_vm(struct domainset *domain)
 {
 	domainset_t empty;
@@ -2409,82 +2409,92 @@ sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
 }
 
 int
-kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
-    id_t id, size_t domainsetsize, const domainset_t *maskp, int policy,
-    const struct cpuset_copy_cb *cb)
+domainset_populate(struct domainset *domain, const domainset_t *mask, int policy,
+    size_t mask_size)
 {
-	struct cpuset *nset;
-	struct cpuset *set;
-	struct thread *ttd;
-	struct proc *p;
-	struct domainset domain;
-	domainset_t *mask;
-	int error;
 
-	if (domainsetsize < sizeof(domainset_t) ||
-	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
-		return (ERANGE);
 	if (policy <= DOMAINSET_POLICY_INVALID ||
-	    policy > DOMAINSET_POLICY_MAX)
+	    policy > DOMAINSET_POLICY_MAX) {
 		return (EINVAL);
-	error = cpuset_check_capabilities(td, level, which, id);
-	if (error != 0)
-		return (error);
-	memset(&domain, 0, sizeof(domain));
-	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
-	error = cb->cpuset_copyin(maskp, mask, domainsetsize);
-	if (error)
-		goto out;
+	}
+
 	/*
 	 * Verify that no high bits are set.
 	 */
-	if (domainsetsize > sizeof(domainset_t)) {
-		char *end;
-		char *cp;
+	if (mask_size > sizeof(domainset_t)) {
+		const char *end;
+		const char *cp;
 
-		end = cp = (char *)&mask->__bits;
-		end += domainsetsize;
+		end = cp = (const char *)&mask->__bits;
+		end += mask_size;
 		cp += sizeof(domainset_t);
-		while (cp != end)
+		while (cp != end) {
 			if (*cp++ != 0) {
-				error = EINVAL;
-				goto out;
+				return (EINVAL);
 			}
+		}
 	}
 	if (DOMAINSET_EMPTY(mask)) {
-		error = EDEADLK;
-		goto out;
+		return (EDEADLK);
 	}
-	DOMAINSET_COPY(mask, &domain.ds_mask);
-	domain.ds_policy = policy;
+	DOMAINSET_COPY(mask, &domain->ds_mask);
+	domain->ds_policy = policy;
 
 	/*
 	 * Sanitize the provided mask.
 	 */
-	if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) {
-		error = EINVAL;
-		goto out;
+	if (!DOMAINSET_SUBSET(&all_domains, &domain->ds_mask)) {
+		return (EINVAL);
 	}
 
 	/* Translate preferred policy into a mask and fallback. */
 	if (policy == DOMAINSET_POLICY_PREFER) {
 		/* Only support a single preferred domain. */
-		if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
-			error = EINVAL;
-			goto out;
+		if (DOMAINSET_COUNT(&domain->ds_mask) != 1) {
+			return (EINVAL);
 		}
-		domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
+		domain->ds_prefer = DOMAINSET_FFS(&domain->ds_mask) - 1;
 		/* This will be constrained by domainset_shadow(). */
-		DOMAINSET_COPY(&all_domains, &domain.ds_mask);
+		DOMAINSET_COPY(&all_domains, &domain->ds_mask);
 	}
 
+	return (0);
+}
+
+int
+kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
+    id_t id, size_t domainsetsize, const domainset_t *maskp, int policy,
+    const struct cpuset_copy_cb *cb)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *ttd;
+	struct proc *p;
+	struct domainset domain;
+	domainset_t *mask;
+	int error;
+
+	error = cpuset_check_capabilities(td, level, which, id);
+	if (error != 0)
+		return (error);
+	if (domainsetsize < sizeof(domainset_t) ||
+	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
+		return (ERANGE);
+	memset(&domain, 0, sizeof(domain));
+	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
+	error = cb->cpuset_copyin(maskp, mask, domainsetsize);
+	if (error)
+		goto out;
+	error = domainset_populate(&domain, mask, policy, domainsetsize);
+	if (error)
+		goto out;
+
 	/*
 	 * When given an impossible policy, fall back to interleaving
 	 * across all domains.
 	 */
 	if (domainset_empty_vm(&domain))
 		domainset_copy(domainset2, &domain);
-
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 93bdd41d1515..a27ab33b34da 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -557,8 +557,10 @@ open_to_fde_flags(int open_flags, bool sticky_orb)
 		{ .f = O_CLOFORK,		.t = UF_FOCLOSE },
 		{ .f = O_RESOLVE_BENEATH,	.t = UF_RESOLVE_BENEATH },
 	};
+#if defined(__clang__) && __clang_major__ >= 19
 	_Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f ==
 	    O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb");
+#endif
 
 	return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) -
 	    (sticky_orb ? 0 : 1), open_flags));
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 03268365891e..0fc2d0e7f1bc 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -70,6 +70,7 @@
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
+#include <sys/ucoredump.h>
 #include <sys/umtxvar.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
@@ -2002,10 +2003,14 @@ int
 core_write(struct coredump_params *cp, const void *base, size_t len,
     off_t offset, enum uio_seg seg, size_t *resid)
 {
+	return ((*cp->cdw->write_fn)(cp->cdw, base, len, offset, seg,
+	    cp->active_cred, resid, cp->td));
+}
 
-	return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base),
-	    len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
-	    cp->active_cred, cp->file_cred, resid, cp->td));
+static int
+core_extend(struct coredump_params *cp, off_t newsz)
+{
+	return ((*cp->cdw->extend_fn)(cp->cdw, newsz, cp->active_cred));
 }
 
 int
@@ -2013,7 +2018,6 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp,
     void *tmpbuf)
 {
 	vm_map_t map;
-	struct mount *mp;
 	size_t resid, runlen;
 	int error;
 	bool success;
@@ -2068,14 +2072,7 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp,
 			}
 		}
 		if (!success) {
-			error = vn_start_write(cp->vp, &mp, V_WAIT);
-			if (error != 0)
-				break;
-			vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY);
-			error = vn_truncate_locked(cp->vp, offset + runlen,
-			    false, cp->td->td_ucred);
-			VOP_UNLOCK(cp->vp);
-			vn_finished_write(mp);
+			error = core_extend(cp, offset + runlen);
 			if (error != 0)
 				break;
 		}
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index d4529e096929..7ef1d19f0ea8 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3466,7 +3466,7 @@ prison_check_af(struct ucred *cred, int af)
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
-	if (prison_owns_vnet(cred))
+	if (prison_owns_vnet(pr))
 		return (0);
 #endif
 
@@ -3531,7 +3531,7 @@ prison_if(struct ucred *cred, const struct sockaddr *sa)
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
-	if (prison_owns_vnet(cred))
+	if (prison_owns_vnet(cred->cr_prison))
 		return (0);
 #endif
 
@@ -3648,7 +3648,7 @@ jailed_without_vnet(struct ucred *cred)
 	if (!jailed(cred))
 		return (false);
 #ifdef VIMAGE
-	if (prison_owns_vnet(cred))
+	if (prison_owns_vnet(cred->cr_prison))
 		return (false);
 #endif
 
@@ -3711,20 +3711,17 @@ getjailname(struct ucred *cred, char *name, size_t len)
 
 #ifdef VIMAGE
 /*
- * Determine whether the prison represented by cred owns
- * its vnet rather than having it inherited.
- *
- * Returns true in case the prison owns the vnet, false otherwise.
+ * Determine whether the prison owns its VNET.
  */
 bool
-prison_owns_vnet(struct ucred *cred)
+prison_owns_vnet(struct prison *pr)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
-	return ((cred->cr_prison->pr_flags & PR_VNET) != 0);
+	return ((pr->pr_flags & PR_VNET) != 0);
 }
 #endif
 
@@ -4425,7 +4422,7 @@ sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
-	havevnet = jailed(cred) && prison_owns_vnet(cred);
+	havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison);
 #else
 	havevnet = 0;
 #endif
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index d9aeec68e620..0f0bc056cafd 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -287,7 +287,7 @@ sys_getgid(struct thread *td, struct getgid_args *uap)
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 #if defined(COMPAT_43)
-	td->td_retval[1] = td->td_ucred->cr_groups[0];
+	td->td_retval[1] = td->td_ucred->cr_gid;
 #endif
 	return (0);
 }
@@ -307,7 +307,7 @@ int
 sys_getegid(struct thread *td, struct getegid_args *uap)
 {
 
-	td->td_retval[0] = td->td_ucred->cr_groups[0];
+	td->td_retval[0] = td->td_ucred->cr_gid;
 	return (0);
 }
 
@@ -1080,7 +1080,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
 	    gid != oldcred->cr_svgid &&		/* allow setgid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
-	    gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
+	    gid != oldcred->cr_gid && /* allow setgid(getegid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0)
 		goto fail;
@@ -1092,7 +1092,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
-	    gid == oldcred->cr_groups[0] ||
+	    gid == oldcred->cr_gid ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0)
@@ -1121,7 +1121,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
 	 * In all cases permitted cases, we are changing the egid.
 	 * Copy credentials so other references do not see our changes.
 	 */
-	if (oldcred->cr_groups[0] != gid) {
+	if (oldcred->cr_gid != gid) {
 		change_egid(newcred, gid);
 		setsugid(p);
 	}
@@ -1167,7 +1167,7 @@ sys_setegid(struct thread *td, struct setegid_args *uap)
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0)
 		goto fail;
 
-	if (oldcred->cr_groups[0] != egid) {
+	if (oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
@@ -1393,12 +1393,12 @@ sys_setregid(struct thread *td, struct setregid_args *uap)
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	    rgid != oldcred->cr_svgid) ||
-	     (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
+	     (egid != (gid_t)-1 && egid != oldcred->cr_gid &&
 	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0)
 		goto fail;
 
-	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+	if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
@@ -1406,9 +1406,9 @@ sys_setregid(struct thread *td, struct setregid_args *uap)
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
-	if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
-	    newcred->cr_svgid != newcred->cr_groups[0]) {
-		change_svgid(newcred, newcred->cr_groups[0]);
+	if ((rgid != (gid_t)-1 || newcred->cr_gid != newcred->cr_rgid) &&
+	    newcred->cr_svgid != newcred->cr_gid) {
+		change_svgid(newcred, newcred->cr_gid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
@@ -1547,17 +1547,17 @@ sys_setresgid(struct thread *td, struct setresgid_args *uap)
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	      rgid != oldcred->cr_svgid &&
-	      rgid != oldcred->cr_groups[0]) ||
+	      rgid != oldcred->cr_gid) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
 	      egid != oldcred->cr_svgid &&
-	      egid != oldcred->cr_groups[0]) ||
+	      egid != oldcred->cr_gid) ||
 	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
 	      sgid != oldcred->cr_svgid &&
-	      sgid != oldcred->cr_groups[0])) &&
+	      sgid != oldcred->cr_gid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0)
 		goto fail;
 
-	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+	if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
@@ -1626,8 +1626,8 @@ sys_getresgid(struct thread *td, struct getresgid_args *uap)
 		error1 = copyout(&cred->cr_rgid,
 		    uap->rgid, sizeof(cred->cr_rgid));
 	if (uap->egid)
-		error2 = copyout(&cred->cr_groups[0],
-		    uap->egid, sizeof(cred->cr_groups[0]));
+		error2 = copyout(&cred->cr_gid,
+		    uap->egid, sizeof(cred->cr_gid));
 	if (uap->sgid)
 		error3 = copyout(&cred->cr_svgid,
 		    uap->sgid, sizeof(cred->cr_svgid));
@@ -1737,7 +1737,7 @@ groupmember(gid_t gid, const struct ucred *cred)
 
 	groups_check_positive_len(cred->cr_ngroups);
 
-	if (gid == cred->cr_groups[0])
+	if (gid == cred->cr_gid)
 		return (true);
 
 	return (group_is_supplementary(gid, cred));
@@ -3015,7 +3015,7 @@ void
 change_egid(struct ucred *newcred, gid_t egid)
 {
 
-	newcred->cr_groups[0] = egid;
+	newcred->cr_gid = egid;
 }
 
 /*-
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 35b258e68701..8438298afc0e 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -698,10 +698,13 @@ sendfile_wait_generic(struct socket *so, off_t need, int *space)
 	 */
 	error = 0;
 	SOCK_SENDBUF_LOCK(so);
-	if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
-		so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
-	if (so->so_snd.sb_lowat < PAGE_SIZE && so->so_snd.sb_hiwat >= PAGE_SIZE)
-		so->so_snd.sb_lowat = PAGE_SIZE;
+	if (so->so_snd.sb_flags & SB_AUTOLOWAT) {
+		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+		if (so->so_snd.sb_lowat < PAGE_SIZE &&
+		    so->so_snd.sb_hiwat >= PAGE_SIZE)
+			so->so_snd.sb_lowat = PAGE_SIZE;
+	}
 retry_space:
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		error = EPIPE;
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 5d51aa675cb7..da0efac0598d 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -45,10 +45,10 @@
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
-#include <sys/compressor.h>
 #include <sys/condvar.h>
 #include <sys/devctl.h>
 #include <sys/event.h>
+#include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
@@ -80,6 +80,7 @@
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
+#include <sys/ucoredump.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <sys/wait.h>
@@ -101,7 +102,6 @@ SDT_PROBE_DEFINE2(proc, , , signal__clear,
 SDT_PROBE_DEFINE3(proc, , , signal__discard,
     "struct thread *", "struct proc *", "int");
 
-static int	coredump(struct thread *);
 static int	killpg1(struct thread *td, int sig, int pgid, int all,
 		    ksiginfo_t *ksi);
 static int	issignal(struct thread *td);
@@ -126,11 +126,6 @@ const struct filterops sig_filtops = {
 	.f_event = filt_signal,
 };
 
-static int	kern_logsigexit = 1;
-SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
-    &kern_logsigexit, 0,
-    "Log processes quitting on abnormal signals to syslog(3)");
-
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
@@ -193,26 +188,6 @@ SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
-static int	sugid_coredump;
-SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
-    &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
-
-static int	capmode_coredump;
-SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
-    &capmode_coredump, 0, "Allow processes in capability mode to dump core");
-
-static int	do_coredump = 1;
-SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
-	&do_coredump, 0, "Enable/Disable coredumps");
-
-static int	set_core_nodump_flag = 0;
-SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
-	0, "Enable setting the NODUMP flag on coredump files");
-
-static int	coredump_devctl = 0;
-SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
-	0, "Generate a devctl notification when processes coredump");
-
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
@@ -784,6 +759,13 @@ sigprop(int sig)
 	return (0);
 }
 
+bool
+sig_do_core(int sig)
+{
+
+	return ((sigprop(sig) & SIGPROP_CORE) != 0);
+}
+
 static bool
 sigact_flag_test(const struct sigaction *act, int flag)
 {
@@ -2665,6 +2647,8 @@ static void
 ptrace_coredumpreq(struct thread *td, struct proc *p,
     struct thr_coredump_req *tcq)
 {
+	struct coredump_vnode_ctx wctx;
+	struct coredump_writer cdw;
 	void *rl_cookie;
 
 	if (p->p_sysent->sv_coredump == NULL) {
@@ -2672,8 +2656,15 @@ ptrace_coredumpreq(struct thread *td, struct proc *p,
 		return;
 	}
 
+	wctx.vp = tcq->tc_vp;
+	wctx.fcred = NOCRED;
+
+	cdw.ctx = &wctx;
+	cdw.write_fn = core_vn_write;
+	cdw.extend_fn = core_vn_extend;
+
 	rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX);
-	tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp,
+	tcq->tc_error = p->p_sysent->sv_coredump(td, &cdw,
 	    tcq->tc_limit, tcq->tc_flags);
 	vn_rangelock_unlock(tcq->tc_vp, rl_cookie);
 }
@@ -3635,82 +3626,6 @@ killproc(struct proc *p, const char *why)
 }
 
 /*
- * Force the current process to exit with the specified signal, dumping core
- * if appropriate.  We bypass the normal tests for masked and caught signals,
- * allowing unrecoverable failures to terminate the process without changing
- * signal state.  Mark the accounting record with the signal termination.
- * If dumping core, save the signal number for the debugger.  Calls exit and
- * does not return.
- */
-void
-sigexit(struct thread *td, int sig)
-{
-	struct proc *p = td->td_proc;
-	const char *coreinfo;
-	int rv;
-	bool logexit;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	proc_set_p2_wexit(p);
-
-	p->p_acflag |= AXSIG;
-	if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0)
-		logexit = kern_logsigexit != 0;
-	else
-		logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0;
-
-	/*
-	 * We must be single-threading to generate a core dump.  This
-	 * ensures that the registers in the core file are up-to-date.
-	 * Also, the ELF dump handler assumes that the thread list doesn't
-	 * change out from under it.
-	 *
-	 * XXX If another thread attempts to single-thread before us
-	 *     (e.g. via fork()), we won't get a dump at all.
-	 */
-	if ((sigprop(sig) & SIGPROP_CORE) &&
-	    thread_single(p, SINGLE_NO_EXIT) == 0) {
-		p->p_sig = sig;
-		/*
-		 * Log signals which would cause core dumps
-		 * (Log as LOG_INFO to appease those who don't want
-		 * these messages.)
-		 * XXX : Todo, as well as euid, write out ruid too
-		 * Note that coredump() drops proc lock.
-		 */
-		rv = coredump(td);
-		switch (rv) {
-		case 0:
-			sig |= WCOREFLAG;
-			coreinfo = " (core dumped)";
-			break;
-		case EFAULT:
-			coreinfo = " (no core dump - bad address)";
-			break;
-		case EINVAL:
-			coreinfo = " (no core dump - invalid argument)";
-			break;
-		case EFBIG:
-			coreinfo = " (no core dump - too large)";
-			break;
-		default:
-			coreinfo = " (no core dump - other error)";
-			break;
-		}
-		if (logexit)
-			log(LOG_INFO,
-			    "pid %d (%s), jid %d, uid %d: exited on "
-			    "signal %d%s\n", p->p_pid, p->p_comm,
-			    p->p_ucred->cr_prison->pr_id,
-			    td->td_ucred->cr_uid,
-			    sig &~ WCOREFLAG, coreinfo);
-	} else
-		PROC_UNLOCK(p);
-	exit1(td, 0, sig);
-	/* NOTREACHED */
-}
-
-/*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
@@ -3803,477 +3718,6 @@ childproc_exited(struct proc *p)
 	sigparent(p, reason, status);
 }
 
-#define	MAX_NUM_CORE_FILES 100000
-#ifndef NUM_CORE_FILES
-#define	NUM_CORE_FILES 5
-#endif
-CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES);
-static int num_cores = NUM_CORE_FILES;
-
-static int
-sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
-{
-	int error;
-	int new_val;
-
-	new_val = num_cores;
-	error = sysctl_handle_int(oidp, &new_val, 0, req);
-	if (error != 0 || req->newptr == NULL)
-		return (error);
-	if (new_val > MAX_NUM_CORE_FILES)
-		new_val = MAX_NUM_CORE_FILES;
-	if (new_val < 0)
-		new_val = 0;
-	num_cores = new_val;
-	return (0);
-}
-SYSCTL_PROC(_debug, OID_AUTO, ncores,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
-    sysctl_debug_num_cores_check, "I",
-    "Maximum number of generated process corefiles while using index format");
-
-#define	GZIP_SUFFIX	".gz"
-#define	ZSTD_SUFFIX	".zst"
-
-int compress_user_cores = 0;
-
-static int
-sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
-{
-	int error, val;
-
-	val = compress_user_cores;
-	error = sysctl_handle_int(oidp, &val, 0, req);
-	if (error != 0 || req->newptr == NULL)
-		return (error);
-	if (val != 0 && !compressor_avail(val))
-		return (EINVAL);
-	compress_user_cores = val;
-	return (error);
-}
-SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
-    CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
-    sysctl_compress_user_cores, "I",
-    "Enable compression of user corefiles ("
-    __XSTRING(COMPRESS_GZIP) " = gzip, "
-    __XSTRING(COMPRESS_ZSTD) " = zstd)");
-
-int compress_user_cores_level = 6;
-SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
-    &compress_user_cores_level, 0,
-    "Corefile compression level");
-
-/*
- * Protect the access to corefilename[] by allproc_lock.
- */
-#define	corefilename_lock	allproc_lock
-
-static char corefilename[MAXPATHLEN] = {"%N.core"};
-TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
-
-static int
-sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
-{
-	int error;
-
-	sx_xlock(&corefilename_lock);
-	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
-	    req);
-	sx_xunlock(&corefilename_lock);
-
-	return (error);
-}
-SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
-    CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
-    "Process corefile name format string");
-
-static void
-vnode_close_locked(struct thread *td, struct vnode *vp)
-{
-
-	VOP_UNLOCK(vp);
-	vn_close(vp, FWRITE, td->td_ucred, td);
-}
-
-/*
- * If the core format has a %I in it, then we need to check
- * for existing corefiles before defining a name.
- * To do this we iterate over 0..ncores to find a
- * non-existing core file name to use. If all core files are
- * already used we choose the oldest one.
- */
-static int
-corefile_open_last(struct thread *td, char *name, int indexpos,
-    int indexlen, int ncores, struct vnode **vpp)
-{
-	struct vnode *oldvp, *nextvp, *vp;
-	struct vattr vattr;
-	struct nameidata nd;
-	int error, i, flags, oflags, cmode;
-	char ch;
-	struct timespec lasttime;
-
-	nextvp = oldvp = NULL;
-	cmode = S_IRUSR | S_IWUSR;
-	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
-	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
-
-	for (i = 0; i < ncores; i++) {
-		flags = O_CREAT | FWRITE | O_NOFOLLOW;
-
-		ch = name[indexpos + indexlen];
-		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
-		    i);
-		name[indexpos + indexlen] = ch;
-
-		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
-		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
-		    NULL);
-		if (error != 0)
-			break;
-
-		vp = nd.ni_vp;
-		NDFREE_PNBUF(&nd);
-		if ((flags & O_CREAT) == O_CREAT) {
-			nextvp = vp;
-			break;
-		}
-
-		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
-		if (error != 0) {
-			vnode_close_locked(td, vp);
-			break;
-		}
-
-		if (oldvp == NULL ||
-		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
-		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
-		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
-			if (oldvp != NULL)
-				vn_close(oldvp, FWRITE, td->td_ucred, td);
-			oldvp = vp;
-			VOP_UNLOCK(oldvp);
-			lasttime = vattr.va_mtime;
-		} else {
-			vnode_close_locked(td, vp);
-		}
-	}
-
-	if (oldvp != NULL) {
-		if (nextvp == NULL) {
-			if ((td->td_proc->p_flag & P_SUGID) != 0) {
-				error = EFAULT;
-				vn_close(oldvp, FWRITE, td->td_ucred, td);
-			} else {
-				nextvp = oldvp;
-				error = vn_lock(nextvp, LK_EXCLUSIVE);
-				if (error != 0) {
-					vn_close(nextvp, FWRITE, td->td_ucred,
-					    td);
-					nextvp = NULL;
-				}
-			}
-		} else {
-			vn_close(oldvp, FWRITE, td->td_ucred, td);
-		}
-	}
-	if (error != 0) {
-		if (nextvp != NULL)
-			vnode_close_locked(td, oldvp);
-	} else {
-		*vpp = nextvp;
-	}
-
-	return (error);
-}
-
-/*
- * corefile_open(comm, uid, pid, td, compress, vpp, namep)
- * Expand the name described in corefilename, using name, uid, and pid
- * and open/create core file.
- * corefilename is a printf-like string, with three format specifiers:
- *	%N	name of process ("name")
- *	%P	process id (pid)
- *	%U	user id (uid)
- * For example, "%N.core" is the default; they can be disabled completely
- * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
- * This is controlled by the sysctl variable kern.corefile (see above).
- */
-static int
-corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
-    int compress, int signum, struct vnode **vpp, char **namep)
-{
-	struct sbuf sb;
-	struct nameidata nd;
-	const char *format;
-	char *hostname, *name;
-	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
-
-	hostname = NULL;
-	format = corefilename;
-	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
-	indexlen = 0;
-	indexpos = -1;
-	ncores = num_cores;
-	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
-	sx_slock(&corefilename_lock);
-	for (i = 0; format[i] != '\0'; i++) {
-		switch (format[i]) {
-		case '%':	/* Format character */
-			i++;
-			switch (format[i]) {
-			case '%':
-				sbuf_putc(&sb, '%');
-				break;
-			case 'H':	/* hostname */
-				if (hostname == NULL) {
-					hostname = malloc(MAXHOSTNAMELEN,
-					    M_TEMP, M_WAITOK);
-				}
-				getcredhostname(td->td_ucred, hostname,
-				    MAXHOSTNAMELEN);
-				sbuf_cat(&sb, hostname);
-				break;
-			case 'I':	/* autoincrementing index */
-				if (indexpos != -1) {
-					sbuf_printf(&sb, "%%I");
-					break;
-				}
-
-				indexpos = sbuf_len(&sb);
-				sbuf_printf(&sb, "%u", ncores - 1);
-				indexlen = sbuf_len(&sb) - indexpos;
-				break;
-			case 'N':	/* process name */
-				sbuf_printf(&sb, "%s", comm);
-				break;
-			case 'P':	/* process id */
-				sbuf_printf(&sb, "%u", pid);
-				break;
-			case 'S':	/* signal number */
-				sbuf_printf(&sb, "%i", signum);
-				break;
-			case 'U':	/* user id */
-				sbuf_printf(&sb, "%u", uid);
-				break;
-			default:
-				log(LOG_ERR,
-				    "Unknown format character %c in "
-				    "corename `%s'\n", format[i], format);
-				break;
-			}
-			break;
-		default:
-			sbuf_putc(&sb, format[i]);
-			break;
-		}
-	}
-	sx_sunlock(&corefilename_lock);
-	free(hostname, M_TEMP);
-	if (compress == COMPRESS_GZIP)
-		sbuf_cat(&sb, GZIP_SUFFIX);
-	else if (compress == COMPRESS_ZSTD)
-		sbuf_cat(&sb, ZSTD_SUFFIX);
-	if (sbuf_error(&sb) != 0) {
-		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
-		    "long\n", (long)pid, comm, (u_long)uid);
-		sbuf_delete(&sb);
-		free(name, M_TEMP);
-		return (ENOMEM);
-	}
-	sbuf_finish(&sb);
-	sbuf_delete(&sb);
-
-	if (indexpos != -1) {
-		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
-		    vpp);
-		if (error != 0) {
-			log(LOG_ERR,
-			    "pid %d (%s), uid (%u):  Path `%s' failed "
-			    "on initial open test, error = %d\n",
-			    pid, comm, uid, name, error);
-		}
-	} else {
-		cmode = S_IRUSR | S_IWUSR;
-		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
-		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
-		flags = O_CREAT | FWRITE | O_NOFOLLOW;
-		if ((td->td_proc->p_flag & P_SUGID) != 0)
-			flags |= O_EXCL;
-
-		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
-		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
-		    NULL);
-		if (error == 0) {
-			*vpp = nd.ni_vp;
-			NDFREE_PNBUF(&nd);
-		}
-	}
-
-	if (error != 0) {
-#ifdef AUDIT
-		audit_proc_coredump(td, name, error);
-#endif
-		free(name, M_TEMP);
-		return (error);
-	}
-	*namep = name;
-	return (0);
-}
-
-/*
- * Dump a process' core.  The main routine does some
- * policy checking, and creates the name of the coredump;
- * then it passes on a vnode and a size limit to the process-specific
- * coredump routine if there is one; if there _is not_ one, it returns
- * ENOSYS; otherwise it returns the error from the process-specific routine.
- */
-
-static int
-coredump(struct thread *td)
-{
-	struct proc *p = td->td_proc;
-	struct ucred *cred = td->td_ucred;
-	struct vnode *vp;
-	struct flock lf;
-	struct vattr vattr;
-	size_t fullpathsize;
-	int error, error1, jid, locked, ppid, sig;
-	char *name;			/* name of corefile */
-	void *rl_cookie;
-	off_t limit;
-	char *fullpath, *freepath = NULL;
-	struct sbuf *sb;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
-
-	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
-	    (p->p_flag2 & P2_NOTRACE) != 0) {
-		PROC_UNLOCK(p);
-		return (EFAULT);
-	}
-
-	/*
-	 * Note that the bulk of limit checking is done after
-	 * the corefile is created.  The exception is if the limit
-	 * for corefiles is 0, in which case we don't bother
-	 * creating the corefile at all.  This layout means that
-	 * a corefile is truncated instead of not being created,
-	 * if it is larger than the limit.
-	 */
-	limit = (off_t)lim_cur(td, RLIMIT_CORE);
-	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
-		PROC_UNLOCK(p);
-		return (EFBIG);
-	}
-
-	ppid = p->p_oppid;
-	sig = p->p_sig;
-	jid = p->p_ucred->cr_prison->pr_id;
-	PROC_UNLOCK(p);
-
-	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
-	    compress_user_cores, p->p_sig, &vp, &name);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Don't dump to non-regular files or files with links.
-	 * Do not dump into system files. Effective user must own the corefile.
-	 */
-	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
-	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
-	    vattr.va_uid != cred->cr_uid) {
-		VOP_UNLOCK(vp);
-		error = EFAULT;
-		goto out;
-	}
-
-	VOP_UNLOCK(vp);
-
-	/* Postpone other writers, including core dumps of other processes. */
-	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
-
-	lf.l_whence = SEEK_SET;
-	lf.l_start = 0;
-	lf.l_len = 0;
-	lf.l_type = F_WRLCK;
-	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
-
-	VATTR_NULL(&vattr);
-	vattr.va_size = 0;
-	if (set_core_nodump_flag)
-		vattr.va_flags = UF_NODUMP;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	VOP_SETATTR(vp, &vattr, cred);
-	VOP_UNLOCK(vp);
-	PROC_LOCK(p);
-	p->p_acflag |= ACORE;
-	PROC_UNLOCK(p);
-
-	if (p->p_sysent->sv_coredump != NULL) {
-		error = p->p_sysent->sv_coredump(td, vp, limit, 0);
-	} else {
-		error = ENOSYS;
-	}
-
-	if (locked) {
-		lf.l_type = F_UNLCK;
-		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
-	}
-	vn_rangelock_unlock(vp, rl_cookie);
-
-	/*
-	 * Notify the userland helper that a process triggered a core dump.
-	 * This allows the helper to run an automated debugging session.
-	 */
-	if (error != 0 || coredump_devctl == 0)
-		goto out;
-	sb = sbuf_new_auto();
-	if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
-		goto out2;
-	sbuf_cat(sb, "comm=\"");
-	devctl_safe_quote_sb(sb, fullpath);
-	free(freepath, M_TEMP);
-	sbuf_cat(sb, "\" core=\"");
-
-	/*
-	 * We can't lookup core file vp directly. When we're replacing a core, and
-	 * other random times, we flush the name cache, so it will fail. Instead,
-	 * if the path of the core is relative, add the current dir in front if it.
-	 */
-	if (name[0] != '/') {
-		fullpathsize = MAXPATHLEN;
-		freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
-		if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
-			free(freepath, M_TEMP);
-			goto out2;
-		}
-		devctl_safe_quote_sb(sb, fullpath);
-		free(freepath, M_TEMP);
-		sbuf_putc(sb, '/');
-	}
-	devctl_safe_quote_sb(sb, name);
-	sbuf_putc(sb, '"');
-
-	sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d",
-	    jid, p->p_pid, ppid, sig);
-	if (sbuf_finish(sb) == 0)
-		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
-out2:
-	sbuf_delete(sb);
-out:
-	error1 = vn_close(vp, FWRITE, cred, td);
-	if (error == 0)
-		error = error1;
-#ifdef AUDIT
-	audit_proc_coredump(td, name, error);
-#endif
-	free(name, M_TEMP);
-	return (error);
-}
-
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 46226cc31980..25da134661e9 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -2368,7 +2368,7 @@ sysctl_root(SYSCTL_HANDLER_ARGS)
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #ifdef VIMAGE
 		else if ((oid->oid_kind & CTLFLAG_VNET) &&
-		     prison_owns_vnet(req->td->td_ucred))
+		     prison_owns_vnet(req->td->td_ucred->cr_prison))
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #endif
 		else
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index f853af193016..50b040132396 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -571,7 +571,7 @@ threadinit(void)
 
 	/*
 	 * Thread structures are specially aligned so that (at least) the
-	 * 5 lower bits of a pointer to 'struct thead' must be 0.  These bits
+	 * 5 lower bits of a pointer to 'struct thread' must be 0.  These bits
 	 * are used by synchronization primitives to store flags in pointers to
 	 * such structures.
 	 */
diff --git a/sys/kern/kern_ucoredump.c b/sys/kern/kern_ucoredump.c
new file mode 100644
index 000000000000..d425596b5f24
--- /dev/null
+++ b/sys/kern/kern_ucoredump.c
@@ -0,0 +1,299 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/compressor.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rmlock.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucoredump.h>
+#include <sys/wait.h>
+
+static int coredump(struct thread *td, const char **);
+
+int compress_user_cores = 0;
+
+static SLIST_HEAD(, coredumper)	coredumpers =
+    SLIST_HEAD_INITIALIZER(coredumpers);
+static struct rmlock	coredump_rmlock;
+RM_SYSINIT(coredump_lock, &coredump_rmlock, "coredump_lock");
+
+static int kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
+    &kern_logsigexit, 0,
+    "Log processes quitting on abnormal signals to syslog(3)");
+
+static int sugid_coredump;
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
+    &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
+
+static int do_coredump = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
+	&do_coredump, 0, "Enable/Disable coredumps");
+
+static int
+sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = compress_user_cores;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (val != 0 && !compressor_avail(val))
+		return (EINVAL);
+	compress_user_cores = val;
+	return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
+    CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
+    sysctl_compress_user_cores, "I",
+    "Enable compression of user corefiles ("
+    __XSTRING(COMPRESS_GZIP) " = gzip, "
+    __XSTRING(COMPRESS_ZSTD) " = zstd)");
+
+int compress_user_cores_level = 6;
+SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
+    &compress_user_cores_level, 0,
+    "Corefile compression level");
+
+void
+coredumper_register(struct coredumper *cd)
+{
+
+	blockcount_init(&cd->cd_refcount);
+	rm_wlock(&coredump_rmlock);
+	SLIST_INSERT_HEAD(&coredumpers, cd, cd_entry);
+	rm_wunlock(&coredump_rmlock);
+}
+
+void
+coredumper_unregister(struct coredumper *cd)
+{
+
+	rm_wlock(&coredump_rmlock);
+	SLIST_REMOVE(&coredumpers, cd, coredumper, cd_entry);
+	rm_wunlock(&coredump_rmlock);
+
+	/*
+	 * Wait for any in-process coredumps to finish before returning.
+	 */
+	blockcount_wait(&cd->cd_refcount, NULL, "dumpwait", 0);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate.  We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state.  Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger.  Calls exit and
+ * does not return.
+ */
+void
+sigexit(struct thread *td, int sig)
+{
+	struct proc *p = td->td_proc;
+	int rv;
+	bool logexit;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	proc_set_p2_wexit(p);
+
+	p->p_acflag |= AXSIG;
+	if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0)
+		logexit = kern_logsigexit != 0;
+	else
+		logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0;
+
+	/*
+	 * We must be single-threading to generate a core dump.  This
+	 * ensures that the registers in the core file are up-to-date.
+	 * Also, the ELF dump handler assumes that the thread list doesn't
+	 * change out from under it.
+	 *
+	 * XXX If another thread attempts to single-thread before us
+	 *     (e.g. via fork()), we won't get a dump at all.
+	 */
+	if (sig_do_core(sig) && thread_single(p, SINGLE_NO_EXIT) == 0) {
+		const char *err = NULL;
+
+		p->p_sig = sig;
+		/*
+		 * Log signals which would cause core dumps
+		 * (Log as LOG_INFO to appease those who don't want
+		 * these messages.)
+		 * XXX : Todo, as well as euid, write out ruid too
+		 * Note that coredump() drops proc lock.
+		 */
+		rv = coredump(td, &err);
+		if (rv == 0) {
+			MPASS(err == NULL);
+			sig |= WCOREFLAG;
+		} else if (err == NULL) {
+			switch (rv) {
+			case EFAULT:
+				err = "bad address";
+				break;
+			case EINVAL:
+				err = "invalild argument";
+				break;
+			case EFBIG:
+				err = "too large";
+				break;
+			default:
+				err = "other error";
+				break;
+			}
+		}
+		if (logexit)
+			log(LOG_INFO,
+			    "pid %d (%s), jid %d, uid %d: exited on "
+			    "signal %d (%s%s)\n", p->p_pid, p->p_comm,
+			    p->p_ucred->cr_prison->pr_id,
+			    td->td_ucred->cr_uid, sig &~ WCOREFLAG,
+			    err != NULL ? "no core dump - " : "core dumped",
+			    err != NULL ? err : "");
+	} else
+		PROC_UNLOCK(p);
+	exit1(td, 0, sig);
+	/* NOTREACHED */
+}
+
+
+/*
+ * Dump a process' core.  The main routine does some
+ * policy checking, and creates the name of the coredump;
+ * then it passes on a vnode and a size limit to the process-specific
+ * coredump routine if there is one; if there _is not_ one, it returns
+ * ENOSYS; otherwise it returns the error from the process-specific routine.
+ */
+static int
+coredump(struct thread *td, const char **errmsg)
+{
+	struct coredumper *iter, *chosen;
+	struct proc *p = td->td_proc;
+	struct rm_priotracker tracker;
+	off_t limit;
+	int error, priority;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
+
+	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
+	    (p->p_flag2 & P2_NOTRACE) != 0) {
+		PROC_UNLOCK(p);
+
+		if (!do_coredump)
+			*errmsg = "denied by kern.coredump";
+		else if ((p->p_flag2 & P2_NOTRACE) != 0)
+			*errmsg = "process has trace disabled";
+		else
+			*errmsg = "sugid process denied by kern.sugid_coredump";
+		return (EFAULT);
+	}
+
+	/*
+	 * Note that the bulk of limit checking is done after
+	 * the corefile is created.  The exception is if the limit
+	 * for corefiles is 0, in which case we don't bother
+	 * creating the corefile at all.  This layout means that
+	 * a corefile is truncated instead of not being created,
+	 * if it is larger than the limit.
+	 */
+	limit = (off_t)lim_cur(td, RLIMIT_CORE);
+	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
+		PROC_UNLOCK(p);
+		*errmsg = "coredumpsize limit is 0";
+		return (EFBIG);
+	}
+
+	rm_rlock(&coredump_rmlock, &tracker);
+	priority = -1;
+	chosen = NULL;
+	SLIST_FOREACH(iter, &coredumpers, cd_entry) {
+		if (iter->cd_probe == NULL) {
+			/*
+			 * If we haven't found anything of a higher priority
+			 * yet, we'll call this a GENERIC.  Ideally, we want
+			 * coredumper modules to include a probe function.
+			 */
+			if (priority < 0) {
+				priority = COREDUMPER_GENERIC;
+				chosen = iter;
+			}
+
+			continue;
+		}
+
+		error = (*iter->cd_probe)(td);
+		if (error < 0)
+			continue;
+
+		/*
+		 * Higher priority than previous options.
+		 */
+		if (error > priority) {
+			priority = error;
+			chosen = iter;
+		}
+	}
+
+	/*
+	 * Acquire our refcount before we drop the lock so that
+	 * coredumper_unregister() can safely assume that the refcount will only
+	 * go down once it's dropped the rmlock.
+	 */
+	blockcount_acquire(&chosen->cd_refcount, 1);
+	rm_runlock(&coredump_rmlock, &tracker);
+
+	/* Currently, we always have the vnode dumper built in. */
+	MPASS(chosen != NULL);
+	error = ((*chosen->cd_handle)(td, limit));
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+
+	blockcount_release(&chosen->cd_refcount, 1);
+
+	return (error);
+}
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 0edb631d1475..464efda1e91a 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code)
 	if (__predict_false(!kasan_enabled))
 		return;
 
-	if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS &&
-	    (vm_offset_t)addr < DMAP_MAX_ADDRESS)
+	if (kasan_md_unsupported((vm_offset_t)addr))
 		return;
 
 	KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS &&
diff --git a/sys/kern/subr_compressor.c b/sys/kern/subr_compressor.c
index 280264881241..5d59622e0455 100644
--- a/sys/kern/subr_compressor.c
+++ b/sys/kern/subr_compressor.c
@@ -538,6 +538,12 @@ compressor_init(compressor_cb_t cb, int format, size_t maxiosize, int level,
 	return (s);
 }
 
+int
+compressor_format(const struct compressor *stream)
+{
+	return (stream->methods->format);
+}
+
 void
 compressor_reset(struct compressor *stream)
 {
diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
index 3a3548bad52b..bb86c779b936 100644
--- a/sys/kern/subr_pctrie.c
+++ b/sys/kern/subr_pctrie.c
@@ -691,21 +691,23 @@ _pctrie_lookup_ge(struct pctrie *ptree, struct pctrie_node *node,
 	 */
 	if (node == PCTRIE_NULL || *pctrie_toval(node) < index) {
 		/* Climb the path to find a node with a descendant > index. */
-		for (node = parent; node != NULL; node = pctrie_parent(node)) {
-			slot = pctrie_slot(node, index) + 1;
-			if ((node->pn_popmap >> slot) != 0)
+		node = NULL;
+		while (parent != NULL) {
+			slot = pctrie_slot(parent, index) + 1;
+			if ((parent->pn_popmap >> slot) != 0)
 				break;
+			node = parent;
+			parent = pctrie_parent(node);
 		}
-		if (node == NULL) {
+		if (parent == NULL) {
 			if (parent_out != NULL)
-				*parent_out = NULL;
+				*parent_out = node;
 			return (NULL);
 		}
 
 		/* Step to the least child with a descendant > index. */
-		slot += ffs(node->pn_popmap >> slot) - 1;
-		parent = node;
-		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		slot += ffs(parent->pn_popmap >> slot) - 1;
+		node = pctrie_node_load(&parent->pn_child[slot], NULL,
 		    PCTRIE_LOCKED);
 	}
 	/* Descend to the least leaf of the subtrie. */
@@ -785,21 +787,23 @@ _pctrie_lookup_le(struct pctrie *ptree, struct pctrie_node *node,
 	 */
 	if (node == PCTRIE_NULL || *pctrie_toval(node) > index) {
 		/* Climb the path to find a node with a descendant < index. */
-		for (node = parent; node != NULL; node = pctrie_parent(node)) {
-			slot = pctrie_slot(node, index);
-			if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
+		node = NULL;
+		while (parent != NULL) {
+			slot = pctrie_slot(parent, index);
+			if ((parent->pn_popmap & ((1 << slot) - 1)) != 0)
 				break;
+			node = parent;
+			parent = pctrie_parent(node);
 		}
-		if (node == NULL) {
+		if (parent == NULL) {
 			if (parent_out != NULL)
-				*parent_out = NULL;
+				*parent_out = node;
 			return (NULL);
 		}
 
 		/* Step to the greatest child with a descendant < index. */
-		slot = ilog2(node->pn_popmap & ((1 << slot) - 1));
-		parent = node;
-		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		slot = ilog2(parent->pn_popmap & ((1 << slot) - 1));
+		node = pctrie_node_load(&parent->pn_child[slot], NULL,
 		    PCTRIE_LOCKED);
 	}
 	/* Descend to the greatest leaf of the subtrie. */
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 18388ae5f232..bac7d0080c71 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -338,8 +338,9 @@ ast_handler(struct thread *td, struct trapframe *framep, bool dtor)
 		td->td_ast = 0;
 	}
 
-	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
-            td->td_proc->p_comm);
+	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td,
+	     td->td_proc == NULL ? -1 : td->td_proc->p_pid,
+	     td->td_proc == NULL ? "" : td->td_proc->p_comm);
 	KASSERT(framep == NULL || TRAPF_USERMODE(framep),
 	    ("ast in kernel mode"));
 
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 94e44d888181..5606b36f772f 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -2269,6 +2269,7 @@ exterr_copyout(struct thread *td)
 		ue.error = 0;
 		sz = sizeof(ue.error);
 	} else {
+		ktrexterr(td);
 		sz = sizeof(ue) - __offsetof(struct uexterror, error);
 	}
 	error = copyout(&ue.error, uloc, sz);
@@ -2309,6 +2310,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap)
 			return (EINVAL);
 		td->td_pflags2 &= ~TDP2_UEXTERR;
 		return (0);
+	case EXTERRCTL_UD:
+		/*
+		 * Important: this code must always return EINVAL and never any
+		 * extended error, for testing purposes.
+		 */
+		/* FALLTHROUGH */
 	default:
 		return (EINVAL);
 	}
@@ -2329,7 +2336,6 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1,
 		td->td_kexterr.p1 = pp1;
 		td->td_kexterr.p2 = pp2;
 		td->td_kexterr.src_line = line;
-		ktrexterr(td);
 	}
 	return (eerror);
 }
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index ce09042abdac..66ce1b5a081d 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1207,7 +1207,7 @@ sb_mark_notready(struct sockbuf *sb)
 	for (; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 		    __func__));
-		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
+		KASSERT((m->m_flags & M_NOTREADY) == 0, ("%s: mbuf not ready",
 		    __func__));
 		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 		    __func__));
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 6f83b875a6b6..85fe48ddd466 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -1134,10 +1134,10 @@ shm_doremove(struct shm_mapping *map)
 
 int
 kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
-    int shmflags, struct filecaps *fcaps, const char *name __unused)
+    int shmflags, struct filecaps *fcaps, const char *name __unused,
+    struct shmfd *shmfd)
 {
 	struct pwddesc *pdp;
-	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	void *rl_cookie;
@@ -1214,23 +1214,41 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
 	if (error != 0)
 		goto outnofp;
 
-	/* A SHM_ANON path pointer creates an anonymous object. */
+	/*
+	 * A SHM_ANON path pointer creates an anonymous object.  We allow other
+	 * parts of the kernel to pre-populate a shmfd and then materialize an
+	 * fd for it here as a means to pass data back up to userland.  This
+	 * doesn't really make sense for named shm objects, but it makes plenty
+	 * of sense for anonymous objects.
+	 */
 	if (userpath == SHM_ANON) {
-		/* A read-only anonymous object is pointless. */
-		if ((flags & O_ACCMODE) == O_RDONLY) {
-			error = EINVAL;
-			goto out;
-		}
-		shmfd = shm_alloc(td->td_ucred, cmode, largepage);
-		if (shmfd == NULL) {
-			error = ENOMEM;
-			goto out;
+		if (shmfd != NULL) {
+			shm_hold(shmfd);
+		} else {
+			/*
+			 * A read-only anonymous object is pointless, unless it
+			 * was pre-populated by the kernel with the expectation
+			 * that a shmfd would later be created for userland to
+			 * access it through.
+			 */
+			if ((flags & O_ACCMODE) == O_RDONLY) {
+				error = EINVAL;
+				goto out;
+			}
+			shmfd = shm_alloc(td->td_ucred, cmode, largepage);
+			if (shmfd == NULL) {
+				error = ENOMEM;
+				goto out;
+			}
+
+			shmfd->shm_seals = initial_seals;
+			shmfd->shm_flags = shmflags;
 		}
-		shmfd->shm_seals = initial_seals;
-		shmfd->shm_flags = shmflags;
 	} else {
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
+
+		MPASS(shmfd == NULL);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
@@ -2173,7 +2191,7 @@ kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode,
     struct filecaps *caps)
 {
 
-	return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL));
+	return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL, NULL));
 }
 
 /*
@@ -2191,7 +2209,7 @@ sys_shm_open2(struct thread *td, struct shm_open2_args *uap)
 {
 
 	return (kern_shm_open2(td, uap->path, uap->flags, uap->mode,
-	    uap->shmflags, NULL, uap->name));
+	    uap->shmflags, NULL, uap->name, NULL));
 }
 
 int
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index ec00878cd9a5..745702bd4a4f 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -195,14 +195,14 @@ int
 sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 {
 	struct mbuf *m;
-	u_int blocker;
+	bool blocker;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
 	KASSERT(count > 0, ("%s: invalid count %d", __func__, count));
 
 	m = m0;
-	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
+	blocker = (sb->sb_fnrdy == m);
 
 	while (count > 0) {
 		KASSERT(m->m_flags & M_NOTREADY,
@@ -217,8 +217,7 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 			m->m_epg_nrdy = 0;
 		} else
 			count--;
-
-		m->m_flags &= ~(M_NOTREADY | blocker);
+		m->m_flags &= ~M_NOTREADY;
 		if (blocker)
 			sb->sb_acc += m->m_len;
 		m = m->m_next;
@@ -240,12 +239,8 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 	}
 
 	/* This one was blocking all the queue. */
-	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
-		KASSERT(m->m_flags & M_BLOCKED,
-		    ("%s: m %p !M_BLOCKED", __func__, m));
-		m->m_flags &= ~M_BLOCKED;
+	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next)
 		sb->sb_acc += m->m_len;
-	}
 
 	sb->sb_fnrdy = m;
 	sbready_compress(sb, m0, m);
@@ -269,8 +264,7 @@ sballoc(struct sockbuf *sb, struct mbuf *m)
 			sb->sb_fnrdy = m;
 		else
 			sb->sb_acc += m->m_len;
-	} else
-		m->m_flags |= M_BLOCKED;
+	}
 
 	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 		sb->sb_ctl += m->m_len;
@@ -287,29 +281,29 @@ sballoc(struct sockbuf *sb, struct mbuf *m)
 void
 sbfree(struct sockbuf *sb, struct mbuf *m)
 {
+	struct mbuf *n;
 
 #if 0	/* XXX: not yet: soclose() call path comes here w/o lock. */
 	SOCKBUF_LOCK_ASSERT(sb);
 #endif
-
 	sb->sb_ccc -= m->m_len;
 
-	if (!(m->m_flags & M_NOTAVAIL))
-		sb->sb_acc -= m->m_len;
-
 	if (m == sb->sb_fnrdy) {
-		struct mbuf *n;
-
 		KASSERT(m->m_flags & M_NOTREADY,
 		    ("%s: m %p !M_NOTREADY", __func__, m));
 
 		n = m->m_next;
 		while (n != NULL && !(n->m_flags & M_NOTREADY)) {
-			n->m_flags &= ~M_BLOCKED;
 			sb->sb_acc += n->m_len;
 			n = n->m_next;
 		}
 		sb->sb_fnrdy = n;
+	} else {
+		/* Assert that mbuf is not behind sb_fnrdy. */
+		for (n = sb->sb_fnrdy; n != NULL; n = n->m_next)
+			KASSERT(n != m, ("%s: sb %p freeing %p behind sb_fnrdy",
+			    __func__, sb, m));
+		sb->sb_acc -= m->m_len;
 	}
 
 	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
@@ -779,6 +773,7 @@ sbsetopt(struct socket *so, struct sockopt *sopt)
 		 * high-water.
 		 */
 		*lowat = (cc > *hiwat) ? *hiwat : cc;
+		*flags &= ~SB_AUTOLOWAT;
 		break;
 	}
 
@@ -1128,13 +1123,7 @@ sbcheck(struct sockbuf *sb, const char *file, int line)
 			}
 			fnrdy = m;
 		}
-		if (fnrdy) {
-			if (!(m->m_flags & M_NOTAVAIL)) {
-				printf("sb %p: fnrdy %p, m %p is avail\n",
-				    sb, sb->sb_fnrdy, m);
-				goto fail;
-			}
-		} else
+		if (fnrdy == NULL)
 			acc += m->m_len;
 		ccc += m->m_len;
 		mbcnt += MSIZE;
@@ -1601,8 +1590,8 @@ sbcut_internal(struct sockbuf *sb, int len)
 			next = m->m_nextpkt;
 		}
 		if (m->m_len > len) {
-			KASSERT(!(m->m_flags & M_NOTAVAIL),
-			    ("%s: m %p M_NOTAVAIL", __func__, m));
+			KASSERT(!(m->m_flags & M_NOTREADY),
+			    ("%s: m %p M_NOTREADY", __func__, m));
 			m->m_len -= len;
 			m->m_data += len;
 			sb->sb_ccc -= len;
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 6c9eb7139cd1..fe2d8d056062 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1211,7 +1211,8 @@ solisten_clone(struct socket *head)
 	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 	so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
-	so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
+	so->so_snd.sb_flags = head->sol_sbsnd_flags &
+	    (SB_AUTOSIZE | SB_AUTOLOWAT);
 	if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
@@ -2988,8 +2989,8 @@ dontblock:
 	 */
 	moff = 0;
 	offset = 0;
-	while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
-	    && error == 0) {
+	while (m != NULL && !(m->m_flags & M_NOTREADY) && uio->uio_resid > 0 &&
+	    error == 0) {
 		/*
 		 * If the type of mbuf has changed since the last mbuf
 		 * examined ('type'), end the receive operation.
@@ -3341,7 +3342,7 @@ deliver:
 			for (m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
-				KASSERT(!(m->m_flags & M_NOTAVAIL),
+				KASSERT(!(m->m_flags & M_NOTREADY),
 				    ("%s: m %p not available", __func__, m));
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
@@ -4514,6 +4515,9 @@ sokqfilter_generic(struct socket *so, struct knote *kn)
 		SOCK_BUF_LOCK(so, which);
 		knlist_add(knl, kn, 1);
 		sb->sb_flags |= SB_KNOTE;
+		if ((kn->kn_sfflags & NOTE_LOWAT) &&
+		    (sb->sb_flags & SB_AUTOLOWAT))
+			sb->sb_flags &= ~SB_AUTOLOWAT;
 		SOCK_BUF_UNLOCK(so, which);
 	}
 	SOCK_UNLOCK(so);
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 3d455b3874cc..89c1d779f04c 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -332,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 
-SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
+    "enum cache_fpl_status");
 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 
@@ -6420,15 +6421,11 @@ out:
 	cache_fpl_smr_assert_not_entered(&fpl);
 	cache_fpl_assert_status(&fpl);
 	*status = fpl.status;
-	if (SDT_PROBES_ENABLED()) {
-		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
-		if (fpl.status == CACHE_FPL_STATUS_HANDLED)
-			SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
-			    ndp);
-	}
-
+	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 	if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 		MPASS(error != CACHE_FPL_FAILED);
+		SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
+		    ndp);
 		if (error != 0) {
 			cache_fpl_cleanup_cnp(fpl.cnp);
 			MPASS(fpl.dvp == NULL);
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
index 41e73bb41a49..d3cd0d1f9832 100644
--- a/sys/kern/vfs_inotify.c
+++ b/sys/kern/vfs_inotify.c
@@ -371,7 +371,7 @@ inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watc
 
 	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
 	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
-		vn_irflag_unset_locked(vp, VIRF_INOTIFY);
+		vn_irflag_unset(vp, VIRF_INOTIFY);
 }
 
 /*
@@ -675,7 +675,8 @@ vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
 					struct vattr va;
 					int error;
 
-					error = VOP_GETATTR(vp, &va, cnp->cn_cred);
+					error = VOP_GETATTR(vp, &va,
+					    cnp->cn_cred);
 					if (error == 0 && va.va_nlink != 0)
 						selfevent = 0;
 				}
@@ -760,9 +761,11 @@ vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
 			 * directory if it's specified as a vnode.
 			 */
 			vrefact(vp);
+			VOP_UNLOCK(vp);
 			NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
 			    dp->d_name, vp);
 			error = namei(&nd);
+			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (error != 0)
 				break;
 			vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 918b256e6c59..29774cf87393 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -6533,17 +6533,6 @@ vop_read_pgcache_post(void *ap, int rc)
 		VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
 }
 
-void
-vop_readdir_post(void *ap, int rc)
-{
-	struct vop_readdir_args *a = ap;
-
-	if (!rc) {
-		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
-		INOTIFY(a->a_vp, IN_ACCESS);
-	}
-}
-
 static struct knlist fs_knlist;
 
 static void
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index d880733cbfe7..25d40a9806cb 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -2253,10 +2253,10 @@ kern_accessat(struct thread *td, int fd, const char *path,
 	cred = td->td_ucred;
 	if ((flag & AT_EACCESS) == 0 &&
 	    ((cred->cr_uid != cred->cr_ruid ||
-	    cred->cr_rgid != cred->cr_groups[0]))) {
+	    cred->cr_rgid != cred->cr_gid))) {
 		usecred = crdup(cred);
 		usecred->cr_uid = cred->cr_ruid;
-		usecred->cr_groups[0] = cred->cr_rgid;
+		usecred->cr_gid = cred->cr_rgid;
 		td->td_ucred = usecred;
 	} else
 		usecred = cred;
@@ -4314,10 +4314,6 @@ kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
-	if (vp->v_type != VDIR) {
-		error = EINVAL;
-		goto fail;
-	}
 	if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) {
 		error = ENOENT;
 		goto fail;
@@ -4330,6 +4326,19 @@ unionread:
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
+	/*
+	 * We want to return ENOTDIR for anything that is not VDIR, but
+	 * not for VBAD, and we can't check for VBAD while the vnode is
+	 * unlocked.
+	 */
+	if (vp->v_type != VDIR) {
+		if (vp->v_type == VBAD)
+			error = EBADF;
+		else
+			error = ENOTDIR;
+		VOP_UNLOCK(vp);
+		goto fail;
+	}
 	AUDIT_ARG_VNODE1(vp);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index 38138a4af921..2e63215b2f97 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -242,8 +242,8 @@ vop_read_pgcache {
 
 
 %% write	vp	L L L
-%! write	pre	VOP_WRITE_PRE
-%! write	post	VOP_WRITE_POST
+%! write	pre	vop_write_pre
+%! write	post	vop_write_post
 
 vop_write {
 	IN struct vnode *vp;
@@ -380,6 +380,7 @@ vop_symlink {
 
 
 %% readdir	vp	L L L
+%! readdir	pre	vop_readdir_pre
 %! readdir	post	vop_readdir_post
 
 vop_readdir {