43 files changed, 2797 insertions, 925 deletions
diff --git a/sys/kern/coredump_vnode.c b/sys/kern/coredump_vnode.c
new file mode 100644
index 000000000000..8b857e9aa4a2
--- /dev/null
+++ b/sys/kern/coredump_vnode.c
@@ -0,0 +1,562 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * - kern_sig.c
+ */
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * -kern_exec.c
+ */
+
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/compressor.h>
+#include <sys/devctl.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/limits.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/ucoredump.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+
+#include <security/audit/audit.h>
+
+#define	GZIP_SUFFIX	".gz"
+#define	ZSTD_SUFFIX	".zst"
+
+#define	MAX_NUM_CORE_FILES 100000
+#ifndef NUM_CORE_FILES
+#define	NUM_CORE_FILES 5
+#endif
+
+static coredumper_handle_fn	coredump_vnode;
+static struct coredumper vnode_coredumper = {
+	.cd_name = "vnode_coredumper",
+	.cd_handle = coredump_vnode,
+};
+
+SYSINIT(vnode_coredumper_register, SI_SUB_EXEC, SI_ORDER_ANY,
+    coredumper_register, &vnode_coredumper);
+
+_Static_assert(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES,
+    "NUM_CORE_FILES is out of range (0 to " __STRING(MAX_NUM_CORE_FILES) ")");
+static int num_cores = NUM_CORE_FILES;
+
+static int capmode_coredump;
+SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
+    &capmode_coredump, 0, "Allow processes in capability mode to dump core");
+
+static int set_core_nodump_flag = 0;
+SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
+	0, "Enable setting the NODUMP flag on coredump files");
+
+static int coredump_devctl = 0;
+SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
+	0, "Generate a devctl notification when processes coredump");
+
+/*
+ * corefilename[] is protected by the allproc_lock.
+ */
+static char corefilename[MAXPATHLEN] = { "%N.core" };
+TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
+
+static int
+sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	sx_xlock(&allproc_lock);
+	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
+	    req);
+	sx_xunlock(&allproc_lock);
+
+	return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
+    "Process corefile name format string");
+
+static int
+sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int new_val;
+
+	new_val = num_cores;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (new_val > MAX_NUM_CORE_FILES)
+		new_val = MAX_NUM_CORE_FILES;
+	if (new_val < 0)
+		new_val = 0;
+	num_cores = new_val;
+	return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, ncores,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
+    sysctl_debug_num_cores_check, "I",
+    "Maximum number of generated process corefiles while using index format");
+
+static void
+vnode_close_locked(struct thread *td, struct vnode *vp)
+{
+
+	VOP_UNLOCK(vp);
+	vn_close(vp, FWRITE, td->td_ucred, td);
+}
+
+int
+core_vn_write(const struct coredump_writer *cdw, const void *base, size_t len,
+    off_t offset, enum uio_seg seg, struct ucred *cred, size_t *resid,
+    struct thread *td)
+{
+	struct coredump_vnode_ctx *ctx = cdw->ctx;
+
+	return (vn_rdwr_inchunks(UIO_WRITE, ctx->vp, __DECONST(void *, base),
+	    len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
+	    cred, ctx->fcred, resid, td));
+}
+
+int
+core_vn_extend(const struct coredump_writer *cdw, off_t newsz,
+    struct ucred *cred)
+{
+	struct coredump_vnode_ctx *ctx = cdw->ctx;
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(ctx->vp, &mp, V_WAIT);
+	if (error != 0)
+		return (error);
+	vn_lock(ctx->vp, LK_EXCLUSIVE | LK_RETRY);
+	error = vn_truncate_locked(ctx->vp, newsz, false, cred);
+	VOP_UNLOCK(ctx->vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * If the core format has a %I in it, then we need to check
+ * for existing corefiles before defining a name.
+ * To do this we iterate over 0..ncores to find a
+ * non-existing core file name to use. If all core files are
+ * already used we choose the oldest one.
+ */
+static int
+corefile_open_last(struct thread *td, char *name, int indexpos,
+    int indexlen, int ncores, struct vnode **vpp)
+{
+	struct vnode *oldvp, *nextvp, *vp;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error, i, flags, oflags, cmode;
+	char ch;
+	struct timespec lasttime;
+
+	nextvp = oldvp = NULL;
+	cmode = S_IRUSR | S_IWUSR;
+	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
+	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+
+	for (i = 0; i < ncores; i++) {
+		flags = O_CREAT | FWRITE | O_NOFOLLOW;
+
+		ch = name[indexpos + indexlen];
+		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
+		    i);
+		name[indexpos + indexlen] = ch;
+
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
+		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
+		    NULL);
+		if (error != 0)
+			break;
+
+		vp = nd.ni_vp;
+		NDFREE_PNBUF(&nd);
+		if ((flags & O_CREAT) == O_CREAT) {
+			nextvp = vp;
+			break;
+		}
+
+		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+		if (error != 0) {
+			vnode_close_locked(td, vp);
+			break;
+		}
+
+		if (oldvp == NULL ||
+		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
+		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
+		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
+			if (oldvp != NULL)
+				vn_close(oldvp, FWRITE, td->td_ucred, td);
+			oldvp = vp;
+			VOP_UNLOCK(oldvp);
+			lasttime = vattr.va_mtime;
+		} else {
+			vnode_close_locked(td, vp);
+		}
+	}
+
+	if (oldvp != NULL) {
+		if (nextvp == NULL) {
+			if ((td->td_proc->p_flag & P_SUGID) != 0) {
+				error = EFAULT;
+				vn_close(oldvp, FWRITE, td->td_ucred, td);
+			} else {
+				nextvp = oldvp;
+				error = vn_lock(nextvp, LK_EXCLUSIVE);
+				if (error != 0) {
+					vn_close(nextvp, FWRITE, td->td_ucred,
+					    td);
+					nextvp = NULL;
+				}
+			}
+		} else {
+			vn_close(oldvp, FWRITE, td->td_ucred, td);
+		}
+	}
+	if (error != 0) {
+		if (nextvp != NULL)
+			vnode_close_locked(td, oldvp);
+	} else {
+		*vpp = nextvp;
+	}
+
+	return (error);
+}
+
+/*
+ * corefile_open(comm, uid, pid, td, compress, vpp, namep)
+ * Expand the name described in corefilename, using name, uid, and pid
+ * and open/create core file.
+ * corefilename is a printf-like string, with three format specifiers:
+ *	%N	name of process ("name")
+ *	%P	process id (pid)
+ *	%U	user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+static int
+corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
+    int compress, int signum, struct vnode **vpp, char **namep)
+{
+	struct sbuf sb;
+	struct nameidata nd;
+	const char *format;
+	char *hostname, *name;
+	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
+
+	hostname = NULL;
+	format = corefilename;
+	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
+	indexlen = 0;
+	indexpos = -1;
+	ncores = num_cores;
+	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
+	sx_slock(&allproc_lock);
+	for (i = 0; format[i] != '\0'; i++) {
+		switch (format[i]) {
+		case '%':	/* Format character */
+			i++;
+			switch (format[i]) {
+			case '%':
+				sbuf_putc(&sb, '%');
+				break;
+			case 'H':	/* hostname */
+				if (hostname == NULL) {
+					hostname = malloc(MAXHOSTNAMELEN,
+					    M_TEMP, M_WAITOK);
+				}
+				getcredhostname(td->td_ucred, hostname,
+				    MAXHOSTNAMELEN);
+				sbuf_cat(&sb, hostname);
+				break;
+			case 'I':	/* autoincrementing index */
+				if (indexpos != -1) {
+					sbuf_printf(&sb, "%%I");
+					break;
+				}
+
+				indexpos = sbuf_len(&sb);
+				sbuf_printf(&sb, "%u", ncores - 1);
+				indexlen = sbuf_len(&sb) - indexpos;
+				break;
+			case 'N':	/* process name */
+				sbuf_printf(&sb, "%s", comm);
+				break;
+			case 'P':	/* process id */
+				sbuf_printf(&sb, "%u", pid);
+				break;
+			case 'S':	/* signal number */
+				sbuf_printf(&sb, "%i", signum);
+				break;
+			case 'U':	/* user id */
+				sbuf_printf(&sb, "%u", uid);
+				break;
+			default:
+				log(LOG_ERR,
+				    "Unknown format character %c in "
+				    "corename `%s'\n", format[i], format);
+				break;
+			}
+			break;
+		default:
+			sbuf_putc(&sb, format[i]);
+			break;
+		}
+	}
+	sx_sunlock(&allproc_lock);
+	free(hostname, M_TEMP);
+	if (compress == COMPRESS_GZIP)
+		sbuf_cat(&sb, GZIP_SUFFIX);
+	else if (compress == COMPRESS_ZSTD)
+		sbuf_cat(&sb, ZSTD_SUFFIX);
+	if (sbuf_error(&sb) != 0) {
+		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
+		    "long\n", (long)pid, comm, (u_long)uid);
+		sbuf_delete(&sb);
+		free(name, M_TEMP);
+		return (ENOMEM);
+	}
+	sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+	if (indexpos != -1) {
+		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
+		    vpp);
+		if (error != 0) {
+			log(LOG_ERR,
+			    "pid %d (%s), uid (%u):  Path `%s' failed "
+			    "on initial open test, error = %d\n",
+			    pid, comm, uid, name, error);
+		}
+	} else {
+		cmode = S_IRUSR | S_IWUSR;
+		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
+		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+		flags = O_CREAT | FWRITE | O_NOFOLLOW;
+		if ((td->td_proc->p_flag & P_SUGID) != 0)
+			flags |= O_EXCL;
+
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
+		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
+		    NULL);
+		if (error == 0) {
+			*vpp = nd.ni_vp;
+			NDFREE_PNBUF(&nd);
+		}
+	}
+
+	if (error != 0) {
+#ifdef AUDIT
+		audit_proc_coredump(td, name, error);
+#endif
+		free(name, M_TEMP);
+		return (error);
+	}
+	*namep = name;
+	return (0);
+}
+
+/*
+ * The vnode dumper is the traditional coredump handler.  Our policy and limits
+ * are generally checked already, so it creates the coredump name and passes on
+ * a vnode and a size limit to the process-specific coredump routine if there is
+ * one.  If there _is not_ one, it returns ENOSYS; otherwise it returns the
+ * error from the process-specific routine.
+ */
+static int
+coredump_vnode(struct thread *td, off_t limit)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *cred = td->td_ucred;
+	struct vnode *vp;
+	struct coredump_vnode_ctx wctx;
+	struct coredump_writer cdw = { };
+	struct flock lf;
+	struct vattr vattr;
+	size_t fullpathsize;
+	int error, error1, jid, locked, ppid, sig;
+	char *name;			/* name of corefile */
+	void *rl_cookie;
+	char *fullpath, *freepath = NULL;
+	struct sbuf *sb;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	ppid = p->p_oppid;
+	sig = p->p_sig;
+	jid = p->p_ucred->cr_prison->pr_id;
+	PROC_UNLOCK(p);
+
+	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
+	    compress_user_cores, sig, &vp, &name);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Don't dump to non-regular files or files with links.
+	 * Do not dump into system files. Effective user must own the corefile.
+	 */
+	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
+	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
+	    vattr.va_uid != cred->cr_uid) {
+		VOP_UNLOCK(vp);
+		error = EFAULT;
+		goto out;
+	}
+
+	VOP_UNLOCK(vp);
+
+	/* Postpone other writers, including core dumps of other processes. */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	lf.l_type = F_WRLCK;
+	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
+
+	VATTR_NULL(&vattr);
+	vattr.va_size = 0;
+	if (set_core_nodump_flag)
+		vattr.va_flags = UF_NODUMP;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VOP_SETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp);
+	PROC_LOCK(p);
+	p->p_acflag |= ACORE;
+	PROC_UNLOCK(p);
+
+	wctx.vp = vp;
+	wctx.fcred = NOCRED;
+
+	cdw.ctx = &wctx;
+	cdw.write_fn = core_vn_write;
+	cdw.extend_fn = core_vn_extend;
+
+	if (p->p_sysent->sv_coredump != NULL) {
+		error = p->p_sysent->sv_coredump(td, &cdw, limit, 0);
+	} else {
+		error = ENOSYS;
+	}
+
+	if (locked) {
+		lf.l_type = F_UNLCK;
+		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+	}
+	vn_rangelock_unlock(vp, rl_cookie);
+
+	/*
+	 * Notify the userland helper that a process triggered a core dump.
+	 * This allows the helper to run an automated debugging session.
+	 */
+	if (error != 0 || coredump_devctl == 0)
+		goto out;
+	sb = sbuf_new_auto();
+	if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
+		goto out2;
+	sbuf_cat(sb, "comm=\"");
+	devctl_safe_quote_sb(sb, fullpath);
+	free(freepath, M_TEMP);
+	sbuf_cat(sb, "\" core=\"");
+
+	/*
+	 * We can't lookup core file vp directly. When we're replacing a core, and
+	 * other random times, we flush the name cache, so it will fail. Instead,
+	 * if the path of the core is relative, add the current dir in front if it.
+	 */
+	if (name[0] != '/') {
+		fullpathsize = MAXPATHLEN;
+		freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
+		if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
+			free(freepath, M_TEMP);
+			goto out2;
+		}
+		devctl_safe_quote_sb(sb, fullpath);
+		free(freepath, M_TEMP);
+		sbuf_putc(sb, '/');
+	}
+	devctl_safe_quote_sb(sb, name);
+	sbuf_putc(sb, '"');
+
+	sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d",
+	    jid, p->p_pid, ppid, sig);
+	if (sbuf_finish(sb) == 0)
+		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
+out2:
+	sbuf_delete(sb);
+out:
+	error1 = vn_close(vp, FWRITE, cred, td);
+	if (error == 0)
+		error = error1;
+#ifdef AUDIT
+	audit_proc_coredump(td, name, error);
+#endif
+	free(name, M_TEMP);
+	return (error);
+}
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index b7ffbe68b483..2690ad3b2679 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -64,6 +64,7 @@
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
+#include <sys/ucoredump.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/eventhandler.h>
@@ -1562,9 +1563,6 @@ struct note_info {
 
 TAILQ_HEAD(note_info_list, note_info);
 
-extern int compress_user_cores;
-extern int compress_user_cores_level;
-
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static void each_dumpable_segment(struct thread *, segment_callback, void *,
@@ -1595,7 +1593,7 @@ core_compressed_write(void *base, size_t len, off_t offset, void *arg)
 }
 
 int
-__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
+__elfN(coredump)(struct thread *td, struct coredump_writer *cdw, off_t limit, int flags)
 {
 	struct ucred *cred = td->td_ucred;
 	int compm, error = 0;
@@ -1625,9 +1623,8 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 	/* Set up core dump parameters. */
 	params.offset = 0;
 	params.active_cred = cred;
-	params.file_cred = NOCRED;
 	params.td = td;
-	params.vp = vp;
+	params.cdw = cdw;
 	params.comp = NULL;
 
 #ifdef RACCT
@@ -1662,6 +1659,12 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 		tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
         }
 
+	if (cdw->init_fn != NULL) {
+		error = (*cdw->init_fn)(cdw, &params);
+		if (error != 0)
+			goto done;
+	}
+
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out following the notes.
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index a48a513aa3b5..91792430d24c 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -658,5 +658,7 @@ struct sysent sysent[] = {
 	{ .sy_narg = AS(getrlimitusage_args), .sy_call = (sy_call_t *)sys_getrlimitusage, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 589 = getrlimitusage */
 	{ .sy_narg = AS(fchroot_args), .sy_call = (sy_call_t *)sys_fchroot, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 590 = fchroot */
 	{ .sy_narg = AS(setcred_args), .sy_call = (sy_call_t *)sys_setcred, .sy_auevent = AUE_SETCRED, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 591 = setcred */
-	{ .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 592 = exterrctl */
+	{ .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 592 = exterrctl */
+	{ .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 593 = inotify_add_watch_at */
+	{ .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 594 = inotify_rm_watch */
 };
diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
index 5d9e2f2f326b..d7eb82d5f259 100644
--- a/sys/kern/kern_cpuset.c
+++ b/sys/kern/kern_cpuset.c
@@ -530,7 +530,7 @@ _domainset_create(struct domainset *domain, struct domainlist *freelist)
  * remove them and update the domainset accordingly.  If only empty
  * domains are present, we must return failure.
  */
-static bool
+bool
 domainset_empty_vm(struct domainset *domain)
 {
 	domainset_t empty;
@@ -2409,82 +2409,92 @@ sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
 }
 
 int
-kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
-    id_t id, size_t domainsetsize, const domainset_t *maskp, int policy,
-    const struct cpuset_copy_cb *cb)
+domainset_populate(struct domainset *domain, const domainset_t *mask, int policy,
+    size_t mask_size)
 {
-	struct cpuset *nset;
-	struct cpuset *set;
-	struct thread *ttd;
-	struct proc *p;
-	struct domainset domain;
-	domainset_t *mask;
-	int error;
 
-	if (domainsetsize < sizeof(domainset_t) ||
-	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
-		return (ERANGE);
 	if (policy <= DOMAINSET_POLICY_INVALID ||
-	    policy > DOMAINSET_POLICY_MAX)
+	    policy > DOMAINSET_POLICY_MAX) {
 		return (EINVAL);
-	error = cpuset_check_capabilities(td, level, which, id);
-	if (error != 0)
-		return (error);
-	memset(&domain, 0, sizeof(domain));
-	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
-	error = cb->cpuset_copyin(maskp, mask, domainsetsize);
-	if (error)
-		goto out;
+	}
+
 	/*
 	 * Verify that no high bits are set.
 	 */
-	if (domainsetsize > sizeof(domainset_t)) {
-		char *end;
-		char *cp;
+	if (mask_size > sizeof(domainset_t)) {
+		const char *end;
+		const char *cp;
 
-		end = cp = (char *)&mask->__bits;
-		end += domainsetsize;
+		end = cp = (const char *)&mask->__bits;
+		end += mask_size;
 		cp += sizeof(domainset_t);
-		while (cp != end)
+		while (cp != end) {
 			if (*cp++ != 0) {
-				error = EINVAL;
-				goto out;
+				return (EINVAL);
 			}
+		}
 	}
 	if (DOMAINSET_EMPTY(mask)) {
-		error = EDEADLK;
-		goto out;
+		return (EDEADLK);
 	}
-	DOMAINSET_COPY(mask, &domain.ds_mask);
-	domain.ds_policy = policy;
+	DOMAINSET_COPY(mask, &domain->ds_mask);
+	domain->ds_policy = policy;
 
 	/*
 	 * Sanitize the provided mask.
 	 */
-	if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) {
-		error = EINVAL;
-		goto out;
+	if (!DOMAINSET_SUBSET(&all_domains, &domain->ds_mask)) {
+		return (EINVAL);
 	}
 
 	/* Translate preferred policy into a mask and fallback. */
 	if (policy == DOMAINSET_POLICY_PREFER) {
 		/* Only support a single preferred domain. */
-		if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
-			error = EINVAL;
-			goto out;
+		if (DOMAINSET_COUNT(&domain->ds_mask) != 1) {
+			return (EINVAL);
 		}
-		domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
+		domain->ds_prefer = DOMAINSET_FFS(&domain->ds_mask) - 1;
 		/* This will be constrained by domainset_shadow(). */
-		DOMAINSET_COPY(&all_domains, &domain.ds_mask);
+		DOMAINSET_COPY(&all_domains, &domain->ds_mask);
 	}
 
+	return (0);
+}
+
+int
+kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
+    id_t id, size_t domainsetsize, const domainset_t *maskp, int policy,
+    const struct cpuset_copy_cb *cb)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *ttd;
+	struct proc *p;
+	struct domainset domain;
+	domainset_t *mask;
+	int error;
+
+	error = cpuset_check_capabilities(td, level, which, id);
+	if (error != 0)
+		return (error);
+	if (domainsetsize < sizeof(domainset_t) ||
+	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
+		return (ERANGE);
+	memset(&domain, 0, sizeof(domain));
+	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
+	error = cb->cpuset_copyin(maskp, mask, domainsetsize);
+	if (error)
+		goto out;
+	error = domainset_populate(&domain, mask, policy, domainsetsize);
+	if (error)
+		goto out;
+
 	/*
 	 * When given an impossible policy, fall back to interleaving
 	 * across all domains.
 	 */
 	if (domainset_empty_vm(&domain))
 		domainset_copy(domainset2, &domain);
-
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index ac4b6ac3f457..a27ab33b34da 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -38,9 +38,11 @@
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
+#define EXTERR_CATEGORY	EXTERR_CAT_FILEDESC
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
+#include <sys/exterrvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
@@ -478,6 +480,92 @@ kern_fcntl_freebsd(struct thread *td, int fd, int cmd, intptr_t arg)
 	return (error);
 }
 
+struct flags_trans_elem {
+	u_int f;
+	u_int t;
+};
+
+static u_int
+flags_trans(const struct flags_trans_elem *ftes, int nitems, u_int from_flags)
+{
+	u_int res;
+	int i;
+
+	res = 0;
+	for (i = 0; i < nitems; i++) {
+		if ((from_flags & ftes[i].f) != 0)
+			res |= ftes[i].t;
+	}
+	return (res);
+}
+
+static uint8_t
+fd_to_fde_flags(int fd_flags)
+{
+	static const struct flags_trans_elem fd_to_fde_flags_s[] = {
+		{ .f = FD_CLOEXEC,		.t = UF_EXCLOSE },
+		{ .f = FD_CLOFORK,		.t = UF_FOCLOSE },
+		{ .f = FD_RESOLVE_BENEATH,	.t = UF_RESOLVE_BENEATH },
+	};
+
+	return (flags_trans(fd_to_fde_flags_s, nitems(fd_to_fde_flags_s),
+	    fd_flags));
+}
+
+static int
+fde_to_fd_flags(uint8_t fde_flags)
+{
+	static const struct flags_trans_elem fde_to_fd_flags_s[] = {
+		{ .f = UF_EXCLOSE,		.t = FD_CLOEXEC },
+		{ .f = UF_FOCLOSE,		.t = FD_CLOFORK },
+		{ .f = UF_RESOLVE_BENEATH,	.t = FD_RESOLVE_BENEATH },
+	};
+
+	return (flags_trans(fde_to_fd_flags_s, nitems(fde_to_fd_flags_s),
+	    fde_flags));
+}
+
+static uint8_t
+fddup_to_fde_flags(int fddup_flags)
+{
+	static const struct flags_trans_elem fddup_to_fde_flags_s[] = {
+		{ .f = FDDUP_FLAG_CLOEXEC,	.t = UF_EXCLOSE },
+		{ .f = FDDUP_FLAG_CLOFORK,	.t = UF_FOCLOSE },
+	};
+
+	return (flags_trans(fddup_to_fde_flags_s, nitems(fddup_to_fde_flags_s),
+	    fddup_flags));
+}
+
+static uint8_t
+close_range_to_fde_flags(int close_range_flags)
+{
+	static const struct flags_trans_elem close_range_to_fde_flags_s[] = {
+		{ .f = CLOSE_RANGE_CLOEXEC,	.t = UF_EXCLOSE },
+		{ .f = CLOSE_RANGE_CLOFORK,	.t = UF_FOCLOSE },
+	};
+
+	return (flags_trans(close_range_to_fde_flags_s,
+	   nitems(close_range_to_fde_flags_s), close_range_flags));
+}
+
+static uint8_t
+open_to_fde_flags(int open_flags, bool sticky_orb)
+{
+	static const struct flags_trans_elem open_to_fde_flags_s[] = {
+		{ .f = O_CLOEXEC,		.t = UF_EXCLOSE },
+		{ .f = O_CLOFORK,		.t = UF_FOCLOSE },
+		{ .f = O_RESOLVE_BENEATH,	.t = UF_RESOLVE_BENEATH },
+	};
+#if defined(__clang__) && __clang_major__ >= 19
+	_Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f ==
+	    O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb");
+#endif
+
+	return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) -
+	    (sticky_orb ? 0 : 1), open_flags));
+}
+
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
@@ -492,6 +580,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 	int error, flg, kif_sz, seals, tmp, got_set, got_cleared;
 	uint64_t bsize;
 	off_t foffset;
+	int flags;
 
 	error = 0;
 	flg = F_POSIX;
@@ -511,6 +600,11 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
+	case F_DUPFD_CLOFORK:
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOFORK, fd, tmp);
+		break;
+
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
@@ -526,10 +620,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		FILEDESC_SLOCK(fdp);
 		fde = fdeget_noref(fdp, fd);
 		if (fde != NULL) {
-			td->td_retval[0] =
-			    ((fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0) |
-			    ((fde->fde_flags & UF_RESOLVE_BENEATH) ?
-			    FD_RESOLVE_BENEATH : 0);
+			td->td_retval[0] = fde_to_fd_flags(fde->fde_flags);
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
@@ -543,10 +634,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			/*
 			 * UF_RESOLVE_BENEATH is sticky and cannot be cleared.
 			 */
-			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
-			    ((arg & FD_CLOEXEC) != 0 ? UF_EXCLOSE : 0) |
-			    ((arg & FD_RESOLVE_BENEATH) != 0 ?
-			    UF_RESOLVE_BENEATH : 0);
+			fde->fde_flags = (fde->fde_flags &
+			    ~(UF_EXCLOSE | UF_FOCLOSE)) | fd_to_fde_flags(arg);
 			error = 0;
 		}
 		FILEDESC_XUNLOCK(fdp);
@@ -916,7 +1005,17 @@ revert_f_setfl:
 		break;
 
 	default:
-		error = EINVAL;
+		if ((cmd & ((1u << F_DUP3FD_SHIFT) - 1)) != F_DUP3FD)
+			return (EXTERROR(EINVAL, "invalid fcntl cmd"));
+		/* Handle F_DUP3FD */
+		flags = (cmd >> F_DUP3FD_SHIFT);
+		if ((flags & ~(FD_CLOEXEC | FD_CLOFORK)) != 0)
+			return (EXTERROR(EINVAL, "invalid flags for F_DUP3FD"));
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FIXED,
+		    ((flags & FD_CLOEXEC) != 0 ? FDDUP_FLAG_CLOEXEC : 0) |
+		    ((flags & FD_CLOFORK) != 0 ? FDDUP_FLAG_CLOFORK : 0),
+		    fd, tmp);
 		break;
 	}
 	return (error);
@@ -946,7 +1045,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 	fdp = p->p_fd;
 	oioctls = NULL;
 
-	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
+	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC | FDDUP_FLAG_CLOFORK)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
 	AUDIT_ARG_FD(old);
@@ -971,8 +1070,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 		goto unlock;
 	if (mode == FDDUP_FIXED && old == new) {
 		td->td_retval[0] = new;
-		if (flags & FDDUP_FLAG_CLOEXEC)
-			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
+		fdp->fd_ofiles[new].fde_flags |= fddup_to_fde_flags(flags);
 		error = 0;
 		goto unlock;
 	}
@@ -1047,10 +1145,8 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 	fde_copy(oldfde, newfde);
 	filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 	    nioctls);
-	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
-		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
-	else
-		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+	newfde->fde_flags = (oldfde->fde_flags & ~(UF_EXCLOSE | UF_FOCLOSE)) |
+	    fddup_to_fde_flags(flags);
 #ifdef CAPABILITIES
 	seqc_write_end(&newfde->fde_seqc);
 #endif
@@ -1416,13 +1512,14 @@ kern_close(struct thread *td, int fd)
 }
 
 static int
-close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
+close_range_flags(struct thread *td, u_int lowfd, u_int highfd, int flags)
 {
 	struct filedesc *fdp;
 	struct fdescenttbl *fdt;
 	struct filedescent *fde;
-	int fd;
+	int fd, fde_flags;
 
+	fde_flags = close_range_to_fde_flags(flags);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	fdt = atomic_load_ptr(&fdp->fd_files);
@@ -1434,7 +1531,7 @@ close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
 	for (; fd <= highfd; fd++) {
 		fde = &fdt->fdt_ofiles[fd];
 		if (fde->fde_file != NULL)
-			fde->fde_flags |= UF_EXCLOSE;
+			fde->fde_flags |= fde_flags;
 	}
 out_locked:
 	FILEDESC_XUNLOCK(fdp);
@@ -1492,8 +1589,8 @@ kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd)
 		return (EINVAL);
 	}
 
-	if ((flags & CLOSE_RANGE_CLOEXEC) != 0)
-		return (close_range_cloexec(td, lowfd, highfd));
+	if ((flags & (CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
+		return (close_range_flags(td, lowfd, highfd, flags));
 
 	return (close_range_impl(td, lowfd, highfd));
 }
@@ -1513,7 +1610,7 @@ sys_close_range(struct thread *td, struct close_range_args *uap)
 	AUDIT_ARG_CMD(uap->highfd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 
-	if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0)
+	if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
 		return (EINVAL);
 	return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd));
 }
@@ -2171,8 +2268,7 @@ _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = fp;
-	fde->fde_flags = ((flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0) |
-	    ((flags & O_RESOLVE_BENEATH) != 0 ? UF_RESOLVE_BENEATH : 0);
+	fde->fde_flags = open_to_fde_flags(flags, true);
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
@@ -2432,6 +2528,7 @@ fdcopy(struct filedesc *fdp)
 	newfdp->fd_freefile = fdp->fd_freefile;
 	FILEDESC_FOREACH_FDE(fdp, i, ofde) {
 		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
+		    (ofde->fde_flags & UF_FOCLOSE) != 0 ||
 		    !fhold(ofde->fde_file)) {
 			if (newfdp->fd_freefile == fdp->fd_freefile)
 				newfdp->fd_freefile = i;
@@ -2729,6 +2826,12 @@ fdcloseexec(struct thread *td)
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, false, false);
 			FILEDESC_UNLOCK_ASSERT(fdp);
+		} else if (fde->fde_flags & UF_FOCLOSE) {
+			/*
+			 * https://austingroupbugs.net/view.php?id=1851
+			 * FD_CLOFORK should not be preserved across exec
+			 */
+			fde->fde_flags &= ~UF_FOCLOSE;
 		}
 	}
 }
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 03268365891e..0fc2d0e7f1bc 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -70,6 +70,7 @@
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
+#include <sys/ucoredump.h>
 #include <sys/umtxvar.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
@@ -2002,10 +2003,14 @@ int
 core_write(struct coredump_params *cp, const void *base, size_t len,
     off_t offset, enum uio_seg seg, size_t *resid)
 {
+	return ((*cp->cdw->write_fn)(cp->cdw, base, len, offset, seg,
+	    cp->active_cred, resid, cp->td));
+}
 
-	return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base),
-	    len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
-	    cp->active_cred, cp->file_cred, resid, cp->td));
+static int
+core_extend(struct coredump_params *cp, off_t newsz)
+{
+	return ((*cp->cdw->extend_fn)(cp->cdw, newsz, cp->active_cred));
 }
 
 int
@@ -2013,7 +2018,6 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp,
     void *tmpbuf)
 {
 	vm_map_t map;
-	struct mount *mp;
 	size_t resid, runlen;
 	int error;
 	bool success;
@@ -2068,14 +2072,7 @@ core_output(char *base, size_t len, off_t offset, struct coredump_params *cp,
 			}
 		}
 		if (!success) {
-			error = vn_start_write(cp->vp, &mp, V_WAIT);
-			if (error != 0)
-				break;
-			vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY);
-			error = vn_truncate_locked(cp->vp, offset + runlen,
-			    false, cp->td->td_ucred);
-			VOP_UNLOCK(cp->vp);
-			vn_finished_write(mp);
+			error = core_extend(cp, offset + runlen);
 			if (error != 0)
 				break;
 		}
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index d4529e096929..7ef1d19f0ea8 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3466,7 +3466,7 @@ prison_check_af(struct ucred *cred, int af)
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
-	if (prison_owns_vnet(cred))
+	if (prison_owns_vnet(pr))
 		return (0);
 #endif
 
@@ -3531,7 +3531,7 @@ prison_if(struct ucred *cred, const struct sockaddr *sa)
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
-	if (prison_owns_vnet(cred))
+	if (prison_owns_vnet(cred->cr_prison))
 		return (0);
 #endif
 
@@ -3648,7 +3648,7 @@ jailed_without_vnet(struct ucred *cred)
 	if (!jailed(cred))
 		return (false);
 #ifdef VIMAGE
-	if (prison_owns_vnet(cred))
+	if (prison_owns_vnet(cred->cr_prison))
 		return (false);
 #endif
 
@@ -3711,20 +3711,17 @@ getjailname(struct ucred *cred, char *name, size_t len)
 
 #ifdef VIMAGE
 /*
- * Determine whether the prison represented by cred owns
- * its vnet rather than having it inherited.
- *
- * Returns true in case the prison owns the vnet, false otherwise.
+ * Determine whether the prison owns its VNET.
  */
 bool
-prison_owns_vnet(struct ucred *cred)
+prison_owns_vnet(struct prison *pr)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
-	return ((cred->cr_prison->pr_flags & PR_VNET) != 0);
+	return ((pr->pr_flags & PR_VNET) != 0);
 }
 #endif
 
@@ -4425,7 +4422,7 @@ sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
-	havevnet = jailed(cred) && prison_owns_vnet(cred);
+	havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison);
 #else
 	havevnet = 0;
 #endif
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index d9aeec68e620..0f0bc056cafd 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -287,7 +287,7 @@ sys_getgid(struct thread *td, struct getgid_args *uap)
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 #if defined(COMPAT_43)
-	td->td_retval[1] = td->td_ucred->cr_groups[0];
+	td->td_retval[1] = td->td_ucred->cr_gid;
 #endif
 	return (0);
 }
@@ -307,7 +307,7 @@ int
 sys_getegid(struct thread *td, struct getegid_args *uap)
 {
 
-	td->td_retval[0] = td->td_ucred->cr_groups[0];
+	td->td_retval[0] = td->td_ucred->cr_gid;
 	return (0);
 }
 
@@ -1080,7 +1080,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
 	    gid != oldcred->cr_svgid &&		/* allow setgid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
-	    gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
+	    gid != oldcred->cr_gid && /* allow setgid(getegid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0)
 		goto fail;
@@ -1092,7 +1092,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
-	    gid == oldcred->cr_groups[0] ||
+	    gid == oldcred->cr_gid ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0)
@@ -1121,7 +1121,7 @@ sys_setgid(struct thread *td, struct setgid_args *uap)
 	 * In all cases permitted cases, we are changing the egid.
 	 * Copy credentials so other references do not see our changes.
 	 */
-	if (oldcred->cr_groups[0] != gid) {
+	if (oldcred->cr_gid != gid) {
 		change_egid(newcred, gid);
 		setsugid(p);
 	}
@@ -1167,7 +1167,7 @@ sys_setegid(struct thread *td, struct setegid_args *uap)
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0)
 		goto fail;
 
-	if (oldcred->cr_groups[0] != egid) {
+	if (oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
@@ -1393,12 +1393,12 @@ sys_setregid(struct thread *td, struct setregid_args *uap)
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	    rgid != oldcred->cr_svgid) ||
-	     (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
+	     (egid != (gid_t)-1 && egid != oldcred->cr_gid &&
 	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0)
 		goto fail;
 
-	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+	if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
@@ -1406,9 +1406,9 @@ sys_setregid(struct thread *td, struct setregid_args *uap)
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
-	if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
-	    newcred->cr_svgid != newcred->cr_groups[0]) {
-		change_svgid(newcred, newcred->cr_groups[0]);
+	if ((rgid != (gid_t)-1 || newcred->cr_gid != newcred->cr_rgid) &&
+	    newcred->cr_svgid != newcred->cr_gid) {
+		change_svgid(newcred, newcred->cr_gid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
@@ -1547,17 +1547,17 @@ sys_setresgid(struct thread *td, struct setresgid_args *uap)
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	      rgid != oldcred->cr_svgid &&
-	      rgid != oldcred->cr_groups[0]) ||
+	      rgid != oldcred->cr_gid) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
 	      egid != oldcred->cr_svgid &&
-	      egid != oldcred->cr_groups[0]) ||
+	      egid != oldcred->cr_gid) ||
 	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
 	      sgid != oldcred->cr_svgid &&
-	      sgid != oldcred->cr_groups[0])) &&
+	      sgid != oldcred->cr_gid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0)
 		goto fail;
 
-	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+	if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
@@ -1626,8 +1626,8 @@ sys_getresgid(struct thread *td, struct getresgid_args *uap)
 		error1 = copyout(&cred->cr_rgid,
 		    uap->rgid, sizeof(cred->cr_rgid));
 	if (uap->egid)
-		error2 = copyout(&cred->cr_groups[0],
-		    uap->egid, sizeof(cred->cr_groups[0]));
+		error2 = copyout(&cred->cr_gid,
+		    uap->egid, sizeof(cred->cr_gid));
 	if (uap->sgid)
 		error3 = copyout(&cred->cr_svgid,
 		    uap->sgid, sizeof(cred->cr_svgid));
@@ -1737,7 +1737,7 @@ groupmember(gid_t gid, const struct ucred *cred)
 
 	groups_check_positive_len(cred->cr_ngroups);
 
-	if (gid == cred->cr_groups[0])
+	if (gid == cred->cr_gid)
 		return (true);
 
 	return (group_is_supplementary(gid, cred));
@@ -3015,7 +3015,7 @@ void
 change_egid(struct ucred *newcred, gid_t egid)
 {
 
-	newcred->cr_groups[0] = egid;
+	newcred->cr_gid = egid;
 }
 
 /*-
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index c8b01afeab4f..dcd38c6e6fbe 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -1637,6 +1637,12 @@ uifree(struct uidinfo *uip)
 	if (uip->ui_pipecnt != 0)
 		printf("freeing uidinfo: uid = %d, pipecnt = %ld\n",
 		    uip->ui_uid, uip->ui_pipecnt);
+	if (uip->ui_inotifycnt != 0)
+		printf("freeing uidinfo: uid = %d, inotifycnt = %ld\n",
+		    uip->ui_uid, uip->ui_inotifycnt);
+	if (uip->ui_inotifywatchcnt != 0)
+		printf("freeing uidinfo: uid = %d, inotifywatchcnt = %ld\n",
+		    uip->ui_uid, uip->ui_inotifywatchcnt);
 	free(uip, M_UIDINFO);
 }
 
@@ -1742,6 +1748,21 @@ chgpipecnt(struct uidinfo *uip, int diff, rlim_t max)
 	return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt"));
 }
 
+int
+chginotifycnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+	return (chglimit(uip, &uip->ui_inotifycnt, diff, max, "inotifycnt"));
+}
+
+int
+chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+	return (chglimit(uip, &uip->ui_inotifywatchcnt, diff, max,
+	    "inotifywatchcnt"));
+}
+
 static int
 sysctl_kern_proc_rlimit_usage(SYSCTL_HANDLER_ARGS)
 {
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 17b53208157a..8438298afc0e 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -27,12 +27,12 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktls.h>
@@ -698,10 +698,13 @@ sendfile_wait_generic(struct socket *so, off_t need, int *space)
 	 */
 	error = 0;
 	SOCK_SENDBUF_LOCK(so);
-	if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
-		so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
-	if (so->so_snd.sb_lowat < PAGE_SIZE && so->so_snd.sb_hiwat >= PAGE_SIZE)
-		so->so_snd.sb_lowat = PAGE_SIZE;
+	if (so->so_snd.sb_flags & SB_AUTOLOWAT) {
+		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+		if (so->so_snd.sb_lowat < PAGE_SIZE &&
+		    so->so_snd.sb_hiwat >= PAGE_SIZE)
+			so->so_snd.sb_lowat = PAGE_SIZE;
+	}
 retry_space:
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		error = EPIPE;
@@ -1246,6 +1249,8 @@ out:
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
+		if (sbytes > 0 && vp != NULL)
+			INOTIFY(vp, IN_ACCESS);
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 4565abc4b540..da0efac0598d 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -45,10 +45,10 @@
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
-#include <sys/compressor.h>
 #include <sys/condvar.h>
 #include <sys/devctl.h>
 #include <sys/event.h>
+#include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
@@ -80,6 +80,7 @@
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
+#include <sys/ucoredump.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <sys/wait.h>
@@ -101,7 +102,6 @@ SDT_PROBE_DEFINE2(proc, , , signal__clear,
 SDT_PROBE_DEFINE3(proc, , , signal__discard,
     "struct thread *", "struct proc *", "int");
 
-static int	coredump(struct thread *);
 static int	killpg1(struct thread *td, int sig, int pgid, int all,
 		    ksiginfo_t *ksi);
 static int	issignal(struct thread *td);
@@ -126,11 +126,6 @@ const struct filterops sig_filtops = {
 	.f_event = filt_signal,
 };
 
-static int	kern_logsigexit = 1;
-SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
-    &kern_logsigexit, 0,
-    "Log processes quitting on abnormal signals to syslog(3)");
-
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
@@ -193,26 +188,6 @@ SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
-static int	sugid_coredump;
-SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
-    &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
-
-static int	capmode_coredump;
-SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
-    &capmode_coredump, 0, "Allow processes in capability mode to dump core");
-
-static int	do_coredump = 1;
-SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
-	&do_coredump, 0, "Enable/Disable coredumps");
-
-static int	set_core_nodump_flag = 0;
-SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
-	0, "Enable setting the NODUMP flag on coredump files");
-
-static int	coredump_devctl = 0;
-SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
-	0, "Generate a devctl notification when processes coredump");
-
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
@@ -784,6 +759,13 @@ sigprop(int sig)
 	return (0);
 }
 
+bool
+sig_do_core(int sig)
+{
+
+	return ((sigprop(sig) & SIGPROP_CORE) != 0);
+}
+
 static bool
 sigact_flag_test(const struct sigaction *act, int flag)
 {
@@ -1050,8 +1032,7 @@ osigaction(struct thread *td, struct osigaction_args *uap)
 int
 osigreturn(struct thread *td, struct osigreturn_args *uap)
 {
-
-	return (nosys(td, (struct nosys_args *)uap));
+	return (kern_nosys(td, 0));
 }
 #endif
 #endif /* COMPAT_43 */
@@ -2666,6 +2647,8 @@ static void
 ptrace_coredumpreq(struct thread *td, struct proc *p,
     struct thr_coredump_req *tcq)
 {
+	struct coredump_vnode_ctx wctx;
+	struct coredump_writer cdw;
 	void *rl_cookie;
 
 	if (p->p_sysent->sv_coredump == NULL) {
@@ -2673,8 +2656,15 @@ ptrace_coredumpreq(struct thread *td, struct proc *p,
 		return;
 	}
 
+	wctx.vp = tcq->tc_vp;
+	wctx.fcred = NOCRED;
+
+	cdw.ctx = &wctx;
+	cdw.write_fn = core_vn_write;
+	cdw.extend_fn = core_vn_extend;
+
 	rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX);
-	tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp,
+	tcq->tc_error = p->p_sysent->sv_coredump(td, &cdw,
 	    tcq->tc_limit, tcq->tc_flags);
 	vn_rangelock_unlock(tcq->tc_vp, rl_cookie);
 }
@@ -3636,82 +3626,6 @@ killproc(struct proc *p, const char *why)
 }
 
 /*
- * Force the current process to exit with the specified signal, dumping core
- * if appropriate.  We bypass the normal tests for masked and caught signals,
- * allowing unrecoverable failures to terminate the process without changing
- * signal state.  Mark the accounting record with the signal termination.
- * If dumping core, save the signal number for the debugger.  Calls exit and
- * does not return.
- */
-void
-sigexit(struct thread *td, int sig)
-{
-	struct proc *p = td->td_proc;
-	const char *coreinfo;
-	int rv;
-	bool logexit;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	proc_set_p2_wexit(p);
-
-	p->p_acflag |= AXSIG;
-	if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0)
-		logexit = kern_logsigexit != 0;
-	else
-		logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0;
-
-	/*
-	 * We must be single-threading to generate a core dump.  This
-	 * ensures that the registers in the core file are up-to-date.
-	 * Also, the ELF dump handler assumes that the thread list doesn't
-	 * change out from under it.
-	 *
-	 * XXX If another thread attempts to single-thread before us
-	 *     (e.g. via fork()), we won't get a dump at all.
-	 */
-	if ((sigprop(sig) & SIGPROP_CORE) &&
-	    thread_single(p, SINGLE_NO_EXIT) == 0) {
-		p->p_sig = sig;
-		/*
-		 * Log signals which would cause core dumps
-		 * (Log as LOG_INFO to appease those who don't want
-		 * these messages.)
-		 * XXX : Todo, as well as euid, write out ruid too
-		 * Note that coredump() drops proc lock.
-		 */
-		rv = coredump(td);
-		switch (rv) {
-		case 0:
-			sig |= WCOREFLAG;
-			coreinfo = " (core dumped)";
-			break;
-		case EFAULT:
-			coreinfo = " (no core dump - bad address)";
-			break;
-		case EINVAL:
-			coreinfo = " (no core dump - invalid argument)";
-			break;
-		case EFBIG:
-			coreinfo = " (no core dump - too large)";
-			break;
-		default:
-			coreinfo = " (no core dump - other error)";
-			break;
-		}
-		if (logexit)
-			log(LOG_INFO,
-			    "pid %d (%s), jid %d, uid %d: exited on "
-			    "signal %d%s\n", p->p_pid, p->p_comm,
-			    p->p_ucred->cr_prison->pr_id,
-			    td->td_ucred->cr_uid,
-			    sig &~ WCOREFLAG, coreinfo);
-	} else
-		PROC_UNLOCK(p);
-	exit1(td, 0, sig);
-	/* NOTREACHED */
-}
-
-/*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
@@ -3804,470 +3718,6 @@ childproc_exited(struct proc *p)
 	sigparent(p, reason, status);
 }
 
-#define	MAX_NUM_CORE_FILES 100000
-#ifndef NUM_CORE_FILES
-#define	NUM_CORE_FILES 5
-#endif
-CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES);
-static int num_cores = NUM_CORE_FILES;
-
-static int
-sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
-{
-	int error;
-	int new_val;
-
-	new_val = num_cores;
-	error = sysctl_handle_int(oidp, &new_val, 0, req);
-	if (error != 0 || req->newptr == NULL)
-		return (error);
-	if (new_val > MAX_NUM_CORE_FILES)
-		new_val = MAX_NUM_CORE_FILES;
-	if (new_val < 0)
-		new_val = 0;
-	num_cores = new_val;
-	return (0);
-}
-SYSCTL_PROC(_debug, OID_AUTO, ncores,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
-    sysctl_debug_num_cores_check, "I",
-    "Maximum number of generated process corefiles while using index format");
-
-#define	GZIP_SUFFIX	".gz"
-#define	ZSTD_SUFFIX	".zst"
-
-int compress_user_cores = 0;
-
-static int
-sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
-{
-	int error, val;
-
-	val = compress_user_cores;
-	error = sysctl_handle_int(oidp, &val, 0, req);
-	if (error != 0 || req->newptr == NULL)
-		return (error);
-	if (val != 0 && !compressor_avail(val))
-		return (EINVAL);
-	compress_user_cores = val;
-	return (error);
-}
-SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
-    CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
-    sysctl_compress_user_cores, "I",
-    "Enable compression of user corefiles ("
-    __XSTRING(COMPRESS_GZIP) " = gzip, "
-    __XSTRING(COMPRESS_ZSTD) " = zstd)");
-
-int compress_user_cores_level = 6;
-SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
-    &compress_user_cores_level, 0,
-    "Corefile compression level");
-
-/*
- * Protect the access to corefilename[] by allproc_lock.
- */
-#define	corefilename_lock	allproc_lock
-
-static char corefilename[MAXPATHLEN] = {"%N.core"};
-TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
-
-static int
-sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
-{
-	int error;
-
-	sx_xlock(&corefilename_lock);
-	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
-	    req);
-	sx_xunlock(&corefilename_lock);
-
-	return (error);
-}
-SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
-    CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
-    "Process corefile name format string");
-
-static void
-vnode_close_locked(struct thread *td, struct vnode *vp)
-{
-
-	VOP_UNLOCK(vp);
-	vn_close(vp, FWRITE, td->td_ucred, td);
-}
-
-/*
- * If the core format has a %I in it, then we need to check
- * for existing corefiles before defining a name.
- * To do this we iterate over 0..ncores to find a
- * non-existing core file name to use. If all core files are
- * already used we choose the oldest one.
- */
-static int
-corefile_open_last(struct thread *td, char *name, int indexpos,
-    int indexlen, int ncores, struct vnode **vpp)
-{
-	struct vnode *oldvp, *nextvp, *vp;
-	struct vattr vattr;
-	struct nameidata nd;
-	int error, i, flags, oflags, cmode;
-	char ch;
-	struct timespec lasttime;
-
-	nextvp = oldvp = NULL;
-	cmode = S_IRUSR | S_IWUSR;
-	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
-	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
-
-	for (i = 0; i < ncores; i++) {
-		flags = O_CREAT | FWRITE | O_NOFOLLOW;
-
-		ch = name[indexpos + indexlen];
-		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
-		    i);
-		name[indexpos + indexlen] = ch;
-
-		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
-		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
-		    NULL);
-		if (error != 0)
-			break;
-
-		vp = nd.ni_vp;
-		NDFREE_PNBUF(&nd);
-		if ((flags & O_CREAT) == O_CREAT) {
-			nextvp = vp;
-			break;
-		}
-
-		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
-		if (error != 0) {
-			vnode_close_locked(td, vp);
-			break;
-		}
-
-		if (oldvp == NULL ||
-		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
-		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
-		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
-			if (oldvp != NULL)
-				vn_close(oldvp, FWRITE, td->td_ucred, td);
-			oldvp = vp;
-			VOP_UNLOCK(oldvp);
-			lasttime = vattr.va_mtime;
-		} else {
-			vnode_close_locked(td, vp);
-		}
-	}
-
-	if (oldvp != NULL) {
-		if (nextvp == NULL) {
-			if ((td->td_proc->p_flag & P_SUGID) != 0) {
-				error = EFAULT;
-				vn_close(oldvp, FWRITE, td->td_ucred, td);
-			} else {
-				nextvp = oldvp;
-				error = vn_lock(nextvp, LK_EXCLUSIVE);
-				if (error != 0) {
-					vn_close(nextvp, FWRITE, td->td_ucred,
-					    td);
-					nextvp = NULL;
-				}
-			}
-		} else {
-			vn_close(oldvp, FWRITE, td->td_ucred, td);
-		}
-	}
-	if (error != 0) {
-		if (nextvp != NULL)
-			vnode_close_locked(td, oldvp);
-	} else {
-		*vpp = nextvp;
-	}
-
-	return (error);
-}
-
-/*
- * corefile_open(comm, uid, pid, td, compress, vpp, namep)
- * Expand the name described in corefilename, using name, uid, and pid
- * and open/create core file.
- * corefilename is a printf-like string, with three format specifiers:
- *	%N	name of process ("name")
- *	%P	process id (pid)
- *	%U	user id (uid)
- * For example, "%N.core" is the default; they can be disabled completely
- * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
- * This is controlled by the sysctl variable kern.corefile (see above).
- */
-static int
-corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
-    int compress, int signum, struct vnode **vpp, char **namep)
-{
-	struct sbuf sb;
-	struct nameidata nd;
-	const char *format;
-	char *hostname, *name;
-	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
-
-	hostname = NULL;
-	format = corefilename;
-	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
-	indexlen = 0;
-	indexpos = -1;
-	ncores = num_cores;
-	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
-	sx_slock(&corefilename_lock);
-	for (i = 0; format[i] != '\0'; i++) {
-		switch (format[i]) {
-		case '%':	/* Format character */
-			i++;
-			switch (format[i]) {
-			case '%':
-				sbuf_putc(&sb, '%');
-				break;
-			case 'H':	/* hostname */
-				if (hostname == NULL) {
-					hostname = malloc(MAXHOSTNAMELEN,
-					    M_TEMP, M_WAITOK);
-				}
-				getcredhostname(td->td_ucred, hostname,
-				    MAXHOSTNAMELEN);
-				sbuf_cat(&sb, hostname);
-				break;
-			case 'I':	/* autoincrementing index */
-				if (indexpos != -1) {
-					sbuf_printf(&sb, "%%I");
-					break;
-				}
-
-				indexpos = sbuf_len(&sb);
-				sbuf_printf(&sb, "%u", ncores - 1);
-				indexlen = sbuf_len(&sb) - indexpos;
-				break;
-			case 'N':	/* process name */
-				sbuf_printf(&sb, "%s", comm);
-				break;
-			case 'P':	/* process id */
-				sbuf_printf(&sb, "%u", pid);
-				break;
-			case 'S':	/* signal number */
-				sbuf_printf(&sb, "%i", signum);
-				break;
-			case 'U':	/* user id */
-				sbuf_printf(&sb, "%u", uid);
-				break;
-			default:
-				log(LOG_ERR,
-				    "Unknown format character %c in "
-				    "corename `%s'\n", format[i], format);
-				break;
-			}
-			break;
-		default:
-			sbuf_putc(&sb, format[i]);
-			break;
-		}
-	}
-	sx_sunlock(&corefilename_lock);
-	free(hostname, M_TEMP);
-	if (compress == COMPRESS_GZIP)
-		sbuf_cat(&sb, GZIP_SUFFIX);
-	else if (compress == COMPRESS_ZSTD)
-		sbuf_cat(&sb, ZSTD_SUFFIX);
-	if (sbuf_error(&sb) != 0) {
-		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
-		    "long\n", (long)pid, comm, (u_long)uid);
-		sbuf_delete(&sb);
-		free(name, M_TEMP);
-		return (ENOMEM);
-	}
-	sbuf_finish(&sb);
-	sbuf_delete(&sb);
-
-	if (indexpos != -1) {
-		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
-		    vpp);
-		if (error != 0) {
-			log(LOG_ERR,
-			    "pid %d (%s), uid (%u):  Path `%s' failed "
-			    "on initial open test, error = %d\n",
-			    pid, comm, uid, name, error);
-		}
-	} else {
-		cmode = S_IRUSR | S_IWUSR;
-		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
-		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
-		flags = O_CREAT | FWRITE | O_NOFOLLOW;
-		if ((td->td_proc->p_flag & P_SUGID) != 0)
-			flags |= O_EXCL;
-
-		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
-		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
-		    NULL);
-		if (error == 0) {
-			*vpp = nd.ni_vp;
-			NDFREE_PNBUF(&nd);
-		}
-	}
-
-	if (error != 0) {
-#ifdef AUDIT
-		audit_proc_coredump(td, name, error);
-#endif
-		free(name, M_TEMP);
-		return (error);
-	}
-	*namep = name;
-	return (0);
-}
-
-/*
- * Dump a process' core.  The main routine does some
- * policy checking, and creates the name of the coredump;
- * then it passes on a vnode and a size limit to the process-specific
- * coredump routine if there is one; if there _is not_ one, it returns
- * ENOSYS; otherwise it returns the error from the process-specific routine.
- */
-
-static int
-coredump(struct thread *td)
-{
-	struct proc *p = td->td_proc;
-	struct ucred *cred = td->td_ucred;
-	struct vnode *vp;
-	struct flock lf;
-	struct vattr vattr;
-	size_t fullpathsize;
-	int error, error1, locked;
-	char *name;			/* name of corefile */
-	void *rl_cookie;
-	off_t limit;
-	char *fullpath, *freepath = NULL;
-	struct sbuf *sb;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
-
-	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
-	    (p->p_flag2 & P2_NOTRACE) != 0) {
-		PROC_UNLOCK(p);
-		return (EFAULT);
-	}
-
-	/*
-	 * Note that the bulk of limit checking is done after
-	 * the corefile is created.  The exception is if the limit
-	 * for corefiles is 0, in which case we don't bother
-	 * creating the corefile at all.  This layout means that
-	 * a corefile is truncated instead of not being created,
-	 * if it is larger than the limit.
-	 */
-	limit = (off_t)lim_cur(td, RLIMIT_CORE);
-	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
-		PROC_UNLOCK(p);
-		return (EFBIG);
-	}
-	PROC_UNLOCK(p);
-
-	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
-	    compress_user_cores, p->p_sig, &vp, &name);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Don't dump to non-regular files or files with links.
-	 * Do not dump into system files. Effective user must own the corefile.
-	 */
-	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
-	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
-	    vattr.va_uid != cred->cr_uid) {
-		VOP_UNLOCK(vp);
-		error = EFAULT;
-		goto out;
-	}
-
-	VOP_UNLOCK(vp);
-
-	/* Postpone other writers, including core dumps of other processes. */
-	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
-
-	lf.l_whence = SEEK_SET;
-	lf.l_start = 0;
-	lf.l_len = 0;
-	lf.l_type = F_WRLCK;
-	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
-
-	VATTR_NULL(&vattr);
-	vattr.va_size = 0;
-	if (set_core_nodump_flag)
-		vattr.va_flags = UF_NODUMP;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	VOP_SETATTR(vp, &vattr, cred);
-	VOP_UNLOCK(vp);
-	PROC_LOCK(p);
-	p->p_acflag |= ACORE;
-	PROC_UNLOCK(p);
-
-	if (p->p_sysent->sv_coredump != NULL) {
-		error = p->p_sysent->sv_coredump(td, vp, limit, 0);
-	} else {
-		error = ENOSYS;
-	}
-
-	if (locked) {
-		lf.l_type = F_UNLCK;
-		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
-	}
-	vn_rangelock_unlock(vp, rl_cookie);
-
-	/*
-	 * Notify the userland helper that a process triggered a core dump.
-	 * This allows the helper to run an automated debugging session.
-	 */
-	if (error != 0 || coredump_devctl == 0)
-		goto out;
-	sb = sbuf_new_auto();
-	if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
-		goto out2;
-	sbuf_cat(sb, "comm=\"");
-	devctl_safe_quote_sb(sb, fullpath);
-	free(freepath, M_TEMP);
-	sbuf_cat(sb, "\" core=\"");
-
-	/*
-	 * We can't lookup core file vp directly. When we're replacing a core, and
-	 * other random times, we flush the name cache, so it will fail. Instead,
-	 * if the path of the core is relative, add the current dir in front if it.
-	 */
-	if (name[0] != '/') {
-		fullpathsize = MAXPATHLEN;
-		freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
-		if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
-			free(freepath, M_TEMP);
-			goto out2;
-		}
-		devctl_safe_quote_sb(sb, fullpath);
-		free(freepath, M_TEMP);
-		sbuf_putc(sb, '/');
-	}
-	devctl_safe_quote_sb(sb, name);
-	sbuf_putc(sb, '"');
-	if (sbuf_finish(sb) == 0)
-		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
-out2:
-	sbuf_delete(sb);
-out:
-	error1 = vn_close(vp, FWRITE, cred, td);
-	if (error == 0)
-		error = error1;
-#ifdef AUDIT
-	audit_proc_coredump(td, name, error);
-#endif
-	free(name, M_TEMP);
-	return (error);
-}
-
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
@@ -4281,6 +3731,12 @@ struct nosys_args {
 int
 nosys(struct thread *td, struct nosys_args *args)
 {
+	return (kern_nosys(td, args->dummy));
+}
+
+int
+kern_nosys(struct thread *td, int dummy)
+{
 	struct proc *p;
 
 	p = td->td_proc;
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
index 24406763a93a..a93d711e7597 100644
--- a/sys/kern/kern_syscalls.c
+++ b/sys/kern/kern_syscalls.c
@@ -35,6 +35,7 @@
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
@@ -50,14 +51,14 @@ int
 lkmnosys(struct thread *td, struct nosys_args *args)
 {
 
-	return (nosys(td, args));
+	return (kern_nosys(td, 0));
 }
 
 int
 lkmressys(struct thread *td, struct nosys_args *args)
 {
 
-	return (nosys(td, args));
+	return (kern_nosys(td, 0));
 }
 
 struct sysent nosys_sysent = {
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 46226cc31980..25da134661e9 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -2368,7 +2368,7 @@ sysctl_root(SYSCTL_HANDLER_ARGS)
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #ifdef VIMAGE
 		else if ((oid->oid_kind & CTLFLAG_VNET) &&
-		     prison_owns_vnet(req->td->td_ucred))
+		     prison_owns_vnet(req->td->td_ucred->cr_prison))
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #endif
 		else
diff --git a/sys/kern/kern_ucoredump.c b/sys/kern/kern_ucoredump.c
new file mode 100644
index 000000000000..d425596b5f24
--- /dev/null
+++ b/sys/kern/kern_ucoredump.c
@@ -0,0 +1,299 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/compressor.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rmlock.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucoredump.h>
+#include <sys/wait.h>
+
+static int coredump(struct thread *td, const char **);
+
+int compress_user_cores = 0;
+
+static SLIST_HEAD(, coredumper)	coredumpers =
+    SLIST_HEAD_INITIALIZER(coredumpers);
+static struct rmlock	coredump_rmlock;
+RM_SYSINIT(coredump_lock, &coredump_rmlock, "coredump_lock");
+
+static int kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
+    &kern_logsigexit, 0,
+    "Log processes quitting on abnormal signals to syslog(3)");
+
+static int sugid_coredump;
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
+    &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
+
+static int do_coredump = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
+	&do_coredump, 0, "Enable/Disable coredumps");
+
+static int
+sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = compress_user_cores;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (val != 0 && !compressor_avail(val))
+		return (EINVAL);
+	compress_user_cores = val;
+	return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
+    CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
+    sysctl_compress_user_cores, "I",
+    "Enable compression of user corefiles ("
+    __XSTRING(COMPRESS_GZIP) " = gzip, "
+    __XSTRING(COMPRESS_ZSTD) " = zstd)");
+
+int compress_user_cores_level = 6;
+SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
+    &compress_user_cores_level, 0,
+    "Corefile compression level");
+
+void
+coredumper_register(struct coredumper *cd)
+{
+
+	blockcount_init(&cd->cd_refcount);
+	rm_wlock(&coredump_rmlock);
+	SLIST_INSERT_HEAD(&coredumpers, cd, cd_entry);
+	rm_wunlock(&coredump_rmlock);
+}
+
+void
+coredumper_unregister(struct coredumper *cd)
+{
+
+	rm_wlock(&coredump_rmlock);
+	SLIST_REMOVE(&coredumpers, cd, coredumper, cd_entry);
+	rm_wunlock(&coredump_rmlock);
+
+	/*
+	 * Wait for any in-process coredumps to finish before returning.
+	 */
+	blockcount_wait(&cd->cd_refcount, NULL, "dumpwait", 0);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate.  We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state.  Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger.  Calls exit and
+ * does not return.
+ */
+void
+sigexit(struct thread *td, int sig)
+{
+	struct proc *p = td->td_proc;
+	int rv;
+	bool logexit;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	proc_set_p2_wexit(p);
+
+	p->p_acflag |= AXSIG;
+	if ((p->p_flag2 & P2_LOGSIGEXIT_CTL) == 0)
+		logexit = kern_logsigexit != 0;
+	else
+		logexit = (p->p_flag2 & P2_LOGSIGEXIT_ENABLE) != 0;
+
+	/*
+	 * We must be single-threading to generate a core dump.  This
+	 * ensures that the registers in the core file are up-to-date.
+	 * Also, the ELF dump handler assumes that the thread list doesn't
+	 * change out from under it.
+	 *
+	 * XXX If another thread attempts to single-thread before us
+	 *     (e.g. via fork()), we won't get a dump at all.
+	 */
+	if (sig_do_core(sig) && thread_single(p, SINGLE_NO_EXIT) == 0) {
+		const char *err = NULL;
+
+		p->p_sig = sig;
+		/*
+		 * Log signals which would cause core dumps
+		 * (Log as LOG_INFO to appease those who don't want
+		 * these messages.)
+		 * XXX : Todo, as well as euid, write out ruid too
+		 * Note that coredump() drops proc lock.
+		 */
+		rv = coredump(td, &err);
+		if (rv == 0) {
+			MPASS(err == NULL);
+			sig |= WCOREFLAG;
+		} else if (err == NULL) {
+			switch (rv) {
+			case EFAULT:
+				err = "bad address";
+				break;
+			case EINVAL:
+				err = "invalild argument";
+				break;
+			case EFBIG:
+				err = "too large";
+				break;
+			default:
+				err = "other error";
+				break;
+			}
+		}
+		if (logexit)
+			log(LOG_INFO,
+			    "pid %d (%s), jid %d, uid %d: exited on "
+			    "signal %d (%s%s)\n", p->p_pid, p->p_comm,
+			    p->p_ucred->cr_prison->pr_id,
+			    td->td_ucred->cr_uid, sig &~ WCOREFLAG,
+			    err != NULL ? "no core dump - " : "core dumped",
+			    err != NULL ? err : "");
+	} else
+		PROC_UNLOCK(p);
+	exit1(td, 0, sig);
+	/* NOTREACHED */
+}
+
+
+/*
+ * Dump a process' core.  The main routine does some
+ * policy checking, and creates the name of the coredump;
+ * then it passes on a vnode and a size limit to the process-specific
+ * coredump routine if there is one; if there _is not_ one, it returns
+ * ENOSYS; otherwise it returns the error from the process-specific routine.
+ */
+static int
+coredump(struct thread *td, const char **errmsg)
+{
+	struct coredumper *iter, *chosen;
+	struct proc *p = td->td_proc;
+	struct rm_priotracker tracker;
+	off_t limit;
+	int error, priority;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
+
+	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
+	    (p->p_flag2 & P2_NOTRACE) != 0) {
+		PROC_UNLOCK(p);
+
+		if (!do_coredump)
+			*errmsg = "denied by kern.coredump";
+		else if ((p->p_flag2 & P2_NOTRACE) != 0)
+			*errmsg = "process has trace disabled";
+		else
+			*errmsg = "sugid process denied by kern.sugid_coredump";
+		return (EFAULT);
+	}
+
+	/*
+	 * Note that the bulk of limit checking is done after
+	 * the corefile is created.  The exception is if the limit
+	 * for corefiles is 0, in which case we don't bother
+	 * creating the corefile at all.  This layout means that
+	 * a corefile is truncated instead of not being created,
+	 * if it is larger than the limit.
+	 */
+	limit = (off_t)lim_cur(td, RLIMIT_CORE);
+	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
+		PROC_UNLOCK(p);
+		*errmsg = "coredumpsize limit is 0";
+		return (EFBIG);
+	}
+
+	rm_rlock(&coredump_rmlock, &tracker);
+	priority = -1;
+	chosen = NULL;
+	SLIST_FOREACH(iter, &coredumpers, cd_entry) {
+		if (iter->cd_probe == NULL) {
+			/*
+			 * If we haven't found anything of a higher priority
+			 * yet, we'll call this a GENERIC.  Ideally, we want
+			 * coredumper modules to include a probe function.
+			 */
+			if (priority < 0) {
+				priority = COREDUMPER_GENERIC;
+				chosen = iter;
+			}
+
+			continue;
+		}
+
+		error = (*iter->cd_probe)(td);
+		if (error < 0)
+			continue;
+
+		/*
+		 * Higher priority than previous options.
+		 */
+		if (error > priority) {
+			priority = error;
+			chosen = iter;
+		}
+	}
+
+	/*
+	 * Acquire our refcount before we drop the lock so that
+	 * coredumper_unregister() can safely assume that the refcount will only
+	 * go down once it's dropped the rmlock.
+	 */
+	blockcount_acquire(&chosen->cd_refcount, 1);
+	rm_runlock(&coredump_rmlock, &tracker);
+
+	/* Currently, we always have the vnode dumper built in. */
+	MPASS(chosen != NULL);
+	error = ((*chosen->cd_handle)(td, limit));
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+
+	blockcount_release(&chosen->cd_refcount, 1);
+
+	return (error);
+}
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 0edb631d1475..464efda1e91a 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code)
 	if (__predict_false(!kasan_enabled))
 		return;
 
-	if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS &&
-	    (vm_offset_t)addr < DMAP_MAX_ADDRESS)
+	if (kasan_md_unsupported((vm_offset_t)addr))
 		return;
 
 	KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS &&
diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c
index 7cc6fb593697..5ad5b0af1681 100644
--- a/sys/kern/subr_capability.c
+++ b/sys/kern/subr_capability.c
@@ -74,6 +74,10 @@ const cap_rights_t cap_getsockopt_rights =
     CAP_RIGHTS_INITIALIZER(CAP_GETSOCKOPT);
 const cap_rights_t cap_getsockname_rights =
     CAP_RIGHTS_INITIALIZER(CAP_GETSOCKNAME);
+const cap_rights_t cap_inotify_add_rights =
+    CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_ADD);
+const cap_rights_t cap_inotify_rm_rights =
+    CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_RM);
 const cap_rights_t cap_ioctl_rights = CAP_RIGHTS_INITIALIZER(CAP_IOCTL);
 const cap_rights_t cap_listen_rights = CAP_RIGHTS_INITIALIZER(CAP_LISTEN);
 const cap_rights_t cap_linkat_source_rights =
diff --git a/sys/kern/subr_compressor.c b/sys/kern/subr_compressor.c
index 280264881241..5d59622e0455 100644
--- a/sys/kern/subr_compressor.c
+++ b/sys/kern/subr_compressor.c
@@ -538,6 +538,12 @@ compressor_init(compressor_cb_t cb, int format, size_t maxiosize, int level,
 	return (s);
 }
 
+int
+compressor_format(const struct compressor *stream)
+{
+	return (stream->methods->format);
+}
+
 void
 compressor_reset(struct compressor *stream)
 {
diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
index 3a3548bad52b..bb86c779b936 100644
--- a/sys/kern/subr_pctrie.c
+++ b/sys/kern/subr_pctrie.c
@@ -691,21 +691,23 @@ _pctrie_lookup_ge(struct pctrie *ptree, struct pctrie_node *node,
 	 */
 	if (node == PCTRIE_NULL || *pctrie_toval(node) < index) {
 		/* Climb the path to find a node with a descendant > index. */
-		for (node = parent; node != NULL; node = pctrie_parent(node)) {
-			slot = pctrie_slot(node, index) + 1;
-			if ((node->pn_popmap >> slot) != 0)
+		node = NULL;
+		while (parent != NULL) {
+			slot = pctrie_slot(parent, index) + 1;
+			if ((parent->pn_popmap >> slot) != 0)
 				break;
+			node = parent;
+			parent = pctrie_parent(node);
 		}
-		if (node == NULL) {
+		if (parent == NULL) {
 			if (parent_out != NULL)
-				*parent_out = NULL;
+				*parent_out = node;
 			return (NULL);
 		}
 
 		/* Step to the least child with a descendant > index. */
-		slot += ffs(node->pn_popmap >> slot) - 1;
-		parent = node;
-		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		slot += ffs(parent->pn_popmap >> slot) - 1;
+		node = pctrie_node_load(&parent->pn_child[slot], NULL,
 		    PCTRIE_LOCKED);
 	}
 	/* Descend to the least leaf of the subtrie. */
@@ -785,21 +787,23 @@ _pctrie_lookup_le(struct pctrie *ptree, struct pctrie_node *node,
 	 */
 	if (node == PCTRIE_NULL || *pctrie_toval(node) > index) {
 		/* Climb the path to find a node with a descendant < index. */
-		for (node = parent; node != NULL; node = pctrie_parent(node)) {
-			slot = pctrie_slot(node, index);
-			if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
+		node = NULL;
+		while (parent != NULL) {
+			slot = pctrie_slot(parent, index);
+			if ((parent->pn_popmap & ((1 << slot) - 1)) != 0)
 				break;
+			node = parent;
+			parent = pctrie_parent(node);
 		}
-		if (node == NULL) {
+		if (parent == NULL) {
 			if (parent_out != NULL)
-				*parent_out = NULL;
+				*parent_out = node;
 			return (NULL);
 		}
 
 		/* Step to the greatest child with a descendant < index. */
-		slot = ilog2(node->pn_popmap & ((1 << slot) - 1));
-		parent = node;
-		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		slot = ilog2(parent->pn_popmap & ((1 << slot) - 1));
+		node = pctrie_node_load(&parent->pn_child[slot], NULL,
 		    PCTRIE_LOCKED);
 	}
 	/* Descend to the greatest leaf of the subtrie. */
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 18388ae5f232..bac7d0080c71 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -338,8 +338,9 @@ ast_handler(struct thread *td, struct trapframe *framep, bool dtor)
 		td->td_ast = 0;
 	}
 
-	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
-            td->td_proc->p_comm);
+	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td,
+	     td->td_proc == NULL ? -1 : td->td_proc->p_pid,
+	     td->td_proc == NULL ? "" : td->td_proc->p_comm);
 	KASSERT(framep == NULL || TRAPF_USERMODE(framep),
 	    ("ast in kernel mode"));
 
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index d31ff3b939cc..5606b36f772f 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -37,16 +37,17 @@
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
-#define	EXTERR_CATEGORY	EXTERR_CAT_FILEDESC
+#define	EXTERR_CATEGORY	EXTERR_CAT_GENIO
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
+#include <sys/exterrvar.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
-#include <sys/exterrvar.h>
+#include <sys/inotify.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
@@ -195,7 +196,7 @@ sys_read(struct thread *td, struct read_args *uap)
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
@@ -233,7 +234,7 @@ kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset)
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
@@ -329,7 +330,7 @@ kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset)
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "neg offset");
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
@@ -396,7 +397,7 @@ sys_write(struct thread *td, struct write_args *uap)
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
@@ -435,7 +436,7 @@ kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
@@ -531,7 +532,7 @@ kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset)
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "neg offset");
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
@@ -602,14 +603,14 @@ kern_ftruncate(struct thread *td, int fd, off_t length)
 
 	AUDIT_ARG_FD(fd);
 	if (length < 0)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "negative length"));
 	error = fget(td, fd, &cap_ftruncate_rights, &fp);
 	if (error)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if (!(fp->f_flag & FWRITE)) {
 		fdrop(fp, td);
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "non-writable"));
 	}
 	error = fo_truncate(fp, length, td->td_ucred, td);
 	fdrop(fp, td);
@@ -840,8 +841,10 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 	int error;
 
 	AUDIT_ARG_FD(fd);
-	if (offset < 0 || len <= 0)
-		return (EINVAL);
+	if (offset < 0)
+		return (EXTERROR(EINVAL, "negative offset"));
+	if (len <= 0)
+		return (EXTERROR(EINVAL, "negative length"));
 	/* Check for wrap. */
 	if (offset > OFF_MAX - len)
 		return (EFBIG);
@@ -898,16 +901,21 @@ kern_fspacectl(struct thread *td, int fd, int cmd,
 	AUDIT_ARG_FFLAGS(flags);
 
 	if (rqsr == NULL)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "no range"));
 	rmsr = *rqsr;
 	if (rmsrp != NULL)
 		*rmsrp = rmsr;
 
-	if (cmd != SPACECTL_DEALLOC ||
-	    rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
-	    rqsr->r_offset > OFF_MAX - rqsr->r_len ||
-	    (flags & ~SPACECTL_F_SUPPORTED) != 0)
-		return (EINVAL);
+	if (cmd != SPACECTL_DEALLOC)
+		return (EXTERROR(EINVAL, "cmd", cmd));
+	if (rqsr->r_offset < 0)
+		return (EXTERROR(EINVAL, "neg offset"));
+	if (rqsr->r_len <= 0)
+		return (EXTERROR(EINVAL, "neg len"));
+	if (rqsr->r_offset > OFF_MAX - rqsr->r_len)
+		return (EXTERROR(EINVAL, "offset too large"));
+	if ((flags & ~SPACECTL_F_SUPPORTED) != 0)
+		return (EXTERROR(EINVAL, "reserved flags", flags));
 
 	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 	if (error != 0)
@@ -939,7 +947,6 @@ int
 kern_specialfd(struct thread *td, int type, void *arg)
 {
 	struct file *fp;
-	struct specialfd_eventfd *ae;
 	int error, fd, fflags;
 
 	fflags = 0;
@@ -948,14 +955,24 @@ kern_specialfd(struct thread *td, int type, void *arg)
 		return (error);
 
 	switch (type) {
-	case SPECIALFD_EVENTFD:
+	case SPECIALFD_EVENTFD: {
+		struct specialfd_eventfd *ae;
+
 		ae = arg;
 		if ((ae->flags & EFD_CLOEXEC) != 0)
 			fflags |= O_CLOEXEC;
 		error = eventfd_create_file(td, fp, ae->initval, ae->flags);
 		break;
+	}
+	case SPECIALFD_INOTIFY: {
+		struct specialfd_inotify *si;
+
+		si = arg;
+		error = inotify_create_file(td, fp, si->flags, &fflags);
+		break;
+	}
 	default:
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "invalid type", type);
 		break;
 	}
 
@@ -970,13 +987,14 @@ kern_specialfd(struct thread *td, int type, void *arg)
 int
 sys___specialfd(struct thread *td, struct __specialfd_args *args)
 {
-	struct specialfd_eventfd ae;
 	int error;
 
 	switch (args->type) {
-	case SPECIALFD_EVENTFD:
+	case SPECIALFD_EVENTFD: {
+		struct specialfd_eventfd ae;
+
 		if (args->len != sizeof(struct specialfd_eventfd)) {
-			error = EINVAL;
+			error = EXTERROR(EINVAL, "eventfd params ABI");
 			break;
 		}
 		error = copyin(args->req, &ae, sizeof(ae));
@@ -984,13 +1002,27 @@ sys___specialfd(struct thread *td, struct __specialfd_args *args)
 			break;
 		if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK |
 		    EFD_SEMAPHORE)) != 0) {
-			error = EINVAL;
+			error = EXTERROR(EINVAL, "reserved flag");
 			break;
 		}
 		error = kern_specialfd(td, args->type, &ae);
 		break;
+	}
+	case SPECIALFD_INOTIFY: {
+		struct specialfd_inotify si;
+
+		if (args->len != sizeof(si)) {
+			error = EINVAL;
+			break;
+		}
+		error = copyin(args->req, &si, sizeof(si));
+		if (error != 0)
+			break;
+		error = kern_specialfd(td, args->type, &si);
+		break;
+	}
 	default:
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "unknown type", args->type);
 		break;
 	}
 	return (error);
@@ -1166,7 +1198,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
 	int error, lf, ndu;
 
 	if (nd < 0)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "negative ndescs"));
 	fdp = td->td_proc->p_fd;
 	ndu = nd;
 	lf = fdp->fd_nfiles;
@@ -1259,7 +1291,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
 		    rtv.tv_usec >= 1000000) {
-			error = EINVAL;
+			error = EXTERROR(EINVAL, "invalid timeval");
 			goto done;
 		}
 		if (!timevalisset(&rtv))
@@ -1491,7 +1523,7 @@ sys_poll(struct thread *td, struct poll_args *uap)
 
 	if (uap->timeout != INFTIM) {
 		if (uap->timeout < 0)
-			return (EINVAL);
+			return (EXTERROR(EINVAL, "invalid timeout"));
 		ts.tv_sec = uap->timeout / 1000;
 		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
 		tsp = &ts;
@@ -1516,7 +1548,7 @@ kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds,
 	precision = 0;
 	if (tsp != NULL) {
 		if (!timespecvalid_interval(tsp))
-			return (EINVAL);
+			return (EXTERROR(EINVAL, "invalid timespec"));
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			sbt = 0;
 		else {
@@ -1619,7 +1651,7 @@ kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds,
 	int error;
 
 	if (kern_poll_maxfds(nfds))
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "too large nfds"));
 	if (nfds > nitems(stackfds))
 		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
 	else
@@ -1796,7 +1828,7 @@ selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 
 		    rtv.tv_usec >= 1000000)
-			return (EINVAL);
+			return (EXTERROR(EINVAL, "invalid timeval"));
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
@@ -2173,7 +2205,7 @@ kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type,
 		    (uintptr_t)p2->p_vmspace);
 		break;
 	default:
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "unknown op");
 		break;
 	}
 
@@ -2237,6 +2269,7 @@ exterr_copyout(struct thread *td)
 		ue.error = 0;
 		sz = sizeof(ue.error);
 	} else {
+		ktrexterr(td);
 		sz = sizeof(ue) - __offsetof(struct uexterror, error);
 	}
 	error = copyout(&ue.error, uloc, sz);
@@ -2277,6 +2310,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap)
 			return (EINVAL);
 		td->td_pflags2 &= ~TDP2_UEXTERR;
 		return (0);
+	case EXTERRCTL_UD:
+		/*
+		 * Important: this code must always return EINVAL and never any
+		 * extended error, for testing purposes.
+		 */
+		/* FALLTHROUGH */
 	default:
 		return (EINVAL);
 	}
@@ -2297,7 +2336,6 @@ exterr_set(int eerror, int category, const char *mmsg, uintptr_t pp1,
 		td->td_kexterr.p1 = pp1;
 		td->td_kexterr.p2 = pp2;
 		td->td_kexterr.src_line = line;
-		ktrexterr(td);
 	}
 	return (eerror);
 }
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index 9340779918a2..ed651da96b14 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -548,7 +548,7 @@ sys_pipe2(struct thread *td, struct pipe2_args *uap)
 {
 	int error, fildes[2];
 
-	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
+	if ((uap->flags & ~(O_CLOEXEC | O_CLOFORK | O_NONBLOCK)) != 0)
 		return (EINVAL);
 	error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
 	if (error)
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index fa36cc824078..90a4f3a7dad8 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -598,4 +598,6 @@ const char *syscallnames[] = {
 	"fchroot",			/* 590 = fchroot */
 	"setcred",			/* 591 = setcred */
 	"exterrctl",			/* 592 = exterrctl */
+	"inotify_add_watch_at",			/* 593 = inotify_add_watch_at */
+	"inotify_rm_watch",			/* 594 = inotify_rm_watch */
 };
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 08b557a7a540..90559fab6086 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3349,11 +3349,26 @@
 		    size_t size
 		);
 	}
-592	AUE_NULL	STD {
+592	AUE_NULL	STD|CAPENABLED {
 		int exterrctl(
 		    u_int op,
 		    u_int flags,
 		    _In_reads_bytes_(4) void *ptr
 		);
 	}
+593	AUE_INOTIFY	STD|CAPENABLED {
+		int inotify_add_watch_at(
+		    int fd,
+		    int dfd,
+		    _In_z_ const char *path,
+		    uint32_t mask
+		);
+	}
+594	AUE_INOTIFY	STD|CAPENABLED {
+		int inotify_rm_watch(
+		    int fd,
+		    int wd
+		);
+	}
+
 ; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 15789d3eb5fa..90b21616a558 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3482,6 +3482,24 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
 		*n_args = 3;
 		break;
 	}
+	/* inotify_add_watch_at */
+	case 593: {
+		struct inotify_add_watch_at_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		iarg[a++] = p->dfd; /* int */
+		uarg[a++] = (intptr_t)p->path; /* const char * */
+		uarg[a++] = p->mask; /* uint32_t */
+		*n_args = 4;
+		break;
+	}
+	/* inotify_rm_watch */
+	case 594: {
+		struct inotify_rm_watch_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		iarg[a++] = p->wd; /* int */
+		*n_args = 2;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
@@ -9317,6 +9335,38 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 			break;
 		};
 		break;
+	/* inotify_add_watch_at */
+	case 593:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "userland const char *";
+			break;
+		case 3:
+			p = "uint32_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* inotify_rm_watch */
+	case 594:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
 	default:
 		break;
 	};
@@ -11305,6 +11355,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 		if (ndx == 0 || ndx == 1)
 			p = "int";
 		break;
+	/* inotify_add_watch_at */
+	case 593:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* inotify_rm_watch */
+	case 594:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
 	default:
 		break;
 	};
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
index 11141d197aec..a545a0a54c25 100644
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -1724,7 +1724,7 @@ freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
 		return (sys_msgsys(td, (struct msgsys_args *)uap));
 	}
 #else
-	return (nosys(td, NULL));
+	return (kern_nosys(td, 0));
 #endif
 }
 
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
index e399517010fc..a99e1a4de14e 100644
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -1904,7 +1904,7 @@ freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
 		return (sys_semsys(td, (struct semsys_args *)uap));
 	}
 #else
-	return (nosys(td, NULL));
+	return (kern_nosys(td, 0));
 #endif
 }
 
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index 60e3fe92a4b7..8d1a469127c6 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -1474,7 +1474,7 @@ freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
 		return (EINVAL);
 	}
 #else
-	return (nosys(td, NULL));
+	return (kern_nosys(td, 0));
 #endif
 }
 
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index ce09042abdac..66ce1b5a081d 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1207,7 +1207,7 @@ sb_mark_notready(struct sockbuf *sb)
 	for (; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 		    __func__));
-		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
+		KASSERT((m->m_flags & M_NOTREADY) == 0, ("%s: mbuf not ready",
 		    __func__));
 		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 		    __func__));
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 6f83b875a6b6..85fe48ddd466 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -1134,10 +1134,10 @@ shm_doremove(struct shm_mapping *map)
 
 int
 kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
-    int shmflags, struct filecaps *fcaps, const char *name __unused)
+    int shmflags, struct filecaps *fcaps, const char *name __unused,
+    struct shmfd *shmfd)
 {
 	struct pwddesc *pdp;
-	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	void *rl_cookie;
@@ -1214,23 +1214,41 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
 	if (error != 0)
 		goto outnofp;
 
-	/* A SHM_ANON path pointer creates an anonymous object. */
+	/*
+	 * A SHM_ANON path pointer creates an anonymous object.  We allow other
+	 * parts of the kernel to pre-populate a shmfd and then materialize an
+	 * fd for it here as a means to pass data back up to userland.  This
+	 * doesn't really make sense for named shm objects, but it makes plenty
+	 * of sense for anonymous objects.
+	 */
 	if (userpath == SHM_ANON) {
-		/* A read-only anonymous object is pointless. */
-		if ((flags & O_ACCMODE) == O_RDONLY) {
-			error = EINVAL;
-			goto out;
-		}
-		shmfd = shm_alloc(td->td_ucred, cmode, largepage);
-		if (shmfd == NULL) {
-			error = ENOMEM;
-			goto out;
+		if (shmfd != NULL) {
+			shm_hold(shmfd);
+		} else {
+			/*
+			 * A read-only anonymous object is pointless, unless it
+			 * was pre-populated by the kernel with the expectation
+			 * that a shmfd would later be created for userland to
+			 * access it through.
+			 */
+			if ((flags & O_ACCMODE) == O_RDONLY) {
+				error = EINVAL;
+				goto out;
+			}
+			shmfd = shm_alloc(td->td_ucred, cmode, largepage);
+			if (shmfd == NULL) {
+				error = ENOMEM;
+				goto out;
+			}
+
+			shmfd->shm_seals = initial_seals;
+			shmfd->shm_flags = shmflags;
 		}
-		shmfd->shm_seals = initial_seals;
-		shmfd->shm_flags = shmflags;
 	} else {
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
+
+		MPASS(shmfd == NULL);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
@@ -2173,7 +2191,7 @@ kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode,
     struct filecaps *caps)
 {
 
-	return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL));
+	return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL, NULL));
 }
 
 /*
@@ -2191,7 +2209,7 @@ sys_shm_open2(struct thread *td, struct shm_open2_args *uap)
 {
 
 	return (kern_shm_open2(td, uap->path, uap->flags, uap->mode,
-	    uap->shmflags, NULL, uap->name));
+	    uap->shmflags, NULL, uap->name, NULL));
 }
 
 int
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index ec00878cd9a5..745702bd4a4f 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -195,14 +195,14 @@ int
 sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 {
 	struct mbuf *m;
-	u_int blocker;
+	bool blocker;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
 	KASSERT(count > 0, ("%s: invalid count %d", __func__, count));
 
 	m = m0;
-	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
+	blocker = (sb->sb_fnrdy == m);
 
 	while (count > 0) {
 		KASSERT(m->m_flags & M_NOTREADY,
@@ -217,8 +217,7 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 			m->m_epg_nrdy = 0;
 		} else
 			count--;
-
-		m->m_flags &= ~(M_NOTREADY | blocker);
+		m->m_flags &= ~M_NOTREADY;
 		if (blocker)
 			sb->sb_acc += m->m_len;
 		m = m->m_next;
@@ -240,12 +239,8 @@ sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 	}
 
 	/* This one was blocking all the queue. */
-	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
-		KASSERT(m->m_flags & M_BLOCKED,
-		    ("%s: m %p !M_BLOCKED", __func__, m));
-		m->m_flags &= ~M_BLOCKED;
+	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next)
 		sb->sb_acc += m->m_len;
-	}
 
 	sb->sb_fnrdy = m;
 	sbready_compress(sb, m0, m);
@@ -269,8 +264,7 @@ sballoc(struct sockbuf *sb, struct mbuf *m)
 			sb->sb_fnrdy = m;
 		else
 			sb->sb_acc += m->m_len;
-	} else
-		m->m_flags |= M_BLOCKED;
+	}
 
 	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 		sb->sb_ctl += m->m_len;
@@ -287,29 +281,29 @@ sballoc(struct sockbuf *sb, struct mbuf *m)
 void
 sbfree(struct sockbuf *sb, struct mbuf *m)
 {
+	struct mbuf *n;
 
 #if 0	/* XXX: not yet: soclose() call path comes here w/o lock. */
 	SOCKBUF_LOCK_ASSERT(sb);
 #endif
-
 	sb->sb_ccc -= m->m_len;
 
-	if (!(m->m_flags & M_NOTAVAIL))
-		sb->sb_acc -= m->m_len;
-
 	if (m == sb->sb_fnrdy) {
-		struct mbuf *n;
-
 		KASSERT(m->m_flags & M_NOTREADY,
 		    ("%s: m %p !M_NOTREADY", __func__, m));
 
 		n = m->m_next;
 		while (n != NULL && !(n->m_flags & M_NOTREADY)) {
-			n->m_flags &= ~M_BLOCKED;
 			sb->sb_acc += n->m_len;
 			n = n->m_next;
 		}
 		sb->sb_fnrdy = n;
+	} else {
+		/* Assert that mbuf is not behind sb_fnrdy. */
+		for (n = sb->sb_fnrdy; n != NULL; n = n->m_next)
+			KASSERT(n != m, ("%s: sb %p freeing %p behind sb_fnrdy",
+			    __func__, sb, m));
+		sb->sb_acc -= m->m_len;
 	}
 
 	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
@@ -779,6 +773,7 @@ sbsetopt(struct socket *so, struct sockopt *sopt)
 		 * high-water.
 		 */
 		*lowat = (cc > *hiwat) ? *hiwat : cc;
+		*flags &= ~SB_AUTOLOWAT;
 		break;
 	}
 
@@ -1128,13 +1123,7 @@ sbcheck(struct sockbuf *sb, const char *file, int line)
 			}
 			fnrdy = m;
 		}
-		if (fnrdy) {
-			if (!(m->m_flags & M_NOTAVAIL)) {
-				printf("sb %p: fnrdy %p, m %p is avail\n",
-				    sb, sb->sb_fnrdy, m);
-				goto fail;
-			}
-		} else
+		if (fnrdy == NULL)
 			acc += m->m_len;
 		ccc += m->m_len;
 		mbcnt += MSIZE;
@@ -1601,8 +1590,8 @@ sbcut_internal(struct sockbuf *sb, int len)
 			next = m->m_nextpkt;
 		}
 		if (m->m_len > len) {
-			KASSERT(!(m->m_flags & M_NOTAVAIL),
-			    ("%s: m %p M_NOTAVAIL", __func__, m));
+			KASSERT(!(m->m_flags & M_NOTREADY),
+			    ("%s: m %p M_NOTREADY", __func__, m));
 			m->m_len -= len;
 			m->m_data += len;
 			sb->sb_ccc -= len;
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 6c9eb7139cd1..fe2d8d056062 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1211,7 +1211,8 @@ solisten_clone(struct socket *head)
 	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 	so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
-	so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
+	so->so_snd.sb_flags = head->sol_sbsnd_flags &
+	    (SB_AUTOSIZE | SB_AUTOLOWAT);
 	if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
@@ -2988,8 +2989,8 @@ dontblock:
 	 */
 	moff = 0;
 	offset = 0;
-	while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
-	    && error == 0) {
+	while (m != NULL && !(m->m_flags & M_NOTREADY) && uio->uio_resid > 0 &&
+	    error == 0) {
 		/*
 		 * If the type of mbuf has changed since the last mbuf
 		 * examined ('type'), end the receive operation.
@@ -3341,7 +3342,7 @@ deliver:
 			for (m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
-				KASSERT(!(m->m_flags & M_NOTAVAIL),
+				KASSERT(!(m->m_flags & M_NOTREADY),
 				    ("%s: m %p not available", __func__, m));
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
@@ -4514,6 +4515,9 @@ sokqfilter_generic(struct socket *so, struct knote *kn)
 		SOCK_BUF_LOCK(so, which);
 		knlist_add(knl, kn, 1);
 		sb->sb_flags |= SB_KNOTE;
+		if ((kn->kn_sfflags & NOTE_LOWAT) &&
+		    (sb->sb_flags & SB_AUTOLOWAT))
+			sb->sb_flags &= ~SB_AUTOLOWAT;
 		SOCK_BUF_UNLOCK(so, which);
 	}
 	SOCK_UNLOCK(so);
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index ad8485028987..133724ac76c5 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -151,6 +151,10 @@ kern_socket(struct thread *td, int domain, int type, int protocol)
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
+	if ((type & SOCK_CLOFORK) != 0) {
+		type &= ~SOCK_CLOFORK;
+		oflag |= O_CLOFORK;
+	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
@@ -352,7 +356,8 @@ kern_accept4(struct thread *td, int s, struct sockaddr *sa, int flags,
 		goto done;
 #endif
 	error = falloc_caps(td, &nfp, &fd,
-	    (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
+	    ((flags & SOCK_CLOEXEC) != 0 ? O_CLOEXEC : 0) |
+	    ((flags & SOCK_CLOFORK) != 0 ? O_CLOFORK : 0), &fcaps);
 	if (error != 0)
 		goto done;
 	SOCK_LOCK(head);
@@ -435,7 +440,7 @@ int
 sys_accept4(struct thread *td, struct accept4_args *uap)
 {
 
-	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+	if ((uap->flags & ~(SOCK_CLOEXEC | SOCK_CLOFORK | SOCK_NONBLOCK)) != 0)
 		return (EINVAL);
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
@@ -557,6 +562,10 @@ kern_socketpair(struct thread *td, int domain, int type, int protocol,
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
+	if ((type & SOCK_CLOFORK) != 0) {
+		type &= ~SOCK_CLOFORK;
+		oflag |= O_CLOFORK;
+	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 72bd0246db11..0056dac65c7d 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -3463,7 +3463,8 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
 
 	UNP_LINK_UNLOCK_ASSERT();
 
-	fdflags = (flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+	fdflags = ((flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0) |
+	    ((flags & MSG_CMSG_CLOFORK) ? O_CLOFORK : 0);
 
 	error = 0;
 	if (controlp != NULL) /* controlp == NULL => free control messages */
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 97dc854c9386..02973146068d 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -301,7 +301,7 @@ static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 static void	aio_biocleanup(struct bio *bp);
-void		aio_init_aioinfo(struct proc *p);
+static int	aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
@@ -309,7 +309,7 @@ static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
-int		aio_aqueue(struct thread *td, struct aiocb *ujob,
+static int	aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_biowakeup(struct bio *bp);
@@ -422,10 +422,11 @@ aio_onceonly(void)
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
-void
+static int
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
+	int error;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
@@ -451,8 +452,20 @@ aio_init_aioinfo(struct proc *p)
 		uma_zfree(kaio_zone, ki);
 	}
 
-	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
-		aio_newproc(NULL);
+	error = 0;
+	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) {
+		error = aio_newproc(NULL);
+		if (error != 0) {
+			/*
+			 * At least one worker is enough to have AIO
+			 * functional.  Clear error in that case.
+			 */
+			if (num_aio_procs > 0)
+				error = 0;
+			break;
+		}
+	}
+	return (error);
 }
 
 static int
@@ -1476,7 +1489,7 @@ static struct aiocb_ops aiocb_ops_osigevent = {
  * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
  * technique is done in this code.
  */
-int
+static int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
     int type, struct aiocb_ops *ops)
 {
@@ -1490,8 +1503,11 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
 	int fd, kqfd;
 	u_short evflags;
 
-	if (p->p_aioinfo == NULL)
-		aio_init_aioinfo(p);
+	if (p->p_aioinfo == NULL) {
+		error = aio_init_aioinfo(p);
+		if (error != 0)
+			goto err1;
+	}
 
 	ki = p->p_aioinfo;
 
@@ -2213,8 +2229,11 @@ kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
-	if (p->p_aioinfo == NULL)
-		aio_init_aioinfo(p);
+	if (p->p_aioinfo == NULL) {
+		error = aio_init_aioinfo(p);
+		if (error != 0)
+			return (error);
+	}
 
 	ki = p->p_aioinfo;
 
@@ -2503,8 +2522,11 @@ kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
 		timo = tvtohz(&atv);
 	}
 
-	if (p->p_aioinfo == NULL)
-		aio_init_aioinfo(p);
+	if (p->p_aioinfo == NULL) {
+		error = aio_init_aioinfo(p);
+		if (error != 0)
+			return (error);
+	}
 	ki = p->p_aioinfo;
 
 	error = 0;
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 883beaf6d1da..89c1d779f04c 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -41,6 +41,7 @@
 #include <sys/counter.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
@@ -331,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 
-SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
+    "enum cache_fpl_status");
 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 
@@ -2629,6 +2631,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 		atomic_store_ptr(&dvp->v_cache_dd, ncp);
 	} else if (vp != NULL) {
 		/*
+		 * Take the slow path in INOTIFY().  This flag will be lazily
+		 * cleared by cache_vop_inotify() once all directories referring
+		 * to vp are unwatched.
+		 */
+		if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0))
+			vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT);
+
+		/*
 		 * For this case, the cache entry maps both the
 		 * directory name in it and the name ".." for the
 		 * directory's parent.
@@ -4008,6 +4018,56 @@ out:
 	return (error);
 }
 
+void
+cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie)
+{
+	struct mtx *vlp;
+	struct namecache *ncp;
+	int isdir;
+	bool logged, self;
+
+	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+	self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 &&
+	    (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0);
+
+	if (self) {
+		int selfevent;
+
+		if (event == _IN_ATTRIB_LINKCOUNT)
+			selfevent = IN_ATTRIB;
+		else
+			selfevent = event;
+		inotify_log(vp, NULL, 0, selfevent | isdir, cookie);
+	}
+	if ((event & IN_ALL_EVENTS) == 0)
+		return;
+
+	logged = false;
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
+	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
+		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+			continue;
+		if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) {
+			/*
+			 * XXX-MJ if the vnode has two links in the same
+			 * dir, we'll log the same event twice.
+			 */
+			inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen,
+			    event | isdir, cookie);
+			logged = true;
+		}
+	}
+	if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) {
+		/*
+		 * We didn't find a watched directory that contains this vnode,
+		 * so stop calling VOP_INOTIFY for operations on the vnode.
+		 */
+		vn_irflag_unset(vp, VIRF_INOTIFY_PARENT);
+	}
+	mtx_unlock(vlp);
+}
+
 #ifdef DDB
 static void
 db_print_vpath(struct vnode *vp)
@@ -6361,15 +6421,11 @@ out:
 	cache_fpl_smr_assert_not_entered(&fpl);
 	cache_fpl_assert_status(&fpl);
 	*status = fpl.status;
-	if (SDT_PROBES_ENABLED()) {
-		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
-		if (fpl.status == CACHE_FPL_STATUS_HANDLED)
-			SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
-			    ndp);
-	}
-
+	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 	if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 		MPASS(error != CACHE_FPL_FAILED);
+		SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
+		    ndp);
 		if (error != 0) {
 			cache_fpl_cleanup_cnp(fpl.cnp);
 			MPASS(fpl.dvp == NULL);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index be49c0887609..fd6202a1424c 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -39,6 +39,7 @@
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/filio.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
@@ -119,6 +120,8 @@ struct vop_vector default_vnodeops = {
 	.vop_getwritemount =	vop_stdgetwritemount,
 	.vop_inactive =		VOP_NULL,
 	.vop_need_inactive =	vop_stdneed_inactive,
+	.vop_inotify =		vop_stdinotify,
+	.vop_inotify_add_watch = vop_stdinotify_add_watch,
 	.vop_ioctl =		vop_stdioctl,
 	.vop_kqfilter =		vop_stdkqfilter,
 	.vop_islocked =		vop_stdislocked,
@@ -453,6 +456,7 @@ vop_stdpathconf(struct vop_pathconf_args *ap)
 		case _PC_MAC_PRESENT:
 		case _PC_NAMEDATTR_ENABLED:
 		case _PC_HAS_NAMEDATTR:
+		case _PC_HAS_HIDDENSYSTEM:
 			*ap->a_retval = 0;
 			return (0);
 		default:
@@ -1306,6 +1310,20 @@ vop_stdneed_inactive(struct vop_need_inactive_args *ap)
 }
 
 int
+vop_stdinotify(struct vop_inotify_args *ap)
+{
+	vn_inotify(ap->a_vp, ap->a_dvp, ap->a_cnp, ap->a_event, ap->a_cookie);
+	return (0);
+}
+
+int
+vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *ap)
+{
+	return (vn_inotify_add_watch(ap->a_vp, ap->a_sc, ap->a_mask,
+	    ap->a_wdp, ap->a_td));
+}
+
+int
 vop_stdioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
new file mode 100644
index 000000000000..d3cd0d1f9832
--- /dev/null
+++ b/sys/kern/vfs_inotify.c
@@ -0,0 +1,1011 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Klara, Inc.
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/caprights.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#define	EXTERR_CATEGORY	EXTERR_CAT_INOTIFY
+#include <sys/exterrvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/inotify.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslimits.h>
+#include <sys/sysproto.h>
+#include <sys/tree.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+uint32_t inotify_rename_cookie;
+
+static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "inotify configuration");
+
+static int inotify_max_queued_events = 16384;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
+    &inotify_max_queued_events, 0,
+    "Maximum number of events to queue on an inotify descriptor");
+
+static int inotify_max_user_instances = 256;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
+    &inotify_max_user_instances, 0,
+    "Maximum number of inotify descriptors per user");
+
+static int inotify_max_user_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
+    &inotify_max_user_watches, 0,
+    "Maximum number of inotify watches per user");
+
+static int inotify_max_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
+    &inotify_max_watches, 0,
+    "Maximum number of inotify watches system-wide");
+
+static int inotify_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
+    &inotify_watches, 0,
+    "Total number of inotify watches currently in use");
+
+static int inotify_coalesce = 1;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
+    &inotify_coalesce, 0,
+    "Coalesce inotify events when possible");
+
+static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
+SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
+    &inotify_event_drops,
+    "Number of inotify events dropped due to limits or allocation failures");
+
+static fo_rdwr_t	inotify_read;
+static fo_ioctl_t	inotify_ioctl;
+static fo_poll_t	inotify_poll;
+static fo_kqfilter_t	inotify_kqfilter;
+static fo_stat_t	inotify_stat;
+static fo_close_t	inotify_close;
+static fo_fill_kinfo_t	inotify_fill_kinfo;
+
+static const struct fileops inotifyfdops = {
+	.fo_read = inotify_read,
+	.fo_write = invfo_rdwr,
+	.fo_truncate = invfo_truncate,
+	.fo_ioctl = inotify_ioctl,
+	.fo_poll = inotify_poll,
+	.fo_kqfilter = inotify_kqfilter,
+	.fo_stat = inotify_stat,
+	.fo_close = inotify_close,
+	.fo_chmod = invfo_chmod,
+	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_fill_kinfo = inotify_fill_kinfo,
+	.fo_cmp = file_kcmp_generic,
+	.fo_flags = DFLAG_PASSABLE,
+};
+
+static void	filt_inotifydetach(struct knote *kn);
+static int	filt_inotifyevent(struct knote *kn, long hint);
+
+static const struct filterops inotify_rfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_inotifydetach,
+	.f_event = filt_inotifyevent,
+};
+
+static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
+
+struct inotify_record {
+	STAILQ_ENTRY(inotify_record) link;
+	struct inotify_event	ev;
+};
+
+static uint64_t inotify_ino = 1;
+
+/*
+ * On LP64 systems this occupies 64 bytes, so we don't get internal
+ * fragmentation by allocating watches with malloc(9).  If the size changes,
+ * consider using a UMA zone to improve memory efficiency.
+ */
+struct inotify_watch {
+	struct inotify_softc *sc; /* back-pointer */
+	int		wd;	/* unique ID */
+	uint32_t	mask;	/* event mask */
+	struct vnode	*vp;	/* vnode being watched, refed */
+	RB_ENTRY(inotify_watch) ilink;		/* inotify linkage */
+	TAILQ_ENTRY(inotify_watch) vlink;	/* vnode linkage */
+};
+
+static void
+inotify_init(void *arg __unused)
+{
+	/* Don't let a user hold too many vnodes. */
+	inotify_max_user_watches = desiredvnodes / 3;
+	/* Don't let the system hold too many vnodes. */
+	inotify_max_watches = desiredvnodes / 2;
+}
+SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
+
+static int
+inotify_watch_cmp(const struct inotify_watch *a,
+    const struct inotify_watch *b)
+{
+	if (a->wd < b->wd)
+		return (-1);
+	else if (a->wd > b->wd)
+		return (1);
+	else
+		return (0);
+}
+RB_HEAD(inotify_watch_tree, inotify_watch);
+RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
+
+struct inotify_softc {
+	struct mtx	lock;			/* serialize all softc writes */
+	STAILQ_HEAD(, inotify_record) pending;	/* events waiting to be read */
+	struct inotify_record overflow;		/* preallocated record */
+	int		nextwatch;		/* next watch ID to try */
+	int		npending;		/* number of pending events */
+	size_t		nbpending;		/* bytes available to read */
+	uint64_t	ino;			/* unique identifier */
+	struct inotify_watch_tree watches;	/* active watches */
+	struct selinfo	sel;			/* select/poll/kevent info */
+	struct ucred	*cred;			/* credential ref */
+};
+
+static struct inotify_record *
+inotify_dequeue(struct inotify_softc *sc)
+{
+	struct inotify_record *rec;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+	KASSERT(!STAILQ_EMPTY(&sc->pending),
+	    ("%s: queue for %p is empty", __func__, sc));
+
+	rec = STAILQ_FIRST(&sc->pending);
+	STAILQ_REMOVE_HEAD(&sc->pending, link);
+	sc->npending--;
+	sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
+	return (rec);
+}
+
+static void
+inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
+{
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	if (head)
+		STAILQ_INSERT_HEAD(&sc->pending, rec, link);
+	else
+		STAILQ_INSERT_TAIL(&sc->pending, rec, link);
+	sc->npending++;
+	sc->nbpending += sizeof(rec->ev) + rec->ev.len;
+}
+
+static int
+inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
+    struct thread *td)
+{
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	int error;
+	bool first;
+
+	sc = fp->f_data;
+	error = 0;
+
+	mtx_lock(&sc->lock);
+	while (STAILQ_EMPTY(&sc->pending)) {
+		if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
+			mtx_unlock(&sc->lock);
+			return (EWOULDBLOCK);
+		}
+		error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
+		if (error != 0) {
+			mtx_unlock(&sc->lock);
+			return (error);
+		}
+	}
+	for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
+		size_t len;
+
+		rec = inotify_dequeue(sc);
+		len = sizeof(rec->ev) + rec->ev.len;
+		if (uio->uio_resid < (ssize_t)len) {
+			inotify_enqueue(sc, rec, true);
+			if (first) {
+				error = EXTERROR(EINVAL,
+				    "read buffer is too small");
+			}
+			break;
+		}
+		mtx_unlock(&sc->lock);
+		error = uiomove(&rec->ev, len, uio);
+#ifdef KTRACE
+		if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+			ktrstruct("inotify", &rec->ev, len);
+#endif
+		mtx_lock(&sc->lock);
+		if (error != 0) {
+			inotify_enqueue(sc, rec, true);
+			mtx_unlock(&sc->lock);
+			return (error);
+		}
+		if (rec == &sc->overflow) {
+			/*
+			 * Signal to inotify_queue_record() that the overflow
+			 * record can be reused.
+			 */
+			memset(rec, 0, sizeof(*rec));
+		} else {
+			free(rec, M_INOTIFY);
+		}
+	}
+	mtx_unlock(&sc->lock);
+	return (error);
+}
+
+static int
+inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
+    struct thread *td)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	switch (com) {
+	case FIONREAD:
+		*(int *)data = (int)sc->nbpending;
+		return (0);
+	case FIONBIO:
+	case FIOASYNC:
+		return (0);
+	default:
+		return (ENOTTY);
+	}
+
+	return (0);
+}
+
+static int
+inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+	struct inotify_softc *sc;
+	int revents;
+
+	sc = fp->f_data;
+	revents = 0;
+
+	mtx_lock(&sc->lock);
+	if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
+		revents |= events & (POLLIN | POLLRDNORM);
+	else
+		selrecord(td, &sc->sel);
+	mtx_unlock(&sc->lock);
+	return (revents);
+}
+
+static void
+filt_inotifydetach(struct knote *kn)
+{
+	struct inotify_softc *sc;
+
+	sc = kn->kn_hook;
+	knlist_remove(&sc->sel.si_note, kn, 0);
+}
+
+static int
+filt_inotifyevent(struct knote *kn, long hint)
+{
+	struct inotify_softc *sc;
+
+	sc = kn->kn_hook;
+	mtx_assert(&sc->lock, MA_OWNED);
+	kn->kn_data = sc->nbpending;
+	return (kn->kn_data > 0);
+}
+
+static int
+inotify_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct inotify_softc *sc;
+
+	if (kn->kn_filter != EVFILT_READ)
+		return (EINVAL);
+	sc = fp->f_data;
+	kn->kn_fop = &inotify_rfiltops;
+	kn->kn_hook = sc;
+	knlist_add(&sc->sel.si_note, kn, 0);
+	return (0);
+}
+
+static int
+inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	memset(sb, 0, sizeof(*sb));
+	sb->st_mode = S_IFREG | S_IRUSR;
+	sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
+	mtx_lock(&sc->lock);
+	sb->st_size = sc->nbpending;
+	sb->st_blocks = sc->npending;
+	sb->st_uid = sc->cred->cr_ruid;
+	sb->st_gid = sc->cred->cr_rgid;
+	sb->st_ino = sc->ino;
+	mtx_unlock(&sc->lock);
+	return (0);
+}
+
+static void
+inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
+{
+	struct vnode *vp;
+
+	vp = watch->vp;
+	mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
+
+	atomic_subtract_int(&inotify_watches, 1);
+	(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+
+	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
+		vn_irflag_unset(vp, VIRF_INOTIFY);
+}
+
+/*
+ * Assumes that the watch has already been removed from its softc.
+ */
+static void
+inotify_remove_watch(struct inotify_watch *watch)
+{
+	struct inotify_softc *sc;
+	struct vnode *vp;
+
+	sc = watch->sc;
+
+	vp = watch->vp;
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	inotify_unlink_watch_locked(sc, watch);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+	vrele(vp);
+	free(watch, M_INOTIFY);
+}
+
+static int
+inotify_close(struct file *fp, struct thread *td)
+{
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	struct inotify_watch *watch;
+
+	sc = fp->f_data;
+
+	mtx_lock(&sc->lock);
+	(void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
+	while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
+		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+		mtx_unlock(&sc->lock);
+		inotify_remove_watch(watch);
+		mtx_lock(&sc->lock);
+	}
+	while (!STAILQ_EMPTY(&sc->pending)) {
+		rec = inotify_dequeue(sc);
+		if (rec != &sc->overflow)
+			free(rec, M_INOTIFY);
+	}
+	mtx_unlock(&sc->lock);
+	seldrain(&sc->sel);
+	knlist_destroy(&sc->sel.si_note);
+	mtx_destroy(&sc->lock);
+	crfree(sc->cred);
+	free(sc, M_INOTIFY);
+	return (0);
+}
+
+static int
+inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+    struct filedesc *fdp)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	kif->kf_type = KF_TYPE_INOTIFY;
+	kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
+	kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
+	return (0);
+}
+
+int
+inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
+{
+	struct inotify_softc *sc;
+	int fflags;
+
+	if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
+		return (EINVAL);
+
+	if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
+	    inotify_max_user_instances))
+		return (EMFILE);
+
+	sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
+	sc->nextwatch = 1; /* Required for compatibility. */
+	STAILQ_INIT(&sc->pending);
+	RB_INIT(&sc->watches);
+	mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
+	knlist_init_mtx(&sc->sel.si_note, &sc->lock);
+	sc->cred = crhold(td->td_ucred);
+	sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
+
+	fflags = FREAD;
+	if ((flags & IN_NONBLOCK) != 0)
+		fflags |= FNONBLOCK;
+	if ((flags & IN_CLOEXEC) != 0)
+		*fflagsp |= O_CLOEXEC;
+	finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
+
+	return (0);
+}
+
+static struct inotify_record *
+inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
+    uint32_t cookie, int waitok)
+{
+	struct inotify_event *evp;
+	struct inotify_record *rec;
+
+	rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
+	    waitok | M_ZERO);
+	if (rec == NULL)
+		return (NULL);
+	evp = &rec->ev;
+	evp->wd = wd;
+	evp->mask = event;
+	evp->cookie = cookie;
+	evp->len = _IN_NAMESIZE(namelen);
+	if (name != NULL)
+		memcpy(evp->name, name, namelen);
+	return (rec);
+}
+
+static bool
+inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
+{
+	struct inotify_record *prev;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	prev = STAILQ_LAST(&sc->pending, inotify_record, link);
+	return (prev != NULL && prev->ev.mask == evp->mask &&
+	    prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
+	    prev->ev.len == evp->len &&
+	    memcmp(prev->ev.name, evp->name, evp->len) == 0);
+}
+
+static void
+inotify_overflow_event(struct inotify_event *evp)
+{
+	evp->mask = IN_Q_OVERFLOW;
+	evp->wd = -1;
+	evp->cookie = 0;
+	evp->len = 0;
+}
+
+/*
+ * Put an event record on the queue for an inotify desscriptor.  Return false if
+ * the record was not enqueued for some reason, true otherwise.
+ */
+static bool
+inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
+{
+	struct inotify_event *evp;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	evp = &rec->ev;
+	if (__predict_false(rec == &sc->overflow)) {
+		/*
+		 * Is the overflow record already in the queue?  If so, there's
+		 * not much else we can do: we're here because a kernel memory
+		 * shortage prevented new record allocations.
+		 */
+		counter_u64_add(inotify_event_drops, 1);
+		if (evp->mask == IN_Q_OVERFLOW)
+			return (false);
+		inotify_overflow_event(evp);
+	} else {
+		/* Try to coalesce duplicate events. */
+		if (inotify_coalesce && inotify_can_coalesce(sc, evp))
+			return (false);
+
+		/*
+		 * Would this one overflow the queue?  If so, convert it to an
+		 * overflow event and try again to coalesce.
+		 */
+		if (sc->npending >= inotify_max_queued_events) {
+			counter_u64_add(inotify_event_drops, 1);
+			inotify_overflow_event(evp);
+			if (inotify_can_coalesce(sc, evp))
+				return (false);
+		}
+	}
+	inotify_enqueue(sc, rec, false);
+	selwakeup(&sc->sel);
+	KNOTE_LOCKED(&sc->sel.si_note, 0);
+	wakeup(&sc->pending);
+	return (true);
+}
+
+static int
+inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
+    int event, uint32_t cookie)
+{
+	struct inotify_watch key;
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	int relecount;
+	bool allocfail;
+
+	relecount = 0;
+
+	sc = watch->sc;
+	rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
+	    M_NOWAIT);
+	if (rec == NULL) {
+		rec = &sc->overflow;
+		allocfail = true;
+	} else {
+		allocfail = false;
+	}
+
+	mtx_lock(&sc->lock);
+	if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
+		free(rec, M_INOTIFY);
+	if ((watch->mask & IN_ONESHOT) != 0 ||
+	    (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
+		if (!allocfail) {
+			rec = inotify_alloc_record(watch->wd, NULL, 0,
+			    IN_IGNORED, 0, M_NOWAIT);
+			if (rec == NULL)
+				rec = &sc->overflow;
+			if (!inotify_queue_record(sc, rec) &&
+			    rec != &sc->overflow)
+				free(rec, M_INOTIFY);
+		}
+
+		/*
+		 * Remove the watch, taking care to handle races with
+		 * inotify_close().
+		 */
+		key.wd = watch->wd;
+		if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
+			RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+			inotify_unlink_watch_locked(sc, watch);
+			free(watch, M_INOTIFY);
+
+			/* Defer vrele() to until locks are dropped. */
+			relecount++;
+		}
+	}
+	mtx_unlock(&sc->lock);
+	return (relecount);
+}
+
+void
+inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
+    uint32_t cookie)
+{
+	struct inotify_watch *watch, *tmp;
+	int relecount;
+
+	KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
+	    ("inotify_log: invalid event %#x", event));
+
+	relecount = 0;
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
+		KASSERT(watch->vp == vp,
+		    ("inotify_log: watch %p vp != vp", watch));
+		if ((watch->mask & event) != 0 || event == IN_UNMOUNT) {
+			relecount += inotify_log_one(watch, name, namelen, event,
+			    cookie);
+		}
+	}
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+	for (int i = 0; i < relecount; i++)
+		vrele(vp);
+}
+
+/*
+ * An inotify event occurred on a watched vnode.
+ */
+void
+vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
+    int event, uint32_t cookie)
+{
+	int isdir;
+
+	VNPASS(vp->v_holdcnt > 0, vp);
+
+	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+
+	if (dvp != NULL) {
+		VNPASS(dvp->v_holdcnt > 0, dvp);
+
+		/*
+		 * Should we log an event for the vnode itself?
+		 */
+		if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
+			int selfevent;
+
+			switch (event) {
+			case _IN_MOVE_DELETE:
+			case IN_DELETE:
+				/*
+				 * IN_DELETE_SELF is only generated when the
+				 * last hard link of a file is removed.
+				 */
+				selfevent = IN_DELETE_SELF;
+				if (vp->v_type != VDIR) {
+					struct vattr va;
+					int error;
+
+					error = VOP_GETATTR(vp, &va,
+					    cnp->cn_cred);
+					if (error == 0 && va.va_nlink != 0)
+						selfevent = 0;
+				}
+				break;
+			case IN_MOVED_FROM:
+				cookie = 0;
+				selfevent = IN_MOVE_SELF;
+				break;
+			case _IN_ATTRIB_LINKCOUNT:
+				selfevent = IN_ATTRIB;
+				break;
+			default:
+				selfevent = event;
+				break;
+			}
+
+			if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
+				inotify_log(vp, NULL, 0, selfevent | isdir,
+				    cookie);
+			}
+		}
+
+		/*
+		 * Something is watching the directory through which this vnode
+		 * was referenced, so we may need to log the event.
+		 */
+		if ((event & IN_ALL_EVENTS) != 0 &&
+		    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
+			inotify_log(dvp, cnp->cn_nameptr,
+			    cnp->cn_namelen, event | isdir, cookie);
+		}
+	} else {
+		/*
+		 * We don't know which watched directory might contain the
+		 * vnode, so we have to fall back to searching the name cache.
+		 */
+		cache_vop_inotify(vp, event, cookie);
+	}
+}
+
+int
+vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
+    uint32_t *wdp, struct thread *td)
+{
+	struct inotify_watch *watch, *watch1;
+	uint32_t wd;
+
+	/*
+	 * If this is a directory, make sure all of its entries are present in
+	 * the name cache so that we're able to look them up if an event occurs.
+	 * The persistent reference on the directory prevents the outgoing name
+	 * cache entries from being reclaimed.
+	 */
+	if (vp->v_type == VDIR) {
+		struct dirent *dp;
+		char *buf;
+		off_t off;
+		size_t buflen, len;
+		int eof, error;
+
+		buflen = 128 * sizeof(struct dirent);
+		buf = malloc(buflen, M_TEMP, M_WAITOK);
+
+		error = 0;
+		len = off = eof = 0;
+		for (;;) {
+			struct nameidata nd;
+
+			error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
+			    &len, &off, &eof);
+			if (error != 0)
+				break;
+			if (len == 0)
+				/* Finished reading. */
+				break;
+			if (strcmp(dp->d_name, ".") == 0 ||
+			    strcmp(dp->d_name, "..") == 0)
+				continue;
+
+			/*
+			 * namei() consumes a reference on the starting
+			 * directory if it's specified as a vnode.
+			 */
+			vrefact(vp);
+			VOP_UNLOCK(vp);
+			NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
+			    dp->d_name, vp);
+			error = namei(&nd);
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+			if (error != 0)
+				break;
+			vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
+			vrele(nd.ni_vp);
+		}
+		free(buf, M_TEMP);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * The vnode referenced in kern_inotify_add_watch() might be different
+	 * than this one if nullfs is in the picture.
+	 */
+	vrefact(vp);
+	watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
+	watch->sc = sc;
+	watch->vp = vp;
+	watch->mask = mask;
+
+	/*
+	 * Are we updating an existing watch?  Search the vnode's list rather
+	 * than that of the softc, as the former is likely to be shorter.
+	 */
+	v_addpollinfo(vp);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
+		if (watch1->sc == sc)
+			break;
+	}
+	mtx_lock(&sc->lock);
+	if (watch1 != NULL) {
+		mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+		/*
+		 * We found an existing watch, update it based on our flags.
+		 */
+		if ((mask & IN_MASK_CREATE) != 0) {
+			mtx_unlock(&sc->lock);
+			vrele(vp);
+			free(watch, M_INOTIFY);
+			return (EEXIST);
+		}
+		if ((mask & IN_MASK_ADD) != 0)
+			watch1->mask |= mask;
+		else
+			watch1->mask = mask;
+		*wdp = watch1->wd;
+		mtx_unlock(&sc->lock);
+		vrele(vp);
+		free(watch, M_INOTIFY);
+		return (EJUSTRETURN);
+	}
+
+	/*
+	 * We're creating a new watch.  Add it to the softc and vnode watch
+	 * lists.
+	 */
+	do {
+		struct inotify_watch key;
+
+		/*
+		 * Search for the next available watch descriptor.  This is
+		 * implemented so as to avoid reusing watch descriptors for as
+		 * long as possible.
+		 */
+		key.wd = wd = sc->nextwatch++;
+		watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+	} while (watch1 != NULL || wd == 0);
+	watch->wd = wd;
+	RB_INSERT(inotify_watch_tree, &sc->watches, watch);
+	TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+	mtx_unlock(&sc->lock);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+	vn_irflag_set_cond(vp, VIRF_INOTIFY);
+
+	*wdp = wd;
+
+	return (0);
+}
+
+void
+vn_inotify_revoke(struct vnode *vp)
+{
+	if (vp->v_pollinfo == NULL) {
+		/* This is a nullfs vnode which shadows a watched vnode. */
+		return;
+	}
+	inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
+}
+
+static int
+fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
+    struct file **fpp)
+{
+	struct file *fp;
+	int error;
+
+	error = fget(td, fd, needrightsp, &fp);
+	if (error != 0)
+		return (error);
+	if (fp->f_type != DTYPE_INOTIFY) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	*fpp = fp;
+	return (0);
+}
+
+int
+kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
+    struct thread *td)
+{
+	struct nameidata nd;
+	struct file *fp;
+	struct inotify_softc *sc;
+	struct vnode *vp;
+	uint32_t wd;
+	int count, error;
+
+	fp = NULL;
+	vp = NULL;
+
+	if ((mask & IN_ALL_EVENTS) == 0)
+		return (EXTERROR(EINVAL, "no events specified"));
+	if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
+	    (IN_MASK_ADD | IN_MASK_CREATE))
+		return (EXTERROR(EINVAL,
+		    "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
+	if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
+		return (EXTERROR(EINVAL, "unrecognized flag"));
+
+	error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
+	if (error != 0)
+		return (error);
+	sc = fp->f_data;
+
+	NDINIT_AT(&nd, LOOKUP,
+	    ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
+	    LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
+	error = namei(&nd);
+	if (error != 0)
+		goto out;
+	NDFREE_PNBUF(&nd);
+	vp = nd.ni_vp;
+
+	error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
+	if (error != 0)
+		goto out;
+
+	if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+
+	count = atomic_fetchadd_int(&inotify_watches, 1);
+	if (count > inotify_max_watches) {
+		atomic_subtract_int(&inotify_watches, 1);
+		error = ENOSPC;
+		goto out;
+	}
+	if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
+	    inotify_max_user_watches)) {
+		atomic_subtract_int(&inotify_watches, 1);
+		error = ENOSPC;
+		goto out;
+	}
+	error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
+	if (error != 0) {
+		atomic_subtract_int(&inotify_watches, 1);
+		(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+		if (error == EJUSTRETURN) {
+			/* We updated an existing watch, everything is ok. */
+			error = 0;
+		} else {
+			goto out;
+		}
+	}
+	td->td_retval[0] = wd;
+
+out:
+	if (vp != NULL)
+		vput(vp);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_inotify_add_watch_at(struct thread *td,
+    struct inotify_add_watch_at_args *uap)
+{
+	return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
+	    uap->mask, td));
+}
+
+int
+kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
+{
+	struct file *fp;
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	struct inotify_watch key, *watch;
+	int error;
+
+	error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
+	if (error != 0)
+		return (error);
+	sc = fp->f_data;
+
+	rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
+
+	/*
+	 * For compatibility with Linux, we do not remove pending events
+	 * associated with the watch.  Watch descriptors are implemented so as
+	 * to avoid being reused for as long as possible, so one hopes that any
+	 * pending events from the removed watch descriptor will be removed
+	 * before the watch descriptor is recycled.
+	 */
+	key.wd = wd;
+	mtx_lock(&sc->lock);
+	watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+	if (watch == NULL) {
+		free(rec, M_INOTIFY);
+		error = EINVAL;
+	} else {
+		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+		if (!inotify_queue_record(sc, rec)) {
+			free(rec, M_INOTIFY);
+			error = 0;
+		}
+	}
+	mtx_unlock(&sc->lock);
+	if (watch != NULL)
+		inotify_remove_watch(watch);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
+{
+	return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index 86c7bdaa02c0..fb3e6a7a2534 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -75,14 +75,20 @@ static void NDVALIDATE_impl(struct nameidata *, int);
 #endif
 
 /*
+ * Reset ndp to its original state.
+ */
+#define	NDRESET(ndp) do {						\
+	NDREINIT_DBG(ndp);						\
+	ndp->ni_resflags = 0;						\
+	ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;			\
+} while (0)
+/*
  * Prepare namei() to restart. Reset components to its original state and set
  * ISRESTARTED flag which signals the underlying lookup code to change the root
  * from ABI root to actual root and prevents a further restarts.
  */
 #define	NDRESTART(ndp) do {						\
-	NDREINIT_DBG(ndp);						\
-	ndp->ni_resflags = 0;						\
-	ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;			\
+	NDRESET(ndp);						\
 	ndp->ni_cnd.cn_flags |= ISRESTARTED;				\
 } while (0)
 
@@ -162,8 +168,8 @@ static struct vop_vector crossmp_vnodeops = {
  */
 
 struct nameicap_tracker {
-	struct vnode *dp;
 	TAILQ_ENTRY(nameicap_tracker) nm_link;
+	struct mount *mp;
 };
 
 /* Zone for cap mode tracker elements used for dotdot capability checks. */
@@ -192,49 +198,75 @@ SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
     "enables \"..\" components in path lookup in capability mode "
     "on non-local mount");
 
-static void
+static int
 nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
 {
 	struct nameicap_tracker *nt;
+	struct mount *mp;
+	int error;
 
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
-		return;
+		return (0);
+	mp = NULL;
+	error = VOP_GETWRITEMOUNT(dp, &mp);
+	if (error != 0)
+		return (error);
 	nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head);
-	if (nt != NULL && nt->dp == dp)
-		return;
+	if (nt != NULL && nt->mp == mp) {
+		vfs_rel(mp);
+		return (0);
+	}
 	nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK);
-	vhold(dp);
-	nt->dp = dp;
-	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+	nt->mp = mp;
+	error = lockmgr(&mp->mnt_renamelock, LK_SHARED | LK_NOWAIT, 0);
+	if (error != 0) {
+		MPASS(ndp->ni_nctrack_mnt == NULL);
+		ndp->ni_nctrack_mnt = mp;
+		free(nt, M_NAMEITRACKER);
+		error = ERESTART;
+	} else {
+		TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+	}
+	return (error);
 }
 
 static void
-nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first)
+nameicap_cleanup(struct nameidata *ndp, int error)
 {
 	struct nameicap_tracker *nt, *nt1;
+	struct mount *mp;
+
+	KASSERT((ndp->ni_nctrack_mnt == NULL &&
+	    TAILQ_EMPTY(&ndp->ni_cap_tracker)) ||
+	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0,
+	    ("tracker active and not strictrelative"));
 
-	nt = first;
-	TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+	TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+		mp = nt->mp;
+		lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+		vfs_rel(mp);
 		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
-		vdrop(nt->dp);
 		free(nt, M_NAMEITRACKER);
 	}
-}
 
-static void
-nameicap_cleanup(struct nameidata *ndp)
-{
-	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
-	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
-	nameicap_cleanup_from(ndp, NULL);
+	mp = ndp->ni_nctrack_mnt;
+	if (mp != NULL) {
+		if (error == ERESTART) {
+			lockmgr(&mp->mnt_renamelock, LK_EXCLUSIVE, 0);
+			lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+		}
+		vfs_rel(mp);
+		ndp->ni_nctrack_mnt = NULL;
+	}
 }
 
 /*
- * For dotdot lookups in capability mode, only allow the component
- * lookup to succeed if the resulting directory was already traversed
- * during the operation.  This catches situations where already
- * traversed directory is moved to different parent, and then we walk
- * over it with dotdots.
+ * For dotdot lookups in capability mode, disallow walking over the
+ * directory no_rbeneath_dpp that was used as the starting point of
+ * the lookup.  Since we take the mnt_renamelocks of all mounts we
+ * ever walked over during lookup, parallel renames are disabled.
+ * This prevents the situation where we circumvent walk over
+ * ni_rbeneath_dpp following dotdots.
  *
  * Also allow to force failure of dotdot lookups for non-local
  * filesystems, where external agents might assist local lookups to
@@ -243,7 +275,6 @@ nameicap_cleanup(struct nameidata *ndp)
 static int
 nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
 {
-	struct nameicap_tracker *nt;
 	struct mount *mp;
 
 	if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf &
@@ -253,22 +284,16 @@ nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
 	    NI_LCF_CAP_DOTDOT_KTR)) == NI_LCF_STRICTREL_KTR))
 		NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0)
-		return (ENOTCAPABLE);
+		goto violation;
+	if (dp == ndp->ni_rbeneath_dpp)
+		goto violation;
 	mp = dp->v_mount;
 	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
 	    (mp->mnt_flag & MNT_LOCAL) == 0)
-		goto capfail;
-	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
-	    nm_link) {
-		if (dp == nt->dp) {
-			nt = TAILQ_NEXT(nt, nm_link);
-			if (nt != NULL)
-				nameicap_cleanup_from(ndp, nt);
-			return (0);
-		}
-	}
+		goto violation;
+	return (0);
 
-capfail:
+violation:
 	if (__predict_false((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0))
 		NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
 	return (ENOTCAPABLE);
@@ -394,6 +419,8 @@ namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
 			    NI_LCF_CAP_DOTDOT;
 		}
 	}
+	if (error == 0 && (ndp->ni_lcf & NI_LCF_STRICTREL) != 0)
+		ndp->ni_rbeneath_dpp = *dpp;
 
 	/*
 	 * If we are auditing the kernel pathname, save the user pathname.
@@ -631,6 +658,7 @@ restart:
 	error = namei_getpath(ndp);
 	if (__predict_false(error != 0)) {
 		namei_cleanup_cnp(cnp);
+		nameicap_cleanup(ndp, error);
 		SDT_PROBE4(vfs, namei, lookup, return, error, NULL,
 		    false, ndp);
 		return (error);
@@ -661,12 +689,12 @@ restart:
 		else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
 		    (cnp->cn_flags & ISRESTARTED) == 0)) {
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, ERESTART);
 			NDRESTART(ndp);
 			goto restart;
 		}
 		return (error);
 	case CACHE_FPL_STATUS_PARTIAL:
-		TAILQ_INIT(&ndp->ni_cap_tracker);
 		dp = ndp->ni_startdir;
 		break;
 	case CACHE_FPL_STATUS_DESTROYED:
@@ -674,18 +702,21 @@ restart:
 		error = namei_getpath(ndp);
 		if (__predict_false(error != 0)) {
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, error);
 			return (error);
 		}
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		/* FALLTHROUGH */
 	case CACHE_FPL_STATUS_ABORTED:
-		TAILQ_INIT(&ndp->ni_cap_tracker);
 		MPASS(ndp->ni_lcf == 0);
 		if (*cnp->cn_pnbuf == '\0') {
 			if ((cnp->cn_flags & EMPTYPATH) != 0) {
-				return (namei_emptypath(ndp));
+				error = namei_emptypath(ndp);
+				nameicap_cleanup(ndp, error);
+				return (error);
 			}
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, ENOENT);
 			SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL,
 			    false, ndp);
 			return (ENOENT);
@@ -693,6 +724,7 @@ restart:
 		error = namei_setup(ndp, &dp, &pwd);
 		if (error != 0) {
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, error);
 			return (error);
 		}
 		break;
@@ -705,16 +737,23 @@ restart:
 		ndp->ni_startdir = dp;
 		error = vfs_lookup(ndp);
 		if (error != 0) {
-			if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
-			    error == ENOENT &&
-			    (cnp->cn_flags & ISRESTARTED) == 0)) {
-				nameicap_cleanup(ndp);
-				pwd_drop(pwd);
-				namei_cleanup_cnp(cnp);
-				NDRESTART(ndp);
-				goto restart;
-			} else
+			uint64_t was_restarted;
+			bool abi_restart;
+
+			was_restarted = ndp->ni_cnd.cn_flags &
+			    ISRESTARTED;
+			abi_restart = pwd->pwd_adir != pwd->pwd_rdir &&
+			    error == ENOENT && was_restarted == 0;
+			if (error != ERESTART && !abi_restart)
 				goto out;
+			nameicap_cleanup(ndp, error);
+			pwd_drop(pwd);
+			namei_cleanup_cnp(cnp);
+			NDRESET(ndp);
+			if (abi_restart)
+				was_restarted = ISRESTARTED;
+			ndp->ni_cnd.cn_flags |= was_restarted;
+			goto restart;
 		}
 
 		/*
@@ -723,7 +762,7 @@ restart:
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			SDT_PROBE4(vfs, namei, lookup, return, error,
 			    ndp->ni_vp, false, ndp);
-			nameicap_cleanup(ndp);
+			nameicap_cleanup(ndp, 0);
 			pwd_drop(pwd);
 			NDVALIDATE(ndp);
 			return (0);
@@ -756,10 +795,10 @@ restart:
 	ndp->ni_vp = NULL;
 	vrele(ndp->ni_dvp);
 out:
-	MPASS(error != 0);
+	MPASS(error != 0 && error != ERESTART);
 	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
 	namei_cleanup_cnp(cnp);
-	nameicap_cleanup(ndp);
+	nameicap_cleanup(ndp, error);
 	pwd_drop(pwd);
 	return (error);
 }
@@ -1185,7 +1224,9 @@ dirloop:
 		}
 	}
 
-	nameicap_tracker_add(ndp, dp);
+	error = nameicap_tracker_add(ndp, dp);
+	if (error != 0)
+		goto bad;
 
 	/*
 	 * Make sure degenerate names don't get here, their handling was
@@ -1210,9 +1251,7 @@ dirloop:
 	 *    the jail or chroot, don't let them out.
 	 * 5. If doing a capability lookup and lookup_cap_dotdot is
 	 *    enabled, return ENOTCAPABLE if the lookup would escape
-	 *    from the initial file descriptor directory.  Checks are
-	 *    done by ensuring that namei() already traversed the
-	 *    result of dotdot lookup.
+	 *    from the initial file descriptor directory.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL_KTR |
@@ -1238,7 +1277,7 @@ dirloop:
 					NI_CAP_VIOLATION(ndp, cnp->cn_pnbuf);
 				if ((ndp->ni_lcf & NI_LCF_STRICTREL) != 0) {
 					error = ENOTCAPABLE;
-					goto capdotdot;
+					goto bad;
 				}
 			}
 			if (isroot || ((dp->v_vflag & VV_ROOT) != 0 &&
@@ -1261,11 +1300,6 @@ dirloop:
 			vn_lock(dp,
 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY));
-			error = nameicap_check_dotdot(ndp, dp);
-			if (error != 0) {
-capdotdot:
-				goto bad;
-			}
 		}
 	}
 
@@ -1314,7 +1348,9 @@ unionlookup:
 			vn_lock(dp,
 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY));
-			nameicap_tracker_add(ndp, dp);
+			error = nameicap_tracker_add(ndp, dp);
+			if (error != 0)
+				goto bad;
 			goto unionlookup;
 		}
 
@@ -1415,7 +1451,7 @@ nextname:
 		goto dirloop;
 	}
 	if (cnp->cn_flags & ISDOTDOT) {
-		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
+		error = nameicap_check_dotdot(ndp, ndp->ni_dvp);
 		if (error != 0)
 			goto bad2;
 	}
@@ -1485,8 +1521,11 @@ success:
 	}
 success_right_lock:
 	if (ndp->ni_vp != NULL) {
-		if ((cnp->cn_flags & ISDOTDOT) == 0)
-			nameicap_tracker_add(ndp, ndp->ni_vp);
+		if ((cnp->cn_flags & ISDOTDOT) == 0) {
+			error = nameicap_tracker_add(ndp, ndp->ni_vp);
+			if (error != 0)
+				goto bad2;
+		}
 		if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS)
 			return (vfs_lookup_failifexists(ndp));
 	}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index cb18468d28bc..8e64a7fe966b 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -156,6 +156,7 @@ mount_init(void *mem, int size, int flags)
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+	lockinit(&mp->mnt_renamelock, PVFS, "rename", 0, 0);
 	mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
 	mp->mnt_ref = 0;
 	mp->mnt_vfs_ops = 1;
@@ -170,6 +171,7 @@ mount_fini(void *mem, int size)
 
 	mp = (struct mount *)mem;
 	uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
+	lockdestroy(&mp->mnt_renamelock);
 	lockdestroy(&mp->mnt_explock);
 	mtx_destroy(&mp->mnt_listmtx);
 	mtx_destroy(&mp->mnt_mtx);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index dc2fb59fb81c..29774cf87393 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -38,7 +38,6 @@
  * External virtual filesystem routines
  */
 
-#include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
@@ -57,6 +56,7 @@
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
+#include <sys/inotify.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
@@ -5246,7 +5246,8 @@ destroy_vpollinfo_free(struct vpollinfo *vi)
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
-
+	KASSERT(TAILQ_EMPTY(&vi->vpi_inotify),
+	    ("%s: pollinfo %p has lingering watches", __func__, vi));
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
@@ -5260,12 +5261,13 @@ v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
-	if (vp->v_pollinfo != NULL)
+	if (atomic_load_ptr(&vp->v_pollinfo) != NULL)
 		return;
 	vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_lock);
+	TAILQ_INIT(&vi->vpi_inotify);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
@@ -5851,6 +5853,8 @@ vop_rename_pre(void *ap)
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
+	struct mount *tmp;
+
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
@@ -5868,6 +5872,11 @@ vop_rename_pre(void *ap)
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+
+	tmp = NULL;
+	VOP_GETWRITEMOUNT(a->a_tdvp, &tmp);
+	lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED);
+	vfs_rel(tmp);
 #endif
 	/*
 	 * It may be tempting to add vn_seqc_write_begin/end calls here and
@@ -6057,6 +6066,28 @@ vop_need_inactive_debugpost(void *ap, int rc)
 #endif
 
 void
+vop_allocate_post(void *ap, int rc)
+{
+	struct vop_allocate_args *a;
+
+	a = ap;
+	if (rc == 0)
+		INOTIFY(a->a_vp, IN_MODIFY);
+}
+
+void
+vop_copy_file_range_post(void *ap, int rc)
+{
+	struct vop_copy_file_range_args *a;
+
+	a = ap;
+	if (rc == 0) {
+		INOTIFY(a->a_invp, IN_ACCESS);
+		INOTIFY(a->a_outvp, IN_MODIFY);
+	}
+}
+
+void
 vop_create_pre(void *ap)
 {
 	struct vop_create_args *a;
@@ -6076,8 +6107,20 @@ vop_create_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
+}
+
+void
+vop_deallocate_post(void *ap, int rc)
+{
+	struct vop_deallocate_args *a;
+
+	a = ap;
+	if (rc == 0)
+		INOTIFY(a->a_vp, IN_MODIFY);
 }
 
 void
@@ -6122,8 +6165,10 @@ vop_deleteextattr_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+		INOTIFY(vp, IN_ATTRIB);
+	}
 }
 
 void
@@ -6153,6 +6198,8 @@ vop_link_post(void *ap, int rc)
 	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
+		INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
+		INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE);
 	}
 }
 
@@ -6176,8 +6223,10 @@ vop_mkdir_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
@@ -6212,8 +6261,10 @@ vop_mknod_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
 }
 
 void
@@ -6225,8 +6276,10 @@ vop_reclaim_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	ASSERT_VOP_IN_SEQC(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
+		INOTIFY_REVOKE(vp);
+	}
 }
 
 void
@@ -6257,6 +6310,8 @@ vop_remove_post(void *ap, int rc)
 	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
+		INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
+		INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
 	}
 }
 
@@ -6288,6 +6343,8 @@ vop_rename_post(void *ap, int rc)
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
+		INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp,
+		    a->a_tdvp, a->a_tcnp);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
@@ -6327,6 +6384,7 @@ vop_rmdir_post(void *ap, int rc)
 		vp->v_vflag |= VV_UNLINKED;
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
+		INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
 	}
 }
 
@@ -6350,8 +6408,10 @@ vop_setattr_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+		INOTIFY(vp, IN_ATTRIB);
+	}
 }
 
 void
@@ -6396,8 +6456,10 @@ vop_setextattr_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+		INOTIFY(vp, IN_ATTRIB);
+	}
 }
 
 void
@@ -6420,8 +6482,10 @@ vop_symlink_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
 }
 
 void
@@ -6429,8 +6493,10 @@ vop_open_post(void *ap, int rc)
 {
 	struct vop_open_args *a = ap;
 
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
+		INOTIFY(a->a_vp, IN_OPEN);
+	}
 }
 
 void
@@ -6442,6 +6508,8 @@ vop_close_post(void *ap, int rc)
 	    !VN_IS_DOOMED(a->a_vp))) {
 		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
+		INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
+		    IN_CLOSE_WRITE : IN_CLOSE_NOWRITE);
 	}
 }
 
@@ -6450,8 +6518,10 @@ vop_read_post(void *ap, int rc)
 {
 	struct vop_read_args *a = ap;
 
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+		INOTIFY(a->a_vp, IN_ACCESS);
+	}
 }
 
 void
@@ -6463,15 +6533,6 @@ vop_read_pgcache_post(void *ap, int rc)
 		VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
 }
 
-void
-vop_readdir_post(void *ap, int rc)
-{
-	struct vop_readdir_args *a = ap;
-
-	if (!rc)
-		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
-}
-
 static struct knlist fs_knlist;
 
 static void
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index c236f241bf20..25d40a9806cb 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -2253,10 +2253,10 @@ kern_accessat(struct thread *td, int fd, const char *path,
 	cred = td->td_ucred;
 	if ((flag & AT_EACCESS) == 0 &&
 	    ((cred->cr_uid != cred->cr_ruid ||
-	    cred->cr_rgid != cred->cr_groups[0]))) {
+	    cred->cr_rgid != cred->cr_gid))) {
 		usecred = crdup(cred);
 		usecred->cr_uid = cred->cr_ruid;
-		usecred->cr_groups[0] = cred->cr_rgid;
+		usecred->cr_gid = cred->cr_rgid;
 		td->td_ucred = usecred;
 	} else
 		usecred = cred;
@@ -3766,7 +3766,7 @@ int
 kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
     const char *new, enum uio_seg pathseg)
 {
-	struct mount *mp = NULL;
+	struct mount *mp, *tmp;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	uint64_t tondflags;
@@ -3774,6 +3774,7 @@ kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
 	short irflag;
 
 again:
+	tmp = mp = NULL;
 	bwillwrite();
 #ifdef MAC
 	if (mac_vnode_check_rename_from_enabled()) {
@@ -3809,6 +3810,7 @@ again:
 	tvp = tond.ni_vp;
 	error = vn_start_write(fvp, &mp, V_NOWAIT);
 	if (error != 0) {
+again1:
 		NDFREE_PNBUF(&fromnd);
 		NDFREE_PNBUF(&tond);
 		if (tvp != NULL)
@@ -3819,11 +3821,25 @@ again:
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
+		if (tmp != NULL) {
+			lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE, NULL);
+			lockmgr(&tmp->mnt_renamelock, LK_RELEASE, NULL);
+			vfs_rel(tmp);
+			tmp = NULL;
+		}
 		error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
 		if (error != 0)
 			return (error);
 		goto again;
 	}
+	error = VOP_GETWRITEMOUNT(tdvp, &tmp);
+	if (error != 0 || tmp == NULL)
+		goto again1;
+	error = lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+	if (error != 0) {
+		vn_finished_write(mp);
+		goto again1;
+	}
 	irflag = vn_irflag_read(fvp);
 	if (((irflag & VIRF_NAMEDATTR) != 0 && tdvp != fromnd.ni_dvp) ||
 	    (irflag & VIRF_NAMEDDIR) != 0) {
@@ -3884,6 +3900,8 @@ out:
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
+	lockmgr(&tmp->mnt_renamelock, LK_RELEASE, 0);
+	vfs_rel(tmp);
 	vn_finished_write(mp);
 out1:
 	if (error == ERESTART)
@@ -4296,10 +4314,6 @@ kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
-	if (vp->v_type != VDIR) {
-		error = EINVAL;
-		goto fail;
-	}
 	if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) {
 		error = ENOENT;
 		goto fail;
@@ -4312,6 +4326,19 @@ unionread:
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
+	/*
+	 * We want to return ENOTDIR for anything that is not VDIR, but
+	 * not for VBAD, and we can't check for VBAD while the vnode is
+	 * unlocked.
+	 */
+	if (vp->v_type != VDIR) {
+		if (vp->v_type == VBAD)
+			error = EBADF;
+		else
+			error = ENOTDIR;
+		VOP_UNLOCK(vp);
+		goto fail;
+	}
 	AUDIT_ARG_VNODE1(vp);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 7487f93e4880..6451c9e07a60 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -52,6 +52,7 @@
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
+#include <sys/inotify.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/limits.h>
@@ -308,7 +309,8 @@ restart:
 				NDREINIT(ndp);
 				goto restart;
 			}
-			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
+			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0 ||
+			    (vn_irflag_read(ndp->ni_dvp) & VIRF_INOTIFY) != 0)
 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
@@ -484,6 +486,7 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 		if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
 		    VOP_ACCESS(vp, VREAD, cred, td) == 0)
 			fp->f_flag |= FKQALLOWED;
+		INOTIFY(vp, IN_OPEN);
 		return (0);
 	}
 
@@ -1746,6 +1749,8 @@ vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
 			vattr.va_vaflags |= VA_SYNC;
 		error = VOP_SETATTR(vp, &vattr, cred);
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
+		if (error == 0)
+			INOTIFY(vp, IN_MODIFY);
 	}
 	return (error);
 }
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index a2b6a7c8ff9f..2e63215b2f97 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -242,8 +242,8 @@ vop_read_pgcache {
 
 
 %% write	vp	L L L
-%! write	pre	VOP_WRITE_PRE
-%! write	post	VOP_WRITE_POST
+%! write	pre	vop_write_pre
+%! write	post	vop_write_post
 
 vop_write {
 	IN struct vnode *vp;
@@ -380,6 +380,7 @@ vop_symlink {
 
 
 %% readdir	vp	L L L
+%! readdir	pre	vop_readdir_pre
 %! readdir	post	vop_readdir_post
 
 vop_readdir {
@@ -702,6 +703,7 @@ vop_vptocnp {
 
 
 %% allocate	vp	E E E
+%! allocate	post	vop_allocate_post
 
 vop_allocate {
 	IN struct vnode *vp;
@@ -786,6 +788,7 @@ vop_fdatasync {
 
 %% copy_file_range	invp	U U U
 %% copy_file_range	outvp	U U U
+%! copy_file_range	post	vop_copy_file_range_post
 
 vop_copy_file_range {
 	IN struct vnode *invp;
@@ -810,6 +813,7 @@ vop_vput_pair {
 
 
 %% deallocate	vp	L L L
+%! deallocate	post	vop_deallocate_post
 
 vop_deallocate {
 	IN struct vnode *vp;
@@ -821,6 +825,27 @@ vop_deallocate {
 };
 
 
+%% inotify	vp	- - -
+
+vop_inotify {
+	IN struct vnode *vp;
+	IN struct vnode *dvp;
+	IN struct componentname *cnp;
+	IN int event;
+	IN uint32_t cookie;
+};
+
+
+%% inotify_add_watch vp	L L L
+
+vop_inotify_add_watch {
+	IN struct vnode *vp;
+	IN struct inotify_softc *sc;
+	IN uint32_t mask;
+	OUT uint32_t *wdp;
+	IN struct thread *td;
+};
+
 # The VOPs below are spares at the end of the table to allow new VOPs to be
 # added in stable branches without breaking the KBI.  New VOPs in HEAD should
 # be added above these spares.  When merging a new VOP to a stable branch,