30 files changed, 1751 insertions, 201 deletions
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index a48a513aa3b5..91792430d24c 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -658,5 +658,7 @@ struct sysent sysent[] = {
 	{ .sy_narg = AS(getrlimitusage_args), .sy_call = (sy_call_t *)sys_getrlimitusage, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 589 = getrlimitusage */
 	{ .sy_narg = AS(fchroot_args), .sy_call = (sy_call_t *)sys_fchroot, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 590 = fchroot */
 	{ .sy_narg = AS(setcred_args), .sy_call = (sy_call_t *)sys_setcred, .sy_auevent = AUE_SETCRED, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 591 = setcred */
-	{ .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 592 = exterrctl */
+	{ .sy_narg = AS(exterrctl_args), .sy_call = (sy_call_t *)sys_exterrctl, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 592 = exterrctl */
+	{ .sy_narg = AS(inotify_add_watch_at_args), .sy_call = (sy_call_t *)sys_inotify_add_watch_at, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 593 = inotify_add_watch_at */
+	{ .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 594 = inotify_rm_watch */
 };
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index ac4b6ac3f457..a27ab33b34da 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -38,9 +38,11 @@
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
+#define EXTERR_CATEGORY	EXTERR_CAT_FILEDESC
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
+#include <sys/exterrvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
@@ -478,6 +480,92 @@ kern_fcntl_freebsd(struct thread *td, int fd, int cmd, intptr_t arg)
 	return (error);
 }
 
+struct flags_trans_elem {
+	u_int f;
+	u_int t;
+};
+
+static u_int
+flags_trans(const struct flags_trans_elem *ftes, int nitems, u_int from_flags)
+{
+	u_int res;
+	int i;
+
+	res = 0;
+	for (i = 0; i < nitems; i++) {
+		if ((from_flags & ftes[i].f) != 0)
+			res |= ftes[i].t;
+	}
+	return (res);
+}
+
+static uint8_t
+fd_to_fde_flags(int fd_flags)
+{
+	static const struct flags_trans_elem fd_to_fde_flags_s[] = {
+		{ .f = FD_CLOEXEC,		.t = UF_EXCLOSE },
+		{ .f = FD_CLOFORK,		.t = UF_FOCLOSE },
+		{ .f = FD_RESOLVE_BENEATH,	.t = UF_RESOLVE_BENEATH },
+	};
+
+	return (flags_trans(fd_to_fde_flags_s, nitems(fd_to_fde_flags_s),
+	    fd_flags));
+}
+
+static int
+fde_to_fd_flags(uint8_t fde_flags)
+{
+	static const struct flags_trans_elem fde_to_fd_flags_s[] = {
+		{ .f = UF_EXCLOSE,		.t = FD_CLOEXEC },
+		{ .f = UF_FOCLOSE,		.t = FD_CLOFORK },
+		{ .f = UF_RESOLVE_BENEATH,	.t = FD_RESOLVE_BENEATH },
+	};
+
+	return (flags_trans(fde_to_fd_flags_s, nitems(fde_to_fd_flags_s),
+	    fde_flags));
+}
+
+static uint8_t
+fddup_to_fde_flags(int fddup_flags)
+{
+	static const struct flags_trans_elem fddup_to_fde_flags_s[] = {
+		{ .f = FDDUP_FLAG_CLOEXEC,	.t = UF_EXCLOSE },
+		{ .f = FDDUP_FLAG_CLOFORK,	.t = UF_FOCLOSE },
+	};
+
+	return (flags_trans(fddup_to_fde_flags_s, nitems(fddup_to_fde_flags_s),
+	    fddup_flags));
+}
+
+static uint8_t
+close_range_to_fde_flags(int close_range_flags)
+{
+	static const struct flags_trans_elem close_range_to_fde_flags_s[] = {
+		{ .f = CLOSE_RANGE_CLOEXEC,	.t = UF_EXCLOSE },
+		{ .f = CLOSE_RANGE_CLOFORK,	.t = UF_FOCLOSE },
+	};
+
+	return (flags_trans(close_range_to_fde_flags_s,
+	   nitems(close_range_to_fde_flags_s), close_range_flags));
+}
+
+static uint8_t
+open_to_fde_flags(int open_flags, bool sticky_orb)
+{
+	static const struct flags_trans_elem open_to_fde_flags_s[] = {
+		{ .f = O_CLOEXEC,		.t = UF_EXCLOSE },
+		{ .f = O_CLOFORK,		.t = UF_FOCLOSE },
+		{ .f = O_RESOLVE_BENEATH,	.t = UF_RESOLVE_BENEATH },
+	};
+#if defined(__clang__) && __clang_major__ >= 19
+	_Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f ==
+	    O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb");
+#endif
+
+	return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) -
+	    (sticky_orb ? 0 : 1), open_flags));
+}
+
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
@@ -492,6 +580,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 	int error, flg, kif_sz, seals, tmp, got_set, got_cleared;
 	uint64_t bsize;
 	off_t foffset;
+	int flags;
 
 	error = 0;
 	flg = F_POSIX;
@@ -511,6 +600,11 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
+	case F_DUPFD_CLOFORK:
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOFORK, fd, tmp);
+		break;
+
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
@@ -526,10 +620,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		FILEDESC_SLOCK(fdp);
 		fde = fdeget_noref(fdp, fd);
 		if (fde != NULL) {
-			td->td_retval[0] =
-			    ((fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0) |
-			    ((fde->fde_flags & UF_RESOLVE_BENEATH) ?
-			    FD_RESOLVE_BENEATH : 0);
+			td->td_retval[0] = fde_to_fd_flags(fde->fde_flags);
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
@@ -543,10 +634,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			/*
 			 * UF_RESOLVE_BENEATH is sticky and cannot be cleared.
 			 */
-			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
-			    ((arg & FD_CLOEXEC) != 0 ? UF_EXCLOSE : 0) |
-			    ((arg & FD_RESOLVE_BENEATH) != 0 ?
-			    UF_RESOLVE_BENEATH : 0);
+			fde->fde_flags = (fde->fde_flags &
+			    ~(UF_EXCLOSE | UF_FOCLOSE)) | fd_to_fde_flags(arg);
 			error = 0;
 		}
 		FILEDESC_XUNLOCK(fdp);
@@ -916,7 +1005,17 @@ revert_f_setfl:
 		break;
 
 	default:
-		error = EINVAL;
+		if ((cmd & ((1u << F_DUP3FD_SHIFT) - 1)) != F_DUP3FD)
+			return (EXTERROR(EINVAL, "invalid fcntl cmd"));
+		/* Handle F_DUP3FD */
+		flags = (cmd >> F_DUP3FD_SHIFT);
+		if ((flags & ~(FD_CLOEXEC | FD_CLOFORK)) != 0)
+			return (EXTERROR(EINVAL, "invalid flags for F_DUP3FD"));
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FIXED,
+		    ((flags & FD_CLOEXEC) != 0 ? FDDUP_FLAG_CLOEXEC : 0) |
+		    ((flags & FD_CLOFORK) != 0 ? FDDUP_FLAG_CLOFORK : 0),
+		    fd, tmp);
 		break;
 	}
 	return (error);
@@ -946,7 +1045,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 	fdp = p->p_fd;
 	oioctls = NULL;
 
-	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
+	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC | FDDUP_FLAG_CLOFORK)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
 	AUDIT_ARG_FD(old);
@@ -971,8 +1070,7 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 		goto unlock;
 	if (mode == FDDUP_FIXED && old == new) {
 		td->td_retval[0] = new;
-		if (flags & FDDUP_FLAG_CLOEXEC)
-			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
+		fdp->fd_ofiles[new].fde_flags |= fddup_to_fde_flags(flags);
 		error = 0;
 		goto unlock;
 	}
@@ -1047,10 +1145,8 @@ kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 	fde_copy(oldfde, newfde);
 	filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 	    nioctls);
-	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
-		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
-	else
-		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+	newfde->fde_flags = (oldfde->fde_flags & ~(UF_EXCLOSE | UF_FOCLOSE)) |
+	    fddup_to_fde_flags(flags);
 #ifdef CAPABILITIES
 	seqc_write_end(&newfde->fde_seqc);
 #endif
@@ -1416,13 +1512,14 @@ kern_close(struct thread *td, int fd)
 }
 
 static int
-close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
+close_range_flags(struct thread *td, u_int lowfd, u_int highfd, int flags)
 {
 	struct filedesc *fdp;
 	struct fdescenttbl *fdt;
 	struct filedescent *fde;
-	int fd;
+	int fd, fde_flags;
 
+	fde_flags = close_range_to_fde_flags(flags);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	fdt = atomic_load_ptr(&fdp->fd_files);
@@ -1434,7 +1531,7 @@ close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
 	for (; fd <= highfd; fd++) {
 		fde = &fdt->fdt_ofiles[fd];
 		if (fde->fde_file != NULL)
-			fde->fde_flags |= UF_EXCLOSE;
+			fde->fde_flags |= fde_flags;
 	}
 out_locked:
 	FILEDESC_XUNLOCK(fdp);
@@ -1492,8 +1589,8 @@ kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd)
 		return (EINVAL);
 	}
 
-	if ((flags & CLOSE_RANGE_CLOEXEC) != 0)
-		return (close_range_cloexec(td, lowfd, highfd));
+	if ((flags & (CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
+		return (close_range_flags(td, lowfd, highfd, flags));
 
 	return (close_range_impl(td, lowfd, highfd));
 }
@@ -1513,7 +1610,7 @@ sys_close_range(struct thread *td, struct close_range_args *uap)
 	AUDIT_ARG_CMD(uap->highfd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 
-	if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0)
+	if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
 		return (EINVAL);
 	return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd));
 }
@@ -2171,8 +2268,7 @@ _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = fp;
-	fde->fde_flags = ((flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0) |
-	    ((flags & O_RESOLVE_BENEATH) != 0 ? UF_RESOLVE_BENEATH : 0);
+	fde->fde_flags = open_to_fde_flags(flags, true);
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
@@ -2432,6 +2528,7 @@ fdcopy(struct filedesc *fdp)
 	newfdp->fd_freefile = fdp->fd_freefile;
 	FILEDESC_FOREACH_FDE(fdp, i, ofde) {
 		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
+		    (ofde->fde_flags & UF_FOCLOSE) != 0 ||
 		    !fhold(ofde->fde_file)) {
 			if (newfdp->fd_freefile == fdp->fd_freefile)
 				newfdp->fd_freefile = i;
@@ -2729,6 +2826,12 @@ fdcloseexec(struct thread *td)
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, false, false);
 			FILEDESC_UNLOCK_ASSERT(fdp);
+		} else if (fde->fde_flags & UF_FOCLOSE) {
+			/*
+			 * https://austingroupbugs.net/view.php?id=1851
+			 * FD_CLOFORK should not be preserved across exec
+			 */
+			fde->fde_flags &= ~UF_FOCLOSE;
 		}
 	}
 }
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index c8b01afeab4f..dcd38c6e6fbe 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -1637,6 +1637,12 @@ uifree(struct uidinfo *uip)
 	if (uip->ui_pipecnt != 0)
 		printf("freeing uidinfo: uid = %d, pipecnt = %ld\n",
 		    uip->ui_uid, uip->ui_pipecnt);
+	if (uip->ui_inotifycnt != 0)
+		printf("freeing uidinfo: uid = %d, inotifycnt = %ld\n",
+		    uip->ui_uid, uip->ui_inotifycnt);
+	if (uip->ui_inotifywatchcnt != 0)
+		printf("freeing uidinfo: uid = %d, inotifywatchcnt = %ld\n",
+		    uip->ui_uid, uip->ui_inotifywatchcnt);
 	free(uip, M_UIDINFO);
 }
 
@@ -1742,6 +1748,21 @@ chgpipecnt(struct uidinfo *uip, int diff, rlim_t max)
 	return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt"));
 }
 
+int
+chginotifycnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+	return (chglimit(uip, &uip->ui_inotifycnt, diff, max, "inotifycnt"));
+}
+
+int
+chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+	return (chglimit(uip, &uip->ui_inotifywatchcnt, diff, max,
+	    "inotifywatchcnt"));
+}
+
 static int
 sysctl_kern_proc_rlimit_usage(SYSCTL_HANDLER_ARGS)
 {
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 17b53208157a..35b258e68701 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -27,12 +27,12 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktls.h>
@@ -1246,6 +1246,8 @@ out:
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
+		if (sbytes > 0 && vp != NULL)
+			INOTIFY(vp, IN_ACCESS);
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 4565abc4b540..5d51aa675cb7 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1050,8 +1050,7 @@ osigaction(struct thread *td, struct osigaction_args *uap)
 int
 osigreturn(struct thread *td, struct osigreturn_args *uap)
 {
-
-	return (nosys(td, (struct nosys_args *)uap));
+	return (kern_nosys(td, 0));
 }
 #endif
 #endif /* COMPAT_43 */
@@ -4139,7 +4138,7 @@ coredump(struct thread *td)
 	struct flock lf;
 	struct vattr vattr;
 	size_t fullpathsize;
-	int error, error1, locked;
+	int error, error1, jid, locked, ppid, sig;
 	char *name;			/* name of corefile */
 	void *rl_cookie;
 	off_t limit;
@@ -4168,6 +4167,10 @@ coredump(struct thread *td)
 		PROC_UNLOCK(p);
 		return (EFBIG);
 	}
+
+	ppid = p->p_oppid;
+	sig = p->p_sig;
+	jid = p->p_ucred->cr_prison->pr_id;
 	PROC_UNLOCK(p);
 
 	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
@@ -4253,6 +4256,9 @@ coredump(struct thread *td)
 	}
 	devctl_safe_quote_sb(sb, name);
 	sbuf_putc(sb, '"');
+
+	sbuf_printf(sb, " jid=%d pid=%d ppid=%d signo=%d",
+	    jid, p->p_pid, ppid, sig);
 	if (sbuf_finish(sb) == 0)
 		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
 out2:
@@ -4281,6 +4287,12 @@ struct nosys_args {
 int
 nosys(struct thread *td, struct nosys_args *args)
 {
+	return (kern_nosys(td, args->dummy));
+}
+
+int
+kern_nosys(struct thread *td, int dummy)
+{
 	struct proc *p;
 
 	p = td->td_proc;
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
index 24406763a93a..a93d711e7597 100644
--- a/sys/kern/kern_syscalls.c
+++ b/sys/kern/kern_syscalls.c
@@ -35,6 +35,7 @@
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
@@ -50,14 +51,14 @@ int
 lkmnosys(struct thread *td, struct nosys_args *args)
 {
 
-	return (nosys(td, args));
+	return (kern_nosys(td, 0));
 }
 
 int
 lkmressys(struct thread *td, struct nosys_args *args)
 {
 
-	return (nosys(td, args));
+	return (kern_nosys(td, 0));
 }
 
 struct sysent nosys_sysent = {
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 0edb631d1475..464efda1e91a 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -263,8 +263,7 @@ kasan_mark(const void *addr, size_t size, size_t redzsize, uint8_t code)
 	if (__predict_false(!kasan_enabled))
 		return;
 
-	if ((vm_offset_t)addr >= DMAP_MIN_ADDRESS &&
-	    (vm_offset_t)addr < DMAP_MAX_ADDRESS)
+	if (kasan_md_unsupported((vm_offset_t)addr))
 		return;
 
 	KASSERT((vm_offset_t)addr >= VM_MIN_KERNEL_ADDRESS &&
diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c
index 7cc6fb593697..5ad5b0af1681 100644
--- a/sys/kern/subr_capability.c
+++ b/sys/kern/subr_capability.c
@@ -74,6 +74,10 @@ const cap_rights_t cap_getsockopt_rights =
     CAP_RIGHTS_INITIALIZER(CAP_GETSOCKOPT);
 const cap_rights_t cap_getsockname_rights =
     CAP_RIGHTS_INITIALIZER(CAP_GETSOCKNAME);
+const cap_rights_t cap_inotify_add_rights =
+    CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_ADD);
+const cap_rights_t cap_inotify_rm_rights =
+    CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_RM);
 const cap_rights_t cap_ioctl_rights = CAP_RIGHTS_INITIALIZER(CAP_IOCTL);
 const cap_rights_t cap_listen_rights = CAP_RIGHTS_INITIALIZER(CAP_LISTEN);
 const cap_rights_t cap_linkat_source_rights =
diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
index 3a3548bad52b..bb86c779b936 100644
--- a/sys/kern/subr_pctrie.c
+++ b/sys/kern/subr_pctrie.c
@@ -691,21 +691,23 @@ _pctrie_lookup_ge(struct pctrie *ptree, struct pctrie_node *node,
 	 */
 	if (node == PCTRIE_NULL || *pctrie_toval(node) < index) {
 		/* Climb the path to find a node with a descendant > index. */
-		for (node = parent; node != NULL; node = pctrie_parent(node)) {
-			slot = pctrie_slot(node, index) + 1;
-			if ((node->pn_popmap >> slot) != 0)
+		node = NULL;
+		while (parent != NULL) {
+			slot = pctrie_slot(parent, index) + 1;
+			if ((parent->pn_popmap >> slot) != 0)
 				break;
+			node = parent;
+			parent = pctrie_parent(node);
 		}
-		if (node == NULL) {
+		if (parent == NULL) {
 			if (parent_out != NULL)
-				*parent_out = NULL;
+				*parent_out = node;
 			return (NULL);
 		}
 
 		/* Step to the least child with a descendant > index. */
-		slot += ffs(node->pn_popmap >> slot) - 1;
-		parent = node;
-		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		slot += ffs(parent->pn_popmap >> slot) - 1;
+		node = pctrie_node_load(&parent->pn_child[slot], NULL,
 		    PCTRIE_LOCKED);
 	}
 	/* Descend to the least leaf of the subtrie. */
@@ -785,21 +787,23 @@ _pctrie_lookup_le(struct pctrie *ptree, struct pctrie_node *node,
 	 */
 	if (node == PCTRIE_NULL || *pctrie_toval(node) > index) {
 		/* Climb the path to find a node with a descendant < index. */
-		for (node = parent; node != NULL; node = pctrie_parent(node)) {
-			slot = pctrie_slot(node, index);
-			if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
+		node = NULL;
+		while (parent != NULL) {
+			slot = pctrie_slot(parent, index);
+			if ((parent->pn_popmap & ((1 << slot) - 1)) != 0)
 				break;
+			node = parent;
+			parent = pctrie_parent(node);
 		}
-		if (node == NULL) {
+		if (parent == NULL) {
 			if (parent_out != NULL)
-				*parent_out = NULL;
+				*parent_out = node;
 			return (NULL);
 		}
 
 		/* Step to the greatest child with a descendant < index. */
-		slot = ilog2(node->pn_popmap & ((1 << slot) - 1));
-		parent = node;
-		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		slot = ilog2(parent->pn_popmap & ((1 << slot) - 1));
+		node = pctrie_node_load(&parent->pn_child[slot], NULL,
 		    PCTRIE_LOCKED);
 	}
 	/* Descend to the greatest leaf of the subtrie. */
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 18388ae5f232..bac7d0080c71 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -338,8 +338,9 @@ ast_handler(struct thread *td, struct trapframe *framep, bool dtor)
 		td->td_ast = 0;
 	}
 
-	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
-            td->td_proc->p_comm);
+	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td,
+	     td->td_proc == NULL ? -1 : td->td_proc->p_pid,
+	     td->td_proc == NULL ? "" : td->td_proc->p_comm);
 	KASSERT(framep == NULL || TRAPF_USERMODE(framep),
 	    ("ast in kernel mode"));
 
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index d31ff3b939cc..b472aaea89e6 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -37,16 +37,17 @@
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
-#define	EXTERR_CATEGORY	EXTERR_CAT_FILEDESC
+#define	EXTERR_CATEGORY	EXTERR_CAT_GENIO
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
+#include <sys/exterrvar.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
-#include <sys/exterrvar.h>
+#include <sys/inotify.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
@@ -195,7 +196,7 @@ sys_read(struct thread *td, struct read_args *uap)
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
@@ -233,7 +234,7 @@ kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset)
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
@@ -329,7 +330,7 @@ kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset)
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "neg offset");
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
@@ -396,7 +397,7 @@ sys_write(struct thread *td, struct write_args *uap)
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
@@ -435,7 +436,7 @@ kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "length > iosize_max"));
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
@@ -531,7 +532,7 @@ kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset)
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "neg offset");
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
@@ -602,14 +603,14 @@ kern_ftruncate(struct thread *td, int fd, off_t length)
 
 	AUDIT_ARG_FD(fd);
 	if (length < 0)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "negative length"));
 	error = fget(td, fd, &cap_ftruncate_rights, &fp);
 	if (error)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if (!(fp->f_flag & FWRITE)) {
 		fdrop(fp, td);
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "non-writable"));
 	}
 	error = fo_truncate(fp, length, td->td_ucred, td);
 	fdrop(fp, td);
@@ -840,8 +841,10 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 	int error;
 
 	AUDIT_ARG_FD(fd);
-	if (offset < 0 || len <= 0)
-		return (EINVAL);
+	if (offset < 0)
+		return (EXTERROR(EINVAL, "negative offset"));
+	if (len <= 0)
+		return (EXTERROR(EINVAL, "negative length"));
 	/* Check for wrap. */
 	if (offset > OFF_MAX - len)
 		return (EFBIG);
@@ -898,16 +901,21 @@ kern_fspacectl(struct thread *td, int fd, int cmd,
 	AUDIT_ARG_FFLAGS(flags);
 
 	if (rqsr == NULL)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "no range"));
 	rmsr = *rqsr;
 	if (rmsrp != NULL)
 		*rmsrp = rmsr;
 
-	if (cmd != SPACECTL_DEALLOC ||
-	    rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
-	    rqsr->r_offset > OFF_MAX - rqsr->r_len ||
-	    (flags & ~SPACECTL_F_SUPPORTED) != 0)
-		return (EINVAL);
+	if (cmd != SPACECTL_DEALLOC)
+		return (EXTERROR(EINVAL, "cmd", cmd));
+	if (rqsr->r_offset < 0)
+		return (EXTERROR(EINVAL, "neg offset"));
+	if (rqsr->r_len <= 0)
+		return (EXTERROR(EINVAL, "neg len"));
+	if (rqsr->r_offset > OFF_MAX - rqsr->r_len)
+		return (EXTERROR(EINVAL, "offset too large"));
+	if ((flags & ~SPACECTL_F_SUPPORTED) != 0)
+		return (EXTERROR(EINVAL, "reserved flags", flags));
 
 	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 	if (error != 0)
@@ -939,7 +947,6 @@ int
 kern_specialfd(struct thread *td, int type, void *arg)
 {
 	struct file *fp;
-	struct specialfd_eventfd *ae;
 	int error, fd, fflags;
 
 	fflags = 0;
@@ -948,14 +955,24 @@ kern_specialfd(struct thread *td, int type, void *arg)
 		return (error);
 
 	switch (type) {
-	case SPECIALFD_EVENTFD:
+	case SPECIALFD_EVENTFD: {
+		struct specialfd_eventfd *ae;
+
 		ae = arg;
 		if ((ae->flags & EFD_CLOEXEC) != 0)
 			fflags |= O_CLOEXEC;
 		error = eventfd_create_file(td, fp, ae->initval, ae->flags);
 		break;
+	}
+	case SPECIALFD_INOTIFY: {
+		struct specialfd_inotify *si;
+
+		si = arg;
+		error = inotify_create_file(td, fp, si->flags, &fflags);
+		break;
+	}
 	default:
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "invalid type", type);
 		break;
 	}
 
@@ -970,13 +987,14 @@ kern_specialfd(struct thread *td, int type, void *arg)
 int
 sys___specialfd(struct thread *td, struct __specialfd_args *args)
 {
-	struct specialfd_eventfd ae;
 	int error;
 
 	switch (args->type) {
-	case SPECIALFD_EVENTFD:
+	case SPECIALFD_EVENTFD: {
+		struct specialfd_eventfd ae;
+
 		if (args->len != sizeof(struct specialfd_eventfd)) {
-			error = EINVAL;
+			error = EXTERROR(EINVAL, "eventfd params ABI");
 			break;
 		}
 		error = copyin(args->req, &ae, sizeof(ae));
@@ -984,13 +1002,27 @@ sys___specialfd(struct thread *td, struct __specialfd_args *args)
 			break;
 		if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK |
 		    EFD_SEMAPHORE)) != 0) {
-			error = EINVAL;
+			error = EXTERROR(EINVAL, "reserved flag");
 			break;
 		}
 		error = kern_specialfd(td, args->type, &ae);
 		break;
+	}
+	case SPECIALFD_INOTIFY: {
+		struct specialfd_inotify si;
+
+		if (args->len != sizeof(si)) {
+			error = EINVAL;
+			break;
+		}
+		error = copyin(args->req, &si, sizeof(si));
+		if (error != 0)
+			break;
+		error = kern_specialfd(td, args->type, &si);
+		break;
+	}
 	default:
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "unknown type", args->type);
 		break;
 	}
 	return (error);
@@ -1166,7 +1198,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
 	int error, lf, ndu;
 
 	if (nd < 0)
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "negative ndescs"));
 	fdp = td->td_proc->p_fd;
 	ndu = nd;
 	lf = fdp->fd_nfiles;
@@ -1259,7 +1291,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
 		    rtv.tv_usec >= 1000000) {
-			error = EINVAL;
+			error = EXTERROR(EINVAL, "invalid timeval");
 			goto done;
 		}
 		if (!timevalisset(&rtv))
@@ -1491,7 +1523,7 @@ sys_poll(struct thread *td, struct poll_args *uap)
 
 	if (uap->timeout != INFTIM) {
 		if (uap->timeout < 0)
-			return (EINVAL);
+			return (EXTERROR(EINVAL, "invalid timeout"));
 		ts.tv_sec = uap->timeout / 1000;
 		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
 		tsp = &ts;
@@ -1516,7 +1548,7 @@ kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds,
 	precision = 0;
 	if (tsp != NULL) {
 		if (!timespecvalid_interval(tsp))
-			return (EINVAL);
+			return (EXTERROR(EINVAL, "invalid timespec"));
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			sbt = 0;
 		else {
@@ -1619,7 +1651,7 @@ kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds,
 	int error;
 
 	if (kern_poll_maxfds(nfds))
-		return (EINVAL);
+		return (EXTERROR(EINVAL, "too large nfds"));
 	if (nfds > nitems(stackfds))
 		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
 	else
@@ -1796,7 +1828,7 @@ selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 
 		    rtv.tv_usec >= 1000000)
-			return (EINVAL);
+			return (EXTERROR(EINVAL, "invalid timeval"));
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
@@ -2173,7 +2205,7 @@ kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type,
 		    (uintptr_t)p2->p_vmspace);
 		break;
 	default:
-		error = EINVAL;
+		error = EXTERROR(EINVAL, "unknown op");
 		break;
 	}
 
@@ -2277,6 +2309,12 @@ sys_exterrctl(struct thread *td, struct exterrctl_args *uap)
 			return (EINVAL);
 		td->td_pflags2 &= ~TDP2_UEXTERR;
 		return (0);
+	case EXTERRCTL_UD:
+		/*
+		 * Important: this code must always return EINVAL and never any
+		 * extended error, for testing purposes.
+		 */
+		/* FALLTHROUGH */
 	default:
 		return (EINVAL);
 	}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index 9340779918a2..ed651da96b14 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -548,7 +548,7 @@ sys_pipe2(struct thread *td, struct pipe2_args *uap)
 {
 	int error, fildes[2];
 
-	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
+	if ((uap->flags & ~(O_CLOEXEC | O_CLOFORK | O_NONBLOCK)) != 0)
 		return (EINVAL);
 	error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
 	if (error)
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index fa36cc824078..90a4f3a7dad8 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -598,4 +598,6 @@ const char *syscallnames[] = {
 	"fchroot",			/* 590 = fchroot */
 	"setcred",			/* 591 = setcred */
 	"exterrctl",			/* 592 = exterrctl */
+	"inotify_add_watch_at",			/* 593 = inotify_add_watch_at */
+	"inotify_rm_watch",			/* 594 = inotify_rm_watch */
 };
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 08b557a7a540..90559fab6086 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3349,11 +3349,26 @@
 		    size_t size
 		);
 	}
-592	AUE_NULL	STD {
+592	AUE_NULL	STD|CAPENABLED {
 		int exterrctl(
 		    u_int op,
 		    u_int flags,
 		    _In_reads_bytes_(4) void *ptr
 		);
 	}
+593	AUE_INOTIFY	STD|CAPENABLED {
+		int inotify_add_watch_at(
+		    int fd,
+		    int dfd,
+		    _In_z_ const char *path,
+		    uint32_t mask
+		);
+	}
+594	AUE_INOTIFY	STD|CAPENABLED {
+		int inotify_rm_watch(
+		    int fd,
+		    int wd
+		);
+	}
+
 ; vim: syntax=off
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 15789d3eb5fa..90b21616a558 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3482,6 +3482,24 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
 		*n_args = 3;
 		break;
 	}
+	/* inotify_add_watch_at */
+	case 593: {
+		struct inotify_add_watch_at_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		iarg[a++] = p->dfd; /* int */
+		uarg[a++] = (intptr_t)p->path; /* const char * */
+		uarg[a++] = p->mask; /* uint32_t */
+		*n_args = 4;
+		break;
+	}
+	/* inotify_rm_watch */
+	case 594: {
+		struct inotify_rm_watch_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		iarg[a++] = p->wd; /* int */
+		*n_args = 2;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
@@ -9317,6 +9335,38 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 			break;
 		};
 		break;
+	/* inotify_add_watch_at */
+	case 593:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "userland const char *";
+			break;
+		case 3:
+			p = "uint32_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* inotify_rm_watch */
+	case 594:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
 	default:
 		break;
 	};
@@ -11305,6 +11355,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 		if (ndx == 0 || ndx == 1)
 			p = "int";
 		break;
+	/* inotify_add_watch_at */
+	case 593:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* inotify_rm_watch */
+	case 594:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
 	default:
 		break;
 	};
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
index 11141d197aec..a545a0a54c25 100644
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -1724,7 +1724,7 @@ freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
 		return (sys_msgsys(td, (struct msgsys_args *)uap));
 	}
 #else
-	return (nosys(td, NULL));
+	return (kern_nosys(td, 0));
 #endif
 }
 
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
index e399517010fc..a99e1a4de14e 100644
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -1904,7 +1904,7 @@ freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
 		return (sys_semsys(td, (struct semsys_args *)uap));
 	}
 #else
-	return (nosys(td, NULL));
+	return (kern_nosys(td, 0));
 #endif
 }
 
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index 60e3fe92a4b7..8d1a469127c6 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -1474,7 +1474,7 @@ freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
 		return (EINVAL);
 	}
 #else
-	return (nosys(td, NULL));
+	return (kern_nosys(td, 0));
 #endif
 }
 
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index ad8485028987..133724ac76c5 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -151,6 +151,10 @@ kern_socket(struct thread *td, int domain, int type, int protocol)
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
+	if ((type & SOCK_CLOFORK) != 0) {
+		type &= ~SOCK_CLOFORK;
+		oflag |= O_CLOFORK;
+	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
@@ -352,7 +356,8 @@ kern_accept4(struct thread *td, int s, struct sockaddr *sa, int flags,
 		goto done;
 #endif
 	error = falloc_caps(td, &nfp, &fd,
-	    (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
+	    ((flags & SOCK_CLOEXEC) != 0 ? O_CLOEXEC : 0) |
+	    ((flags & SOCK_CLOFORK) != 0 ? O_CLOFORK : 0), &fcaps);
 	if (error != 0)
 		goto done;
 	SOCK_LOCK(head);
@@ -435,7 +440,7 @@ int
 sys_accept4(struct thread *td, struct accept4_args *uap)
 {
 
-	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+	if ((uap->flags & ~(SOCK_CLOEXEC | SOCK_CLOFORK | SOCK_NONBLOCK)) != 0)
 		return (EINVAL);
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
@@ -557,6 +562,10 @@ kern_socketpair(struct thread *td, int domain, int type, int protocol,
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
+	if ((type & SOCK_CLOFORK) != 0) {
+		type &= ~SOCK_CLOFORK;
+		oflag |= O_CLOFORK;
+	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 72bd0246db11..0056dac65c7d 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -3463,7 +3463,8 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
 
 	UNP_LINK_UNLOCK_ASSERT();
 
-	fdflags = (flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+	fdflags = ((flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0) |
+	    ((flags & MSG_CMSG_CLOFORK) ? O_CLOFORK : 0);
 
 	error = 0;
 	if (controlp != NULL) /* controlp == NULL => free control messages */
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 97dc854c9386..02973146068d 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -301,7 +301,7 @@ static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 static void	aio_biocleanup(struct bio *bp);
-void		aio_init_aioinfo(struct proc *p);
+static int	aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
@@ -309,7 +309,7 @@ static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
-int		aio_aqueue(struct thread *td, struct aiocb *ujob,
+static int	aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_biowakeup(struct bio *bp);
@@ -422,10 +422,11 @@ aio_onceonly(void)
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
-void
+static int
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
+	int error;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
@@ -451,8 +452,20 @@ aio_init_aioinfo(struct proc *p)
 		uma_zfree(kaio_zone, ki);
 	}
 
-	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
-		aio_newproc(NULL);
+	error = 0;
+	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) {
+		error = aio_newproc(NULL);
+		if (error != 0) {
+			/*
+			 * At least one worker is enough to have AIO
+			 * functional.  Clear error in that case.
+			 */
+			if (num_aio_procs > 0)
+				error = 0;
+			break;
+		}
+	}
+	return (error);
 }
 
 static int
@@ -1476,7 +1489,7 @@ static struct aiocb_ops aiocb_ops_osigevent = {
  * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
  * technique is done in this code.
  */
-int
+static int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
     int type, struct aiocb_ops *ops)
 {
@@ -1490,8 +1503,11 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
 	int fd, kqfd;
 	u_short evflags;
 
-	if (p->p_aioinfo == NULL)
-		aio_init_aioinfo(p);
+	if (p->p_aioinfo == NULL) {
+		error = aio_init_aioinfo(p);
+		if (error != 0)
+			goto err1;
+	}
 
 	ki = p->p_aioinfo;
 
@@ -2213,8 +2229,11 @@ kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
-	if (p->p_aioinfo == NULL)
-		aio_init_aioinfo(p);
+	if (p->p_aioinfo == NULL) {
+		error = aio_init_aioinfo(p);
+		if (error != 0)
+			return (error);
+	}
 
 	ki = p->p_aioinfo;
 
@@ -2503,8 +2522,11 @@ kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
 		timo = tvtohz(&atv);
 	}
 
-	if (p->p_aioinfo == NULL)
-		aio_init_aioinfo(p);
+	if (p->p_aioinfo == NULL) {
+		error = aio_init_aioinfo(p);
+		if (error != 0)
+			return (error);
+	}
 	ki = p->p_aioinfo;
 
 	error = 0;
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 883beaf6d1da..89c1d779f04c 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -41,6 +41,7 @@
 #include <sys/counter.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
@@ -331,7 +332,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 
-SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
+    "enum cache_fpl_status");
 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 
@@ -2629,6 +2631,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 		atomic_store_ptr(&dvp->v_cache_dd, ncp);
 	} else if (vp != NULL) {
 		/*
+		 * Take the slow path in INOTIFY().  This flag will be lazily
+		 * cleared by cache_vop_inotify() once all directories referring
+		 * to vp are unwatched.
+		 */
+		if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0))
+			vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT);
+
+		/*
 		 * For this case, the cache entry maps both the
 		 * directory name in it and the name ".." for the
 		 * directory's parent.
@@ -4008,6 +4018,56 @@ out:
 	return (error);
 }
 
+void
+cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie)
+{
+	struct mtx *vlp;
+	struct namecache *ncp;
+	int isdir;
+	bool logged, self;
+
+	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+	self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 &&
+	    (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0);
+
+	if (self) {
+		int selfevent;
+
+		if (event == _IN_ATTRIB_LINKCOUNT)
+			selfevent = IN_ATTRIB;
+		else
+			selfevent = event;
+		inotify_log(vp, NULL, 0, selfevent | isdir, cookie);
+	}
+	if ((event & IN_ALL_EVENTS) == 0)
+		return;
+
+	logged = false;
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
+	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
+		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+			continue;
+		if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) {
+			/*
+			 * XXX-MJ if the vnode has two links in the same
+			 * dir, we'll log the same event twice.
+			 */
+			inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen,
+			    event | isdir, cookie);
+			logged = true;
+		}
+	}
+	if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) {
+		/*
+		 * We didn't find a watched directory that contains this vnode,
+		 * so stop calling VOP_INOTIFY for operations on the vnode.
+		 */
+		vn_irflag_unset(vp, VIRF_INOTIFY_PARENT);
+	}
+	mtx_unlock(vlp);
+}
+
 #ifdef DDB
 static void
 db_print_vpath(struct vnode *vp)
@@ -6361,15 +6421,11 @@ out:
 	cache_fpl_smr_assert_not_entered(&fpl);
 	cache_fpl_assert_status(&fpl);
 	*status = fpl.status;
-	if (SDT_PROBES_ENABLED()) {
-		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
-		if (fpl.status == CACHE_FPL_STATUS_HANDLED)
-			SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
-			    ndp);
-	}
-
+	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 	if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 		MPASS(error != CACHE_FPL_FAILED);
+		SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
+		    ndp);
 		if (error != 0) {
 			cache_fpl_cleanup_cnp(fpl.cnp);
 			MPASS(fpl.dvp == NULL);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index be49c0887609..fd6202a1424c 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -39,6 +39,7 @@
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/filio.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
@@ -119,6 +120,8 @@ struct vop_vector default_vnodeops = {
 	.vop_getwritemount =	vop_stdgetwritemount,
 	.vop_inactive =		VOP_NULL,
 	.vop_need_inactive =	vop_stdneed_inactive,
+	.vop_inotify =		vop_stdinotify,
+	.vop_inotify_add_watch = vop_stdinotify_add_watch,
 	.vop_ioctl =		vop_stdioctl,
 	.vop_kqfilter =		vop_stdkqfilter,
 	.vop_islocked =		vop_stdislocked,
@@ -453,6 +456,7 @@ vop_stdpathconf(struct vop_pathconf_args *ap)
 		case _PC_MAC_PRESENT:
 		case _PC_NAMEDATTR_ENABLED:
 		case _PC_HAS_NAMEDATTR:
+		case _PC_HAS_HIDDENSYSTEM:
 			*ap->a_retval = 0;
 			return (0);
 		default:
@@ -1306,6 +1310,20 @@ vop_stdneed_inactive(struct vop_need_inactive_args *ap)
 }
 
 int
+vop_stdinotify(struct vop_inotify_args *ap)
+{
+	vn_inotify(ap->a_vp, ap->a_dvp, ap->a_cnp, ap->a_event, ap->a_cookie);
+	return (0);
+}
+
+int
+vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *ap)
+{
+	return (vn_inotify_add_watch(ap->a_vp, ap->a_sc, ap->a_mask,
+	    ap->a_wdp, ap->a_td));
+}
+
+int
 vop_stdioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
new file mode 100644
index 000000000000..d3cd0d1f9832
--- /dev/null
+++ b/sys/kern/vfs_inotify.c
@@ -0,0 +1,1011 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Klara, Inc.
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/caprights.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#define	EXTERR_CATEGORY	EXTERR_CAT_INOTIFY
+#include <sys/exterrvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/inotify.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslimits.h>
+#include <sys/sysproto.h>
+#include <sys/tree.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+uint32_t inotify_rename_cookie;
+
+static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "inotify configuration");
+
+static int inotify_max_queued_events = 16384;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
+    &inotify_max_queued_events, 0,
+    "Maximum number of events to queue on an inotify descriptor");
+
+static int inotify_max_user_instances = 256;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
+    &inotify_max_user_instances, 0,
+    "Maximum number of inotify descriptors per user");
+
+static int inotify_max_user_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
+    &inotify_max_user_watches, 0,
+    "Maximum number of inotify watches per user");
+
+static int inotify_max_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
+    &inotify_max_watches, 0,
+    "Maximum number of inotify watches system-wide");
+
+static int inotify_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
+    &inotify_watches, 0,
+    "Total number of inotify watches currently in use");
+
+static int inotify_coalesce = 1;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
+    &inotify_coalesce, 0,
+    "Coalesce inotify events when possible");
+
+static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
+SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
+    &inotify_event_drops,
+    "Number of inotify events dropped due to limits or allocation failures");
+
+static fo_rdwr_t	inotify_read;
+static fo_ioctl_t	inotify_ioctl;
+static fo_poll_t	inotify_poll;
+static fo_kqfilter_t	inotify_kqfilter;
+static fo_stat_t	inotify_stat;
+static fo_close_t	inotify_close;
+static fo_fill_kinfo_t	inotify_fill_kinfo;
+
+static const struct fileops inotifyfdops = {
+	.fo_read = inotify_read,
+	.fo_write = invfo_rdwr,
+	.fo_truncate = invfo_truncate,
+	.fo_ioctl = inotify_ioctl,
+	.fo_poll = inotify_poll,
+	.fo_kqfilter = inotify_kqfilter,
+	.fo_stat = inotify_stat,
+	.fo_close = inotify_close,
+	.fo_chmod = invfo_chmod,
+	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_fill_kinfo = inotify_fill_kinfo,
+	.fo_cmp = file_kcmp_generic,
+	.fo_flags = DFLAG_PASSABLE,
+};
+
+static void	filt_inotifydetach(struct knote *kn);
+static int	filt_inotifyevent(struct knote *kn, long hint);
+
+static const struct filterops inotify_rfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_inotifydetach,
+	.f_event = filt_inotifyevent,
+};
+
+static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
+
+struct inotify_record {
+	STAILQ_ENTRY(inotify_record) link;
+	struct inotify_event	ev;
+};
+
+static uint64_t inotify_ino = 1;
+
+/*
+ * On LP64 systems this occupies 64 bytes, so we don't get internal
+ * fragmentation by allocating watches with malloc(9).  If the size changes,
+ * consider using a UMA zone to improve memory efficiency.
+ */
+struct inotify_watch {
+	struct inotify_softc *sc; /* back-pointer */
+	int		wd;	/* unique ID */
+	uint32_t	mask;	/* event mask */
+	struct vnode	*vp;	/* vnode being watched, refed */
+	RB_ENTRY(inotify_watch) ilink;		/* inotify linkage */
+	TAILQ_ENTRY(inotify_watch) vlink;	/* vnode linkage */
+};
+
+static void
+inotify_init(void *arg __unused)
+{
+	/* Don't let a user hold too many vnodes. */
+	inotify_max_user_watches = desiredvnodes / 3;
+	/* Don't let the system hold too many vnodes. */
+	inotify_max_watches = desiredvnodes / 2;
+}
+SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
+
+static int
+inotify_watch_cmp(const struct inotify_watch *a,
+    const struct inotify_watch *b)
+{
+	if (a->wd < b->wd)
+		return (-1);
+	else if (a->wd > b->wd)
+		return (1);
+	else
+		return (0);
+}
+RB_HEAD(inotify_watch_tree, inotify_watch);
+RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
+
+struct inotify_softc {
+	struct mtx	lock;			/* serialize all softc writes */
+	STAILQ_HEAD(, inotify_record) pending;	/* events waiting to be read */
+	struct inotify_record overflow;		/* preallocated record */
+	int		nextwatch;		/* next watch ID to try */
+	int		npending;		/* number of pending events */
+	size_t		nbpending;		/* bytes available to read */
+	uint64_t	ino;			/* unique identifier */
+	struct inotify_watch_tree watches;	/* active watches */
+	struct selinfo	sel;			/* select/poll/kevent info */
+	struct ucred	*cred;			/* credential ref */
+};
+
+static struct inotify_record *
+inotify_dequeue(struct inotify_softc *sc)
+{
+	struct inotify_record *rec;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+	KASSERT(!STAILQ_EMPTY(&sc->pending),
+	    ("%s: queue for %p is empty", __func__, sc));
+
+	rec = STAILQ_FIRST(&sc->pending);
+	STAILQ_REMOVE_HEAD(&sc->pending, link);
+	sc->npending--;
+	sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
+	return (rec);
+}
+
+static void
+inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
+{
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	if (head)
+		STAILQ_INSERT_HEAD(&sc->pending, rec, link);
+	else
+		STAILQ_INSERT_TAIL(&sc->pending, rec, link);
+	sc->npending++;
+	sc->nbpending += sizeof(rec->ev) + rec->ev.len;
+}
+
+static int
+inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
+    struct thread *td)
+{
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	int error;
+	bool first;
+
+	sc = fp->f_data;
+	error = 0;
+
+	mtx_lock(&sc->lock);
+	while (STAILQ_EMPTY(&sc->pending)) {
+		if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
+			mtx_unlock(&sc->lock);
+			return (EWOULDBLOCK);
+		}
+		error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
+		if (error != 0) {
+			mtx_unlock(&sc->lock);
+			return (error);
+		}
+	}
+	for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
+		size_t len;
+
+		rec = inotify_dequeue(sc);
+		len = sizeof(rec->ev) + rec->ev.len;
+		if (uio->uio_resid < (ssize_t)len) {
+			inotify_enqueue(sc, rec, true);
+			if (first) {
+				error = EXTERROR(EINVAL,
+				    "read buffer is too small");
+			}
+			break;
+		}
+		mtx_unlock(&sc->lock);
+		error = uiomove(&rec->ev, len, uio);
+#ifdef KTRACE
+		if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+			ktrstruct("inotify", &rec->ev, len);
+#endif
+		mtx_lock(&sc->lock);
+		if (error != 0) {
+			inotify_enqueue(sc, rec, true);
+			mtx_unlock(&sc->lock);
+			return (error);
+		}
+		if (rec == &sc->overflow) {
+			/*
+			 * Signal to inotify_queue_record() that the overflow
+			 * record can be reused.
+			 */
+			memset(rec, 0, sizeof(*rec));
+		} else {
+			free(rec, M_INOTIFY);
+		}
+	}
+	mtx_unlock(&sc->lock);
+	return (error);
+}
+
+static int
+inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
+    struct thread *td)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	switch (com) {
+	case FIONREAD:
+		*(int *)data = (int)sc->nbpending;
+		return (0);
+	case FIONBIO:
+	case FIOASYNC:
+		return (0);
+	default:
+		return (ENOTTY);
+	}
+
+	return (0);
+}
+
+static int
+inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+	struct inotify_softc *sc;
+	int revents;
+
+	sc = fp->f_data;
+	revents = 0;
+
+	mtx_lock(&sc->lock);
+	if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
+		revents |= events & (POLLIN | POLLRDNORM);
+	else
+		selrecord(td, &sc->sel);
+	mtx_unlock(&sc->lock);
+	return (revents);
+}
+
+static void
+filt_inotifydetach(struct knote *kn)
+{
+	struct inotify_softc *sc;
+
+	sc = kn->kn_hook;
+	knlist_remove(&sc->sel.si_note, kn, 0);
+}
+
+static int
+filt_inotifyevent(struct knote *kn, long hint)
+{
+	struct inotify_softc *sc;
+
+	sc = kn->kn_hook;
+	mtx_assert(&sc->lock, MA_OWNED);
+	kn->kn_data = sc->nbpending;
+	return (kn->kn_data > 0);
+}
+
+static int
+inotify_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct inotify_softc *sc;
+
+	if (kn->kn_filter != EVFILT_READ)
+		return (EINVAL);
+	sc = fp->f_data;
+	kn->kn_fop = &inotify_rfiltops;
+	kn->kn_hook = sc;
+	knlist_add(&sc->sel.si_note, kn, 0);
+	return (0);
+}
+
+static int
+inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	memset(sb, 0, sizeof(*sb));
+	sb->st_mode = S_IFREG | S_IRUSR;
+	sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
+	mtx_lock(&sc->lock);
+	sb->st_size = sc->nbpending;
+	sb->st_blocks = sc->npending;
+	sb->st_uid = sc->cred->cr_ruid;
+	sb->st_gid = sc->cred->cr_rgid;
+	sb->st_ino = sc->ino;
+	mtx_unlock(&sc->lock);
+	return (0);
+}
+
+static void
+inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
+{
+	struct vnode *vp;
+
+	vp = watch->vp;
+	mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
+
+	atomic_subtract_int(&inotify_watches, 1);
+	(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+
+	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
+		vn_irflag_unset(vp, VIRF_INOTIFY);
+}
+
+/*
+ * Assumes that the watch has already been removed from its softc.
+ */
+static void
+inotify_remove_watch(struct inotify_watch *watch)
+{
+	struct inotify_softc *sc;
+	struct vnode *vp;
+
+	sc = watch->sc;
+
+	vp = watch->vp;
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	inotify_unlink_watch_locked(sc, watch);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+	vrele(vp);
+	free(watch, M_INOTIFY);
+}
+
+static int
+inotify_close(struct file *fp, struct thread *td)
+{
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	struct inotify_watch *watch;
+
+	sc = fp->f_data;
+
+	mtx_lock(&sc->lock);
+	(void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
+	while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
+		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+		mtx_unlock(&sc->lock);
+		inotify_remove_watch(watch);
+		mtx_lock(&sc->lock);
+	}
+	while (!STAILQ_EMPTY(&sc->pending)) {
+		rec = inotify_dequeue(sc);
+		if (rec != &sc->overflow)
+			free(rec, M_INOTIFY);
+	}
+	mtx_unlock(&sc->lock);
+	seldrain(&sc->sel);
+	knlist_destroy(&sc->sel.si_note);
+	mtx_destroy(&sc->lock);
+	crfree(sc->cred);
+	free(sc, M_INOTIFY);
+	return (0);
+}
+
+static int
+inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+    struct filedesc *fdp)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	kif->kf_type = KF_TYPE_INOTIFY;
+	kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
+	kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
+	return (0);
+}
+
+int
+inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
+{
+	struct inotify_softc *sc;
+	int fflags;
+
+	if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
+		return (EINVAL);
+
+	if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
+	    inotify_max_user_instances))
+		return (EMFILE);
+
+	sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
+	sc->nextwatch = 1; /* Required for compatibility. */
+	STAILQ_INIT(&sc->pending);
+	RB_INIT(&sc->watches);
+	mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
+	knlist_init_mtx(&sc->sel.si_note, &sc->lock);
+	sc->cred = crhold(td->td_ucred);
+	sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
+
+	fflags = FREAD;
+	if ((flags & IN_NONBLOCK) != 0)
+		fflags |= FNONBLOCK;
+	if ((flags & IN_CLOEXEC) != 0)
+		*fflagsp |= O_CLOEXEC;
+	finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
+
+	return (0);
+}
+
+static struct inotify_record *
+inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
+    uint32_t cookie, int waitok)
+{
+	struct inotify_event *evp;
+	struct inotify_record *rec;
+
+	rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
+	    waitok | M_ZERO);
+	if (rec == NULL)
+		return (NULL);
+	evp = &rec->ev;
+	evp->wd = wd;
+	evp->mask = event;
+	evp->cookie = cookie;
+	evp->len = _IN_NAMESIZE(namelen);
+	if (name != NULL)
+		memcpy(evp->name, name, namelen);
+	return (rec);
+}
+
+static bool
+inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
+{
+	struct inotify_record *prev;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	prev = STAILQ_LAST(&sc->pending, inotify_record, link);
+	return (prev != NULL && prev->ev.mask == evp->mask &&
+	    prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
+	    prev->ev.len == evp->len &&
+	    memcmp(prev->ev.name, evp->name, evp->len) == 0);
+}
+
+static void
+inotify_overflow_event(struct inotify_event *evp)
+{
+	evp->mask = IN_Q_OVERFLOW;
+	evp->wd = -1;
+	evp->cookie = 0;
+	evp->len = 0;
+}
+
+/*
+ * Put an event record on the queue for an inotify desscriptor.  Return false if
+ * the record was not enqueued for some reason, true otherwise.
+ */
+static bool
+inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
+{
+	struct inotify_event *evp;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	evp = &rec->ev;
+	if (__predict_false(rec == &sc->overflow)) {
+		/*
+		 * Is the overflow record already in the queue?  If so, there's
+		 * not much else we can do: we're here because a kernel memory
+		 * shortage prevented new record allocations.
+		 */
+		counter_u64_add(inotify_event_drops, 1);
+		if (evp->mask == IN_Q_OVERFLOW)
+			return (false);
+		inotify_overflow_event(evp);
+	} else {
+		/* Try to coalesce duplicate events. */
+		if (inotify_coalesce && inotify_can_coalesce(sc, evp))
+			return (false);
+
+		/*
+		 * Would this one overflow the queue?  If so, convert it to an
+		 * overflow event and try again to coalesce.
+		 */
+		if (sc->npending >= inotify_max_queued_events) {
+			counter_u64_add(inotify_event_drops, 1);
+			inotify_overflow_event(evp);
+			if (inotify_can_coalesce(sc, evp))
+				return (false);
+		}
+	}
+	inotify_enqueue(sc, rec, false);
+	selwakeup(&sc->sel);
+	KNOTE_LOCKED(&sc->sel.si_note, 0);
+	wakeup(&sc->pending);
+	return (true);
+}
+
+static int
+inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
+    int event, uint32_t cookie)
+{
+	struct inotify_watch key;
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	int relecount;
+	bool allocfail;
+
+	relecount = 0;
+
+	sc = watch->sc;
+	rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
+	    M_NOWAIT);
+	if (rec == NULL) {
+		rec = &sc->overflow;
+		allocfail = true;
+	} else {
+		allocfail = false;
+	}
+
+	mtx_lock(&sc->lock);
+	if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
+		free(rec, M_INOTIFY);
+	if ((watch->mask & IN_ONESHOT) != 0 ||
+	    (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
+		if (!allocfail) {
+			rec = inotify_alloc_record(watch->wd, NULL, 0,
+			    IN_IGNORED, 0, M_NOWAIT);
+			if (rec == NULL)
+				rec = &sc->overflow;
+			if (!inotify_queue_record(sc, rec) &&
+			    rec != &sc->overflow)
+				free(rec, M_INOTIFY);
+		}
+
+		/*
+		 * Remove the watch, taking care to handle races with
+		 * inotify_close().
+		 */
+		key.wd = watch->wd;
+		if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
+			RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+			inotify_unlink_watch_locked(sc, watch);
+			free(watch, M_INOTIFY);
+
+			/* Defer vrele() to until locks are dropped. */
+			relecount++;
+		}
+	}
+	mtx_unlock(&sc->lock);
+	return (relecount);
+}
+
+void
+inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
+    uint32_t cookie)
+{
+	struct inotify_watch *watch, *tmp;
+	int relecount;
+
+	KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
+	    ("inotify_log: invalid event %#x", event));
+
+	relecount = 0;
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
+		KASSERT(watch->vp == vp,
+		    ("inotify_log: watch %p vp != vp", watch));
+		if ((watch->mask & event) != 0 || event == IN_UNMOUNT) {
+			relecount += inotify_log_one(watch, name, namelen, event,
+			    cookie);
+		}
+	}
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+	for (int i = 0; i < relecount; i++)
+		vrele(vp);
+}
+
+/*
+ * An inotify event occurred on a watched vnode.
+ */
+void
+vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
+    int event, uint32_t cookie)
+{
+	int isdir;
+
+	VNPASS(vp->v_holdcnt > 0, vp);
+
+	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+
+	if (dvp != NULL) {
+		VNPASS(dvp->v_holdcnt > 0, dvp);
+
+		/*
+		 * Should we log an event for the vnode itself?
+		 */
+		if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
+			int selfevent;
+
+			switch (event) {
+			case _IN_MOVE_DELETE:
+			case IN_DELETE:
+				/*
+				 * IN_DELETE_SELF is only generated when the
+				 * last hard link of a file is removed.
+				 */
+				selfevent = IN_DELETE_SELF;
+				if (vp->v_type != VDIR) {
+					struct vattr va;
+					int error;
+
+					error = VOP_GETATTR(vp, &va,
+					    cnp->cn_cred);
+					if (error == 0 && va.va_nlink != 0)
+						selfevent = 0;
+				}
+				break;
+			case IN_MOVED_FROM:
+				cookie = 0;
+				selfevent = IN_MOVE_SELF;
+				break;
+			case _IN_ATTRIB_LINKCOUNT:
+				selfevent = IN_ATTRIB;
+				break;
+			default:
+				selfevent = event;
+				break;
+			}
+
+			if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
+				inotify_log(vp, NULL, 0, selfevent | isdir,
+				    cookie);
+			}
+		}
+
+		/*
+		 * Something is watching the directory through which this vnode
+		 * was referenced, so we may need to log the event.
+		 */
+		if ((event & IN_ALL_EVENTS) != 0 &&
+		    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
+			inotify_log(dvp, cnp->cn_nameptr,
+			    cnp->cn_namelen, event | isdir, cookie);
+		}
+	} else {
+		/*
+		 * We don't know which watched directory might contain the
+		 * vnode, so we have to fall back to searching the name cache.
+		 */
+		cache_vop_inotify(vp, event, cookie);
+	}
+}
+
+int
+vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
+    uint32_t *wdp, struct thread *td)
+{
+	struct inotify_watch *watch, *watch1;
+	uint32_t wd;
+
+	/*
+	 * If this is a directory, make sure all of its entries are present in
+	 * the name cache so that we're able to look them up if an event occurs.
+	 * The persistent reference on the directory prevents the outgoing name
+	 * cache entries from being reclaimed.
+	 */
+	if (vp->v_type == VDIR) {
+		struct dirent *dp;
+		char *buf;
+		off_t off;
+		size_t buflen, len;
+		int eof, error;
+
+		buflen = 128 * sizeof(struct dirent);
+		buf = malloc(buflen, M_TEMP, M_WAITOK);
+
+		error = 0;
+		len = off = eof = 0;
+		for (;;) {
+			struct nameidata nd;
+
+			error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
+			    &len, &off, &eof);
+			if (error != 0)
+				break;
+			if (len == 0)
+				/* Finished reading. */
+				break;
+			if (strcmp(dp->d_name, ".") == 0 ||
+			    strcmp(dp->d_name, "..") == 0)
+				continue;
+
+			/*
+			 * namei() consumes a reference on the starting
+			 * directory if it's specified as a vnode.
+			 */
+			vrefact(vp);
+			VOP_UNLOCK(vp);
+			NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
+			    dp->d_name, vp);
+			error = namei(&nd);
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+			if (error != 0)
+				break;
+			vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
+			vrele(nd.ni_vp);
+		}
+		free(buf, M_TEMP);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * The vnode referenced in kern_inotify_add_watch() might be different
+	 * than this one if nullfs is in the picture.
+	 */
+	vrefact(vp);
+	watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
+	watch->sc = sc;
+	watch->vp = vp;
+	watch->mask = mask;
+
+	/*
+	 * Are we updating an existing watch?  Search the vnode's list rather
+	 * than that of the softc, as the former is likely to be shorter.
+	 */
+	v_addpollinfo(vp);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
+		if (watch1->sc == sc)
+			break;
+	}
+	mtx_lock(&sc->lock);
+	if (watch1 != NULL) {
+		mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+		/*
+		 * We found an existing watch, update it based on our flags.
+		 */
+		if ((mask & IN_MASK_CREATE) != 0) {
+			mtx_unlock(&sc->lock);
+			vrele(vp);
+			free(watch, M_INOTIFY);
+			return (EEXIST);
+		}
+		if ((mask & IN_MASK_ADD) != 0)
+			watch1->mask |= mask;
+		else
+			watch1->mask = mask;
+		*wdp = watch1->wd;
+		mtx_unlock(&sc->lock);
+		vrele(vp);
+		free(watch, M_INOTIFY);
+		return (EJUSTRETURN);
+	}
+
+	/*
+	 * We're creating a new watch.  Add it to the softc and vnode watch
+	 * lists.
+	 */
+	do {
+		struct inotify_watch key;
+
+		/*
+		 * Search for the next available watch descriptor.  This is
+		 * implemented so as to avoid reusing watch descriptors for as
+		 * long as possible.
+		 */
+		key.wd = wd = sc->nextwatch++;
+		watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+	} while (watch1 != NULL || wd == 0);
+	watch->wd = wd;
+	RB_INSERT(inotify_watch_tree, &sc->watches, watch);
+	TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+	mtx_unlock(&sc->lock);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+	vn_irflag_set_cond(vp, VIRF_INOTIFY);
+
+	*wdp = wd;
+
+	return (0);
+}
+
+void
+vn_inotify_revoke(struct vnode *vp)
+{
+	if (vp->v_pollinfo == NULL) {
+		/* This is a nullfs vnode which shadows a watched vnode. */
+		return;
+	}
+	inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
+}
+
+static int
+fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
+    struct file **fpp)
+{
+	struct file *fp;
+	int error;
+
+	error = fget(td, fd, needrightsp, &fp);
+	if (error != 0)
+		return (error);
+	if (fp->f_type != DTYPE_INOTIFY) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	*fpp = fp;
+	return (0);
+}
+
+int
+kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
+    struct thread *td)
+{
+	struct nameidata nd;
+	struct file *fp;
+	struct inotify_softc *sc;
+	struct vnode *vp;
+	uint32_t wd;
+	int count, error;
+
+	fp = NULL;
+	vp = NULL;
+
+	if ((mask & IN_ALL_EVENTS) == 0)
+		return (EXTERROR(EINVAL, "no events specified"));
+	if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
+	    (IN_MASK_ADD | IN_MASK_CREATE))
+		return (EXTERROR(EINVAL,
+		    "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
+	if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
+		return (EXTERROR(EINVAL, "unrecognized flag"));
+
+	error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
+	if (error != 0)
+		return (error);
+	sc = fp->f_data;
+
+	NDINIT_AT(&nd, LOOKUP,
+	    ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
+	    LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
+	error = namei(&nd);
+	if (error != 0)
+		goto out;
+	NDFREE_PNBUF(&nd);
+	vp = nd.ni_vp;
+
+	error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
+	if (error != 0)
+		goto out;
+
+	if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+
+	count = atomic_fetchadd_int(&inotify_watches, 1);
+	if (count > inotify_max_watches) {
+		atomic_subtract_int(&inotify_watches, 1);
+		error = ENOSPC;
+		goto out;
+	}
+	if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
+	    inotify_max_user_watches)) {
+		atomic_subtract_int(&inotify_watches, 1);
+		error = ENOSPC;
+		goto out;
+	}
+	error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
+	if (error != 0) {
+		atomic_subtract_int(&inotify_watches, 1);
+		(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+		if (error == EJUSTRETURN) {
+			/* We updated an existing watch, everything is ok. */
+			error = 0;
+		} else {
+			goto out;
+		}
+	}
+	td->td_retval[0] = wd;
+
+out:
+	if (vp != NULL)
+		vput(vp);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_inotify_add_watch_at(struct thread *td,
+    struct inotify_add_watch_at_args *uap)
+{
+	return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
+	    uap->mask, td));
+}
+
+int
+kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
+{
+	struct file *fp;
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	struct inotify_watch key, *watch;
+	int error;
+
+	error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
+	if (error != 0)
+		return (error);
+	sc = fp->f_data;
+
+	rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
+
+	/*
+	 * For compatibility with Linux, we do not remove pending events
+	 * associated with the watch.  Watch descriptors are implemented so as
+	 * to avoid being reused for as long as possible, so one hopes that any
+	 * pending events from the removed watch descriptor will be removed
+	 * before the watch descriptor is recycled.
+	 */
+	key.wd = wd;
+	mtx_lock(&sc->lock);
+	watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+	if (watch == NULL) {
+		free(rec, M_INOTIFY);
+		error = EINVAL;
+	} else {
+		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+		if (!inotify_queue_record(sc, rec)) {
+			free(rec, M_INOTIFY);
+			error = 0;
+		}
+	}
+	mtx_unlock(&sc->lock);
+	if (watch != NULL)
+		inotify_remove_watch(watch);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
+{
+	return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index 86c7bdaa02c0..fb3e6a7a2534 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -75,14 +75,20 @@ static void NDVALIDATE_impl(struct nameidata *, int);
 #endif
 
 /*
+ * Reset ndp to its original state.
+ */
+#define	NDRESET(ndp) do {						\
+	NDREINIT_DBG(ndp);						\
+	ndp->ni_resflags = 0;						\
+	ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;			\
+} while (0)
+/*
  * Prepare namei() to restart. Reset components to its original state and set
  * ISRESTARTED flag which signals the underlying lookup code to change the root
  * from ABI root to actual root and prevents a further restarts.
  */
 #define	NDRESTART(ndp) do {						\
-	NDREINIT_DBG(ndp);						\
-	ndp->ni_resflags = 0;						\
-	ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;			\
+	NDRESET(ndp);						\
 	ndp->ni_cnd.cn_flags |= ISRESTARTED;				\
 } while (0)
 
@@ -162,8 +168,8 @@ static struct vop_vector crossmp_vnodeops = {
  */
 
 struct nameicap_tracker {
-	struct vnode *dp;
 	TAILQ_ENTRY(nameicap_tracker) nm_link;
+	struct mount *mp;
 };
 
 /* Zone for cap mode tracker elements used for dotdot capability checks. */
@@ -192,49 +198,75 @@ SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
     "enables \"..\" components in path lookup in capability mode "
     "on non-local mount");
 
-static void
+static int
 nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
 {
 	struct nameicap_tracker *nt;
+	struct mount *mp;
+	int error;
 
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
-		return;
+		return (0);
+	mp = NULL;
+	error = VOP_GETWRITEMOUNT(dp, &mp);
+	if (error != 0)
+		return (error);
 	nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head);
-	if (nt != NULL && nt->dp == dp)
-		return;
+	if (nt != NULL && nt->mp == mp) {
+		vfs_rel(mp);
+		return (0);
+	}
 	nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK);
-	vhold(dp);
-	nt->dp = dp;
-	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+	nt->mp = mp;
+	error = lockmgr(&mp->mnt_renamelock, LK_SHARED | LK_NOWAIT, 0);
+	if (error != 0) {
+		MPASS(ndp->ni_nctrack_mnt == NULL);
+		ndp->ni_nctrack_mnt = mp;
+		free(nt, M_NAMEITRACKER);
+		error = ERESTART;
+	} else {
+		TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+	}
+	return (error);
 }
 
 static void
-nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first)
+nameicap_cleanup(struct nameidata *ndp, int error)
 {
 	struct nameicap_tracker *nt, *nt1;
+	struct mount *mp;
+
+	KASSERT((ndp->ni_nctrack_mnt == NULL &&
+	    TAILQ_EMPTY(&ndp->ni_cap_tracker)) ||
+	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0,
+	    ("tracker active and not strictrelative"));
 
-	nt = first;
-	TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+	TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+		mp = nt->mp;
+		lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+		vfs_rel(mp);
 		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
-		vdrop(nt->dp);
 		free(nt, M_NAMEITRACKER);
 	}
-}
 
-static void
-nameicap_cleanup(struct nameidata *ndp)
-{
-	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
-	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
-	nameicap_cleanup_from(ndp, NULL);
+	mp = ndp->ni_nctrack_mnt;
+	if (mp != NULL) {
+		if (error == ERESTART) {
+			lockmgr(&mp->mnt_renamelock, LK_EXCLUSIVE, 0);
+			lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0);
+		}
+		vfs_rel(mp);
+		ndp->ni_nctrack_mnt = NULL;
+	}
 }
 
 /*
- * For dotdot lookups in capability mode, only allow the component
- * lookup to succeed if the resulting directory was already traversed
- * during the operation.  This catches situations where already
- * traversed directory is moved to different parent, and then we walk
- * over it with dotdots.
+ * For dotdot lookups in capability mode, disallow walking over the
+ * directory no_rbeneath_dpp that was used as the starting point of
+ * the lookup.  Since we take the mnt_renamelocks of all mounts we
+ * ever walked over during lookup, parallel renames are disabled.
+ * This prevents the situation where we circumvent walk over
+ * ni_rbeneath_dpp following dotdots.
  *
  * Also allow to force failure of dotdot lookups for non-local
  * filesystems, where external agents might assist local lookups to
@@ -243,7 +275,6 @@ nameicap_cleanup(struct nameidata *ndp)
 static int
 nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
 {
-	struct nameicap_tracker *nt;
 	struct mount *mp;
 
 	if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf &
@@ -253,22 +284,16 @@ nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
 	    NI_LCF_CAP_DOTDOT_KTR)) == NI_LCF_STRICTREL_KTR))
 		NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0)
-		return (ENOTCAPABLE);
+		goto violation;
+	if (dp == ndp->ni_rbeneath_dpp)
+		goto violation;
 	mp = dp->v_mount;
 	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
 	    (mp->mnt_flag & MNT_LOCAL) == 0)
-		goto capfail;
-	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
-	    nm_link) {
-		if (dp == nt->dp) {
-			nt = TAILQ_NEXT(nt, nm_link);
-			if (nt != NULL)
-				nameicap_cleanup_from(ndp, nt);
-			return (0);
-		}
-	}
+		goto violation;
+	return (0);
 
-capfail:
+violation:
 	if (__predict_false((ndp->ni_lcf & NI_LCF_STRICTREL_KTR) != 0))
 		NI_CAP_VIOLATION(ndp, ndp->ni_cnd.cn_pnbuf);
 	return (ENOTCAPABLE);
@@ -394,6 +419,8 @@ namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
 			    NI_LCF_CAP_DOTDOT;
 		}
 	}
+	if (error == 0 && (ndp->ni_lcf & NI_LCF_STRICTREL) != 0)
+		ndp->ni_rbeneath_dpp = *dpp;
 
 	/*
 	 * If we are auditing the kernel pathname, save the user pathname.
@@ -631,6 +658,7 @@ restart:
 	error = namei_getpath(ndp);
 	if (__predict_false(error != 0)) {
 		namei_cleanup_cnp(cnp);
+		nameicap_cleanup(ndp, error);
 		SDT_PROBE4(vfs, namei, lookup, return, error, NULL,
 		    false, ndp);
 		return (error);
@@ -661,12 +689,12 @@ restart:
 		else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
 		    (cnp->cn_flags & ISRESTARTED) == 0)) {
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, ERESTART);
 			NDRESTART(ndp);
 			goto restart;
 		}
 		return (error);
 	case CACHE_FPL_STATUS_PARTIAL:
-		TAILQ_INIT(&ndp->ni_cap_tracker);
 		dp = ndp->ni_startdir;
 		break;
 	case CACHE_FPL_STATUS_DESTROYED:
@@ -674,18 +702,21 @@ restart:
 		error = namei_getpath(ndp);
 		if (__predict_false(error != 0)) {
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, error);
 			return (error);
 		}
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		/* FALLTHROUGH */
 	case CACHE_FPL_STATUS_ABORTED:
-		TAILQ_INIT(&ndp->ni_cap_tracker);
 		MPASS(ndp->ni_lcf == 0);
 		if (*cnp->cn_pnbuf == '\0') {
 			if ((cnp->cn_flags & EMPTYPATH) != 0) {
-				return (namei_emptypath(ndp));
+				error = namei_emptypath(ndp);
+				nameicap_cleanup(ndp, error);
+				return (error);
 			}
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, ENOENT);
 			SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL,
 			    false, ndp);
 			return (ENOENT);
@@ -693,6 +724,7 @@ restart:
 		error = namei_setup(ndp, &dp, &pwd);
 		if (error != 0) {
 			namei_cleanup_cnp(cnp);
+			nameicap_cleanup(ndp, error);
 			return (error);
 		}
 		break;
@@ -705,16 +737,23 @@ restart:
 		ndp->ni_startdir = dp;
 		error = vfs_lookup(ndp);
 		if (error != 0) {
-			if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
-			    error == ENOENT &&
-			    (cnp->cn_flags & ISRESTARTED) == 0)) {
-				nameicap_cleanup(ndp);
-				pwd_drop(pwd);
-				namei_cleanup_cnp(cnp);
-				NDRESTART(ndp);
-				goto restart;
-			} else
+			uint64_t was_restarted;
+			bool abi_restart;
+
+			was_restarted = ndp->ni_cnd.cn_flags &
+			    ISRESTARTED;
+			abi_restart = pwd->pwd_adir != pwd->pwd_rdir &&
+			    error == ENOENT && was_restarted == 0;
+			if (error != ERESTART && !abi_restart)
 				goto out;
+			nameicap_cleanup(ndp, error);
+			pwd_drop(pwd);
+			namei_cleanup_cnp(cnp);
+			NDRESET(ndp);
+			if (abi_restart)
+				was_restarted = ISRESTARTED;
+			ndp->ni_cnd.cn_flags |= was_restarted;
+			goto restart;
 		}
 
 		/*
@@ -723,7 +762,7 @@ restart:
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			SDT_PROBE4(vfs, namei, lookup, return, error,
 			    ndp->ni_vp, false, ndp);
-			nameicap_cleanup(ndp);
+			nameicap_cleanup(ndp, 0);
 			pwd_drop(pwd);
 			NDVALIDATE(ndp);
 			return (0);
@@ -756,10 +795,10 @@ restart:
 	ndp->ni_vp = NULL;
 	vrele(ndp->ni_dvp);
 out:
-	MPASS(error != 0);
+	MPASS(error != 0 && error != ERESTART);
 	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
 	namei_cleanup_cnp(cnp);
-	nameicap_cleanup(ndp);
+	nameicap_cleanup(ndp, error);
 	pwd_drop(pwd);
 	return (error);
 }
@@ -1185,7 +1224,9 @@ dirloop:
 		}
 	}
 
-	nameicap_tracker_add(ndp, dp);
+	error = nameicap_tracker_add(ndp, dp);
+	if (error != 0)
+		goto bad;
 
 	/*
 	 * Make sure degenerate names don't get here, their handling was
@@ -1210,9 +1251,7 @@ dirloop:
 	 *    the jail or chroot, don't let them out.
 	 * 5. If doing a capability lookup and lookup_cap_dotdot is
 	 *    enabled, return ENOTCAPABLE if the lookup would escape
-	 *    from the initial file descriptor directory.  Checks are
-	 *    done by ensuring that namei() already traversed the
-	 *    result of dotdot lookup.
+	 *    from the initial file descriptor directory.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (__predict_false((ndp->ni_lcf & (NI_LCF_STRICTREL_KTR |
@@ -1238,7 +1277,7 @@ dirloop:
 					NI_CAP_VIOLATION(ndp, cnp->cn_pnbuf);
 				if ((ndp->ni_lcf & NI_LCF_STRICTREL) != 0) {
 					error = ENOTCAPABLE;
-					goto capdotdot;
+					goto bad;
 				}
 			}
 			if (isroot || ((dp->v_vflag & VV_ROOT) != 0 &&
@@ -1261,11 +1300,6 @@ dirloop:
 			vn_lock(dp,
 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY));
-			error = nameicap_check_dotdot(ndp, dp);
-			if (error != 0) {
-capdotdot:
-				goto bad;
-			}
 		}
 	}
 
@@ -1314,7 +1348,9 @@ unionlookup:
 			vn_lock(dp,
 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY));
-			nameicap_tracker_add(ndp, dp);
+			error = nameicap_tracker_add(ndp, dp);
+			if (error != 0)
+				goto bad;
 			goto unionlookup;
 		}
 
@@ -1415,7 +1451,7 @@ nextname:
 		goto dirloop;
 	}
 	if (cnp->cn_flags & ISDOTDOT) {
-		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
+		error = nameicap_check_dotdot(ndp, ndp->ni_dvp);
 		if (error != 0)
 			goto bad2;
 	}
@@ -1485,8 +1521,11 @@ success:
 	}
 success_right_lock:
 	if (ndp->ni_vp != NULL) {
-		if ((cnp->cn_flags & ISDOTDOT) == 0)
-			nameicap_tracker_add(ndp, ndp->ni_vp);
+		if ((cnp->cn_flags & ISDOTDOT) == 0) {
+			error = nameicap_tracker_add(ndp, ndp->ni_vp);
+			if (error != 0)
+				goto bad2;
+		}
 		if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS)
 			return (vfs_lookup_failifexists(ndp));
 	}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index cb18468d28bc..8e64a7fe966b 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -156,6 +156,7 @@ mount_init(void *mem, int size, int flags)
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+	lockinit(&mp->mnt_renamelock, PVFS, "rename", 0, 0);
 	mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
 	mp->mnt_ref = 0;
 	mp->mnt_vfs_ops = 1;
@@ -170,6 +171,7 @@ mount_fini(void *mem, int size)
 
 	mp = (struct mount *)mem;
 	uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
+	lockdestroy(&mp->mnt_renamelock);
 	lockdestroy(&mp->mnt_explock);
 	mtx_destroy(&mp->mnt_listmtx);
 	mtx_destroy(&mp->mnt_mtx);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index dc2fb59fb81c..918b256e6c59 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -38,7 +38,6 @@
  * External virtual filesystem routines
  */
 
-#include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
@@ -57,6 +56,7 @@
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
+#include <sys/inotify.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
@@ -5246,7 +5246,8 @@ destroy_vpollinfo_free(struct vpollinfo *vi)
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
-
+	KASSERT(TAILQ_EMPTY(&vi->vpi_inotify),
+	    ("%s: pollinfo %p has lingering watches", __func__, vi));
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
@@ -5260,12 +5261,13 @@ v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
-	if (vp->v_pollinfo != NULL)
+	if (atomic_load_ptr(&vp->v_pollinfo) != NULL)
 		return;
 	vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_lock);
+	TAILQ_INIT(&vi->vpi_inotify);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
@@ -5851,6 +5853,8 @@ vop_rename_pre(void *ap)
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
+	struct mount *tmp;
+
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
@@ -5868,6 +5872,11 @@ vop_rename_pre(void *ap)
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+
+	tmp = NULL;
+	VOP_GETWRITEMOUNT(a->a_tdvp, &tmp);
+	lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED);
+	vfs_rel(tmp);
 #endif
 	/*
 	 * It may be tempting to add vn_seqc_write_begin/end calls here and
@@ -6057,6 +6066,28 @@ vop_need_inactive_debugpost(void *ap, int rc)
 #endif
 
 void
+vop_allocate_post(void *ap, int rc)
+{
+	struct vop_allocate_args *a;
+
+	a = ap;
+	if (rc == 0)
+		INOTIFY(a->a_vp, IN_MODIFY);
+}
+
+void
+vop_copy_file_range_post(void *ap, int rc)
+{
+	struct vop_copy_file_range_args *a;
+
+	a = ap;
+	if (rc == 0) {
+		INOTIFY(a->a_invp, IN_ACCESS);
+		INOTIFY(a->a_outvp, IN_MODIFY);
+	}
+}
+
+void
 vop_create_pre(void *ap)
 {
 	struct vop_create_args *a;
@@ -6076,8 +6107,20 @@ vop_create_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
+}
+
+void
+vop_deallocate_post(void *ap, int rc)
+{
+	struct vop_deallocate_args *a;
+
+	a = ap;
+	if (rc == 0)
+		INOTIFY(a->a_vp, IN_MODIFY);
 }
 
 void
@@ -6122,8 +6165,10 @@ vop_deleteextattr_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+		INOTIFY(vp, IN_ATTRIB);
+	}
 }
 
 void
@@ -6153,6 +6198,8 @@ vop_link_post(void *ap, int rc)
 	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
+		INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
+		INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE);
 	}
 }
 
@@ -6176,8 +6223,10 @@ vop_mkdir_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
@@ -6212,8 +6261,10 @@ vop_mknod_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
 }
 
 void
@@ -6225,8 +6276,10 @@ vop_reclaim_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	ASSERT_VOP_IN_SEQC(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
+		INOTIFY_REVOKE(vp);
+	}
 }
 
 void
@@ -6257,6 +6310,8 @@ vop_remove_post(void *ap, int rc)
 	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
+		INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
+		INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
 	}
 }
 
@@ -6288,6 +6343,8 @@ vop_rename_post(void *ap, int rc)
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
+		INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp,
+		    a->a_tdvp, a->a_tcnp);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
@@ -6327,6 +6384,7 @@ vop_rmdir_post(void *ap, int rc)
 		vp->v_vflag |= VV_UNLINKED;
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
+		INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
 	}
 }
 
@@ -6350,8 +6408,10 @@ vop_setattr_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+		INOTIFY(vp, IN_ATTRIB);
+	}
 }
 
 void
@@ -6396,8 +6456,10 @@ vop_setextattr_post(void *ap, int rc)
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+		INOTIFY(vp, IN_ATTRIB);
+	}
 }
 
 void
@@ -6420,8 +6482,10 @@ vop_symlink_post(void *ap, int rc)
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
+		INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
+	}
 }
 
 void
@@ -6429,8 +6493,10 @@ vop_open_post(void *ap, int rc)
 {
 	struct vop_open_args *a = ap;
 
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
+		INOTIFY(a->a_vp, IN_OPEN);
+	}
 }
 
 void
@@ -6442,6 +6508,8 @@ vop_close_post(void *ap, int rc)
 	    !VN_IS_DOOMED(a->a_vp))) {
 		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
+		INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
+		    IN_CLOSE_WRITE : IN_CLOSE_NOWRITE);
 	}
 }
 
@@ -6450,8 +6518,10 @@ vop_read_post(void *ap, int rc)
 {
 	struct vop_read_args *a = ap;
 
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+		INOTIFY(a->a_vp, IN_ACCESS);
+	}
 }
 
 void
@@ -6468,8 +6538,10 @@ vop_readdir_post(void *ap, int rc)
 {
 	struct vop_readdir_args *a = ap;
 
-	if (!rc)
+	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+		INOTIFY(a->a_vp, IN_ACCESS);
+	}
 }
 
 static struct knlist fs_knlist;
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index c236f241bf20..c71e0d9ee569 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -3766,7 +3766,7 @@ int
 kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
     const char *new, enum uio_seg pathseg)
 {
-	struct mount *mp = NULL;
+	struct mount *mp, *tmp;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	uint64_t tondflags;
@@ -3774,6 +3774,7 @@ kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
 	short irflag;
 
 again:
+	tmp = mp = NULL;
 	bwillwrite();
 #ifdef MAC
 	if (mac_vnode_check_rename_from_enabled()) {
@@ -3809,6 +3810,7 @@ again:
 	tvp = tond.ni_vp;
 	error = vn_start_write(fvp, &mp, V_NOWAIT);
 	if (error != 0) {
+again1:
 		NDFREE_PNBUF(&fromnd);
 		NDFREE_PNBUF(&tond);
 		if (tvp != NULL)
@@ -3819,11 +3821,25 @@ again:
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
+		if (tmp != NULL) {
+			lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE, NULL);
+			lockmgr(&tmp->mnt_renamelock, LK_RELEASE, NULL);
+			vfs_rel(tmp);
+			tmp = NULL;
+		}
 		error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
 		if (error != 0)
 			return (error);
 		goto again;
 	}
+	error = VOP_GETWRITEMOUNT(tdvp, &tmp);
+	if (error != 0 || tmp == NULL)
+		goto again1;
+	error = lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+	if (error != 0) {
+		vn_finished_write(mp);
+		goto again1;
+	}
 	irflag = vn_irflag_read(fvp);
 	if (((irflag & VIRF_NAMEDATTR) != 0 && tdvp != fromnd.ni_dvp) ||
 	    (irflag & VIRF_NAMEDDIR) != 0) {
@@ -3884,6 +3900,8 @@ out:
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
+	lockmgr(&tmp->mnt_renamelock, LK_RELEASE, 0);
+	vfs_rel(tmp);
 	vn_finished_write(mp);
 out1:
 	if (error == ERESTART)
@@ -4296,10 +4314,6 @@ kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
-	if (vp->v_type != VDIR) {
-		error = EINVAL;
-		goto fail;
-	}
 	if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) {
 		error = ENOENT;
 		goto fail;
@@ -4312,6 +4326,19 @@ unionread:
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
+	/*
+	 * We want to return ENOTDIR for anything that is not VDIR, but
+	 * not for VBAD, and we can't check for VBAD while the vnode is
+	 * unlocked.
+	 */
+	if (vp->v_type != VDIR) {
+		if (vp->v_type == VBAD)
+			error = EBADF;
+		else
+			error = ENOTDIR;
+		VOP_UNLOCK(vp);
+		goto fail;
+	}
 	AUDIT_ARG_VNODE1(vp);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 7487f93e4880..6451c9e07a60 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -52,6 +52,7 @@
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
+#include <sys/inotify.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/limits.h>
@@ -308,7 +309,8 @@ restart:
 				NDREINIT(ndp);
 				goto restart;
 			}
-			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
+			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0 ||
+			    (vn_irflag_read(ndp->ni_dvp) & VIRF_INOTIFY) != 0)
 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
@@ -484,6 +486,7 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 		if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
 		    VOP_ACCESS(vp, VREAD, cred, td) == 0)
 			fp->f_flag |= FKQALLOWED;
+		INOTIFY(vp, IN_OPEN);
 		return (0);
 	}
 
@@ -1746,6 +1749,8 @@ vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
 			vattr.va_vaflags |= VA_SYNC;
 		error = VOP_SETATTR(vp, &vattr, cred);
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
+		if (error == 0)
+			INOTIFY(vp, IN_MODIFY);
 	}
 	return (error);
 }
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index a2b6a7c8ff9f..38138a4af921 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -702,6 +702,7 @@ vop_vptocnp {
 
 
 %% allocate	vp	E E E
+%! allocate	post	vop_allocate_post
 
 vop_allocate {
 	IN struct vnode *vp;
@@ -786,6 +787,7 @@ vop_fdatasync {
 
 %% copy_file_range	invp	U U U
 %% copy_file_range	outvp	U U U
+%! copy_file_range	post	vop_copy_file_range_post
 
 vop_copy_file_range {
 	IN struct vnode *invp;
@@ -810,6 +812,7 @@ vop_vput_pair {
 
 
 %% deallocate	vp	L L L
+%! deallocate	post	vop_deallocate_post
 
 vop_deallocate {
 	IN struct vnode *vp;
@@ -821,6 +824,27 @@ vop_deallocate {
 };
 
 
+%% inotify	vp	- - -
+
+vop_inotify {
+	IN struct vnode *vp;
+	IN struct vnode *dvp;
+	IN struct componentname *cnp;
+	IN int event;
+	IN uint32_t cookie;
+};
+
+
+%% inotify_add_watch vp	L L L
+
+vop_inotify_add_watch {
+	IN struct vnode *vp;
+	IN struct inotify_softc *sc;
+	IN uint32_t mask;
+	OUT uint32_t *wdp;
+	IN struct thread *td;
+};
+
 # The VOPs below are spares at the end of the table to allow new VOPs to be
 # added in stable branches without breaking the KBI.  New VOPs in HEAD should
 # be added above these spares.  When merging a new VOP to a stable branch,