1 files changed, 1008 insertions, 0 deletions
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
new file mode 100644
index 000000000000..9562350c897f
--- /dev/null
+++ b/sys/kern/vfs_inotify.c
@@ -0,0 +1,1008 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Klara, Inc.
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/caprights.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#define	EXTERR_CATEGORY	EXTERR_CAT_INOTIFY
+#include <sys/exterrvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/inotify.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslimits.h>
+#include <sys/sysproto.h>
+#include <sys/tree.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+uint32_t inotify_rename_cookie;
+
+static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "inotify configuration");
+
+static int inotify_max_queued_events = 16384;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
+    &inotify_max_queued_events, 0,
+    "Maximum number of events to queue on an inotify descriptor");
+
+static int inotify_max_user_instances = 256;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
+    &inotify_max_user_instances, 0,
+    "Maximum number of inotify descriptors per user");
+
+static int inotify_max_user_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
+    &inotify_max_user_watches, 0,
+    "Maximum number of inotify watches per user");
+
+static int inotify_max_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
+    &inotify_max_watches, 0,
+    "Maximum number of inotify watches system-wide");
+
+static int inotify_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
+    &inotify_watches, 0,
+    "Total number of inotify watches currently in use");
+
+static int inotify_coalesce = 1;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
+    &inotify_coalesce, 0,
+    "Coalesce inotify events when possible");
+
+static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
+SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
+    &inotify_event_drops,
+    "Number of inotify events dropped due to limits or allocation failures");
+
+static fo_rdwr_t	inotify_read;
+static fo_ioctl_t	inotify_ioctl;
+static fo_poll_t	inotify_poll;
+static fo_kqfilter_t	inotify_kqfilter;
+static fo_stat_t	inotify_stat;
+static fo_close_t	inotify_close;
+static fo_fill_kinfo_t	inotify_fill_kinfo;
+
+static const struct fileops inotifyfdops = {
+	.fo_read = inotify_read,
+	.fo_write = invfo_rdwr,
+	.fo_truncate = invfo_truncate,
+	.fo_ioctl = inotify_ioctl,
+	.fo_poll = inotify_poll,
+	.fo_kqfilter = inotify_kqfilter,
+	.fo_stat = inotify_stat,
+	.fo_close = inotify_close,
+	.fo_chmod = invfo_chmod,
+	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_fill_kinfo = inotify_fill_kinfo,
+	.fo_cmp = file_kcmp_generic,
+	.fo_flags = DFLAG_PASSABLE,
+};
+
+static void	filt_inotifydetach(struct knote *kn);
+static int	filt_inotifyevent(struct knote *kn, long hint);
+
+static const struct filterops inotify_rfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_inotifydetach,
+	.f_event = filt_inotifyevent,
+};
+
+static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
+
+struct inotify_record {
+	STAILQ_ENTRY(inotify_record) link;
+	struct inotify_event	ev;
+};
+
+static uint64_t inotify_ino = 1;
+
+/*
+ * On LP64 systems this occupies 64 bytes, so we don't get internal
+ * fragmentation by allocating watches with malloc(9).  If the size changes,
+ * consider using a UMA zone to improve memory efficiency.
+ */
+struct inotify_watch {
+	struct inotify_softc *sc; /* back-pointer */
+	int		wd;	/* unique ID */
+	uint32_t	mask;	/* event mask */
+	struct vnode	*vp;	/* vnode being watched, refed */
+	RB_ENTRY(inotify_watch) ilink;		/* inotify linkage */
+	TAILQ_ENTRY(inotify_watch) vlink;	/* vnode linkage */
+};
+
+static void
+inotify_init(void *arg __unused)
+{
+	/* Don't let a user hold too many vnodes. */
+	inotify_max_user_watches = desiredvnodes / 3;
+	/* Don't let the system hold too many vnodes. */
+	inotify_max_watches = desiredvnodes / 2;
+}
+SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
+
+static int
+inotify_watch_cmp(const struct inotify_watch *a,
+    const struct inotify_watch *b)
+{
+	if (a->wd < b->wd)
+		return (-1);
+	else if (a->wd > b->wd)
+		return (1);
+	else
+		return (0);
+}
+RB_HEAD(inotify_watch_tree, inotify_watch);
+RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
+
+struct inotify_softc {
+	struct mtx	lock;			/* serialize all softc writes */
+	STAILQ_HEAD(, inotify_record) pending;	/* events waiting to be read */
+	struct inotify_record overflow;		/* preallocated record */
+	int		nextwatch;		/* next watch ID to try */
+	int		npending;		/* number of pending events */
+	size_t		nbpending;		/* bytes available to read */
+	uint64_t	ino;			/* unique identifier */
+	struct inotify_watch_tree watches;	/* active watches */
+	struct selinfo	sel;			/* select/poll/kevent info */
+	struct ucred	*cred;			/* credential ref */
+};
+
+static struct inotify_record *
+inotify_dequeue(struct inotify_softc *sc)
+{
+	struct inotify_record *rec;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+	KASSERT(!STAILQ_EMPTY(&sc->pending),
+	    ("%s: queue for %p is empty", __func__, sc));
+
+	rec = STAILQ_FIRST(&sc->pending);
+	STAILQ_REMOVE_HEAD(&sc->pending, link);
+	sc->npending--;
+	sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
+	return (rec);
+}
+
+static void
+inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
+{
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	if (head)
+		STAILQ_INSERT_HEAD(&sc->pending, rec, link);
+	else
+		STAILQ_INSERT_TAIL(&sc->pending, rec, link);
+	sc->npending++;
+	sc->nbpending += sizeof(rec->ev) + rec->ev.len;
+}
+
+static int
+inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
+    struct thread *td)
+{
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	int error;
+	bool first;
+
+	sc = fp->f_data;
+	error = 0;
+
+	mtx_lock(&sc->lock);
+	while (STAILQ_EMPTY(&sc->pending)) {
+		if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
+			mtx_unlock(&sc->lock);
+			return (EWOULDBLOCK);
+		}
+		error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
+		if (error != 0) {
+			mtx_unlock(&sc->lock);
+			return (error);
+		}
+	}
+	for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
+		size_t len;
+
+		rec = inotify_dequeue(sc);
+		len = sizeof(rec->ev) + rec->ev.len;
+		if (uio->uio_resid < (ssize_t)len) {
+			inotify_enqueue(sc, rec, true);
+			if (first) {
+				error = EXTERROR(EINVAL,
+				    "read buffer is too small");
+			}
+			break;
+		}
+		mtx_unlock(&sc->lock);
+		error = uiomove(&rec->ev, len, uio);
+#ifdef KTRACE
+		if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+			ktrstruct("inotify", &rec->ev, len);
+#endif
+		mtx_lock(&sc->lock);
+		if (error != 0) {
+			inotify_enqueue(sc, rec, true);
+			mtx_unlock(&sc->lock);
+			return (error);
+		}
+		if (rec == &sc->overflow) {
+			/*
+			 * Signal to inotify_queue_record() that the overflow
+			 * record can be reused.
+			 */
+			memset(rec, 0, sizeof(*rec));
+		} else {
+			free(rec, M_INOTIFY);
+		}
+	}
+	mtx_unlock(&sc->lock);
+	return (error);
+}
+
+static int
+inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
+    struct thread *td)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	switch (com) {
+	case FIONREAD:
+		*(int *)data = (int)sc->nbpending;
+		return (0);
+	case FIONBIO:
+	case FIOASYNC:
+		return (0);
+	default:
+		return (ENOTTY);
+	}
+
+	return (0);
+}
+
+static int
+inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+	struct inotify_softc *sc;
+	int revents;
+
+	sc = fp->f_data;
+	revents = 0;
+
+	mtx_lock(&sc->lock);
+	if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
+		revents |= events & (POLLIN | POLLRDNORM);
+	else
+		selrecord(td, &sc->sel);
+	mtx_unlock(&sc->lock);
+	return (revents);
+}
+
+static void
+filt_inotifydetach(struct knote *kn)
+{
+	struct inotify_softc *sc;
+
+	sc = kn->kn_hook;
+	knlist_remove(&sc->sel.si_note, kn, 0);
+}
+
+static int
+filt_inotifyevent(struct knote *kn, long hint)
+{
+	struct inotify_softc *sc;
+
+	sc = kn->kn_hook;
+	mtx_assert(&sc->lock, MA_OWNED);
+	kn->kn_data = sc->nbpending;
+	return (kn->kn_data > 0);
+}
+
+static int
+inotify_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct inotify_softc *sc;
+
+	if (kn->kn_filter != EVFILT_READ)
+		return (EINVAL);
+	sc = fp->f_data;
+	kn->kn_fop = &inotify_rfiltops;
+	kn->kn_hook = sc;
+	knlist_add(&sc->sel.si_note, kn, 0);
+	return (0);
+}
+
+static int
+inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	memset(sb, 0, sizeof(*sb));
+	sb->st_mode = S_IFREG | S_IRUSR;
+	sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
+	mtx_lock(&sc->lock);
+	sb->st_size = sc->nbpending;
+	sb->st_blocks = sc->npending;
+	sb->st_uid = sc->cred->cr_ruid;
+	sb->st_gid = sc->cred->cr_rgid;
+	sb->st_ino = sc->ino;
+	mtx_unlock(&sc->lock);
+	return (0);
+}
+
+static void
+inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
+{
+	struct vnode *vp;
+
+	vp = watch->vp;
+	mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
+
+	atomic_subtract_int(&inotify_watches, 1);
+	(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+
+	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
+		vn_irflag_unset_locked(vp, VIRF_INOTIFY);
+}
+
+/*
+ * Assumes that the watch has already been removed from its softc.
+ */
+static void
+inotify_remove_watch(struct inotify_watch *watch)
+{
+	struct inotify_softc *sc;
+	struct vnode *vp;
+
+	sc = watch->sc;
+
+	vp = watch->vp;
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	inotify_unlink_watch_locked(sc, watch);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+	vrele(vp);
+	free(watch, M_INOTIFY);
+}
+
+static int
+inotify_close(struct file *fp, struct thread *td)
+{
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	struct inotify_watch *watch;
+
+	sc = fp->f_data;
+
+	mtx_lock(&sc->lock);
+	(void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
+	while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
+		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+		mtx_unlock(&sc->lock);
+		inotify_remove_watch(watch);
+		mtx_lock(&sc->lock);
+	}
+	while (!STAILQ_EMPTY(&sc->pending)) {
+		rec = inotify_dequeue(sc);
+		if (rec != &sc->overflow)
+			free(rec, M_INOTIFY);
+	}
+	mtx_unlock(&sc->lock);
+	seldrain(&sc->sel);
+	knlist_destroy(&sc->sel.si_note);
+	mtx_destroy(&sc->lock);
+	crfree(sc->cred);
+	free(sc, M_INOTIFY);
+	return (0);
+}
+
+static int
+inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+    struct filedesc *fdp)
+{
+	struct inotify_softc *sc;
+
+	sc = fp->f_data;
+
+	kif->kf_type = KF_TYPE_INOTIFY;
+	kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
+	kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
+	return (0);
+}
+
+int
+inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
+{
+	struct inotify_softc *sc;
+	int fflags;
+
+	if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
+		return (EINVAL);
+
+	if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
+	    inotify_max_user_instances))
+		return (EMFILE);
+
+	sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
+	sc->nextwatch = 1; /* Required for compatibility. */
+	STAILQ_INIT(&sc->pending);
+	RB_INIT(&sc->watches);
+	mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
+	knlist_init_mtx(&sc->sel.si_note, &sc->lock);
+	sc->cred = crhold(td->td_ucred);
+	sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
+
+	fflags = FREAD;
+	if ((flags & IN_NONBLOCK) != 0)
+		fflags |= FNONBLOCK;
+	if ((flags & IN_CLOEXEC) != 0)
+		*fflagsp |= O_CLOEXEC;
+	finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
+
+	return (0);
+}
+
+static struct inotify_record *
+inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
+    uint32_t cookie, int waitok)
+{
+	struct inotify_event *evp;
+	struct inotify_record *rec;
+
+	rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
+	    waitok | M_ZERO);
+	if (rec == NULL)
+		return (NULL);
+	evp = &rec->ev;
+	evp->wd = wd;
+	evp->mask = event;
+	evp->cookie = cookie;
+	evp->len = _IN_NAMESIZE(namelen);
+	if (name != NULL)
+		memcpy(evp->name, name, namelen);
+	return (rec);
+}
+
+static bool
+inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
+{
+	struct inotify_record *prev;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	prev = STAILQ_LAST(&sc->pending, inotify_record, link);
+	return (prev != NULL && prev->ev.mask == evp->mask &&
+	    prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
+	    prev->ev.len == evp->len &&
+	    (evp->len == 0 || strcmp(prev->ev.name, evp->name) == 0));
+}
+
+static void
+inotify_overflow_event(struct inotify_event *evp)
+{
+	evp->mask = IN_Q_OVERFLOW;
+	evp->wd = -1;
+	evp->cookie = 0;
+	evp->len = 0;
+}
+
+/*
+ * Put an event record on the queue for an inotify desscriptor.  Return false if
+ * the record was not enqueued for some reason, true otherwise.
+ */
+static bool
+inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
+{
+	struct inotify_event *evp;
+
+	mtx_assert(&sc->lock, MA_OWNED);
+
+	evp = &rec->ev;
+	if (__predict_false(rec == &sc->overflow)) {
+		/*
+		 * Is the overflow record already in the queue?  If so, there's
+		 * not much else we can do: we're here because a kernel memory
+		 * shortage prevented new record allocations.
+		 */
+		counter_u64_add(inotify_event_drops, 1);
+		if (evp->mask == IN_Q_OVERFLOW)
+			return (false);
+		inotify_overflow_event(evp);
+	} else {
+		/* Try to coalesce duplicate events. */
+		if (inotify_coalesce && inotify_can_coalesce(sc, evp))
+			return (false);
+
+		/*
+		 * Would this one overflow the queue?  If so, convert it to an
+		 * overflow event and try again to coalesce.
+		 */
+		if (sc->npending >= inotify_max_queued_events) {
+			counter_u64_add(inotify_event_drops, 1);
+			inotify_overflow_event(evp);
+			if (inotify_can_coalesce(sc, evp))
+				return (false);
+		}
+	}
+	inotify_enqueue(sc, rec, false);
+	selwakeup(&sc->sel);
+	KNOTE_LOCKED(&sc->sel.si_note, 0);
+	wakeup(&sc->pending);
+	return (true);
+}
+
+static int
+inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
+    int event, uint32_t cookie)
+{
+	struct inotify_watch key;
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	int relecount;
+	bool allocfail;
+
+	relecount = 0;
+
+	sc = watch->sc;
+	rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
+	    M_NOWAIT);
+	if (rec == NULL) {
+		rec = &sc->overflow;
+		allocfail = true;
+	} else {
+		allocfail = false;
+	}
+
+	mtx_lock(&sc->lock);
+	if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
+		free(rec, M_INOTIFY);
+	if ((watch->mask & IN_ONESHOT) != 0 ||
+	    (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
+		if (!allocfail) {
+			rec = inotify_alloc_record(watch->wd, NULL, 0,
+			    IN_IGNORED, 0, M_NOWAIT);
+			if (rec == NULL)
+				rec = &sc->overflow;
+			if (!inotify_queue_record(sc, rec) &&
+			    rec != &sc->overflow)
+				free(rec, M_INOTIFY);
+		}
+
+		/*
+		 * Remove the watch, taking care to handle races with
+		 * inotify_close().
+		 */
+		key.wd = watch->wd;
+		if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
+			RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+			inotify_unlink_watch_locked(sc, watch);
+			free(watch, M_INOTIFY);
+
+			/* Defer vrele() to until locks are dropped. */
+			relecount++;
+		}
+	}
+	mtx_unlock(&sc->lock);
+	return (relecount);
+}
+
+void
+inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
+    uint32_t cookie)
+{
+	struct inotify_watch *watch, *tmp;
+	int relecount;
+
+	KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
+	    ("inotify_log: invalid event %#x", event));
+
+	relecount = 0;
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
+		KASSERT(watch->vp == vp,
+		    ("inotify_log: watch %p vp != vp", watch));
+		if ((watch->mask & event) != 0 || event == IN_UNMOUNT) {
+			relecount += inotify_log_one(watch, name, namelen, event,
+			    cookie);
+		}
+	}
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+	for (int i = 0; i < relecount; i++)
+		vrele(vp);
+}
+
+/*
+ * An inotify event occurred on a watched vnode.
+ */
+void
+vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
+    int event, uint32_t cookie)
+{
+	int isdir;
+
+	VNPASS(vp->v_holdcnt > 0, vp);
+
+	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+
+	if (dvp != NULL) {
+		VNPASS(dvp->v_holdcnt > 0, dvp);
+
+		/*
+		 * Should we log an event for the vnode itself?
+		 */
+		if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
+			int selfevent;
+
+			switch (event) {
+			case _IN_MOVE_DELETE:
+			case IN_DELETE:
+				/*
+				 * IN_DELETE_SELF is only generated when the
+				 * last hard link of a file is removed.
+				 */
+				selfevent = IN_DELETE_SELF;
+				if (vp->v_type != VDIR) {
+					struct vattr va;
+					int error;
+
+					error = VOP_GETATTR(vp, &va, cnp->cn_cred);
+					if (error == 0 && va.va_nlink != 0)
+						selfevent = 0;
+				}
+				break;
+			case IN_MOVED_FROM:
+				cookie = 0;
+				selfevent = IN_MOVE_SELF;
+				break;
+			case _IN_ATTRIB_LINKCOUNT:
+				selfevent = IN_ATTRIB;
+				break;
+			default:
+				selfevent = event;
+				break;
+			}
+
+			if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
+				inotify_log(vp, NULL, 0, selfevent | isdir,
+				    cookie);
+			}
+		}
+
+		/*
+		 * Something is watching the directory through which this vnode
+		 * was referenced, so we may need to log the event.
+		 */
+		if ((event & IN_ALL_EVENTS) != 0 &&
+		    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
+			inotify_log(dvp, cnp->cn_nameptr,
+			    cnp->cn_namelen, event | isdir, cookie);
+		}
+	} else {
+		/*
+		 * We don't know which watched directory might contain the
+		 * vnode, so we have to fall back to searching the name cache.
+		 */
+		cache_vop_inotify(vp, event, cookie);
+	}
+}
+
+int
+vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
+    uint32_t *wdp, struct thread *td)
+{
+	struct inotify_watch *watch, *watch1;
+	uint32_t wd;
+
+	/*
+	 * If this is a directory, make sure all of its entries are present in
+	 * the name cache so that we're able to look them up if an event occurs.
+	 * The persistent reference on the directory prevents the outgoing name
+	 * cache entries from being reclaimed.
+	 */
+	if (vp->v_type == VDIR) {
+		struct dirent *dp;
+		char *buf;
+		off_t off;
+		size_t buflen, len;
+		int eof, error;
+
+		buflen = 128 * sizeof(struct dirent);
+		buf = malloc(buflen, M_TEMP, M_WAITOK);
+
+		error = 0;
+		len = off = eof = 0;
+		for (;;) {
+			struct nameidata nd;
+
+			error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
+			    &len, &off, &eof);
+			if (error != 0)
+				break;
+			if (len == 0)
+				/* Finished reading. */
+				break;
+			if (strcmp(dp->d_name, ".") == 0 ||
+			    strcmp(dp->d_name, "..") == 0)
+				continue;
+
+			/*
+			 * namei() consumes a reference on the starting
+			 * directory if it's specified as a vnode.
+			 */
+			vrefact(vp);
+			NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
+			    dp->d_name, vp);
+			error = namei(&nd);
+			if (error != 0)
+				break;
+			vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
+			vrele(nd.ni_vp);
+		}
+		free(buf, M_TEMP);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * The vnode referenced in kern_inotify_add_watch() might be different
+	 * than this one if nullfs is in the picture.
+	 */
+	vrefact(vp);
+	watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
+	watch->sc = sc;
+	watch->vp = vp;
+	watch->mask = mask;
+
+	/*
+	 * Are we updating an existing watch?  Search the vnode's list rather
+	 * than that of the softc, as the former is likely to be shorter.
+	 */
+	v_addpollinfo(vp);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
+		if (watch1->sc == sc)
+			break;
+	}
+	mtx_lock(&sc->lock);
+	if (watch1 != NULL) {
+		mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+		/*
+		 * We found an existing watch, update it based on our flags.
+		 */
+		if ((mask & IN_MASK_CREATE) != 0) {
+			mtx_unlock(&sc->lock);
+			vrele(vp);
+			free(watch, M_INOTIFY);
+			return (EEXIST);
+		}
+		if ((mask & IN_MASK_ADD) != 0)
+			watch1->mask |= mask;
+		else
+			watch1->mask = mask;
+		*wdp = watch1->wd;
+		mtx_unlock(&sc->lock);
+		vrele(vp);
+		free(watch, M_INOTIFY);
+		return (EJUSTRETURN);
+	}
+
+	/*
+	 * We're creating a new watch.  Add it to the softc and vnode watch
+	 * lists.
+	 */
+	do {
+		struct inotify_watch key;
+
+		/*
+		 * Search for the next available watch descriptor.  This is
+		 * implemented so as to avoid reusing watch descriptors for as
+		 * long as possible.
+		 */
+		key.wd = wd = sc->nextwatch++;
+		watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+	} while (watch1 != NULL || wd == 0);
+	watch->wd = wd;
+	RB_INSERT(inotify_watch_tree, &sc->watches, watch);
+	TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+	mtx_unlock(&sc->lock);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+	vn_irflag_set_cond(vp, VIRF_INOTIFY);
+
+	*wdp = wd;
+
+	return (0);
+}
+
+void
+vn_inotify_revoke(struct vnode *vp)
+{
+	if (vp->v_pollinfo == NULL) {
+		/* This is a nullfs vnode which shadows a watched vnode. */
+		return;
+	}
+	inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
+}
+
+static int
+fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
+    struct file **fpp)
+{
+	struct file *fp;
+	int error;
+
+	error = fget(td, fd, needrightsp, &fp);
+	if (error != 0)
+		return (error);
+	if (fp->f_type != DTYPE_INOTIFY) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	*fpp = fp;
+	return (0);
+}
+
+int
+kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
+    struct thread *td)
+{
+	struct nameidata nd;
+	struct file *fp;
+	struct inotify_softc *sc;
+	struct vnode *vp;
+	uint32_t wd;
+	int count, error;
+
+	fp = NULL;
+	vp = NULL;
+
+	if ((mask & IN_ALL_EVENTS) == 0)
+		return (EXTERROR(EINVAL, "no events specified"));
+	if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
+	    (IN_MASK_ADD | IN_MASK_CREATE))
+		return (EXTERROR(EINVAL,
+		    "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
+	if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
+		return (EXTERROR(EINVAL, "unrecognized flag"));
+
+	error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
+	if (error != 0)
+		return (error);
+	sc = fp->f_data;
+
+	NDINIT_AT(&nd, LOOKUP,
+	    ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
+	    LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
+	error = namei(&nd);
+	if (error != 0)
+		goto out;
+	NDFREE_PNBUF(&nd);
+	vp = nd.ni_vp;
+
+	error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
+	if (error != 0)
+		goto out;
+
+	if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+
+	count = atomic_fetchadd_int(&inotify_watches, 1);
+	if (count > inotify_max_watches) {
+		atomic_subtract_int(&inotify_watches, 1);
+		error = ENOSPC;
+		goto out;
+	}
+	if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
+	    inotify_max_user_watches)) {
+		atomic_subtract_int(&inotify_watches, 1);
+		error = ENOSPC;
+		goto out;
+	}
+	error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
+	if (error != 0) {
+		atomic_subtract_int(&inotify_watches, 1);
+		(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+		if (error == EJUSTRETURN) {
+			/* We updated an existing watch, everything is ok. */
+			error = 0;
+		} else {
+			goto out;
+		}
+	}
+	td->td_retval[0] = wd;
+
+out:
+	if (vp != NULL)
+		vput(vp);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_inotify_add_watch_at(struct thread *td,
+    struct inotify_add_watch_at_args *uap)
+{
+	return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
+	    uap->mask, td));
+}
+
+int
+kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
+{
+	struct file *fp;
+	struct inotify_softc *sc;
+	struct inotify_record *rec;
+	struct inotify_watch key, *watch;
+	int error;
+
+	error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
+	if (error != 0)
+		return (error);
+	sc = fp->f_data;
+
+	rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
+
+	/*
+	 * For compatibility with Linux, we do not remove pending events
+	 * associated with the watch.  Watch descriptors are implemented so as
+	 * to avoid being reused for as long as possible, so one hopes that any
+	 * pending events from the removed watch descriptor will be removed
+	 * before the watch descriptor is recycled.
+	 */
+	key.wd = wd;
+	mtx_lock(&sc->lock);
+	watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+	if (watch == NULL) {
+		free(rec, M_INOTIFY);
+		error = EINVAL;
+	} else {
+		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+		if (!inotify_queue_record(sc, rec)) {
+			free(rec, M_INOTIFY);
+			error = 0;
+		}
+	}
+	mtx_unlock(&sc->lock);
+	if (watch != NULL)
+		inotify_remove_watch(watch);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
+{
+	return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
+}