aboutsummaryrefslogtreecommitdiff
path: root/sys/kern/vfs_inotify.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/vfs_inotify.c')
-rw-r--r--sys/kern/vfs_inotify.c1008
1 files changed, 1008 insertions, 0 deletions
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
new file mode 100644
index 000000000000..9562350c897f
--- /dev/null
+++ b/sys/kern/vfs_inotify.c
@@ -0,0 +1,1008 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Klara, Inc.
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/caprights.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#define EXTERR_CATEGORY EXTERR_CAT_INOTIFY
+#include <sys/exterrvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/inotify.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslimits.h>
+#include <sys/sysproto.h>
+#include <sys/tree.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+uint32_t inotify_rename_cookie;
+
+static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "inotify configuration");
+
+static int inotify_max_queued_events = 16384;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
+ &inotify_max_queued_events, 0,
+ "Maximum number of events to queue on an inotify descriptor");
+
+static int inotify_max_user_instances = 256;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
+ &inotify_max_user_instances, 0,
+ "Maximum number of inotify descriptors per user");
+
+static int inotify_max_user_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
+ &inotify_max_user_watches, 0,
+ "Maximum number of inotify watches per user");
+
+static int inotify_max_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
+ &inotify_max_watches, 0,
+ "Maximum number of inotify watches system-wide");
+
+static int inotify_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
+ &inotify_watches, 0,
+ "Total number of inotify watches currently in use");
+
+static int inotify_coalesce = 1;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
+ &inotify_coalesce, 0,
+ "Coalesce inotify events when possible");
+
+static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
+SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
+ &inotify_event_drops,
+ "Number of inotify events dropped due to limits or allocation failures");
+
+static fo_rdwr_t inotify_read;
+static fo_ioctl_t inotify_ioctl;
+static fo_poll_t inotify_poll;
+static fo_kqfilter_t inotify_kqfilter;
+static fo_stat_t inotify_stat;
+static fo_close_t inotify_close;
+static fo_fill_kinfo_t inotify_fill_kinfo;
+
+static const struct fileops inotifyfdops = {
+ .fo_read = inotify_read,
+ .fo_write = invfo_rdwr,
+ .fo_truncate = invfo_truncate,
+ .fo_ioctl = inotify_ioctl,
+ .fo_poll = inotify_poll,
+ .fo_kqfilter = inotify_kqfilter,
+ .fo_stat = inotify_stat,
+ .fo_close = inotify_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_fill_kinfo = inotify_fill_kinfo,
+ .fo_cmp = file_kcmp_generic,
+ .fo_flags = DFLAG_PASSABLE,
+};
+
+static void filt_inotifydetach(struct knote *kn);
+static int filt_inotifyevent(struct knote *kn, long hint);
+
+static const struct filterops inotify_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_inotifydetach,
+ .f_event = filt_inotifyevent,
+};
+
+static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
+
+struct inotify_record {
+ STAILQ_ENTRY(inotify_record) link;
+ struct inotify_event ev;
+};
+
+static uint64_t inotify_ino = 1;
+
+/*
+ * On LP64 systems this occupies 64 bytes, so we don't get internal
+ * fragmentation by allocating watches with malloc(9). If the size changes,
+ * consider using a UMA zone to improve memory efficiency.
+ */
+struct inotify_watch {
+ struct inotify_softc *sc; /* back-pointer */
+ int wd; /* unique ID */
+ uint32_t mask; /* event mask */
+ struct vnode *vp; /* vnode being watched, refed */
+ RB_ENTRY(inotify_watch) ilink; /* inotify linkage */
+ TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */
+};
+
+static void
+inotify_init(void *arg __unused)
+{
+ /* Don't let a user hold too many vnodes. */
+ inotify_max_user_watches = desiredvnodes / 3;
+ /* Don't let the system hold too many vnodes. */
+ inotify_max_watches = desiredvnodes / 2;
+}
+SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
+
+static int
+inotify_watch_cmp(const struct inotify_watch *a,
+ const struct inotify_watch *b)
+{
+ if (a->wd < b->wd)
+ return (-1);
+ else if (a->wd > b->wd)
+ return (1);
+ else
+ return (0);
+}
+RB_HEAD(inotify_watch_tree, inotify_watch);
+RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
+
+struct inotify_softc {
+ struct mtx lock; /* serialize all softc writes */
+ STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */
+ struct inotify_record overflow; /* preallocated record */
+ int nextwatch; /* next watch ID to try */
+ int npending; /* number of pending events */
+ size_t nbpending; /* bytes available to read */
+ uint64_t ino; /* unique identifier */
+ struct inotify_watch_tree watches; /* active watches */
+ struct selinfo sel; /* select/poll/kevent info */
+ struct ucred *cred; /* credential ref */
+};
+
+static struct inotify_record *
+inotify_dequeue(struct inotify_softc *sc)
+{
+ struct inotify_record *rec;
+
+ mtx_assert(&sc->lock, MA_OWNED);
+ KASSERT(!STAILQ_EMPTY(&sc->pending),
+ ("%s: queue for %p is empty", __func__, sc));
+
+ rec = STAILQ_FIRST(&sc->pending);
+ STAILQ_REMOVE_HEAD(&sc->pending, link);
+ sc->npending--;
+ sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
+ return (rec);
+}
+
+static void
+inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
+{
+ mtx_assert(&sc->lock, MA_OWNED);
+
+ if (head)
+ STAILQ_INSERT_HEAD(&sc->pending, rec, link);
+ else
+ STAILQ_INSERT_TAIL(&sc->pending, rec, link);
+ sc->npending++;
+ sc->nbpending += sizeof(rec->ev) + rec->ev.len;
+}
+
+static int
+inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
+ struct thread *td)
+{
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ int error;
+ bool first;
+
+ sc = fp->f_data;
+ error = 0;
+
+ mtx_lock(&sc->lock);
+ while (STAILQ_EMPTY(&sc->pending)) {
+ if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
+ mtx_unlock(&sc->lock);
+ return (EWOULDBLOCK);
+ }
+ error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
+ if (error != 0) {
+ mtx_unlock(&sc->lock);
+ return (error);
+ }
+ }
+ for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
+ size_t len;
+
+ rec = inotify_dequeue(sc);
+ len = sizeof(rec->ev) + rec->ev.len;
+ if (uio->uio_resid < (ssize_t)len) {
+ inotify_enqueue(sc, rec, true);
+ if (first) {
+ error = EXTERROR(EINVAL,
+ "read buffer is too small");
+ }
+ break;
+ }
+ mtx_unlock(&sc->lock);
+ error = uiomove(&rec->ev, len, uio);
+#ifdef KTRACE
+ if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+ ktrstruct("inotify", &rec->ev, len);
+#endif
+ mtx_lock(&sc->lock);
+ if (error != 0) {
+ inotify_enqueue(sc, rec, true);
+ mtx_unlock(&sc->lock);
+ return (error);
+ }
+ if (rec == &sc->overflow) {
+ /*
+ * Signal to inotify_queue_record() that the overflow
+ * record can be reused.
+ */
+ memset(rec, 0, sizeof(*rec));
+ } else {
+ free(rec, M_INOTIFY);
+ }
+ }
+ mtx_unlock(&sc->lock);
+ return (error);
+}
+
+static int
+inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
+ struct thread *td)
+{
+ struct inotify_softc *sc;
+
+ sc = fp->f_data;
+
+ switch (com) {
+ case FIONREAD:
+ *(int *)data = (int)sc->nbpending;
+ return (0);
+ case FIONBIO:
+ case FIOASYNC:
+ return (0);
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+static int
+inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+ struct inotify_softc *sc;
+ int revents;
+
+ sc = fp->f_data;
+ revents = 0;
+
+ mtx_lock(&sc->lock);
+ if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(td, &sc->sel);
+ mtx_unlock(&sc->lock);
+ return (revents);
+}
+
+static void
+filt_inotifydetach(struct knote *kn)
+{
+ struct inotify_softc *sc;
+
+ sc = kn->kn_hook;
+ knlist_remove(&sc->sel.si_note, kn, 0);
+}
+
+static int
+filt_inotifyevent(struct knote *kn, long hint)
+{
+ struct inotify_softc *sc;
+
+ sc = kn->kn_hook;
+ mtx_assert(&sc->lock, MA_OWNED);
+ kn->kn_data = sc->nbpending;
+ return (kn->kn_data > 0);
+}
+
+static int
+inotify_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct inotify_softc *sc;
+
+ if (kn->kn_filter != EVFILT_READ)
+ return (EINVAL);
+ sc = fp->f_data;
+ kn->kn_fop = &inotify_rfiltops;
+ kn->kn_hook = sc;
+ knlist_add(&sc->sel.si_note, kn, 0);
+ return (0);
+}
+
+static int
+inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
+{
+ struct inotify_softc *sc;
+
+ sc = fp->f_data;
+
+ memset(sb, 0, sizeof(*sb));
+ sb->st_mode = S_IFREG | S_IRUSR;
+ sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
+ mtx_lock(&sc->lock);
+ sb->st_size = sc->nbpending;
+ sb->st_blocks = sc->npending;
+ sb->st_uid = sc->cred->cr_ruid;
+ sb->st_gid = sc->cred->cr_rgid;
+ sb->st_ino = sc->ino;
+ mtx_unlock(&sc->lock);
+ return (0);
+}
+
+static void
+inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
+{
+ struct vnode *vp;
+
+ vp = watch->vp;
+ mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
+
+ atomic_subtract_int(&inotify_watches, 1);
+ (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+
+ TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+ if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
+ vn_irflag_unset_locked(vp, VIRF_INOTIFY);
+}
+
+/*
+ * Assumes that the watch has already been removed from its softc.
+ */
+static void
+inotify_remove_watch(struct inotify_watch *watch)
+{
+ struct inotify_softc *sc;
+ struct vnode *vp;
+
+ sc = watch->sc;
+
+ vp = watch->vp;
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ inotify_unlink_watch_locked(sc, watch);
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+ vrele(vp);
+ free(watch, M_INOTIFY);
+}
+
+static int
+inotify_close(struct file *fp, struct thread *td)
+{
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ struct inotify_watch *watch;
+
+ sc = fp->f_data;
+
+ mtx_lock(&sc->lock);
+ (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
+ while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+ mtx_unlock(&sc->lock);
+ inotify_remove_watch(watch);
+ mtx_lock(&sc->lock);
+ }
+ while (!STAILQ_EMPTY(&sc->pending)) {
+ rec = inotify_dequeue(sc);
+ if (rec != &sc->overflow)
+ free(rec, M_INOTIFY);
+ }
+ mtx_unlock(&sc->lock);
+ seldrain(&sc->sel);
+ knlist_destroy(&sc->sel.si_note);
+ mtx_destroy(&sc->lock);
+ crfree(sc->cred);
+ free(sc, M_INOTIFY);
+ return (0);
+}
+
+static int
+inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+ struct filedesc *fdp)
+{
+ struct inotify_softc *sc;
+
+ sc = fp->f_data;
+
+ kif->kf_type = KF_TYPE_INOTIFY;
+ kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
+ kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
+ return (0);
+}
+
+int
+inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
+{
+ struct inotify_softc *sc;
+ int fflags;
+
+ if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
+ return (EINVAL);
+
+ if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
+ inotify_max_user_instances))
+ return (EMFILE);
+
+ sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
+ sc->nextwatch = 1; /* Required for compatibility. */
+ STAILQ_INIT(&sc->pending);
+ RB_INIT(&sc->watches);
+ mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
+ knlist_init_mtx(&sc->sel.si_note, &sc->lock);
+ sc->cred = crhold(td->td_ucred);
+ sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
+
+ fflags = FREAD;
+ if ((flags & IN_NONBLOCK) != 0)
+ fflags |= FNONBLOCK;
+ if ((flags & IN_CLOEXEC) != 0)
+ *fflagsp |= O_CLOEXEC;
+ finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
+
+ return (0);
+}
+
+static struct inotify_record *
+inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
+ uint32_t cookie, int waitok)
+{
+ struct inotify_event *evp;
+ struct inotify_record *rec;
+
+ rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
+ waitok | M_ZERO);
+ if (rec == NULL)
+ return (NULL);
+ evp = &rec->ev;
+ evp->wd = wd;
+ evp->mask = event;
+ evp->cookie = cookie;
+ evp->len = _IN_NAMESIZE(namelen);
+ if (name != NULL)
+ memcpy(evp->name, name, namelen);
+ return (rec);
+}
+
+static bool
+inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
+{
+ struct inotify_record *prev;
+
+ mtx_assert(&sc->lock, MA_OWNED);
+
+ prev = STAILQ_LAST(&sc->pending, inotify_record, link);
+ return (prev != NULL && prev->ev.mask == evp->mask &&
+ prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
+ prev->ev.len == evp->len &&
+ (evp->len == 0 || strcmp(prev->ev.name, evp->name) == 0));
+}
+
+static void
+inotify_overflow_event(struct inotify_event *evp)
+{
+ evp->mask = IN_Q_OVERFLOW;
+ evp->wd = -1;
+ evp->cookie = 0;
+ evp->len = 0;
+}
+
+/*
+ * Put an event record on the queue for an inotify desscriptor. Return false if
+ * the record was not enqueued for some reason, true otherwise.
+ */
+static bool
+inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
+{
+ struct inotify_event *evp;
+
+ mtx_assert(&sc->lock, MA_OWNED);
+
+ evp = &rec->ev;
+ if (__predict_false(rec == &sc->overflow)) {
+ /*
+ * Is the overflow record already in the queue? If so, there's
+ * not much else we can do: we're here because a kernel memory
+ * shortage prevented new record allocations.
+ */
+ counter_u64_add(inotify_event_drops, 1);
+ if (evp->mask == IN_Q_OVERFLOW)
+ return (false);
+ inotify_overflow_event(evp);
+ } else {
+ /* Try to coalesce duplicate events. */
+ if (inotify_coalesce && inotify_can_coalesce(sc, evp))
+ return (false);
+
+ /*
+ * Would this one overflow the queue? If so, convert it to an
+ * overflow event and try again to coalesce.
+ */
+ if (sc->npending >= inotify_max_queued_events) {
+ counter_u64_add(inotify_event_drops, 1);
+ inotify_overflow_event(evp);
+ if (inotify_can_coalesce(sc, evp))
+ return (false);
+ }
+ }
+ inotify_enqueue(sc, rec, false);
+ selwakeup(&sc->sel);
+ KNOTE_LOCKED(&sc->sel.si_note, 0);
+ wakeup(&sc->pending);
+ return (true);
+}
+
+static int
+inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
+ int event, uint32_t cookie)
+{
+ struct inotify_watch key;
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ int relecount;
+ bool allocfail;
+
+ relecount = 0;
+
+ sc = watch->sc;
+ rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
+ M_NOWAIT);
+ if (rec == NULL) {
+ rec = &sc->overflow;
+ allocfail = true;
+ } else {
+ allocfail = false;
+ }
+
+ mtx_lock(&sc->lock);
+ if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
+ free(rec, M_INOTIFY);
+ if ((watch->mask & IN_ONESHOT) != 0 ||
+ (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
+ if (!allocfail) {
+ rec = inotify_alloc_record(watch->wd, NULL, 0,
+ IN_IGNORED, 0, M_NOWAIT);
+ if (rec == NULL)
+ rec = &sc->overflow;
+ if (!inotify_queue_record(sc, rec) &&
+ rec != &sc->overflow)
+ free(rec, M_INOTIFY);
+ }
+
+ /*
+ * Remove the watch, taking care to handle races with
+ * inotify_close().
+ */
+ key.wd = watch->wd;
+ if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+ inotify_unlink_watch_locked(sc, watch);
+ free(watch, M_INOTIFY);
+
+ /* Defer vrele() to until locks are dropped. */
+ relecount++;
+ }
+ }
+ mtx_unlock(&sc->lock);
+ return (relecount);
+}
+
+void
+inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
+ uint32_t cookie)
+{
+ struct inotify_watch *watch, *tmp;
+ int relecount;
+
+ KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
+ ("inotify_log: invalid event %#x", event));
+
+ relecount = 0;
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
+ KASSERT(watch->vp == vp,
+ ("inotify_log: watch %p vp != vp", watch));
+ if ((watch->mask & event) != 0 || event == IN_UNMOUNT) {
+ relecount += inotify_log_one(watch, name, namelen, event,
+ cookie);
+ }
+ }
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+ for (int i = 0; i < relecount; i++)
+ vrele(vp);
+}
+
+/*
+ * An inotify event occurred on a watched vnode.
+ */
+void
+vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
+ int event, uint32_t cookie)
+{
+ int isdir;
+
+ VNPASS(vp->v_holdcnt > 0, vp);
+
+ isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+
+ if (dvp != NULL) {
+ VNPASS(dvp->v_holdcnt > 0, dvp);
+
+ /*
+ * Should we log an event for the vnode itself?
+ */
+ if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
+ int selfevent;
+
+ switch (event) {
+ case _IN_MOVE_DELETE:
+ case IN_DELETE:
+ /*
+ * IN_DELETE_SELF is only generated when the
+ * last hard link of a file is removed.
+ */
+ selfevent = IN_DELETE_SELF;
+ if (vp->v_type != VDIR) {
+ struct vattr va;
+ int error;
+
+ error = VOP_GETATTR(vp, &va, cnp->cn_cred);
+ if (error == 0 && va.va_nlink != 0)
+ selfevent = 0;
+ }
+ break;
+ case IN_MOVED_FROM:
+ cookie = 0;
+ selfevent = IN_MOVE_SELF;
+ break;
+ case _IN_ATTRIB_LINKCOUNT:
+ selfevent = IN_ATTRIB;
+ break;
+ default:
+ selfevent = event;
+ break;
+ }
+
+ if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
+ inotify_log(vp, NULL, 0, selfevent | isdir,
+ cookie);
+ }
+ }
+
+ /*
+ * Something is watching the directory through which this vnode
+ * was referenced, so we may need to log the event.
+ */
+ if ((event & IN_ALL_EVENTS) != 0 &&
+ (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
+ inotify_log(dvp, cnp->cn_nameptr,
+ cnp->cn_namelen, event | isdir, cookie);
+ }
+ } else {
+ /*
+ * We don't know which watched directory might contain the
+ * vnode, so we have to fall back to searching the name cache.
+ */
+ cache_vop_inotify(vp, event, cookie);
+ }
+}
+
+int
+vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
+ uint32_t *wdp, struct thread *td)
+{
+ struct inotify_watch *watch, *watch1;
+ uint32_t wd;
+
+ /*
+ * If this is a directory, make sure all of its entries are present in
+ * the name cache so that we're able to look them up if an event occurs.
+ * The persistent reference on the directory prevents the outgoing name
+ * cache entries from being reclaimed.
+ */
+ if (vp->v_type == VDIR) {
+ struct dirent *dp;
+ char *buf;
+ off_t off;
+ size_t buflen, len;
+ int eof, error;
+
+ buflen = 128 * sizeof(struct dirent);
+ buf = malloc(buflen, M_TEMP, M_WAITOK);
+
+ error = 0;
+ len = off = eof = 0;
+ for (;;) {
+ struct nameidata nd;
+
+ error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
+ &len, &off, &eof);
+ if (error != 0)
+ break;
+ if (len == 0)
+ /* Finished reading. */
+ break;
+ if (strcmp(dp->d_name, ".") == 0 ||
+ strcmp(dp->d_name, "..") == 0)
+ continue;
+
+ /*
+ * namei() consumes a reference on the starting
+ * directory if it's specified as a vnode.
+ */
+ vrefact(vp);
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
+ dp->d_name, vp);
+ error = namei(&nd);
+ if (error != 0)
+ break;
+ vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
+ vrele(nd.ni_vp);
+ }
+ free(buf, M_TEMP);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * The vnode referenced in kern_inotify_add_watch() might be different
+ * than this one if nullfs is in the picture.
+ */
+ vrefact(vp);
+ watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
+ watch->sc = sc;
+ watch->vp = vp;
+ watch->mask = mask;
+
+ /*
+ * Are we updating an existing watch? Search the vnode's list rather
+ * than that of the softc, as the former is likely to be shorter.
+ */
+ v_addpollinfo(vp);
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
+ if (watch1->sc == sc)
+ break;
+ }
+ mtx_lock(&sc->lock);
+ if (watch1 != NULL) {
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+ /*
+ * We found an existing watch, update it based on our flags.
+ */
+ if ((mask & IN_MASK_CREATE) != 0) {
+ mtx_unlock(&sc->lock);
+ vrele(vp);
+ free(watch, M_INOTIFY);
+ return (EEXIST);
+ }
+ if ((mask & IN_MASK_ADD) != 0)
+ watch1->mask |= mask;
+ else
+ watch1->mask = mask;
+ *wdp = watch1->wd;
+ mtx_unlock(&sc->lock);
+ vrele(vp);
+ free(watch, M_INOTIFY);
+ return (EJUSTRETURN);
+ }
+
+ /*
+ * We're creating a new watch. Add it to the softc and vnode watch
+ * lists.
+ */
+ do {
+ struct inotify_watch key;
+
+ /*
+ * Search for the next available watch descriptor. This is
+ * implemented so as to avoid reusing watch descriptors for as
+ * long as possible.
+ */
+ key.wd = wd = sc->nextwatch++;
+ watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+ } while (watch1 != NULL || wd == 0);
+ watch->wd = wd;
+ RB_INSERT(inotify_watch_tree, &sc->watches, watch);
+ TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+ mtx_unlock(&sc->lock);
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ vn_irflag_set_cond(vp, VIRF_INOTIFY);
+
+ *wdp = wd;
+
+ return (0);
+}
+
+void
+vn_inotify_revoke(struct vnode *vp)
+{
+ if (vp->v_pollinfo == NULL) {
+ /* This is a nullfs vnode which shadows a watched vnode. */
+ return;
+ }
+ inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
+}
+
+static int
+fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
+ struct file **fpp)
+{
+ struct file *fp;
+ int error;
+
+ error = fget(td, fd, needrightsp, &fp);
+ if (error != 0)
+ return (error);
+ if (fp->f_type != DTYPE_INOTIFY) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ *fpp = fp;
+ return (0);
+}
+
+int
+kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
+ struct thread *td)
+{
+ struct nameidata nd;
+ struct file *fp;
+ struct inotify_softc *sc;
+ struct vnode *vp;
+ uint32_t wd;
+ int count, error;
+
+ fp = NULL;
+ vp = NULL;
+
+ if ((mask & IN_ALL_EVENTS) == 0)
+ return (EXTERROR(EINVAL, "no events specified"));
+ if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
+ (IN_MASK_ADD | IN_MASK_CREATE))
+ return (EXTERROR(EINVAL,
+ "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
+ if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
+ return (EXTERROR(EINVAL, "unrecognized flag"));
+
+ error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
+ if (error != 0)
+ return (error);
+ sc = fp->f_data;
+
+ NDINIT_AT(&nd, LOOKUP,
+ ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
+ LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
+ error = namei(&nd);
+ if (error != 0)
+ goto out;
+ NDFREE_PNBUF(&nd);
+ vp = nd.ni_vp;
+
+ error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
+ if (error != 0)
+ goto out;
+
+ if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+
+ count = atomic_fetchadd_int(&inotify_watches, 1);
+ if (count > inotify_max_watches) {
+ atomic_subtract_int(&inotify_watches, 1);
+ error = ENOSPC;
+ goto out;
+ }
+ if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
+ inotify_max_user_watches)) {
+ atomic_subtract_int(&inotify_watches, 1);
+ error = ENOSPC;
+ goto out;
+ }
+ error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
+ if (error != 0) {
+ atomic_subtract_int(&inotify_watches, 1);
+ (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+ if (error == EJUSTRETURN) {
+ /* We updated an existing watch, everything is ok. */
+ error = 0;
+ } else {
+ goto out;
+ }
+ }
+ td->td_retval[0] = wd;
+
+out:
+ if (vp != NULL)
+ vput(vp);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_inotify_add_watch_at(struct thread *td,
+ struct inotify_add_watch_at_args *uap)
+{
+ return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
+ uap->mask, td));
+}
+
+int
+kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
+{
+ struct file *fp;
+ struct inotify_softc *sc;
+ struct inotify_record *rec;
+ struct inotify_watch key, *watch;
+ int error;
+
+ error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
+ if (error != 0)
+ return (error);
+ sc = fp->f_data;
+
+ rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
+
+ /*
+ * For compatibility with Linux, we do not remove pending events
+ * associated with the watch. Watch descriptors are implemented so as
+ * to avoid being reused for as long as possible, so one hopes that any
+ * pending events from the removed watch descriptor will be removed
+ * before the watch descriptor is recycled.
+ */
+ key.wd = wd;
+ mtx_lock(&sc->lock);
+ watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
+ if (watch == NULL) {
+ free(rec, M_INOTIFY);
+ error = EINVAL;
+ } else {
+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+ if (!inotify_queue_record(sc, rec)) {
+ free(rec, M_INOTIFY);
+ error = 0;
+ }
+ }
+ mtx_unlock(&sc->lock);
+ if (watch != NULL)
+ inotify_remove_watch(watch);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
+{
+ return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
+}