/* $NetBSD: linux_inotify.c,v 1.6 2024/02/09 22:08:34 andvar Exp $ */ /*- * Copyright (c) 2023 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Theodore Preduta. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD: linux_inotify.c,v 1.6 2024/02/09 22:08:34 andvar Exp $"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * inotify(2). This interface allows the user to get file system * events and (unlike kqueue(2)) their order is strictly preserved. * While nice, the API has sufficient gotchas that mean we don't want * to add native entry points for it. They are: * * - Because data is returned via read(2), this API is prone to * unaligned memory accesses. There is a note in the Linux man page * that says the name field of struct linux_inotify_event *can* be * used for alignment purposes. In practice, even Linux doesn't * always do this, so for simplicity, we don't ever do this. */ #define LINUX_INOTIFY_MAX_QUEUED 16384 #define LINUX_INOTIFY_MAX_FROM_KEVENT 3 #if DEBUG_LINUX #define DPRINTF(x) uprintf x #else #define DPRINTF(x) __nothing #endif struct inotify_entry { TAILQ_ENTRY(inotify_entry) ie_entries; char ie_name[NAME_MAX + 1]; struct linux_inotify_event ie_event; }; struct inotify_dir_entries { size_t ide_count; struct inotify_dir_entry { char name[NAME_MAX + 1]; ino_t fileno; } ide_entries[]; }; #define INOTIFY_DIR_ENTRIES_SIZE(count) (sizeof(struct inotify_dir_entries) \ + count * sizeof(struct inotify_dir_entry)) struct inotifyfd { int ifd_kqfd; /* kqueue fd used by this inotify */ /* instance */ struct selinfo ifd_sel; /* for EVFILT_READ by epoll */ kmutex_t ifd_lock; /* lock for ifd_sel, ifd_wds and */ /* ifd_nwds */ struct inotify_dir_entries **ifd_wds; /* keeps track of watch descriptors */ /* for directories: snapshot of the */ /* directory state */ /* for files: an inotify_dir_entries */ /* with ide_count == 0 */ size_t ifd_nwds; /* max watch descriptor that can be */ /* stored in ifd_wds + 1 */ TAILQ_HEAD(, inotify_entry) ifd_qhead; /* queue of pending events */ size_t ifd_qcount; /* number of pending events */ kcondvar_t ifd_qcv; /* condvar for blocking reads */ kmutex_t ifd_qlock; /* lock for ifd_q* and interlock */ /* for ifd_qcv */ }; struct inotify_kevent_mask_pair { uint32_t inotify; uint32_t kevent; }; static int inotify_kev_fetch_changes(void *, const struct kevent *, struct kevent *, size_t, int); static int do_inotify_init(struct lwp *, register_t *, int); static int inotify_close_wd(struct inotifyfd *, int); static uint32_t inotify_mask_to_kevent_fflags(uint32_t, enum vtype); static void do_kevent_to_inotify(int32_t, uint32_t, uint32_t, struct inotify_entry *, size_t *, char *); static int kevent_to_inotify(struct inotifyfd *, int, enum vtype, uint32_t, uint32_t, struct inotify_entry *, size_t *); static int inotify_readdir(file_t *, struct dirent *, int *, bool); static struct inotify_dir_entries *get_inotify_dir_entries(int, bool); static int inotify_filt_attach(struct knote *); static void inotify_filt_detach(struct knote *); static int inotify_filt_event(struct knote *, long); static void inotify_read_filt_detach(struct knote *); static int inotify_read_filt_event(struct knote *, long); static int inotify_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); static int inotify_close(file_t *); static int inotify_poll(file_t *, int); static int inotify_kqfilter(file_t *, struct knote *); static void inotify_restart(file_t *); static const char inotify_filtname[] = "LINUX_INOTIFY"; static int inotify_filtid; /* "fake" EVFILT_VNODE that gets attached to ifd_deps */ static const struct filterops inotify_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = inotify_filt_attach, .f_detach = inotify_filt_detach, .f_event = inotify_filt_event, .f_touch = NULL, }; /* EVFILT_READ attached to inotifyfd (to support watching via epoll) */ static const struct filterops inotify_read_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, /* attached via .fo_kqfilter */ .f_detach = inotify_read_filt_detach, .f_event = inotify_read_filt_event, .f_touch = NULL, }; static const struct fileops inotify_fileops = { .fo_name = "inotify", .fo_read = inotify_read, .fo_write = fbadop_write, .fo_ioctl = fbadop_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = inotify_poll, .fo_stat = fbadop_stat, .fo_close = inotify_close, .fo_kqfilter = inotify_kqfilter, .fo_restart = inotify_restart, .fo_fpathconf = (void *)eopnotsupp, }; /* basic flag translations */ static const struct inotify_kevent_mask_pair common_inotify_to_kevent[] = { { .inotify = LINUX_IN_ATTRIB, .kevent = NOTE_ATTRIB, }, { .inotify = LINUX_IN_CLOSE_NOWRITE, .kevent = NOTE_CLOSE, }, { .inotify = LINUX_IN_OPEN, .kevent = NOTE_OPEN, }, { .inotify = LINUX_IN_MOVE_SELF, .kevent = NOTE_RENAME, }, }; static const size_t common_inotify_to_kevent_len = __arraycount(common_inotify_to_kevent); static const struct inotify_kevent_mask_pair vreg_inotify_to_kevent[] = { { .inotify = LINUX_IN_ACCESS, .kevent = NOTE_READ, }, { .inotify = LINUX_IN_ATTRIB, .kevent = NOTE_ATTRIB|NOTE_LINK, }, { .inotify = LINUX_IN_CLOSE_WRITE, .kevent = NOTE_CLOSE_WRITE, }, { .inotify = LINUX_IN_MODIFY, .kevent = NOTE_WRITE, }, }; static const size_t vreg_inotify_to_kevent_len = __arraycount(vreg_inotify_to_kevent); static const struct inotify_kevent_mask_pair vdir_inotify_to_kevent[] = { { .inotify = LINUX_IN_ACCESS, .kevent = NOTE_READ, }, { .inotify = LINUX_IN_CREATE, .kevent = NOTE_WRITE, }, { .inotify = LINUX_IN_DELETE, .kevent = NOTE_WRITE, }, { .inotify = LINUX_IN_MOVED_FROM, .kevent = NOTE_WRITE, }, { .inotify = LINUX_IN_MOVED_TO, .kevent = NOTE_WRITE, }, }; static const size_t vdir_inotify_to_kevent_len = __arraycount(vdir_inotify_to_kevent); static const struct inotify_kevent_mask_pair common_kevent_to_inotify[] = { { .kevent = NOTE_ATTRIB, .inotify = LINUX_IN_ATTRIB, }, { .kevent = NOTE_CLOSE, .inotify = LINUX_IN_CLOSE_NOWRITE, }, { .kevent = NOTE_CLOSE_WRITE, .inotify = LINUX_IN_CLOSE_WRITE, }, { .kevent = NOTE_OPEN, .inotify = LINUX_IN_OPEN, }, { .kevent = NOTE_READ, .inotify = LINUX_IN_ACCESS, }, { .kevent = NOTE_RENAME, .inotify = LINUX_IN_MOVE_SELF, }, { .kevent = NOTE_REVOKE, .inotify = LINUX_IN_UNMOUNT, }, }; static const size_t common_kevent_to_inotify_len = __arraycount(common_kevent_to_inotify); static const struct inotify_kevent_mask_pair vreg_kevent_to_inotify[] = { { .kevent = NOTE_DELETE|NOTE_LINK, .inotify = LINUX_IN_ATTRIB, }, { .kevent = NOTE_WRITE, .inotify = LINUX_IN_MODIFY, }, }; static const size_t vreg_kevent_to_inotify_len = __arraycount(vreg_kevent_to_inotify); /* * Register the custom kfilter for inotify. */ int linux_inotify_init(void) { return kfilter_register(inotify_filtname, &inotify_filtops, &inotify_filtid); } /* * Unregister the custom kfilter for inotify. */ int linux_inotify_fini(void) { return kfilter_unregister(inotify_filtname); } /* * Copyin callback used by kevent. This copies already converted * filters from kernel memory to the kevent internal kernel memory. * Hence the memcpy instead of copyin. */ static int inotify_kev_fetch_changes(void *ctx, const struct kevent *changelist, struct kevent *changes, size_t index, int n) { memcpy(changes, changelist + index, n * sizeof(*changes)); return 0; } /* * Initialize a new inotify fd. */ static int do_inotify_init(struct lwp *l, register_t *retval, int flags) { file_t *fp; int error, fd; struct proc *p = l->l_proc; struct inotifyfd *ifd; struct sys_kqueue1_args kqa; if (flags & ~(LINUX_IN_ALL_FLAGS)) return EINVAL; ifd = kmem_zalloc(sizeof(*ifd), KM_SLEEP); mutex_init(&ifd->ifd_lock, MUTEX_DEFAULT, IPL_NONE); mutex_init(&ifd->ifd_qlock, MUTEX_DEFAULT, IPL_NONE); cv_init(&ifd->ifd_qcv, "inotify"); selinit(&ifd->ifd_sel); TAILQ_INIT(&ifd->ifd_qhead); ifd->ifd_nwds = 1; ifd->ifd_wds = kmem_zalloc(ifd->ifd_nwds * sizeof(*ifd->ifd_wds), KM_SLEEP); SCARG(&kqa, flags) = 0; if (flags & LINUX_IN_NONBLOCK) SCARG(&kqa, flags) |= O_NONBLOCK; error = sys_kqueue1(l, &kqa, retval); if (error != 0) goto leave0; ifd->ifd_kqfd = *retval; error = fd_allocfile(&fp, &fd); if (error != 0) goto leave1; fp->f_flag = FREAD; if (flags & LINUX_IN_NONBLOCK) fp->f_flag |= FNONBLOCK; fp->f_type = DTYPE_MISC; fp->f_ops = &inotify_fileops; fp->f_data = ifd; fd_set_exclose(l, fd, (flags & LINUX_IN_CLOEXEC) != 0); fd_affix(p, fp, fd); *retval = fd; return 0; leave1: KASSERT(fd_getfile(ifd->ifd_kqfd) != NULL); fd_close(ifd->ifd_kqfd); leave0: kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds)); kmem_free(ifd, sizeof(*ifd)); mutex_destroy(&ifd->ifd_lock); mutex_destroy(&ifd->ifd_qlock); cv_destroy(&ifd->ifd_qcv); seldestroy(&ifd->ifd_sel); return error; } #ifndef __aarch64__ /* * inotify_init(2). Initialize a new inotify fd with flags=0. */ int linux_sys_inotify_init(struct lwp *l, const void *v, register_t *retval) { return do_inotify_init(l, retval, 0); } #endif /* * inotify_init(2). Initialize a new inotify fd with the given flags. */ int linux_sys_inotify_init1(struct lwp *l, const struct linux_sys_inotify_init1_args *uap, register_t *retval) { /* { syscallarg(int) flags; } */ return do_inotify_init(l, retval, SCARG(uap, flags)); } /* * Convert inotify mask to the fflags of an equivalent kevent. */ static uint32_t inotify_mask_to_kevent_fflags(uint32_t mask, enum vtype type) { const struct inotify_kevent_mask_pair *type_inotify_to_kevent; uint32_t fflags; size_t i, type_inotify_to_kevent_len; switch (type) { case VREG: case VDIR: case VLNK: break; default: return 0; } /* flags that all watches could have */ fflags = NOTE_DELETE|NOTE_REVOKE; for (i = 0; i < common_inotify_to_kevent_len; i++) if (mask & common_inotify_to_kevent[i].inotify) fflags |= common_inotify_to_kevent[i].kevent; /* flags that depend on type */ switch (type) { case VREG: type_inotify_to_kevent = vreg_inotify_to_kevent; type_inotify_to_kevent_len = vreg_inotify_to_kevent_len; break; case VDIR: type_inotify_to_kevent = vdir_inotify_to_kevent; type_inotify_to_kevent_len = vdir_inotify_to_kevent_len; break; default: type_inotify_to_kevent_len = 0; break; } for (i = 0; i < type_inotify_to_kevent_len; i++) if (mask & type_inotify_to_kevent[i].inotify) fflags |= type_inotify_to_kevent[i].kevent; return fflags; } /* * inotify_add_watch(2). Open a fd for pathname (if desired by mask) * track it and add an equivalent kqueue event for it in * ifd->ifd_kqfd. */ int linux_sys_inotify_add_watch(struct lwp *l, const struct linux_sys_inotify_add_watch_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(const char *) pathname; syscallarg(uint32_t) mask; } */ int wd, i, error = 0; file_t *fp, *wp, *cur_fp; struct inotifyfd *ifd; struct inotify_dir_entries **new_wds; struct knote *kn, *tmpkn; struct sys_open_args oa; struct kevent kev; struct vnode *wvp; namei_simple_flags_t sflags; struct kevent_ops k_ops = { .keo_private = NULL, .keo_fetch_timeout = NULL, .keo_fetch_changes = inotify_kev_fetch_changes, .keo_put_events = NULL, }; const int fd = SCARG(uap, fd); const uint32_t mask = SCARG(uap, mask); if (mask & ~LINUX_IN_ADD_KNOWN) return EINVAL; fp = fd_getfile(fd); if (fp == NULL) return EBADF; if (fp->f_ops != &inotify_fileops) { /* not an inotify fd */ error = EBADF; goto leave0; } ifd = fp->f_data; mutex_enter(&ifd->ifd_lock); if (mask & LINUX_IN_DONT_FOLLOW) sflags = NSM_NOFOLLOW_TRYEMULROOT; else sflags = NSM_FOLLOW_TRYEMULROOT; error = namei_simple_user(SCARG(uap, pathname), sflags, &wvp); if (error != 0) goto leave1; /* Check to see if we already have a descriptor to wd's file. */ wd = -1; for (i = 0; i < ifd->ifd_nwds; i++) { if (ifd->ifd_wds[i] != NULL) { cur_fp = fd_getfile(i); if (cur_fp == NULL) { DPRINTF(("%s: wd=%d was closed externally\n", __func__, i)); error = EBADF; goto leave1; } if (cur_fp->f_type != DTYPE_VNODE) { DPRINTF(("%s: wd=%d was replaced " "with a non-vnode\n", __func__, i)); error = EBADF; } if (error == 0 && cur_fp->f_vnode == wvp) wd = i; fd_putfile(i); if (error != 0) goto leave1; if (wd != -1) break; } } if (wd == -1) { /* * If we do not have a descriptor to wd's file, we * need to open the watch descriptor. */ SCARG(&oa, path) = SCARG(uap, pathname); SCARG(&oa, mode) = 0; SCARG(&oa, flags) = O_RDONLY; if (mask & LINUX_IN_DONT_FOLLOW) SCARG(&oa, flags) |= O_NOFOLLOW; if (mask & LINUX_IN_ONLYDIR) SCARG(&oa, flags) |= O_DIRECTORY; error = sys_open(l, &oa, retval); if (error != 0) goto leave1; wd = *retval; wp = fd_getfile(wd); KASSERT(wp != NULL); KASSERT(wp->f_type == DTYPE_VNODE); /* translate the flags */ memset(&kev, 0, sizeof(kev)); EV_SET(&kev, wd, inotify_filtid, EV_ADD|EV_ENABLE, NOTE_DELETE|NOTE_REVOKE, 0, ifd); if (mask & LINUX_IN_ONESHOT) kev.flags |= EV_ONESHOT; kev.fflags |= inotify_mask_to_kevent_fflags(mask, wp->f_vnode->v_type); error = kevent1(retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL, &k_ops); if (error != 0) { KASSERT(fd_getfile(wd) != NULL); fd_close(wd); } else { /* Success! */ *retval = wd; /* Resize ifd_nwds to accommodate wd. */ if (wd+1 > ifd->ifd_nwds) { new_wds = kmem_zalloc( (wd+1) * sizeof(*ifd->ifd_wds), KM_SLEEP); memcpy(new_wds, ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds)); kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds)); ifd->ifd_wds = new_wds; ifd->ifd_nwds = wd+1; } ifd->ifd_wds[wd] = get_inotify_dir_entries(wd, true); } } else { /* * If we do have a descriptor to wd's file, try to edit * the relevant knote. */ if (mask & LINUX_IN_MASK_CREATE) { error = EEXIST; goto leave1; } wp = fd_getfile(wd); if (wp == NULL) { DPRINTF(("%s: wd=%d was closed externally " "(race, probably)\n", __func__, wd)); error = EBADF; goto leave1; } if (wp->f_type != DTYPE_VNODE) { DPRINTF(("%s: wd=%d was replace with a non-vnode " "(race, probably)\n", __func__, wd)); error = EBADF; goto leave2; } kev.fflags = NOTE_DELETE | NOTE_REVOKE | inotify_mask_to_kevent_fflags(mask, wp->f_vnode->v_type); mutex_enter(wp->f_vnode->v_interlock); /* * XXX We are forced to find the appropriate knote * manually because we cannot create a custom f_touch * function for inotify_filtops. See filter_touch() * in kern_event.c for details. */ SLIST_FOREACH_SAFE(kn, &wp->f_vnode->v_klist->vk_klist, kn_selnext, tmpkn) { if (kn->kn_fop == &inotify_filtops && ifd == kn->kn_kevent.udata) { mutex_enter(&kn->kn_kq->kq_lock); if (mask & LINUX_IN_MASK_ADD) kn->kn_sfflags |= kev.fflags; else kn->kn_sfflags = kev.fflags; wp->f_vnode->v_klist->vk_interest |= kn->kn_sfflags; mutex_exit(&kn->kn_kq->kq_lock); } } mutex_exit(wp->f_vnode->v_interlock); /* Success! */ *retval = wd; } leave2: fd_putfile(wd); leave1: mutex_exit(&ifd->ifd_lock); leave0: fd_putfile(fd); return error; } /* * Remove a wd from ifd and close wd. */ static int inotify_close_wd(struct inotifyfd *ifd, int wd) { file_t *wp; int error; register_t retval; struct kevent kev; struct kevent_ops k_ops = { .keo_private = NULL, .keo_fetch_timeout = NULL, .keo_fetch_changes = inotify_kev_fetch_changes, .keo_put_events = NULL, }; mutex_enter(&ifd->ifd_lock); KASSERT(0 <= wd && wd < ifd->ifd_nwds && ifd->ifd_wds[wd] != NULL); kmem_free(ifd->ifd_wds[wd], INOTIFY_DIR_ENTRIES_SIZE(ifd->ifd_wds[wd]->ide_count)); ifd->ifd_wds[wd] = NULL; mutex_exit(&ifd->ifd_lock); wp = fd_getfile(wd); if (wp == NULL) { DPRINTF(("%s: wd=%d is already closed\n", __func__, wd)); return 0; } KASSERT(!mutex_owned(wp->f_vnode->v_interlock)); memset(&kev, 0, sizeof(kev)); EV_SET(&kev, wd, EVFILT_VNODE, EV_DELETE, 0, 0, 0); error = kevent1(&retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL, &k_ops); if (error != 0) DPRINTF(("%s: attempt to disable all events for wd=%d " "had error=%d\n", __func__, wd, error)); return fd_close(wd); } /* * inotify_rm_watch(2). Close wd and remove it from ifd->ifd_wds. */ int linux_sys_inotify_rm_watch(struct lwp *l, const struct linux_sys_inotify_rm_watch_args *uap, register_t *retval) { /* { syscallarg(int) fd; syscallarg(int) wd; } */ struct inotifyfd *ifd; file_t *fp; int error = 0; const int fd = SCARG(uap, fd); const int wd = SCARG(uap, wd); fp = fd_getfile(fd); if (fp == NULL) return EBADF; if (fp->f_ops != &inotify_fileops) { /* not an inotify fd */ error = EINVAL; goto leave; } ifd = fp->f_data; if (wd < 0 || wd >= ifd->ifd_nwds || ifd->ifd_wds[wd] == NULL) { error = EINVAL; goto leave; } error = inotify_close_wd(ifd, wd); leave: fd_putfile(fd); return error; } /* * Attach the inotify filter. */ static int inotify_filt_attach(struct knote *kn) { file_t *fp = kn->kn_obj; struct vnode *vp; KASSERT(fp->f_type == DTYPE_VNODE); vp = fp->f_vnode; /* * Needs to be set so that we get the same event handling as * EVFILT_VNODE. Otherwise we don't get any events. * * A consequence of this is that modifications/removals of * this knote need to specify EVFILT_VNODE rather than * inotify_filtid. */ kn->kn_filter = EVFILT_VNODE; kn->kn_fop = &inotify_filtops; kn->kn_hook = vp; vn_knote_attach(vp, kn); return 0; } /* * Detach the inotify filter. */ static void inotify_filt_detach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; vn_knote_detach(vp, kn); } /* * Create a single inotify event. */ static void do_kevent_to_inotify(int32_t wd, uint32_t mask, uint32_t cookie, struct inotify_entry *buf, size_t *nbuf, char *name) { KASSERT(*nbuf < LINUX_INOTIFY_MAX_FROM_KEVENT); buf += *nbuf; memset(buf, 0, sizeof(*buf)); buf->ie_event.wd = wd; buf->ie_event.mask = mask; buf->ie_event.cookie = cookie; if (name != NULL) { buf->ie_event.len = strlen(name) + 1; KASSERT(buf->ie_event.len < sizeof(buf->ie_name)); strcpy(buf->ie_name, name); } ++(*nbuf); } /* * Like vn_readdir(), but with vnode locking only if needs_lock is * true (to avoid double locking in some situations). */ static int inotify_readdir(file_t *fp, struct dirent *dep, int *done, bool needs_lock) { struct vnode *vp; struct iovec iov; struct uio uio; int error, eofflag; KASSERT(fp->f_type == DTYPE_VNODE); vp = fp->f_vnode; KASSERT(vp->v_type == VDIR); iov.iov_base = dep; iov.iov_len = sizeof(*dep); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_rw = UIO_READ; uio.uio_resid = sizeof(*dep); UIO_SETUP_SYSSPACE(&uio); mutex_enter(&fp->f_lock); uio.uio_offset = fp->f_offset; mutex_exit(&fp->f_lock); /* XXX: should pass whether to lock or not */ if (needs_lock) vn_lock(vp, LK_SHARED | LK_RETRY); else /* * XXX We need to temprarily drop v_interlock because * it may be temporarily acquired by biowait(). */ mutex_exit(vp->v_interlock); KASSERT(!mutex_owned(vp->v_interlock)); error = VOP_READDIR(vp, &uio, fp->f_cred, &eofflag, NULL, NULL); if (needs_lock) VOP_UNLOCK(vp); else mutex_enter(vp->v_interlock); mutex_enter(&fp->f_lock); fp->f_offset = uio.uio_offset; mutex_exit(&fp->f_lock); *done = sizeof(*dep) - uio.uio_resid; return error; } /* * Create (and allocate) an appropriate inotify_dir_entries struct for wd to be * used on ifd_wds of inotifyfd. If the entries on a directory fail to be read, * NULL is returned. needs_lock indicates if the vnode's lock is not already * owned. */ static struct inotify_dir_entries * get_inotify_dir_entries(int wd, bool needs_lock) { struct dirent de; struct dirent *currdep; struct inotify_dir_entries *idep = NULL; file_t *wp; int done, error; size_t i, decount; wp = fd_getfile(wd); if (wp == NULL) return NULL; if (wp->f_type != DTYPE_VNODE) goto leave; /* for non-directories, we have 0 entries. */ if (wp->f_vnode->v_type != VDIR) { idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(0), KM_SLEEP); goto leave; } mutex_enter(&wp->f_lock); wp->f_offset = 0; mutex_exit(&wp->f_lock); decount = 0; for (;;) { error = inotify_readdir(wp, &de, &done, needs_lock); if (error != 0) goto leave; if (done == 0) break; currdep = &de; while ((char *)currdep < ((char *)&de) + done) { decount++; currdep = _DIRENT_NEXT(currdep); } } idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(decount), KM_SLEEP); idep->ide_count = decount; mutex_enter(&wp->f_lock); wp->f_offset = 0; mutex_exit(&wp->f_lock); for (i = 0; i < decount;) { error = inotify_readdir(wp, &de, &done, needs_lock); if (error != 0 || done == 0) { kmem_free(idep, INOTIFY_DIR_ENTRIES_SIZE(decount)); idep = NULL; goto leave; } currdep = &de; while ((char *)currdep < ((char *)&de) + done) { idep->ide_entries[i].fileno = currdep->d_fileno; strcpy(idep->ide_entries[i].name, currdep->d_name); currdep = _DIRENT_NEXT(currdep); i++; } } leave: fd_putfile(wd); return idep; } static size_t find_entry(struct inotify_dir_entries *i1, struct inotify_dir_entries *i2) { for (size_t i = 0; i < i2->ide_count; i++) if (i2->ide_entries[i].fileno != i1->ide_entries[i].fileno) return i; KASSERTMSG(0, "Entry not found"); return -1; } static void handle_write(struct inotifyfd *ifd, int wd, struct inotify_entry *buf, size_t *nbuf) { struct inotify_dir_entries *old_idep, *new_idep; size_t i; mutex_enter(&ifd->ifd_lock); old_idep = ifd->ifd_wds[wd]; KASSERT(old_idep != NULL); new_idep = get_inotify_dir_entries(wd, false); if (new_idep == NULL) { DPRINTF(("%s: directory for wd=%d could not be read\n", __func__, wd)); mutex_exit(&ifd->ifd_lock); return; } if (old_idep->ide_count < new_idep->ide_count) { KASSERT(old_idep->ide_count + 1 == new_idep->ide_count); /* Find the new entry. */ i = find_entry(new_idep, old_idep); do_kevent_to_inotify(wd, LINUX_IN_CREATE, 0, buf, nbuf, new_idep->ide_entries[i].name); goto out; } if (old_idep->ide_count > new_idep->ide_count) { KASSERT(old_idep->ide_count == new_idep->ide_count + 1); /* Find the deleted entry. */ i = find_entry(old_idep, new_idep); do_kevent_to_inotify(wd, LINUX_IN_DELETE, 0, buf, nbuf, old_idep->ide_entries[i].name); goto out; } /* * XXX Because we are not watching the entire * file system, the only time we know for sure * that the event is a LINUX_IN_MOVED_FROM/ * LINUX_IN_MOVED_TO is when the move happens * within a single directory... ie. the number * of directory entries has not changed. * * Otherwise all we can say for sure is that * something was created/deleted. So we issue a * LINUX_IN_CREATE/LINUX_IN_DELETE. */ ino_t changed = new_idep->ide_entries[new_idep->ide_count - 1].fileno; /* Find the deleted entry. */ for (i = 0; i < old_idep->ide_count; i++) if (old_idep->ide_entries[i].fileno == changed) break; KASSERT(i != old_idep->ide_count); do_kevent_to_inotify(wd, LINUX_IN_MOVED_FROM, changed, buf, nbuf, old_idep->ide_entries[i].name); do_kevent_to_inotify(wd, LINUX_IN_MOVED_TO, changed, buf, nbuf, new_idep->ide_entries[new_idep->ide_count - 1].name); out: ifd->ifd_wds[wd] = new_idep; mutex_exit(&ifd->ifd_lock); } /* * Convert a kevent flags and fflags for EVFILT_VNODE to some number * of inotify events. */ static int kevent_to_inotify(struct inotifyfd *ifd, int wd, enum vtype wtype, uint32_t flags, uint32_t fflags, struct inotify_entry *buf, size_t *nbuf) { struct stat st; file_t *wp; size_t i; int error = 0; for (i = 0; i < common_kevent_to_inotify_len; i++) if (fflags & common_kevent_to_inotify[i].kevent) do_kevent_to_inotify(wd, common_kevent_to_inotify[i].inotify, 0, buf, nbuf, NULL); if (wtype == VREG) { for (i = 0; i < vreg_kevent_to_inotify_len; i++) if (fflags & vreg_kevent_to_inotify[i].kevent) do_kevent_to_inotify(wd, vreg_kevent_to_inotify[i].inotify, 0, buf, nbuf, NULL); } else if (wtype == VDIR) { for (i = 0; i < *nbuf; i++) if (buf[i].ie_event.mask & (LINUX_IN_ACCESS|LINUX_IN_ATTRIB |LINUX_IN_CLOSE|LINUX_IN_OPEN)) buf[i].ie_event.mask |= LINUX_IN_ISDIR; /* Need to disambiguate the possible NOTE_WRITEs. */ if (fflags & NOTE_WRITE) handle_write(ifd, wd, buf, nbuf); } /* * Need to check if wd is actually has a link count of 0 to issue a * LINUX_IN_DELETE_SELF. */ if (fflags & NOTE_DELETE) { wp = fd_getfile(wd); KASSERT(wp != NULL); KASSERT(wp->f_type == DTYPE_VNODE); vn_stat(wp->f_vnode, &st); fd_putfile(wd); if (st.st_nlink == 0) do_kevent_to_inotify(wd, LINUX_IN_DELETE_SELF, 0, buf, nbuf, NULL); } /* LINUX_IN_IGNORED must be the last event issued for wd. */ if ((flags & EV_ONESHOT) || (fflags & (NOTE_REVOKE|NOTE_DELETE))) { do_kevent_to_inotify(wd, LINUX_IN_IGNORED, 0, buf, nbuf, NULL); /* * XXX in theory we could call inotify_close_wd(ifd, wd) but if * we get here we must already be holding v_interlock for * wd... so we can't. * * For simplicity we do nothing, and so wd will only be closed * when the inotify fd is closed. */ } return error; } /* * Handle an event. Unlike EVFILT_VNODE, we translate the event to a * linux_inotify_event and put it in our own custom queue. */ static int inotify_filt_event(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct inotifyfd *ifd; struct inotify_entry *cur_ie; size_t nbuf, i; uint32_t status; struct inotify_entry buf[LINUX_INOTIFY_MAX_FROM_KEVENT]; /* * If KN_WILLDETACH is set then * 1. kn->kn_kevent.udata has already been trashed with a * struct lwp *, so we don't have access to a real ifd * anymore, and * 2. we're about to detach anyways, so we don't really care * about the events. * (Also because of this we need to get ifd under the same * lock as kn->kn_status.) */ mutex_enter(&kn->kn_kq->kq_lock); status = kn->kn_status; ifd = kn->kn_kevent.udata; mutex_exit(&kn->kn_kq->kq_lock); if (status & KN_WILLDETACH) return 0; /* * If we don't care about the NOTEs in hint, we don't generate * any events. */ hint &= kn->kn_sfflags; if (hint == 0) return 0; KASSERT(mutex_owned(vp->v_interlock)); KASSERT(!mutex_owned(&ifd->ifd_lock)); mutex_enter(&ifd->ifd_qlock); /* * early out: there's no point even traslating the event if we * have nowhere to put it (and an LINUX_IN_Q_OVERFLOW has * already been added). */ if (ifd->ifd_qcount >= LINUX_INOTIFY_MAX_QUEUED) goto leave; nbuf = 0; (void)kevent_to_inotify(ifd, kn->kn_id, vp->v_type, kn->kn_flags, hint, buf, &nbuf); for (i = 0; i < nbuf && ifd->ifd_qcount < LINUX_INOTIFY_MAX_QUEUED-1; i++) { cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP); memcpy(cur_ie, &buf[i], sizeof(*cur_ie)); TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries); ifd->ifd_qcount++; } /* handle early overflow, by adding an overflow event to the end */ if (i != nbuf) { nbuf = 0; cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP); do_kevent_to_inotify(-1, LINUX_IN_Q_OVERFLOW, 0, cur_ie, &nbuf, NULL); TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries); ifd->ifd_qcount++; } if (nbuf > 0) { cv_signal(&ifd->ifd_qcv); mutex_enter(&ifd->ifd_lock); selnotify(&ifd->ifd_sel, 0, NOTE_LOWAT); mutex_exit(&ifd->ifd_lock); } else DPRINTF(("%s: hint=%lx resulted in 0 inotify events\n", __func__, hint)); leave: mutex_exit(&ifd->ifd_qlock); return 0; } /* * Read inotify events from the queue. */ static int inotify_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { struct inotify_entry *cur_iep; size_t cur_size, nread; int error = 0; struct inotifyfd *ifd = fp->f_data; mutex_enter(&ifd->ifd_qlock); if (ifd->ifd_qcount == 0) { if (fp->f_flag & O_NONBLOCK) { error = EAGAIN; goto leave; } while (ifd->ifd_qcount == 0) { /* wait until there is an event to read */ error = cv_wait_sig(&ifd->ifd_qcv, &ifd->ifd_qlock); if (error != 0) { error = EINTR; goto leave; } } } KASSERT(ifd->ifd_qcount > 0); KASSERT(mutex_owned(&ifd->ifd_qlock)); nread = 0; while (ifd->ifd_qcount > 0) { cur_iep = TAILQ_FIRST(&ifd->ifd_qhead); KASSERT(cur_iep != NULL); cur_size = sizeof(cur_iep->ie_event) + cur_iep->ie_event.len; if (cur_size > uio->uio_resid) { if (nread == 0) error = EINVAL; break; } error = uiomove(&cur_iep->ie_event, sizeof(cur_iep->ie_event), uio); if (error != 0) break; error = uiomove(&cur_iep->ie_name, cur_iep->ie_event.len, uio); if (error != 0) break; /* cleanup */ TAILQ_REMOVE(&ifd->ifd_qhead, cur_iep, ie_entries); kmem_free(cur_iep, sizeof(*cur_iep)); nread++; ifd->ifd_qcount--; } leave: /* Wake up the next reader, if the queue is not empty. */ if (ifd->ifd_qcount > 0) cv_signal(&ifd->ifd_qcv); mutex_exit(&ifd->ifd_qlock); return error; } /* * Close all the file descriptors associated with fp. */ static int inotify_close(file_t *fp) { int error; size_t i; file_t *kqfp; struct inotifyfd *ifd = fp->f_data; for (i = 0; i < ifd->ifd_nwds; i++) { if (ifd->ifd_wds[i] != NULL) { error = inotify_close_wd(ifd, i); if (error != 0) return error; } } /* the reference we need to hold is ifd->ifd_kqfp */ kqfp = fd_getfile(ifd->ifd_kqfd); if (kqfp == NULL) { DPRINTF(("%s: kqfp=%d is already closed\n", __func__, ifd->ifd_kqfd)); } else { error = fd_close(ifd->ifd_kqfd); if (error != 0) return error; } mutex_destroy(&ifd->ifd_lock); mutex_destroy(&ifd->ifd_qlock); cv_destroy(&ifd->ifd_qcv); seldestroy(&ifd->ifd_sel); kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds)); kmem_free(ifd, sizeof(*ifd)); fp->f_data = NULL; return 0; } /* * Check if there are pending read events. */ static int inotify_poll(file_t *fp, int events) { int revents; struct inotifyfd *ifd = fp->f_data; revents = 0; if (events & (POLLIN|POLLRDNORM)) { mutex_enter(&ifd->ifd_qlock); if (ifd->ifd_qcount > 0) revents |= events & (POLLIN|POLLRDNORM); mutex_exit(&ifd->ifd_qlock); } return revents; } /* * Attach EVFILT_READ to the inotify instance in fp. * * This is so you can watch inotify with epoll. No other kqueue * filter needs to be supported. */ static int inotify_kqfilter(file_t *fp, struct knote *kn) { struct inotifyfd *ifd = fp->f_data; KASSERT(fp == kn->kn_obj); if (kn->kn_filter != EVFILT_READ) return EINVAL; kn->kn_fop = &inotify_read_filtops; mutex_enter(&ifd->ifd_lock); selrecord_knote(&ifd->ifd_sel, kn); mutex_exit(&ifd->ifd_lock); return 0; } /* * Detach a filter from an inotify instance. */ static void inotify_read_filt_detach(struct knote *kn) { struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data; mutex_enter(&ifd->ifd_lock); selremove_knote(&ifd->ifd_sel, kn); mutex_exit(&ifd->ifd_lock); } /* * Handle EVFILT_READ events. Note that nothing is put in kn_data. */ static int inotify_read_filt_event(struct knote *kn, long hint) { struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data; if (hint != 0) { KASSERT(mutex_owned(&ifd->ifd_lock)); KASSERT(mutex_owned(&ifd->ifd_qlock)); KASSERT(hint == NOTE_LOWAT); kn->kn_data = ifd->ifd_qcount; } return kn->kn_data > 0; } /* * Restart the inotify instance. */ static void inotify_restart(file_t *fp) { struct inotifyfd *ifd = fp->f_data; mutex_enter(&ifd->ifd_qlock); cv_broadcast(&ifd->ifd_qcv); mutex_exit(&ifd->ifd_qlock); }