diff options
Diffstat (limited to 'sys/fs')
96 files changed, 10404 insertions, 2168 deletions
diff --git a/sys/fs/cd9660/cd9660_lookup.c b/sys/fs/cd9660/cd9660_lookup.c index 569ee631416c..75fcdc9152cd 100644 --- a/sys/fs/cd9660/cd9660_lookup.c +++ b/sys/fs/cd9660/cd9660_lookup.c @@ -47,8 +47,8 @@ #include <fs/cd9660/iso_rrip.h> struct cd9660_ino_alloc_arg { - cd_ino_t ino; - cd_ino_t i_ino; + ino_t ino; + ino_t i_ino; struct iso_directory_record *ep; }; @@ -115,7 +115,7 @@ cd9660_lookup(struct vop_cachedlookup_args *ap) struct cd9660_ino_alloc_arg dd_arg; u_long bmask; /* block offset mask */ int error; - cd_ino_t ino, i_ino; + ino_t ino, i_ino; int ltype, reclen; u_short namelen; int isoflags; @@ -125,7 +125,7 @@ cd9660_lookup(struct vop_cachedlookup_args *ap) char *name; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; - int flags = cnp->cn_flags; + uint64_t flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; ep2 = ep = NULL; diff --git a/sys/fs/cd9660/cd9660_node.c b/sys/fs/cd9660/cd9660_node.c index 67270b40f2b0..ce6ec3aa7a1c 100644 --- a/sys/fs/cd9660/cd9660_node.c +++ b/sys/fs/cd9660/cd9660_node.c @@ -281,10 +281,10 @@ cd9660_tstamp_conv17(u_char *pi, struct timespec *pu) return cd9660_tstamp_conv7(buf, pu, ISO_FTYPE_DEFAULT); } -cd_ino_t +ino_t isodirino(struct iso_directory_record *isodir, struct iso_mnt *imp) { - cd_ino_t ino; + ino_t ino; /* * Note there is an inverse calculation in @@ -293,7 +293,7 @@ isodirino(struct iso_directory_record *isodir, struct iso_mnt *imp) * and also a calculation of the isodir pointer * from an inode in cd9660_vnops.c:cd9660_readlink() */ - ino = ((cd_ino_t)isonum_733(isodir->extent) + + ino = ((ino_t)isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length)) << imp->im_bshift; return ino; } diff --git a/sys/fs/cd9660/cd9660_node.h b/sys/fs/cd9660/cd9660_node.h index 9dc84dd57c0e..6021c1681c5d 100644 --- a/sys/fs/cd9660/cd9660_node.h +++ b/sys/fs/cd9660/cd9660_node.h @@ -56,7 +56,7 @@ typedef struct { struct iso_node { struct vnode *i_vnode; /* vnode associated with this inode */ - cd_ino_t i_number; /* the identity of the inode */ + ino_t i_number; /* the identity of the inode */ /* we use the actual starting block of the file */ struct iso_mnt *i_mnt; /* filesystem associated with this inode */ struct lockf *i_lockf; /* head of byte-level lock list */ diff --git a/sys/fs/cd9660/cd9660_rrip.c b/sys/fs/cd9660/cd9660_rrip.c index 26825062d25a..d0b0008d10b2 100644 --- a/sys/fs/cd9660/cd9660_rrip.c +++ b/sys/fs/cd9660/cd9660_rrip.c @@ -593,7 +593,7 @@ static RRIP_TABLE rrip_table_getname[] = { int cd9660_rrip_getname(struct iso_directory_record *isodir, char *outbuf, - u_short *outlen, cd_ino_t *inump, struct iso_mnt *imp) + u_short *outlen, ino_t *inump, struct iso_mnt *imp) { ISO_RRIP_ANALYZE analyze; RRIP_TABLE *tab; diff --git a/sys/fs/cd9660/cd9660_vfsops.c b/sys/fs/cd9660/cd9660_vfsops.c index f067453d3458..b4db4c4f7331 100644 --- a/sys/fs/cd9660/cd9660_vfsops.c +++ b/sys/fs/cd9660/cd9660_vfsops.c @@ -394,7 +394,7 @@ iso_mountfs(struct vnode *devvp, struct mount *mp) isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; - isomp->im_fmask = isomp->im_dmask = ACCESSPERMS; + isomp->im_fmask = isomp->im_dmask = ALLPERMS; vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP); vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS); @@ -560,7 +560,7 @@ cd9660_root(struct mount *mp, int flags, struct vnode **vpp) struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = (struct iso_directory_record *)imp->root; - cd_ino_t ino = isodirino(dp, imp); + ino_t ino = isodirino(dp, imp); /* * With RRIP we must use the `.' entry of the root directory. @@ -660,15 +660,15 @@ static int cd9660_vfs_hash_cmp(struct vnode *vp, void *pino) { struct iso_node *ip; - cd_ino_t ino; + ino_t ino; ip = VTOI(vp); - ino = *(cd_ino_t *)pino; + ino = *(ino_t *)pino; return (ip->i_number != ino); } int -cd9660_vget_internal(struct mount *mp, cd_ino_t ino, int flags, +cd9660_vget_internal(struct mount *mp, ino_t ino, int flags, struct vnode **vpp, int relocated, struct iso_directory_record *isodir) { struct iso_mnt *imp; diff --git a/sys/fs/cd9660/cd9660_vnops.c b/sys/fs/cd9660/cd9660_vnops.c index 33ca58472490..c4d0e6ba7b30 100644 --- a/sys/fs/cd9660/cd9660_vnops.c +++ b/sys/fs/cd9660/cd9660_vnops.c @@ -443,7 +443,7 @@ cd9660_readdir(struct vop_readdir_args *ap) u_short namelen; u_int ncookies = 0; uint64_t *cookies = NULL; - cd_ino_t ino; + ino_t ino; dp = VTOI(vdp); imp = dp->i_mnt; @@ -758,6 +758,9 @@ cd9660_pathconf(struct vop_pathconf_args *ap) /* NOTREACHED */ } +_Static_assert(sizeof(struct ifid) <= sizeof(struct fid), + "struct ifid must be no larger than struct fid"); + /* * Vnode pointer to File handle */ diff --git a/sys/fs/cd9660/iso.h b/sys/fs/cd9660/iso.h index a9733f62c077..40047cc92de6 100644 --- a/sys/fs/cd9660/iso.h +++ b/sys/fs/cd9660/iso.h @@ -212,21 +212,12 @@ struct iso_extended_attributes { u_char len_au [ISODCL (247, 250)]; /* 723 */ }; -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_WANT_ISO_MNT) /* CD-ROM Format type */ enum ISO_FTYPE { ISO_FTYPE_DEFAULT, ISO_FTYPE_9660, ISO_FTYPE_RRIP, ISO_FTYPE_JOLIET, ISO_FTYPE_ECMA, ISO_FTYPE_HIGH_SIERRA }; -#ifndef ISOFSMNT_ROOT -#define ISOFSMNT_ROOT 0 -#endif - -/* - * When ino_t becomes 64-bit, we can remove this definition in favor of ino_t. - */ -typedef __uint64_t cd_ino_t; - struct iso_mnt { uint64_t im_flags; @@ -262,12 +253,16 @@ struct iso_mnt { void *im_l2d; }; +#endif /* defined(_KERNEL) || defined(_WANT_ISO_MNT) */ + +#ifdef _KERNEL + struct ifid { u_short ifid_len; u_short ifid_pad; - cd_ino_t ifid_ino; + ino_t ifid_ino; long ifid_start; -}; +} __packed; #define VFSTOISOFS(mp) ((struct iso_mnt *)((mp)->mnt_data)) @@ -276,7 +271,7 @@ struct ifid { #define lblkno(imp, loc) ((loc) >> (imp)->im_bshift) #define blksize(imp, ip, lbn) ((imp)->logical_block_size) -int cd9660_vget_internal(struct mount *, cd_ino_t, int, struct vnode **, int, +int cd9660_vget_internal(struct mount *, ino_t , int, struct vnode **, int, struct iso_directory_record *); #define cd9660_sysctl ((int (*)(int *, u_int, void *, size_t *, void *, \ size_t, struct proc *))eopnotsupp) @@ -287,7 +282,7 @@ extern struct vop_vector cd9660_fifoops; int isochar(u_char *, u_char *, int, u_short *, int *, int, void *); int isofncmp(u_char *, int, u_char *, int, int, int, void *, void *); void isofntrans(u_char *, int, u_char *, u_short *, int, int, int, int, void *); -cd_ino_t isodirino(struct iso_directory_record *, struct iso_mnt *); +ino_t isodirino(struct iso_directory_record *, struct iso_mnt *); u_short sgetrune(const char *, size_t, char const **, int, void *); #endif /* _KERNEL */ diff --git a/sys/fs/cd9660/iso_rrip.h b/sys/fs/cd9660/iso_rrip.h index bea0811eccf4..5a75beb08d93 100644 --- a/sys/fs/cd9660/iso_rrip.h +++ b/sys/fs/cd9660/iso_rrip.h @@ -63,7 +63,7 @@ typedef struct { off_t iso_ce_off; /* offset of continuation area */ int iso_ce_len; /* length of continuation area */ struct iso_mnt *imp; /* mount structure */ - cd_ino_t *inump; /* inode number pointer */ + ino_t *inump; /* inode number pointer */ char *outbuf; /* name/symbolic link output area */ u_short *outlen; /* length of above */ u_short maxlen; /* maximum length of above */ @@ -76,7 +76,7 @@ int cd9660_rrip_analyze(struct iso_directory_record *isodir, struct iso_node *inop, struct iso_mnt *imp); int cd9660_rrip_getname(struct iso_directory_record *isodir, char *outbuf, u_short *outlen, - cd_ino_t *inump, struct iso_mnt *imp); + ino_t *inump, struct iso_mnt *imp); int cd9660_rrip_getsymname(struct iso_directory_record *isodir, char *outbuf, u_short *outlen, struct iso_mnt *imp); diff --git a/sys/fs/cuse/cuse.c b/sys/fs/cuse/cuse.c index 9ef234c35427..d63a7d4691cf 100644 --- a/sys/fs/cuse/cuse.c +++ b/sys/fs/cuse/cuse.c @@ -191,13 +191,13 @@ static void cuse_client_kqfilter_write_detach(struct knote *kn); static int cuse_client_kqfilter_read_event(struct knote *kn, long hint); static int cuse_client_kqfilter_write_event(struct knote *kn, long hint); -static struct filterops cuse_client_kqfilter_read_ops = { +static const struct filterops cuse_client_kqfilter_read_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_read_detach, .f_event = cuse_client_kqfilter_read_event, }; -static struct filterops cuse_client_kqfilter_write_ops = { +static const struct filterops cuse_client_kqfilter_write_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_write_detach, .f_event = cuse_client_kqfilter_write_event, @@ -332,7 +332,7 @@ cuse_kern_uninit(void *arg) mtx_destroy(&cuse_global_mtx); } -SYSUNINIT(cuse_kern_uninit, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_uninit, 0); +SYSUNINIT(cuse_kern_uninit, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_uninit, NULL); static int cuse_server_get(struct cuse_server **ppcs) diff --git a/sys/fs/devfs/devfs_devs.c b/sys/fs/devfs/devfs_devs.c index db879efe803a..124f9f0449af 100644 --- a/sys/fs/devfs/devfs_devs.c +++ b/sys/fs/devfs/devfs_devs.c @@ -86,6 +86,9 @@ sysctl_devname(SYSCTL_HANDLER_ARGS) struct cdev_priv *cdp; struct cdev *dev; + if (req->newptr == NULL) + return (EINVAL); + #ifdef COMPAT_FREEBSD11 if (req->newlen == sizeof(ud_compat)) { error = SYSCTL_IN(req, &ud_compat, sizeof(ud_compat)); @@ -118,11 +121,8 @@ SYSCTL_PROC(_kern, OID_AUTO, devname, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MPSAFE, NULL, 0, sysctl_devname, "", "devname(3) handler"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct cdev), "sizeof(struct cdev)"); - -SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev_priv, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct cdev_priv), "sizeof(struct cdev_priv)"); +SYSCTL_SIZEOF_STRUCT(cdev); +SYSCTL_SIZEOF_STRUCT(cdev_priv); struct cdev * devfs_alloc(int flags) diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index a35f6dbf9520..1d744e6593c0 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -66,7 +66,7 @@ static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; -static struct fileops devfs_ops_f; +static const struct fileops devfs_ops_f; #include <fs/devfs/devfs.h> #include <fs/devfs/devfs_int.h> @@ -555,8 +555,7 @@ loop: if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); - } - else if (VN_IS_DOOMED(vp)) { + } else if (VN_IS_DOOMED(vp)) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; @@ -1516,6 +1515,8 @@ devfs_readdir(struct vop_readdir_args *ap) */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; + if (dd == NULL && error == 0 && ap->a_eofflag != NULL) + *ap->a_eofflag = 1; return (error); } @@ -2038,7 +2039,7 @@ devfs_cmp_f(struct file *fp1, struct file *fp2, struct thread *td) return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data)); } -static struct fileops devfs_ops_f = { +static const struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, diff --git a/sys/fs/ext2fs/ext2_extents.c b/sys/fs/ext2fs/ext2_extents.c index 3ae1da4fe6b7..146aa48f6743 100644 --- a/sys/fs/ext2fs/ext2_extents.c +++ b/sys/fs/ext2fs/ext2_extents.c @@ -711,7 +711,7 @@ ext4_ext_tree_init(struct inode *ip) ip->i_flag |= IN_E4EXTENTS; - memset(ip->i_data, 0, EXT2_NDADDR + EXT2_NIADDR); + memset(ip->i_data, 0, sizeof(ip->i_data)); ehp = (struct ext4_extent_header *)ip->i_data; ehp->eh_magic = htole16(EXT4_EXT_MAGIC); ehp->eh_max = htole16(ext4_ext_space_root(ip)); diff --git a/sys/fs/ext2fs/ext2_vfsops.c b/sys/fs/ext2fs/ext2_vfsops.c index bffbf4546f37..9e7a03fffd71 100644 --- a/sys/fs/ext2fs/ext2_vfsops.c +++ b/sys/fs/ext2fs/ext2_vfsops.c @@ -1345,7 +1345,7 @@ ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) return (ESTALE); } *vpp = nvp; - vnode_create_vobject(*vpp, 0, curthread); + vnode_create_vobject(*vpp, ip->i_size, curthread); return (0); } diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index dfbb11f75421..064c10bd18b2 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -1889,6 +1889,8 @@ ext2_vptofh(struct vop_vptofh_args *ap) { struct inode *ip; struct ufid *ufhp; + _Static_assert(sizeof(struct ufid) <= sizeof(struct fid), + "struct ufid cannot be larger than struct fid"); ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; diff --git a/sys/fs/ext2fs/inode.h b/sys/fs/ext2fs/inode.h index 9ee1b5672da6..c45339bfde40 100644 --- a/sys/fs/ext2fs/inode.h +++ b/sys/fs/ext2fs/inode.h @@ -187,10 +187,10 @@ struct indir { /* This overlays the fid structure (see mount.h). */ struct ufid { - uint16_t ufid_len; /* Length of structure. */ - uint16_t ufid_pad; /* Force 32-bit alignment. */ - ino_t ufid_ino; /* File number (ino). */ - uint32_t ufid_gen; /* Generation number. */ + uint16_t ufid_len; /* Length of structure. */ + uint16_t ufid_pad; /* Force 32-bit alignment. */ + uint32_t ufid_gen; /* Generation number. */ + ino_t ufid_ino; /* File number (ino). */ }; #endif /* _KERNEL */ diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c index 9ec80794e795..58a22b8bdc50 100644 --- a/sys/fs/fdescfs/fdesc_vnops.c +++ b/sys/fs/fdescfs/fdesc_vnops.c @@ -502,7 +502,7 @@ fdesc_setattr(struct vop_setattr_args *ap) cap_rights_init_one(&rights, CAP_EXTATTR_SET), &fp); } else { error = getvnode_path(td, fd, - cap_rights_init_one(&rights, CAP_EXTATTR_SET), &fp); + cap_rights_init_one(&rights, CAP_EXTATTR_SET), NULL, &fp); } if (error) { /* @@ -547,6 +547,8 @@ fdesc_readdir(struct vop_readdir_args *ap) fmp = VFSTOFDESC(ap->a_vp->v_mount); if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || @@ -559,7 +561,12 @@ fdesc_readdir(struct vop_readdir_args *ap) fcnt = i - 2; /* The first two nodes are `.' and `..' */ FILEDESC_SLOCK(fdp); - while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { + while (uio->uio_resid >= UIO_MX) { + if (i >= fdp->fd_nfiles + 2) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; + break; + } bzero((caddr_t)dp, UIO_MX); switch (i) { case 0: /* `.' */ @@ -639,7 +646,7 @@ fdesc_readlink(struct vop_readlink_args *va) VOP_UNLOCK(vn); td = curthread; - error = fget_cap(td, fd_fd, &cap_no_rights, &fp, NULL); + error = fget_cap(td, fd_fd, &cap_no_rights, NULL, &fp, NULL); if (error != 0) goto out; diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c index 892793993ecc..57b3559731f7 100644 --- a/sys/fs/fuse/fuse_device.c +++ b/sys/fs/fuse/fuse_device.c @@ -82,6 +82,8 @@ #include <sys/sysctl.h> #include <sys/poll.h> #include <sys/selinfo.h> +#define EXTERR_CATEGORY EXTERR_CAT_FUSE +#include <sys/exterrvar.h> #include "fuse.h" #include "fuse_internal.h" @@ -120,13 +122,13 @@ static int fuse_device_filt_read(struct knote *kn, long hint); static int fuse_device_filt_write(struct knote *kn, long hint); static void fuse_device_filt_detach(struct knote *kn); -struct filterops fuse_device_rfiltops = { +static const struct filterops fuse_device_rfiltops = { .f_isfd = 1, .f_detach = fuse_device_filt_detach, .f_event = fuse_device_filt_read, }; -struct filterops fuse_device_wfiltops = { +static const struct filterops fuse_device_wfiltops = { .f_isfd = 1, .f_event = fuse_device_filt_write, }; @@ -152,7 +154,7 @@ fdata_dtor(void *arg) FUSE_LOCK(); fuse_lck_mtx_lock(fdata->aw_mtx); /* wakup poll()ers */ - selwakeuppri(&fdata->ks_rsel, PZERO + 1); + selwakeuppri(&fdata->ks_rsel, PZERO); /* Don't let syscall handlers wait in vain */ while ((tick = fuse_aw_pop(fdata))) { fuse_lck_mtx_lock(tick->tk_aw_mtx); @@ -193,7 +195,7 @@ fuse_device_filter(struct cdev *dev, struct knote *kn) kn->kn_fop = &fuse_device_wfiltops; error = 0; } else if (error == 0) { - error = EINVAL; + error = EXTERROR(EINVAL, "Unsupported kevent filter"); kn->kn_data = error; } @@ -319,7 +321,7 @@ again: "we know early on that reader should be kicked so we " "don't wait for news"); fuse_lck_mtx_unlock(data->ms_mtx); - return (ENODEV); + return (EXTERROR(ENODEV, "This FUSE session is about to be closed")); } if (!(tick = fuse_ms_pop(data))) { /* check if we may block */ @@ -331,7 +333,10 @@ again: err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0); if (err != 0) { fuse_lck_mtx_unlock(data->ms_mtx); - return (fdata_get_dead(data) ? ENODEV : err); + if (fdata_get_dead(data)) + err = EXTERROR(ENODEV, + "This FUSE session is about to be closed"); + return (err); } tick = fuse_ms_pop(data); } @@ -361,8 +366,8 @@ again: FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); } - return (ENODEV); /* This should make the daemon get off - * of us */ + /* This should make the daemon get off of us */ + return (EXTERROR(ENODEV, "This FUSE session is about to be closed")); } SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read message successfully"); @@ -385,7 +390,7 @@ again: fdata_set_dead(data); SDT_PROBE2(fusefs, , device, trace, 2, "daemon is stupid, kick it off..."); - err = ENODEV; + err = EXTERROR(ENODEV, "Partial read attempted"); } else { err = uiomove(buf, buflen, uio); } @@ -403,12 +408,14 @@ fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) SDT_PROBE2(fusefs, , device, trace, 1, "Format error: body size " "differs from size claimed by header"); - return (EINVAL); + return (EXTERROR(EINVAL, "Format error: body size " + "differs from size claimed by header")); } if (uio->uio_resid && ohead->unique != 0 && ohead->error) { SDT_PROBE2(fusefs, , device, trace, 1, "Format error: non zero error but message had a body"); - return (EINVAL); + return (EXTERROR(EINVAL, "Format error: non zero error, " + "but message had a body")); } return (0); @@ -439,13 +446,12 @@ fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); - mp = data->mp; if (uio->uio_resid < sizeof(struct fuse_out_header)) { SDT_PROBE2(fusefs, , device, trace, 1, "fuse_device_write got less than a header!"); fdata_set_dead(data); - return (EINVAL); + return (EXTERROR(EINVAL, "fuse_device_write got less than a header!")); } if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0) return (err); @@ -453,7 +459,7 @@ fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) if (data->linux_errnos != 0 && ohead.error != 0) { err = -ohead.error; if (err < 0 || err >= nitems(linux_to_bsd_errtbl)) - return (EINVAL); + return (EXTERROR(EINVAL, "Unknown Linux errno", err)); /* '-', because it will get flipped again below */ ohead.error = -linux_to_bsd_errtbl[err]; @@ -521,7 +527,7 @@ fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); tick->tk_aw_handler(tick, uio); - err = EINVAL; + err = EXTERROR(EINVAL, "Unknown errno", ohead.error); } else { memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); @@ -542,6 +548,13 @@ fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) } else if (ohead.unique == 0){ /* unique == 0 means asynchronous notification */ SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); + mp = data->mp; + vfs_ref(mp); + err = vfs_busy(mp, 0); + vfs_rel(mp); + if (err) + return (err); + switch (ohead.error) { case FUSE_NOTIFY_INVAL_ENTRY: err = fuse_internal_invalidate_entry(mp, uio); @@ -564,8 +577,10 @@ fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) /* Unimplemented. See comments in fuse_vnops */ default: /* Not implemented */ - err = ENOSYS; + err = EXTERROR(ENOSYS, "Unimplemented FUSE notification code", + ohead.error); } + vfs_unbusy(mp); } else { /* no callback at all! */ SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, @@ -582,7 +597,7 @@ fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) */ err = 0; } else { - err = EINVAL; + err = EXTERROR(EINVAL, "FUSE ticket is missing"); } } diff --git a/sys/fs/fuse/fuse_file.c b/sys/fs/fuse/fuse_file.c index 88de12d59425..5f5819c2ccae 100644 --- a/sys/fs/fuse/fuse_file.c +++ b/sys/fs/fuse/fuse_file.c @@ -122,7 +122,6 @@ fuse_filehandle_open(struct vnode *vp, int a_mode, struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred) { struct mount *mp = vnode_mount(vp); - struct fuse_data *data = fuse_get_mpdata(mp); struct fuse_dispatcher fdi; const struct fuse_open_out default_foo = { .fh = 0, @@ -132,12 +131,10 @@ fuse_filehandle_open(struct vnode *vp, int a_mode, struct fuse_open_in *foi = NULL; const struct fuse_open_out *foo; fufh_type_t fufh_type; - int dataflags = data->dataflags; int err = 0; int oflags = 0; int op = FUSE_OPEN; int relop = FUSE_RELEASE; - int fsess_no_op_support = FSESS_NO_OPEN_SUPPORT; fufh_type = fflags_2_fufh_type(a_mode); oflags = fufh_type_2_fflags(fufh_type); @@ -145,12 +142,11 @@ fuse_filehandle_open(struct vnode *vp, int a_mode, if (vnode_isdir(vp)) { op = FUSE_OPENDIR; relop = FUSE_RELEASEDIR; - fsess_no_op_support = FSESS_NO_OPENDIR_SUPPORT; /* vn_open_vnode already rejects FWRITE on directories */ MPASS(fufh_type == FUFH_RDONLY || fufh_type == FUFH_EXEC); } fdisp_init(&fdi, sizeof(*foi)); - if (fsess_not_impl(mp, op) && dataflags & fsess_no_op_support) { + if (fsess_not_impl(mp, op)) { /* The operation implicitly succeeds */ foo = &default_foo; } else { @@ -160,7 +156,7 @@ fuse_filehandle_open(struct vnode *vp, int a_mode, foi->flags = oflags; err = fdisp_wait_answ(&fdi); - if (err == ENOSYS && dataflags & fsess_no_op_support) { + if (err == ENOSYS) { /* The operation implicitly succeeds */ foo = &default_foo; fsess_set_notimpl(mp, op); @@ -174,6 +170,7 @@ fuse_filehandle_open(struct vnode *vp, int a_mode, goto out; } else { foo = fdi.answ; + fsess_set_impl(mp, op); } } diff --git a/sys/fs/fuse/fuse_file.h b/sys/fs/fuse/fuse_file.h index 2a90e66d1b23..232132473953 100644 --- a/sys/fs/fuse/fuse_file.h +++ b/sys/fs/fuse/fuse_file.h @@ -139,7 +139,7 @@ struct fuse_filehandle { /* * flags returned by FUSE_OPEN - * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE + * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE, FOPEN_NOFLUSH * Unsupported: * FOPEN_NONSEEKABLE: Adding support would require a new per-file * or per-vnode attribute, which would have to be checked by diff --git a/sys/fs/fuse/fuse_internal.c b/sys/fs/fuse/fuse_internal.c index 29d88fc942f4..61fe2ed032f6 100644 --- a/sys/fs/fuse/fuse_internal.c +++ b/sys/fs/fuse/fuse_internal.c @@ -282,12 +282,12 @@ fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr, * dirty writes! That's a server bug. */ if (fuse_libabi_geq(data, 7, 23)) { - msg = "writeback cache incoherent!." + msg = "writeback cache incoherent! " "To prevent data corruption, disable " "the writeback cache according to your " "FUSE server's documentation."; } else { - msg = "writeback cache incoherent!." + msg = "writeback cache incoherent! " "To prevent data corruption, disable " "the writeback cache by setting " "vfs.fusefs.data_cache_mode to 0 or 1."; @@ -979,6 +979,9 @@ fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio) struct fuse_data *data = tick->tk_data; struct fuse_init_out *fiio = NULL; + if (fdata_get_dead(data)) + goto out; + if ((err = tick->tk_aw_ohead.error)) { goto out; } @@ -1010,10 +1013,6 @@ fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio) data->dataflags |= FSESS_POSIX_LOCKS; if (fiio->flags & FUSE_EXPORT_SUPPORT) data->dataflags |= FSESS_EXPORT_SUPPORT; - if (fiio->flags & FUSE_NO_OPEN_SUPPORT) - data->dataflags |= FSESS_NO_OPEN_SUPPORT; - if (fiio->flags & FUSE_NO_OPENDIR_SUPPORT) - data->dataflags |= FSESS_NO_OPENDIR_SUPPORT; /* * Don't bother to check FUSE_BIG_WRITES, because it's * redundant with max_write diff --git a/sys/fs/fuse/fuse_io.c b/sys/fs/fuse/fuse_io.c index 00b348814642..0760d7641c7d 100644 --- a/sys/fs/fuse/fuse_io.c +++ b/sys/fs/fuse/fuse_io.c @@ -932,7 +932,7 @@ fuse_io_invalbuf(struct vnode *vp, struct thread *td) if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) return EIO; fvdat->flag |= FN_FLUSHWANT; - tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); + tsleep(&fvdat->flag, PRIBIO, "fusevinv", 2 * hz); error = 0; if (p != NULL) { PROC_LOCK(p); diff --git a/sys/fs/fuse/fuse_ipc.c b/sys/fs/fuse/fuse_ipc.c index f1f9f801bf4d..0b6048644d32 100644 --- a/sys/fs/fuse/fuse_ipc.c +++ b/sys/fs/fuse/fuse_ipc.c @@ -443,11 +443,6 @@ retry: if (err == EWOULDBLOCK) { SDT_PROBE2(fusefs, , ipc, trace, 3, "fticket_wait_answer: EWOULDBLOCK"); -#ifdef XXXIP /* die conditionally */ - if (!fdata_get_dead(data)) { - fdata_set_dead(data); - } -#endif err = ETIMEDOUT; fticket_set_answered(ftick); } else if ((err == EINTR || err == ERESTART)) { @@ -593,7 +588,7 @@ fdata_set_dead(struct fuse_data *data) fuse_lck_mtx_lock(data->ms_mtx); data->dataflags |= FSESS_DEAD; wakeup_one(data); - selwakeuppri(&data->ks_rsel, PZERO + 1); + selwakeuppri(&data->ks_rsel, PZERO); wakeup(&data->ticketer); fuse_lck_mtx_unlock(data->ms_mtx); FUSE_UNLOCK(); @@ -669,7 +664,7 @@ fuse_insert_message(struct fuse_ticket *ftick, bool urgent) else fuse_ms_push(ftick); wakeup_one(ftick->tk_data); - selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1); + selwakeuppri(&ftick->tk_data->ks_rsel, PZERO); KNOTE_LOCKED(&ftick->tk_data->ks_rsel.si_note, 0); fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx); } diff --git a/sys/fs/fuse/fuse_ipc.h b/sys/fs/fuse/fuse_ipc.h index 0ec556138be0..3bfc859dbac9 100644 --- a/sys/fs/fuse/fuse_ipc.h +++ b/sys/fs/fuse/fuse_ipc.h @@ -227,8 +227,6 @@ struct fuse_data { /* (and being observed by the daemon) */ #define FSESS_PUSH_SYMLINKS_IN 0x0020 /* prefix absolute symlinks with mp */ #define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */ -#define FSESS_NO_OPEN_SUPPORT 0x0080 /* can elide FUSE_OPEN ops */ -#define FSESS_NO_OPENDIR_SUPPORT 0x0100 /* can elide FUSE_OPENDIR ops */ #define FSESS_ASYNC_READ 0x1000 /* allow multiple reads of some file */ #define FSESS_POSIX_LOCKS 0x2000 /* daemon supports POSIX locks */ #define FSESS_EXPORT_SUPPORT 0x10000 /* daemon supports NFS-style lookups */ @@ -240,6 +238,8 @@ struct fuse_data { #define FSESS_WARN_WB_CACHE_INCOHERENT 0x400000 /* WB cache incoherent */ #define FSESS_WARN_ILLEGAL_INODE 0x800000 /* Illegal inode for new file */ #define FSESS_WARN_READLINK_EMBEDDED_NUL 0x1000000 /* corrupt READLINK output */ +#define FSESS_WARN_DOT_LOOKUP 0x2000000 /* Inconsistent . LOOKUP response */ +#define FSESS_WARN_INODE_MISMATCH 0x4000000 /* ino != nodeid */ #define FSESS_MNTOPTS_MASK ( \ FSESS_DAEMON_CAN_SPY | FSESS_PUSH_SYMLINKS_IN | \ FSESS_DEFAULT_PERMISSIONS | FSESS_INTR) diff --git a/sys/fs/fuse/fuse_kernel.h b/sys/fs/fuse/fuse_kernel.h index ad93a26adaab..942448b47365 100644 --- a/sys/fs/fuse/fuse_kernel.h +++ b/sys/fs/fuse/fuse_kernel.h @@ -161,6 +161,33 @@ * - add FOPEN_CACHE_DIR * - add FUSE_MAX_PAGES, add max_pages to init_out * - add FUSE_CACHE_SYMLINKS + * + * 7.29 + * - add FUSE_NO_OPENDIR_SUPPORT flag + * + * 7.30 + * - add FUSE_EXPLICIT_INVAL_DATA + * - add FUSE_IOCTL_COMPAT_X32 + * + * 7.31 + * - add FUSE_WRITE_KILL_PRIV flag + * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING + * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag + * + * 7.32 + * - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS + * + * 7.33 + * - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID + * - add FUSE_OPEN_KILL_SUIDGID + * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT + * - add FUSE_SETXATTR_ACL_KILL_SGID + * + * 7.34 + * - add FUSE_SYNCFS + * + * 7.35 + * - add FOPEN_NOFLUSH */ #ifndef _FUSE_FUSE_KERNEL_H @@ -196,7 +223,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 29 +#define FUSE_KERNEL_MINOR_VERSION 35 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -220,7 +247,7 @@ struct fuse_attr { uint32_t gid; uint32_t rdev; uint32_t blksize; - uint32_t padding; + uint32_t flags; }; struct fuse_kstatfs { @@ -257,6 +284,7 @@ struct fuse_file_lock { #define FATTR_MTIME_NOW (1 << 8) #define FATTR_LOCKOWNER (1 << 9) #define FATTR_CTIME (1 << 10) +#define FATTR_KILL_SUIDGID (1 << 11) /** * Flags returned by the OPEN request @@ -265,11 +293,15 @@ struct fuse_file_lock { * FOPEN_KEEP_CACHE: don't invalidate the data cache on open * FOPEN_NONSEEKABLE: the file is not seekable * FOPEN_CACHE_DIR: allow caching this directory + * FOPEN_STREAM: the file is stream-like (no file position at all) + * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) #define FOPEN_NONSEEKABLE (1 << 2) #define FOPEN_CACHE_DIR (1 << 3) +#define FOPEN_STREAM (1 << 4) +#define FOPEN_NOFLUSH (1 << 5) /** * INIT request/reply flags @@ -299,6 +331,17 @@ struct fuse_file_lock { * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages * FUSE_CACHE_SYMLINKS: cache READLINK responses * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir + * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request + * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for + * foffset and moffset fields in struct + * fuse_setupmapping_out and fuse_removemapping_one. + * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts + * FUSE_HANDLE_KILLPRIV_V2: fs kills suid/sgid/cap on write/chown/trunc. + * Upon write/truncate suid/sgid is only killed if caller + * does not have CAP_FSETID. Additionally upon + * write/truncate sgid is killed only if file has group + * execute permission. (Same as Linux VFS behavior). + * FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -325,6 +368,11 @@ struct fuse_file_lock { #define FUSE_MAX_PAGES (1 << 22) #define FUSE_CACHE_SYMLINKS (1 << 23) #define FUSE_NO_OPENDIR_SUPPORT (1 << 24) +#define FUSE_EXPLICIT_INVAL_DATA (1 << 25) +#define FUSE_MAP_ALIGNMENT (1 << 26) +#define FUSE_SUBMOUNTS (1 << 27) +#define FUSE_HANDLE_KILLPRIV_V2 (1 << 28) +#define FUSE_SETXATTR_EXT (1 << 29) #ifdef linux /** @@ -356,9 +404,14 @@ struct fuse_file_lock { * * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed * FUSE_WRITE_LOCKOWNER: lock_owner field is valid + * FUSE_WRITE_KILL_SUIDGID: kill suid and sgid bits */ #define FUSE_WRITE_CACHE (1 << 0) #define FUSE_WRITE_LOCKOWNER (1 << 1) +#define FUSE_WRITE_KILL_SUIDGID (1 << 2) + +/* Obsolete alias; this flag implies killing suid/sgid only. */ +#define FUSE_WRITE_KILL_PRIV FUSE_WRITE_KILL_SUIDGID /** * Read flags @@ -373,6 +426,7 @@ struct fuse_file_lock { * FUSE_IOCTL_RETRY: retry with new iovecs * FUSE_IOCTL_32BIT: 32bit ioctl * FUSE_IOCTL_DIR: is a directory + * FUSE_IOCTL_COMPAT_X32: x32 compat ioctl on 64bit machine (64bit time_t) * * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs */ @@ -381,6 +435,7 @@ struct fuse_file_lock { #define FUSE_IOCTL_RETRY (1 << 2) #define FUSE_IOCTL_32BIT (1 << 3) #define FUSE_IOCTL_DIR (1 << 4) +#define FUSE_IOCTL_COMPAT_X32 (1 << 5) #define FUSE_IOCTL_MAX_IOV 256 @@ -404,6 +459,25 @@ struct fuse_file_lock { #define FUSE_FALLOC_FL_KEEP_SIZE 0x1 #define FUSE_FALLOC_FL_PUNCH_HOLE 0x2 +/** + * fuse_attr flags + * + * FUSE_ATTR_SUBMOUNT: Object is a submount root + */ +#define FUSE_ATTR_SUBMOUNT (1 << 0) + +/** + * Open flags + * FUSE_OPEN_KILL_SUIDGID: Kill suid and sgid if executable + */ +#define FUSE_OPEN_KILL_SUIDGID (1 << 0) + +/** + * setxattr flags + * FUSE_SETXATTR_ACL_KILL_SGID: Clear SGID when system.posix_acl_access is set + */ +#define FUSE_SETXATTR_ACL_KILL_SGID (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ @@ -450,10 +524,16 @@ enum fuse_opcode { FUSE_RENAME2 = 45, FUSE_LSEEK = 46, FUSE_COPY_FILE_RANGE = 47, + FUSE_SETUPMAPPING = 48, + FUSE_REMOVEMAPPING = 49, + FUSE_SYNCFS = 50, #ifdef linux /* CUSE specific operations */ CUSE_INIT = 4096, + /* Reserved opcodes: helpful to detect structure endian-ness */ + CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */ + FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */ #endif /* linux */ }; @@ -561,14 +641,14 @@ struct fuse_setattr_in { struct fuse_open_in { uint32_t flags; - uint32_t unused; + uint32_t open_flags; /* FUSE_OPEN_... */ }; struct fuse_create_in { uint32_t flags; uint32_t mode; uint32_t umask; - uint32_t padding; + uint32_t open_flags; /* FUSE_OPEN_... */ }; struct fuse_open_out { @@ -630,9 +710,13 @@ struct fuse_fsync_in { uint32_t padding; }; +#define FUSE_COMPAT_SETXATTR_IN_SIZE 8 + struct fuse_setxattr_in { uint32_t size; uint32_t flags; + uint32_t setxattr_flags; + uint32_t padding; }; struct fuse_listxattr_in { @@ -692,7 +776,7 @@ struct fuse_init_out { uint32_t max_write; uint32_t time_gran; uint16_t max_pages; - uint16_t padding; + uint16_t map_alignment; uint32_t unused[8]; }; @@ -863,6 +947,10 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +/* Device ioctls: */ +#define FUSE_DEV_IOC_MAGIC 229 +#define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) + struct fuse_lseek_in { uint64_t fh; uint64_t offset; @@ -884,4 +972,38 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) +struct fuse_setupmapping_in { + /* An already open handle */ + uint64_t fh; + /* Offset into the file to start the mapping */ + uint64_t foffset; + /* Length of mapping required */ + uint64_t len; + /* Flags, FUSE_SETUPMAPPING_FLAG_* */ + uint64_t flags; + /* Offset in Memory Window */ + uint64_t moffset; +}; + +struct fuse_removemapping_in { + /* number of fuse_removemapping_one follows */ + uint32_t count; +}; + +struct fuse_removemapping_one { + /* Offset into the dax window start the unmapping */ + uint64_t moffset; + /* Length of mapping required */ + uint64_t len; +}; + +#define FUSE_REMOVEMAPPING_MAX_ENTRY \ + (PAGE_SIZE / sizeof(struct fuse_removemapping_one)) + +struct fuse_syncfs_in { + uint64_t padding; +}; + #endif /* _FUSE_FUSE_KERNEL_H */ diff --git a/sys/fs/fuse/fuse_node.c b/sys/fs/fuse/fuse_node.c index 777519450954..742dc66bcafc 100644 --- a/sys/fs/fuse/fuse_node.c +++ b/sys/fs/fuse/fuse_node.c @@ -297,6 +297,8 @@ fuse_vnode_get(struct mount *mp, __enum_uint8(vtype) vtyp) { struct thread *td = curthread; + bool exportable = fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT; + /* * feo should only be NULL for the root directory, which (when libfuse * is used) always has generation 0 @@ -309,6 +311,23 @@ fuse_vnode_get(struct mount *mp, "Assigned same inode to both parent and child."); return EIO; } + if (feo && feo->nodeid != feo->attr.ino && exportable) { + /* + * NFS servers (both kernelspace and userspace) rely on + * VFS_VGET to lookup inodes. But that's only possible if the + * file's inode number matches its nodeid, which isn't + * necessarily the case for FUSE. If they don't match, then we + * can complete the current operation, but future VFS_VGET + * operations will almost certainly return spurious results. + * Warn the operator. + * + * But only warn the operator if the file system reports + * NFS-compatibility, because that's the only time that this + * matters, and dumb fuse servers abound. + */ + fuse_warn(fuse_get_mpdata(mp), FSESS_WARN_INODE_MISMATCH, + "file has different inode number and nodeid."); + } err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp); if (err) { @@ -354,7 +373,7 @@ void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td) { if (vnode_vtype(vp) == VREG) - vnode_create_vobject(vp, 0, td); + vnode_create_vobject(vp, VNODE_NO_SIZE, td); } int diff --git a/sys/fs/fuse/fuse_vfsops.c b/sys/fs/fuse/fuse_vfsops.c index e088f92bf5bf..1b858a988289 100644 --- a/sys/fs/fuse/fuse_vfsops.c +++ b/sys/fs/fuse/fuse_vfsops.c @@ -81,6 +81,8 @@ #include <sys/mount.h> #include <sys/sysctl.h> #include <sys/fcntl.h> +#define EXTERR_CATEGORY EXTERR_CAT_FUSE +#include <sys/exterrvar.h> #include "fuse.h" #include "fuse_node.h" @@ -272,7 +274,7 @@ fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags, int error; if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "NFS-style lookups are not supported")); error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp); if (error) { @@ -286,7 +288,7 @@ fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags, return (ESTALE); } *vpp = nvp; - vnode_create_vobject(*vpp, 0, curthread); + vnode_create_vobject(*vpp, VNODE_NO_SIZE, curthread); return (0); } @@ -321,11 +323,11 @@ fuse_vfsop_mount(struct mount *mp) opts = mp->mnt_optnew; if (!opts) - return EINVAL; + return (EXTERROR(EINVAL, "Mount options were not supplied")); /* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */ if (!vfs_getopts(opts, "fspath", &err)) - return err; + return (EXTERROR(err, "Mount options are missing 'fspath'")); /* * With the help of underscored options the mount program @@ -358,11 +360,12 @@ fuse_vfsop_mount(struct mount *mp) /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ fspec = vfs_getopts(opts, "from", &err); if (!fspec) - return err; + return (EXTERROR(err, "Mount options are missing 'from'")); /* `fd' contains the filedescriptor for this session; REQUIRED */ if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) - return EINVAL; + return (EXTERROR(EINVAL, "Mount options contain an invalid value " + "for 'fd'")); err = fuse_getdevice(fspec, td, &fdev); if (err != 0) @@ -398,11 +401,17 @@ fuse_vfsop_mount(struct mount *mp) /* Sanity + permission checks */ if (!data->daemoncred) panic("fuse daemon found, but identity unknown"); - if (mntopts & FSESS_DAEMON_CAN_SPY) + if (mntopts & FSESS_DAEMON_CAN_SPY) { err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); - if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) + EXTERROR(err, "FUSE daemon requires privileges " + "due to 'allow_other' option"); + } + if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) { /* are we allowed to do the first mount? */ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); + EXTERROR(err, "Mounting as a user that is different from the FUSE " + "daemon's requires privileges"); + } if (err) { FUSE_UNLOCK(); goto out; @@ -549,7 +558,7 @@ fuse_vfsop_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) * nullfs mount of a fusefs file system. */ SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp); - return (EOPNOTSUPP); + return (EXTERROR(EOPNOTSUPP, "NFS-style lookups are not supported")); } error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp); @@ -565,15 +574,28 @@ fuse_vfsop_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) error = fdisp_wait_answ(&fdi); if (error) - return error; + goto out; feo = (struct fuse_entry_out *)fdi.answ; + if (feo->nodeid == 0) { /* zero nodeid means ENOENT and cache it */ error = ENOENT; goto out; } + if (feo->nodeid != nodeid) { + /* + * Something is very wrong with the server if "foo/." has a + * different inode number than "foo". + */ + static const char exterr[] = "Inconsistent LOOKUP response: " + "\"FILE/.\" has a different inode number than \"FILE\"."; + fuse_warn(data, FSESS_WARN_DOT_LOOKUP, exterr); + error = EXTERROR(EIO, exterr); + goto out; + } + vtyp = IFTOVT(feo->attr.mode); error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp); if (error) diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 3f8f3322162a..ae28617537fd 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -89,6 +89,8 @@ #include <sys/buf.h> #include <sys/sysctl.h> #include <sys/vmmeter.h> +#define EXTERR_CATEGORY EXTERR_CAT_FUSE +#include <sys/exterrvar.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -289,6 +291,10 @@ fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) if (err) return err; + if (fufh->fuse_open_flags & FOPEN_NOFLUSH && + (!fsess_opt_writeback(vnode_mount(vp)))) + return (0); + fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); ffi = fdi.indata; @@ -395,6 +401,9 @@ fuse_vnop_do_lseek(struct vnode *vp, struct thread *td, struct ucred *cred, err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LSEEK); + } else if (err == ENXIO) { + /* Note: ENXIO means "no more hole/data regions until EOF" */ + fsess_set_impl(mp, FUSE_LSEEK); } else if (err == 0) { fsess_set_impl(mp, FUSE_LSEEK); flso = fdi.answ; @@ -432,7 +441,8 @@ fuse_vnop_access(struct vop_access_args *ap) if (vnode_isvroot(vp)) { return 0; } - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { @@ -441,7 +451,8 @@ fuse_vnop_access(struct vop_access_args *ap) return 0; } } - return EBADF; + return (EXTERROR(EBADF, "Access denied until FUSE session " + "is initialized")); } if (vnode_islnk(vp)) { return 0; @@ -482,7 +493,8 @@ fuse_vnop_advlock(struct vop_advlock_args *ap) dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } switch(ap->a_op) { @@ -499,7 +511,7 @@ fuse_vnop_advlock(struct vop_advlock_args *ap) op = FUSE_SETLK; break; default: - return EINVAL; + return (EXTERROR(EINVAL, "Unsupported lock flags")); } if (!(dataflags & FSESS_POSIX_LOCKS)) @@ -527,14 +539,14 @@ fuse_vnop_advlock(struct vop_advlock_args *ap) size = vattr.va_size; if (size > OFF_MAX || (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) { - err = EOVERFLOW; + err = EXTERROR(EOVERFLOW, "Offset is too large"); goto out; } start = size + fl->l_start; break; default: - return (EINVAL); + return (EXTERROR(EINVAL, "Unsupported offset type")); } err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); @@ -596,15 +608,14 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); switch (vp->v_type) { case VFIFO: return (ESPIPE); case VLNK: case VREG: - if (vfs_isrdonly(mp)) - return (EROFS); break; default: return (ENODEV); @@ -614,7 +625,8 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) return (EROFS); if (fsess_not_impl(mp, FUSE_FALLOCATE)) - return (EINVAL); + return (EXTERROR(EINVAL, "This server does not implement " + "FUSE_FALLOCATE")); io.uio_offset = *offset; io.uio_resid = *len; @@ -644,13 +656,14 @@ fuse_vnop_allocate(struct vop_allocate_args *ap) if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FALLOCATE); - err = EINVAL; + err = EXTERROR(EINVAL, "This server does not implement " + "FUSE_ALLOCATE"); } else if (err == EOPNOTSUPP) { /* * The file system server does not support FUSE_FALLOCATE with * the supplied mode for this particular file. */ - err = EINVAL; + err = EXTERROR(EINVAL, "This file can't be pre-allocated"); } else if (!err) { *offset += *len; *len = 0; @@ -696,7 +709,8 @@ fuse_vnop_bmap(struct vop_bmap_args *ap) int maxrun; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } mp = vnode_mount(vp); @@ -793,6 +807,9 @@ fuse_vnop_close(struct vop_close_args *ap) if (fflag & IO_NDELAY) return 0; + if (cred == NULL) + cred = td->td_ucred; + err = fuse_flush(vp, cred, pid, fflag); if (err == 0 && (fvdat->flag & FN_ATIMECHANGE) && !vfs_isrdonly(mp)) { struct vattr vap; @@ -860,19 +877,21 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap) pid_t pid; int err; - err = ENOSYS; if (mp == NULL || mp != vnode_mount(outvp)) - goto fallback; + return (EXTERROR(ENOSYS, "Mount points do not match")); if (incred->cr_uid != outcred->cr_uid) - goto fallback; + return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not " + "support different credentials for infd and outfd")); if (incred->cr_groups[0] != outcred->cr_groups[0]) - goto fallback; + return (EXTERROR(ENOSYS, "FUSE_COPY_FILE_RANGE does not " + "support different credentials for infd and outfd")); /* Caller busied mp, mnt_data can be safely accessed. */ if (fsess_not_impl(mp, FUSE_COPY_FILE_RANGE)) - goto fallback; + return (EXTERROR(ENOSYS, "This daemon does not " + "implement COPY_FILE_RANGE")); if (ap->a_fsizetd == NULL) td = curthread; @@ -882,7 +901,7 @@ fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap) vn_lock_pair(invp, false, LK_SHARED, outvp, false, LK_EXCLUSIVE); if (invp->v_data == NULL || outvp->v_data == NULL) { - err = EBADF; + err = EXTERROR(EBADF, "vnode got reclaimed"); goto unlock; } @@ -946,7 +965,6 @@ unlock: if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_COPY_FILE_RANGE); -fallback: /* * No need to call vn_rlimit_fsizex_res before return, since the uio is @@ -1014,7 +1032,8 @@ fuse_vnop_create(struct vop_create_args *ap) int flags; if (fuse_isdeadfs(dvp)) - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); /* FUSE expects sockets to be created with FUSE_MKNOD */ if (vap->va_type == VSOCK) @@ -1030,7 +1049,7 @@ fuse_vnop_create(struct vop_create_args *ap) bzero(&fdi, sizeof(fdi)); if (vap->va_type != VREG) - return (EINVAL); + return (EXTERROR(EINVAL, "Only regular files can be created")); if (fsess_not_impl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { /* Fallback to FUSE_MKNOD/FUSE_OPEN */ @@ -1211,8 +1230,8 @@ fuse_vnop_getattr(struct vop_getattr_args *ap) if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); - err = ENOTCONN; - return err; + return (EXTERROR(ENOTCONN, "FUSE daemon is not " + "initialized")); } else { goto fake; } @@ -1341,10 +1360,11 @@ fuse_vnop_link(struct vop_link_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (vnode_mount(tdvp) != vnode_mount(vp)) { - return EXDEV; + return (EXDEV); } /* @@ -1354,7 +1374,7 @@ fuse_vnop_link(struct vop_link_args *ap) * validating that nlink does not overflow. */ if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX) - return EMLINK; + return (EMLINK); fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); @@ -1366,12 +1386,13 @@ fuse_vnop_link(struct vop_link_args *ap) feo = fdi.answ; if (fli.oldnodeid != feo->nodeid) { + static const char exterr[] = "Server assigned wrong inode " + "for a hard link."; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); - fuse_warn(data, FSESS_WARN_ILLEGAL_INODE, - "Assigned wrong inode for a hard link."); + fuse_warn(data, FSESS_WARN_ILLEGAL_INODE, exterr); fuse_vnode_clear_attr_cache(vp); fuse_vnode_clear_attr_cache(tdvp); - err = EIO; + err = EXTERROR(EIO, exterr); goto out; } @@ -1428,8 +1449,8 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) struct timespec now; int nameiop = cnp->cn_nameiop; - int flags = cnp->cn_flags; - int islastcn = flags & ISLASTCN; + bool isdotdot = cnp->cn_flags & ISDOTDOT; + bool islastcn = cnp->cn_flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); struct fuse_data *data = fuse_get_mpdata(mp); int default_permissions = data->dataflags & FSESS_DEFAULT_PERMISSIONS; @@ -1448,7 +1469,8 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) if (fuse_isdeadfs(dvp)) { *vpp = NULL; - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (!vnode_isdir(dvp)) return ENOTDIR; @@ -1462,14 +1484,14 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) return err; is_dot = cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.'; - if ((flags & ISDOTDOT) && !(data->dataflags & FSESS_EXPORT_SUPPORT)) - { + if (isdotdot && !(data->dataflags & FSESS_EXPORT_SUPPORT)) { if (!(VTOFUD(dvp)->flag & FN_PARENT_NID)) { /* * Since the file system doesn't support ".." lookups, * we have no way to find this entry. */ - return ESTALE; + return (EXTERROR(ESTALE, "This server does not support " + "'..' lookups")); } nid = VTOFUD(dvp)->parent_nid; if (nid == 0) @@ -1577,7 +1599,7 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) } } else { /* Entry was found */ - if (flags & ISDOTDOT) { + if (isdotdot) { struct fuse_lookup_alloc_arg flaa; flaa.nid = nid; @@ -1592,11 +1614,11 @@ fuse_vnop_lookup(struct vop_lookup_args *ap) vref(dvp); *vpp = dvp; } else { + static const char exterr[] = "Server assigned " + "same inode to both parent and child."; fuse_warn(fuse_get_mpdata(mp), - FSESS_WARN_ILLEGAL_INODE, - "Assigned same inode to both parent and " - "child."); - err = EIO; + FSESS_WARN_ILLEGAL_INODE, exterr); + err = EXTERROR(EIO, exterr); } } else { @@ -1684,7 +1706,8 @@ fuse_vnop_mkdir(struct vop_mkdir_args *ap) struct fuse_mkdir_in fmdi; if (fuse_isdeadfs(dvp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); fmdi.umask = curthread->td_proc->p_pd->pd_cmask; @@ -1711,7 +1734,8 @@ fuse_vnop_mknod(struct vop_mknod_args *ap) struct vattr *vap = ap->a_vap; if (fuse_isdeadfs(dvp)) - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); return fuse_internal_mknod(dvp, vpp, cnp, vap); } @@ -1735,11 +1759,13 @@ fuse_vnop_open(struct vop_open_args *ap) pid_t pid = td->td_proc->p_pid; if (fuse_isdeadfs(vp)) - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) - return (EOPNOTSUPP); + return (EXTERROR(EOPNOTSUPP, "Unsupported vnode type", + vp->v_type)); if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) - return EINVAL; + return (EXTERROR(EINVAL, "Illegal mode", a_mode)); if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { fuse_vnode_open(vp, 0, td); @@ -1754,6 +1780,9 @@ fuse_vnop_pathconf(struct vop_pathconf_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp; + struct fuse_filehandle *fufh; + int err; + bool closefufh = false; switch (ap->a_name) { case _PC_FILESIZEBITS: @@ -1783,22 +1812,45 @@ fuse_vnop_pathconf(struct vop_pathconf_args *ap) !fsess_not_impl(mp, FUSE_LSEEK)) { off_t offset = 0; - /* Issue a FUSE_LSEEK to find out if it's implemented */ - fuse_vnop_do_lseek(vp, curthread, curthread->td_ucred, - curthread->td_proc->p_pid, &offset, SEEK_DATA); + /* + * Issue a FUSE_LSEEK to find out if it's supported. + * Use SEEK_DATA instead of SEEK_HOLE, because the + * latter generally requires sequential scans of file + * metadata, which can be slow. + */ + err = fuse_vnop_do_lseek(vp, curthread, + curthread->td_ucred, curthread->td_proc->p_pid, + &offset, SEEK_DATA); + if (err == EBADF) { + /* + * pathconf() doesn't necessarily open the + * file. So we may need to do it here. + */ + err = fuse_filehandle_open(vp, FREAD, &fufh, + curthread, curthread->td_ucred); + if (err == 0) { + closefufh = true; + err = fuse_vnop_do_lseek(vp, curthread, + curthread->td_ucred, + curthread->td_proc->p_pid, &offset, + SEEK_DATA); + } + if (closefufh) + fuse_filehandle_close(vp, fufh, + curthread, curthread->td_ucred); + } + } if (fsess_is_impl(mp, FUSE_LSEEK)) { *ap->a_retval = 1; return (0); + } else if (fsess_not_impl(mp, FUSE_LSEEK)) { + /* FUSE_LSEEK is not implemented */ + return (EXTERROR(EINVAL, "This server does not " + "implement FUSE_LSEEK")); } else { - /* - * Probably FUSE_LSEEK is not implemented. It might - * be, if the FUSE_LSEEK above returned an error like - * EACCES, but in that case we can't tell, so it's - * safest to report EINVAL anyway. - */ - return (EINVAL); + return (err); } default: return (vop_stdpathconf(ap)); @@ -1830,7 +1882,8 @@ fuse_vnop_read(struct vop_read_args *ap) MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (VTOFUD(vp)->flag & FN_DIRECTIO) { @@ -1907,20 +1960,18 @@ fuse_vnop_readdir(struct vop_readdir_args *ap) if (ap->a_eofflag) *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { - return ENXIO; - } - if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ - (uio_resid(uio) < sizeof(struct dirent))) { - return EINVAL; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } + if (uio_resid(uio) < sizeof(struct dirent)) + return (EXTERROR(EINVAL, "Buffer is too small")); tresid = uio->uio_resid; err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); if (err == EBADF && mp->mnt_flag & MNT_EXPORTED) { - KASSERT(fuse_get_mpdata(mp)->dataflags - & FSESS_NO_OPENDIR_SUPPORT, - ("FUSE file systems that don't set " - "FUSE_NO_OPENDIR_SUPPORT should not be exported")); + KASSERT(!fsess_is_impl(mp, FUSE_OPENDIR), + ("FUSE file systems that implement " + "FUSE_OPENDIR should not be exported")); /* * nfsd will do VOP_READDIR without first doing VOP_OPEN. We * must implicitly open the directory here. @@ -1983,7 +2034,8 @@ fuse_vnop_readlink(struct vop_readlink_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (!vnode_islnk(vp)) { return EINVAL; @@ -1994,10 +2046,11 @@ fuse_vnop_readlink(struct vop_readlink_args *ap) goto out; } if (strnlen(fdi.answ, fdi.iosize) + 1 < fdi.iosize) { + static const char exterr[] = "Server returned an embedded NUL " + "from FUSE_READLINK."; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); - fuse_warn(data, FSESS_WARN_READLINK_EMBEDDED_NUL, - "Returned an embedded NUL from FUSE_READLINK."); - err = EIO; + fuse_warn(data, FSESS_WARN_READLINK_EMBEDDED_NUL, exterr); + err = EXTERROR(EIO, exterr); goto out; } if (((char *)fdi.answ)[0] == '/' && @@ -2081,10 +2134,11 @@ fuse_vnop_remove(struct vop_remove_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (vnode_isdir(vp)) { - return EPERM; + return (EXTERROR(EPERM, "vnode is a directory")); } err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); @@ -2117,12 +2171,13 @@ fuse_vnop_rename(struct vop_rename_args *ap) int err = 0; if (fuse_isdeadfs(fdvp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); - err = EXDEV; + err = EXTERROR(EXDEV, "Cross-device rename"); goto out; } cache_purge(fvp); @@ -2193,10 +2248,12 @@ fuse_vnop_rmdir(struct vop_rmdir_args *ap) int err; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (VTOFUD(vp) == VTOFUD(dvp)) { - return EINVAL; + return (EXTERROR(EINVAL, "Directory to be removed " + "contains itself")); } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); @@ -2233,7 +2290,8 @@ fuse_vnop_setattr(struct vop_setattr_args *ap) checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (vap->va_uid != (uid_t)VNOVAL) { @@ -2248,19 +2306,15 @@ fuse_vnop_setattr(struct vop_setattr_args *ap) return (err2); if (vap->va_uid != old_va.va_uid) return err; - else - accmode |= VADMIN; drop_suid = true; - } else - accmode |= VADMIN; - } else - accmode |= VADMIN; + } + } + accmode |= VADMIN; } if (vap->va_gid != (gid_t)VNOVAL) { if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN)) drop_suid = true; - if (checkperm && !groupmember(vap->va_gid, cred)) - { + if (checkperm && !groupmember(vap->va_gid, cred)) { /* * Non-root users may only chgrp to one of their own * groups @@ -2274,11 +2328,9 @@ fuse_vnop_setattr(struct vop_setattr_args *ap) return (err2); if (vap->va_gid != old_va.va_gid) return err; - accmode |= VADMIN; - } else - accmode |= VADMIN; - } else - accmode |= VADMIN; + } + } + accmode |= VADMIN; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { @@ -2404,7 +2456,8 @@ fuse_vnop_symlink(struct vop_symlink_args *ap) size_t len; if (fuse_isdeadfs(dvp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } /* * Unlike the other creator type calls, here we have to create a message @@ -2450,7 +2503,8 @@ fuse_vnop_write(struct vop_write_args *ap) MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { - return ENXIO; + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); } if (VTOFUD(vp)->flag & FN_DIRECTIO) { @@ -2603,10 +2657,12 @@ fuse_vnop_getextattr(struct vop_getextattr_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_GETXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "extended attributes")); err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) @@ -2644,7 +2700,8 @@ fuse_vnop_getextattr(struct vop_getextattr_args *ap) if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); - err = EOPNOTSUPP; + err = (EXTERROR(EOPNOTSUPP, "This server does not " + "implement extended attributes")); } goto out; } @@ -2683,16 +2740,19 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap) struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; + size_t struct_size = FUSE_COMPAT_SETXATTR_IN_SIZE; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_SETXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "setting extended attributes")); if (vfs_isrdonly(mp)) return EROFS; @@ -2704,9 +2764,11 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap) * return EOPNOTSUPP. */ if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) - return (EOPNOTSUPP); + return (EXTERROR(EOPNOTSUPP, "This server does not " + "implement removing extended attributess")); else - return (EINVAL); + return (EXTERROR(EINVAL, "DELETEEXTATTR should be used " + "to remove extattrs")); } err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, @@ -2723,17 +2785,26 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap) len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; - fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid); + /* older FUSE servers use a smaller fuse_setxattr_in struct*/ + if (fuse_libabi_geq(fuse_get_mpdata(mp), 7, 33)) + struct_size = sizeof(*set_xattr_in); + + fdisp_init(&fdi, len + struct_size + uio->uio_resid); fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred); set_xattr_in = fdi.indata; set_xattr_in->size = uio->uio_resid; - attr_str = (char *)fdi.indata + sizeof(*set_xattr_in); + if (fuse_libabi_geq(fuse_get_mpdata(mp), 7, 33)) { + set_xattr_in->setxattr_flags = 0; + set_xattr_in->padding = 0; + } + + attr_str = (char *)fdi.indata + struct_size; snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); - err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len, + err = uiomove((char *)fdi.indata + struct_size + len, uio->uio_resid, uio); if (err != 0) { goto out; @@ -2743,7 +2814,8 @@ fuse_vnop_setextattr(struct vop_setextattr_args *ap) if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_SETXATTR); - err = EOPNOTSUPP; + err = EXTERROR(EOPNOTSUPP, "This server does not implement " + "setting extended attributes"); } if (err == ERESTART) { /* Can't restart after calling uiomove */ @@ -2854,10 +2926,12 @@ fuse_vnop_listextattr(struct vop_listextattr_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_LISTXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "extended attributes")); err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) @@ -2885,7 +2959,8 @@ fuse_vnop_listextattr(struct vop_listextattr_args *ap) if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); - err = EOPNOTSUPP; + err = EXTERROR(EOPNOTSUPP, "This server does not " + "implement extended attributes"); } goto out; } @@ -2985,7 +3060,8 @@ fuse_vnop_deallocate(struct vop_deallocate_args *ap) bool closefufh = false; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (vfs_isrdonly(mp)) return (EROFS); @@ -3053,8 +3129,8 @@ fuse_vnop_deallocate(struct vop_deallocate_args *ap) false); } -out: fdisp_destroy(&fdi); +out: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); @@ -3091,10 +3167,12 @@ fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) int err; if (fuse_isdeadfs(vp)) - return (ENXIO); + return (EXTERROR(ENXIO, "This FUSE session is about " + "to be closed")); if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server does not implement " + "removing extended attributes")); if (vfs_isrdonly(mp)) return EROFS; @@ -3123,7 +3201,8 @@ fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_REMOVEXATTR); - err = EOPNOTSUPP; + err = EXTERROR(EOPNOTSUPP, "This server does not implement " + "removing extended attributes"); } fdisp_destroy(&fdi); @@ -3177,25 +3256,27 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap) /* NFS requires lookups for "." and ".." */ SDT_PROBE2(fusefs, , vnops, trace, 1, "VOP_VPTOFH without FUSE_EXPORT_SUPPORT"); - return EOPNOTSUPP; + return (EXTERROR(EOPNOTSUPP, "This server is " + "missing FUSE_EXPORT_SUPPORT")); } if ((mp->mnt_flag & MNT_EXPORTED) && - !(data->dataflags & FSESS_NO_OPENDIR_SUPPORT)) + fsess_is_impl(mp, FUSE_OPENDIR)) { /* * NFS is stateless, so nfsd must reopen a directory on every * call to VOP_READDIR, passing in the d_off field from the - * final dirent of the previous invocation. But without - * FUSE_NO_OPENDIR_SUPPORT, the FUSE protocol does not + * final dirent of the previous invocation. But if the server + * implements FUSE_OPENDIR, the FUSE protocol does not * guarantee that d_off will be valid after a directory is * closed and reopened. So prohibit exporting FUSE file - * systems that don't set that flag. + * systems that implement FUSE_OPENDIR. * * But userspace NFS servers don't have this problem. */ SDT_PROBE2(fusefs, , vnops, trace, 1, - "VOP_VPTOFH without FUSE_NO_OPENDIR_SUPPORT"); - return EOPNOTSUPP; + "VOP_VPTOFH with FUSE_OPENDIR"); + return (EXTERROR(EOPNOTSUPP, "This server implements " + "FUSE_OPENDIR so is not compatible with getfh")); } err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); @@ -3209,6 +3290,7 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap) if (fvdat->generation <= UINT32_MAX) fhp->gen = fvdat->generation; else - return EOVERFLOW; + return (EXTERROR(EOVERFLOW, "inode generation " + "number overflow")); return (0); } diff --git a/sys/fs/msdosfs/denode.h b/sys/fs/msdosfs/denode.h index 0d31b0583fa6..e6928fb46052 100644 --- a/sys/fs/msdosfs/denode.h +++ b/sys/fs/msdosfs/denode.h @@ -212,7 +212,7 @@ struct denode { ((dep)->de_Attributes & ATTR_DIRECTORY) ? 0 : (dep)->de_FileSize), \ putushort((dp)->deHighClust, (dep)->de_StartCluster >> 16)) -#if defined(_KERNEL) || defined(MAKEFS) +#if defined(_KERNEL) || defined(_WANT_MSDOSFS_INTERNALS) #define VTODE(vp) ((struct denode *)(vp)->v_data) #define DETOV(de) ((de)->de_vnode) @@ -294,5 +294,5 @@ int removede(struct denode *pdep, struct denode *dep); int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred); int doscheckpath( struct denode *source, struct denode *target, daddr_t *wait_scn); -#endif /* _KERNEL || MAKEFS */ +#endif /* _KERNEL || _WANT_MSDOSFS_INTERNALS */ #endif /* !_FS_MSDOSFS_DENODE_H_ */ diff --git a/sys/fs/msdosfs/fat.h b/sys/fs/msdosfs/fat.h index a88bfb94e91d..344cd5a9416d 100644 --- a/sys/fs/msdosfs/fat.h +++ b/sys/fs/msdosfs/fat.h @@ -81,7 +81,7 @@ #define MSDOSFSEOF(pmp, cn) ((((cn) | ~(pmp)->pm_fatmask) & CLUST_EOFS) == CLUST_EOFS) -#if defined (_KERNEL) || defined(MAKEFS) +#if defined (_KERNEL) || defined(_WANT_MSDOSFS_INTERNALS) /* * These are the values for the function argument to the function * fatentry(). @@ -110,5 +110,5 @@ markvoldirty(struct msdosfsmount *pmp, bool dirty) return (markvoldirty_upgrade(pmp, dirty, false)); } -#endif /* _KERNEL || MAKEFS */ +#endif /* _KERNEL || _WANT_MSDOSFS_INTERNALS */ #endif /* !_FS_MSDOSFS_FAT_H_ */ diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c index da4848169173..208b64930e61 100644 --- a/sys/fs/msdosfs/msdosfs_conv.c +++ b/sys/fs/msdosfs/msdosfs_conv.c @@ -797,19 +797,24 @@ mbsadjpos(const char **instr, size_t inlen, size_t outlen, int weight, int flag, static u_char * dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struct msdosfsmount *pmp) { - u_char c, *outp; - size_t len, olen; + u_char c, *outp, *outp1; + size_t i, len, olen; outp = outbuf; if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { olen = len = 4; + outp1 = outp; if (lower & (LCASE_BASE | LCASE_EXT)) msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char **)instr, ilen, (char **)&outp, &olen, KICONV_LOWER); else msdosfs_iconv->convchr(pmp->pm_d2u, (const char **)instr, ilen, (char **)&outp, &olen); + for (i = 0; i < outp - outp1; i++) { + if (outp1[i] == '/') + outp1[i] = '?'; + } len -= olen; /* @@ -826,6 +831,8 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t *ilen, int lower, struc c = dos2unix[c]; if (lower & (LCASE_BASE | LCASE_EXT)) c = u2l[c]; + if (c == '/') + c = '?'; *outp++ = c; outbuf[1] = '\0'; } diff --git a/sys/fs/msdosfs/msdosfs_lookup.c b/sys/fs/msdosfs/msdosfs_lookup.c index 2a90339d0878..8ab6d35a2685 100644 --- a/sys/fs/msdosfs/msdosfs_lookup.c +++ b/sys/fs/msdosfs/msdosfs_lookup.c @@ -198,7 +198,9 @@ msdosfs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname switch (unix2dosfn((const u_char *)cnp->cn_nameptr, dosfilename, cnp->cn_namelen, 0, pmp)) { case 0: - return (EINVAL); + if (nameiop == CREATE || nameiop == RENAME) + return (EINVAL); + return (ENOENT); case 1: break; case 2: @@ -843,7 +845,6 @@ doscheckpath(struct denode *source, struct denode *target, daddr_t *wait_scn) *wait_scn = 0; pmp = target->de_pmp; - lockmgr_assert(&pmp->pm_checkpath_lock, KA_XLOCKED); KASSERT(pmp == source->de_pmp, ("doscheckpath: source and target on different filesystems")); diff --git a/sys/fs/msdosfs/msdosfs_vfsops.c b/sys/fs/msdosfs/msdosfs_vfsops.c index 258c701bd300..4431d36c8a8e 100644 --- a/sys/fs/msdosfs/msdosfs_vfsops.c +++ b/sys/fs/msdosfs/msdosfs_vfsops.c @@ -575,7 +575,6 @@ mountmsdosfs(struct vnode *odevvp, struct mount *mp) pmp->pm_bo = bo; lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0); - lockinit(&pmp->pm_checkpath_lock, 0, "msdoscp", 0, 0); TASK_INIT(&pmp->pm_rw2ro_task, 0, msdosfs_remount_ro, pmp); @@ -722,7 +721,9 @@ mountmsdosfs(struct vnode *odevvp, struct mount *mp) } } - clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv ; + clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv; + if (clusters >= (CLUST_RSRVD & pmp->pm_fatmask)) + clusters = CLUST_RSRVD & pmp->pm_fatmask; if (pmp->pm_maxcluster >= clusters) { #ifdef MSDOSFS_DEBUG printf("Warning: number of clusters (%ld) exceeds FAT " @@ -869,7 +870,6 @@ error_exit: } if (pmp != NULL) { lockdestroy(&pmp->pm_fatlock); - lockdestroy(&pmp->pm_checkpath_lock); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; @@ -969,7 +969,6 @@ msdosfs_unmount(struct mount *mp, int mntflags) dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); lockdestroy(&pmp->pm_fatlock); - lockdestroy(&pmp->pm_checkpath_lock); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; return (error); diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index 078ea5e52312..33e0d94954d7 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -945,7 +945,7 @@ msdosfs_rename(struct vop_rename_args *ap) struct denode *fdip, *fip, *tdip, *tip, *nip; u_char toname[12], oldname[11]; u_long to_diroffset; - bool checkpath_locked, doingdirectory, newparent; + bool doingdirectory, newparent; int error; u_long cn, pcl, blkoff; daddr_t bn, wait_scn, scn; @@ -986,8 +986,6 @@ msdosfs_rename(struct vop_rename_args *ap) if (tvp != NULL && tvp != tdvp) VOP_UNLOCK(tvp); - checkpath_locked = false; - relock: doingdirectory = newparent = false; @@ -1108,12 +1106,8 @@ relock: if (doingdirectory && newparent) { if (error != 0) /* write access check above */ goto unlock; - lockmgr(&pmp->pm_checkpath_lock, LK_EXCLUSIVE, NULL); - checkpath_locked = true; error = doscheckpath(fip, tdip, &wait_scn); if (wait_scn != 0) { - lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL); - checkpath_locked = false; VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); @@ -1276,8 +1270,6 @@ relock: cache_purge(fvp); unlock: - if (checkpath_locked) - lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL); vput(fdvp); vput(fvp); if (tvp != NULL) { @@ -1289,7 +1281,6 @@ unlock: vput(tdvp); return (error); releout: - MPASS(!checkpath_locked); vrele(tdvp); if (tvp != NULL) vrele(tvp); @@ -1530,6 +1521,9 @@ msdosfs_readdir(struct vop_readdir_args *ap) ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; + /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the filesystem vnode, and hence can @@ -1623,8 +1617,11 @@ msdosfs_readdir(struct vop_readdir_args *ap) on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); - if (diff <= 0) - break; + if (diff <= 0) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; + goto out; + } n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) @@ -1655,6 +1652,8 @@ msdosfs_readdir(struct vop_readdir_args *ap) */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; goto out; } /* @@ -1752,15 +1751,6 @@ out: uio->uio_offset = off; - /* - * Set the eofflag (NFS uses it) - */ - if (ap->a_eofflag) { - if (dep->de_FileSize - (offset - bias) <= 0) - *ap->a_eofflag = 1; - else - *ap->a_eofflag = 0; - } return (error); } @@ -1951,6 +1941,9 @@ msdosfs_pathconf(struct vop_pathconf_args *ap) case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); + case _PC_HAS_HIDDENSYSTEM: + *ap->a_retval = 1; + return (0); default: return (vop_stdpathconf(ap)); } @@ -1962,6 +1955,8 @@ msdosfs_vptofh(struct vop_vptofh_args *ap) { struct denode *dep; struct defid *defhp; + _Static_assert(sizeof(struct defid) <= sizeof(struct fid), + "struct defid cannot be larger than struct fid"); dep = VTODE(ap->a_vp); defhp = (struct defid *)ap->a_fhp; diff --git a/sys/fs/msdosfs/msdosfsmount.h b/sys/fs/msdosfs/msdosfsmount.h index 8f15bc2eaf42..04e6b75bea2a 100644 --- a/sys/fs/msdosfs/msdosfsmount.h +++ b/sys/fs/msdosfs/msdosfsmount.h @@ -52,14 +52,17 @@ #ifndef _MSDOSFS_MSDOSFSMOUNT_H_ #define _MSDOSFS_MSDOSFSMOUNT_H_ -#if defined (_KERNEL) || defined(MAKEFS) +#if defined(_KERNEL) || defined(_WANT_MSDOSFS_INTERNALS) #include <sys/types.h> -#ifndef MAKEFS +#ifdef _KERNEL #include <sys/lock.h> #include <sys/lockmgr.h> -#include <sys/_task.h> +#else +#include <sys/_lock.h> +#include <sys/_lockmgr.h> #endif +#include <sys/_task.h> #include <sys/tree.h> #ifdef MALLOC_DECLARE @@ -114,11 +117,8 @@ struct msdosfsmount { void *pm_w2u; /* Unicode->Local iconv handle */ void *pm_u2d; /* Unicode->DOS iconv handle */ void *pm_d2u; /* DOS->Local iconv handle */ -#ifndef MAKEFS struct lock pm_fatlock; /* lockmgr protecting allocations */ - struct lock pm_checkpath_lock; /* protects doscheckpath result */ struct task pm_rw2ro_task; /* context for emergency remount ro */ -#endif }; /* @@ -245,9 +245,9 @@ struct msdosfs_fileno { #define MSDOSFS_ASSERT_MP_LOCKED(pmp) \ lockmgr_assert(&(pmp)->pm_fatlock, KA_XLOCKED) -#endif /* _KERNEL || MAKEFS */ +#endif /* _KERNEL || _WANT_MSDOSFS_INTERNALS */ -#ifndef MAKEFS +#ifdef _KERNEL /* * Arguments to mount MSDOS filesystems. */ @@ -265,7 +265,7 @@ struct msdosfs_args { char *cs_local; /* Local Charset */ mode_t dirmask; /* dir mask to be applied for msdosfs perms */ }; -#endif /* MAKEFS */ +#endif /* _KERNEL */ /* * Msdosfs mount options: diff --git a/sys/fs/nfs/nfs.h b/sys/fs/nfs/nfs.h index 9b09520b3257..e6a125b388a8 100644 --- a/sys/fs/nfs/nfs.h +++ b/sys/fs/nfs/nfs.h @@ -865,6 +865,8 @@ struct nfsslot { /* Enumerated type for nfsuserd state. */ typedef enum { NOTRUNNING=0, STARTSTOP=1, RUNNING=2 } nfsuserd_state; +typedef enum { UNKNOWN=0, DELETED=1, NLINK_ZERO=2, VALID=3 } nfsremove_status; + #endif /* _KERNEL */ #endif /* _NFS_NFS_H */ diff --git a/sys/fs/nfs/nfs_commonacl.c b/sys/fs/nfs/nfs_commonacl.c index 55e6f89dd8ec..bba1d8821a9b 100644 --- a/sys/fs/nfs/nfs_commonacl.c +++ b/sys/fs/nfs/nfs_commonacl.c @@ -65,7 +65,7 @@ nfsrv_dissectace(struct nfsrv_descript *nd, struct acl_entry *acep, goto nfsmout; } else if (len == 0) { /* Netapp filers return a 0 length who for nil users */ - acep->ae_tag = ACL_UNDEFINED_TAG; + acep->ae_tag = ACL_EVERYONE; /* Avoid panics. */ acep->ae_id = ACL_UNDEFINED_ID; acep->ae_perm = (acl_perm_t)0; acep->ae_entry_type = ACL_ENTRY_TYPE_DENY; @@ -352,32 +352,7 @@ nfsrv_buildace(struct nfsrv_descript *nd, u_char *name, int namelen, if (ace->ae_perm & ACL_SYNCHRONIZE) acemask |= NFSV4ACE_SYNCHRONIZE; } else { - if (ace->ae_perm & ACL_READ_DATA) - acemask |= NFSV4ACE_READDATA; - if (ace->ae_perm & ACL_WRITE_DATA) - acemask |= NFSV4ACE_WRITEDATA; - if (ace->ae_perm & ACL_APPEND_DATA) - acemask |= NFSV4ACE_APPENDDATA; - if (ace->ae_perm & ACL_READ_NAMED_ATTRS) - acemask |= NFSV4ACE_READNAMEDATTR; - if (ace->ae_perm & ACL_WRITE_NAMED_ATTRS) - acemask |= NFSV4ACE_WRITENAMEDATTR; - if (ace->ae_perm & ACL_EXECUTE) - acemask |= NFSV4ACE_EXECUTE; - if (ace->ae_perm & ACL_READ_ATTRIBUTES) - acemask |= NFSV4ACE_READATTRIBUTES; - if (ace->ae_perm & ACL_WRITE_ATTRIBUTES) - acemask |= NFSV4ACE_WRITEATTRIBUTES; - if (ace->ae_perm & ACL_DELETE) - acemask |= NFSV4ACE_DELETE; - if (ace->ae_perm & ACL_READ_ACL) - acemask |= NFSV4ACE_READACL; - if (ace->ae_perm & ACL_WRITE_ACL) - acemask |= NFSV4ACE_WRITEACL; - if (ace->ae_perm & ACL_WRITE_OWNER) - acemask |= NFSV4ACE_WRITEOWNER; - if (ace->ae_perm & ACL_SYNCHRONIZE) - acemask |= NFSV4ACE_SYNCHRONIZE; + acemask = nfs_aceperm(ace->ae_perm); } *tl++ = txdr_unsigned(acemask); *tl++ = txdr_unsigned(namelen); @@ -388,6 +363,43 @@ nfsrv_buildace(struct nfsrv_descript *nd, u_char *name, int namelen, } /* + * Convert ae_perm to NFSv4 ACL acemask4 for regular files. + */ +uint32_t +nfs_aceperm(acl_perm_t ae_perm) +{ + uint32_t acemask = 0x0; + + if (ae_perm & ACL_READ_DATA) + acemask |= NFSV4ACE_READDATA; + if (ae_perm & ACL_WRITE_DATA) + acemask |= NFSV4ACE_WRITEDATA; + if (ae_perm & ACL_APPEND_DATA) + acemask |= NFSV4ACE_APPENDDATA; + if (ae_perm & ACL_READ_NAMED_ATTRS) + acemask |= NFSV4ACE_READNAMEDATTR; + if (ae_perm & ACL_WRITE_NAMED_ATTRS) + acemask |= NFSV4ACE_WRITENAMEDATTR; + if (ae_perm & ACL_EXECUTE) + acemask |= NFSV4ACE_EXECUTE; + if (ae_perm & ACL_READ_ATTRIBUTES) + acemask |= NFSV4ACE_READATTRIBUTES; + if (ae_perm & ACL_WRITE_ATTRIBUTES) + acemask |= NFSV4ACE_WRITEATTRIBUTES; + if (ae_perm & ACL_DELETE) + acemask |= NFSV4ACE_DELETE; + if (ae_perm & ACL_READ_ACL) + acemask |= NFSV4ACE_READACL; + if (ae_perm & ACL_WRITE_ACL) + acemask |= NFSV4ACE_WRITEACL; + if (ae_perm & ACL_WRITE_OWNER) + acemask |= NFSV4ACE_WRITEOWNER; + if (ae_perm & ACL_SYNCHRONIZE) + acemask |= NFSV4ACE_SYNCHRONIZE; + return (acemask); +} + +/* * Build an NFSv4 ACL. */ int diff --git a/sys/fs/nfs/nfs_commonkrpc.c b/sys/fs/nfs/nfs_commonkrpc.c index e5c658ce76d2..0ae3b94bef89 100644 --- a/sys/fs/nfs/nfs_commonkrpc.c +++ b/sys/fs/nfs/nfs_commonkrpc.c @@ -670,7 +670,7 @@ newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp, struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers, u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep) { - uint32_t retseq, retval, slotseq, *tl; + uint32_t retseq, retval, retval0, slotseq, *tl; int i = 0, j = 0, opcnt, set_sigset = 0, slot; int error = 0, usegssname = 0, secflavour = AUTH_SYS; int freeslot, maxslot, reterr, slotpos, timeo; @@ -1039,7 +1039,7 @@ tryagain: sep->nfsess_badslots |= (0x1ULL << nd->nd_slotid); mtx_unlock(&sep->nfsess_mtx); /* And free the slot. */ - nfsv4_freeslot(sep, nd->nd_slotid, false); + nfsv4_freeslot(sep, nd->nd_slotid, true); } if (stat == RPC_INTR) error = EINTR; @@ -1192,15 +1192,22 @@ tryagain: if (retseq != sep->nfsess_slotseq[slot]) printf("retseq diff 0x%x\n", retseq); - retval = fxdr_unsigned(uint32_t, *++tl); + retval0 = fxdr_unsigned(uint32_t,*tl++); + retval = fxdr_unsigned(uint32_t, *tl); if ((retval + 1) < sep->nfsess_foreslots - ) + ) { sep->nfsess_foreslots = (retval + 1); - else if ((retval + 1) > - sep->nfsess_foreslots) - sep->nfsess_foreslots = (retval - < 64) ? (retval + 1) : 64; + nfs_resetslots(sep); + } else if ((retval + 1) > + sep->nfsess_foreslots) { + if (retval0 > retval) + printf("Sess:highest > " + "target_highest\n"); + sep->nfsess_foreslots = + (retval < NFSV4_SLOTS) ? + (retval + 1) : NFSV4_SLOTS; + } } mtx_unlock(&sep->nfsess_mtx); @@ -1464,6 +1471,25 @@ nfsmout: } /* + * Reset slots above nfsess_foreslots that are not busy. + */ +void +nfs_resetslots(struct nfsclsession *sep) +{ + int i; + uint64_t bitval; + + mtx_assert(&sep->nfsess_mtx, MA_OWNED); + bitval = (1 << sep->nfsess_foreslots); + for (i = sep->nfsess_foreslots; i < NFSV4_SLOTS; i++) { + if ((sep->nfsess_slots & bitval) == 0 && + (sep->nfsess_badslots & bitval) == 0) + sep->nfsess_slotseq[i] = 0; + bitval <<= 1; + } +} + +/* * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and * wait for all requests to complete. This is used by forced unmounts * to terminate any outstanding RPCs. diff --git a/sys/fs/nfs/nfs_commonport.c b/sys/fs/nfs/nfs_commonport.c index 2db9af5b9ea9..0c94f4e7dc52 100644 --- a/sys/fs/nfs/nfs_commonport.c +++ b/sys/fs/nfs/nfs_commonport.c @@ -258,7 +258,8 @@ newnfs_copycred(struct nfscred *nfscr, struct ucred *cr) KASSERT(nfscr->nfsc_ngroups >= 0, ("newnfs_copycred: negative nfsc_ngroups")); cr->cr_uid = nfscr->nfsc_uid; - crsetgroups(cr, nfscr->nfsc_ngroups, nfscr->nfsc_groups); + crsetgroups_fallback(cr, nfscr->nfsc_ngroups, nfscr->nfsc_groups, + GID_NOGROUP); } /* diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c index 3c9af40253ad..a957315aaa12 100644 --- a/sys/fs/nfs/nfs_commonsubs.c +++ b/sys/fs/nfs/nfs_commonsubs.c @@ -135,7 +135,7 @@ struct nfsv4_opflag nfsv4_opflag[NFSV42_NOPS] = { { 1, 2, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Lookupp */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* NVerify */ { 1, 1, 0, 1, LK_EXCLUSIVE, 1, 0 }, /* Open */ - { 1, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenAttr */ + { 1, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* OpenAttr */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenConfirm */ { 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenDowngrade */ { 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutFH */ @@ -219,18 +219,19 @@ NFSD_VNET_DEFINE_STATIC(u_char *, nfsrv_dnsname) = NULL; static int nfs_bigreply[NFSV42_NPROCS] = { 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 1, 0, 0, 0, 0, 0 }; + 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }; /* local functions */ static int nfsrv_skipace(struct nfsrv_descript *nd, int *acesizep); static void nfsv4_wanted(struct nfsv4lock *lp); static uint32_t nfsv4_filesavail(struct statfs *, struct mount *); -static int nfsrv_cmpmixedcase(u_char *cp, u_char *cp2, int len); static int nfsrv_getuser(int procnum, uid_t uid, gid_t gid, char *name); static void nfsrv_removeuser(struct nfsusrgrp *usrp, int isuser); static int nfsrv_getrefstr(struct nfsrv_descript *, u_char **, u_char **, int *, int *); static void nfsrv_refstrbigenough(int, u_char **, u_char **, int *); +static uint32_t vtonfsv4_type(struct vattr *); +static __enum_uint8(vtype) nfsv4tov_type(uint32_t, uint16_t *); static struct { int op; @@ -250,10 +251,10 @@ static struct { { NFSV4OP_CREATE, 5, "Create", 6, }, { NFSV4OP_CREATE, 1, "Create", 6, }, { NFSV4OP_CREATE, 3, "Create", 6, }, + { NFSV4OP_REMOVE, 3, "Remove", 6, }, { NFSV4OP_REMOVE, 1, "Remove", 6, }, - { NFSV4OP_REMOVE, 1, "Remove", 6, }, - { NFSV4OP_SAVEFH, 5, "Rename", 6, }, - { NFSV4OP_SAVEFH, 4, "Link", 4, }, + { NFSV4OP_SAVEFH, 7, "Rename", 6, }, + { NFSV4OP_SAVEFH, 6, "Link", 4, }, { NFSV4OP_READDIR, 2, "Readdir", 7, }, { NFSV4OP_READDIR, 2, "Readdir", 7, }, { NFSV4OP_GETATTR, 1, "Getattr", 7, }, @@ -308,6 +309,7 @@ static struct { { NFSV4OP_DEALLOCATE, 2, "Deallocate", 10, }, { NFSV4OP_LAYOUTERROR, 1, "LayoutError", 11, }, { NFSV4OP_VERIFY, 3, "AppendWrite", 11, }, + { NFSV4OP_OPENATTR, 3, "OpenAttr", 8, }, }; /* @@ -317,7 +319,7 @@ static int nfs_bigrequest[NFSV42_NPROCS] = { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 1 + 0, 1, 0 }; /* @@ -610,32 +612,43 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap, break; case ND_NFSV4: NFSZERO_ATTRBIT(&attrbits); - if (vap->va_mode != (mode_t)VNOVAL) - NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MODE); + np = NULL; + if (strcmp(vp->v_mount->mnt_vfc->vfc_name, "nfs") == 0) + np = VTONFS(vp); + if (vap->va_mode != (mode_t)VNOVAL) { + if ((flags & NFSSATTR_NEWFILE) != 0 && np != NULL && + NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_MODEUMASK)) + NFSSETBIT_ATTRBIT(&attrbits, + NFSATTRBIT_MODEUMASK); + else + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MODE); + } if ((flags & NFSSATTR_FULL) && vap->va_uid != (uid_t)VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); if ((flags & NFSSATTR_FULL) && vap->va_gid != (gid_t)VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNERGROUP); if ((flags & NFSSATTR_FULL) && vap->va_size != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); + if ((flags & NFSSATTR_FULL) && vap->va_flags != VNOVAL) { + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN); + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM); + } if (vap->va_atime.tv_sec != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET); if (vap->va_mtime.tv_sec != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFYSET); - if (vap->va_birthtime.tv_sec != VNOVAL && - strcmp(vp->v_mount->mnt_vfc->vfc_name, "nfs") == 0) { - /* - * We can only test for support of TimeCreate if - * the "vp" argument is for an NFS vnode. - */ - np = VTONFS(vp); - if (NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, - NFSATTRBIT_TIMECREATE)) - NFSSETBIT_ATTRBIT(&attrbits, - NFSATTRBIT_TIMECREATE); - } + /* + * We can only test for support of TimeCreate if + * the "vp" argument is for an NFS vnode. + */ + if (vap->va_birthtime.tv_sec != VNOVAL && np != NULL && + NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_TIMECREATE)) + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); (void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0, - &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); + &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, + false, false, false); break; } } @@ -980,6 +993,17 @@ nfsm_fhtom(struct nfsmount *nmp, struct nfsrv_descript *nd, u_int8_t *fhp, (nmp->nm_privflag & NFSMNTP_FAKEROOTFH) != 0) { fhp = nmp->nm_fh; size = nmp->nm_fhsize; + } else if (size >= NFSX_FHMAX + NFSX_V4NAMEDDIRFH && + size <= NFSX_FHMAX + NFSX_V4NAMEDATTRFH) { + size -= (NFSX_FHMAX - NFSX_MYFH); + NFSM_BUILD(tl, uint32_t *, NFSX_MYFH + + 2 * NFSX_UNSIGNED); + *tl++ = txdr_unsigned(size); + NFSBCOPY(fhp, tl, NFSX_MYFH); + tl += (NFSX_MYFH / NFSX_UNSIGNED); + *tl = 0; + bytesize = NFSX_MYFH + 2 * NFSX_UNSIGNED; + break; } fullsiz = NFSM_RNDUP(size); if (set_true) { @@ -1277,7 +1301,8 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nap, struct nfsfh **nfhpp, fhandle_t *fhp, int fhsize, struct nfsv3_pathconf *pc, struct statfs *sbp, struct nfsstatfs *sfp, struct nfsfsinfo *fsp, NFSACL_T *aclp, int compare, int *retcmpp, - u_int32_t *leasep, u_int32_t *rderrp, NFSPROC_T *p, struct ucred *cred) + u_int32_t *leasep, u_int32_t *rderrp, bool *has_namedattrp, + NFSPROC_T *p, struct ucred *cred) { u_int32_t *tl; int i = 0, j, k, l = 0, m, bitpos, attrsum = 0; @@ -1293,6 +1318,8 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, gid_t gid; u_int32_t freenum = 0, tuint; u_int64_t uquad = 0, thyp, thyp2; + uint16_t tui16; + long has_pathconf; #ifdef QUOTA struct dqblk dqb; uid_t savuid; @@ -1316,6 +1343,7 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, * Just set default values to some of the important ones. */ if (nap != NULL) { + VATTR_NULL(&nap->na_vattr); nap->na_type = VREG; nap->na_mode = 0; nap->na_rdev = (NFSDEV_T)0; @@ -1365,6 +1393,8 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, sfp->sf_tbytes = UINT64_MAX; sfp->sf_abytes = UINT64_MAX; } + if (has_namedattrp != NULL) + *has_namedattrp = false; } /* @@ -1397,6 +1427,16 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACL); NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACLSUPPORT); } + /* Some filesystems do not support uf_hidden */ + if (vp == NULL || VOP_PATHCONF(vp, + _PC_HAS_HIDDENSYSTEM, &has_pathconf) != 0) + has_pathconf = 0; + if (has_pathconf == 0) { + NFSCLRBIT_ATTRBIT(&checkattrbits, + NFSATTRBIT_HIDDEN); + NFSCLRBIT_ATTRBIT(&checkattrbits, + NFSATTRBIT_SYSTEM); + } if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits) || retnotsup) *retcmpp = NFSERR_NOTSAME; @@ -1407,11 +1447,16 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (compare) { if (!(*retcmpp)) { - if (nap->na_type != nfsv34tov_type(*tl)) + tui16 = 0; + if (nap->na_type != nfsv4tov_type(*tl, + &tui16) || + ((nap->na_bsdflags & SFBSD_NAMEDATTR) ^ + tui16) != 0) *retcmpp = NFSERR_NOTSAME; } } else if (nap != NULL) { - nap->na_type = nfsv34tov_type(*tl); + nap->na_type = nfsv4tov_type(*tl, + &nap->na_bsdflags); } attrsum += NFSX_UNSIGNED; break; @@ -1490,9 +1535,23 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, break; case NFSATTRBIT_NAMEDATTR: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (compare && !(*retcmpp)) { - if (*tl != newnfs_false) - *retcmpp = NFSERR_NOTSAME; + if (compare) { + if (!(*retcmpp)) { + if (vp == NULL || VOP_PATHCONF(vp, + _PC_HAS_NAMEDATTR, &has_pathconf) + != 0) + has_pathconf = 0; + if ((has_pathconf != 0 && + *tl != newnfs_true) || + (has_pathconf == 0 && + *tl != newnfs_false)) + *retcmpp = NFSERR_NOTSAME; + } + } else if (has_namedattrp != NULL) { + if (*tl == newnfs_true) + *has_namedattrp = true; + else + *has_namedattrp = false; } attrsum += NFSX_UNSIGNED; break; @@ -1666,6 +1725,8 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, goto nfsmout; tfhsize = tnfhp->nfh_len; if (compare) { + if (tfhsize > NFSX_MYFH) + tfhsize = NFSX_MYFH; if (!(*retcmpp) && !NFSRV_CMPFH(tnfhp->nfh_fh, tfhsize, fhp, fhsize)) @@ -1745,9 +1806,17 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, free(cp2, M_NFSSTRING); break; case NFSATTRBIT_HIDDEN: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (compare && !(*retcmpp)) - *retcmpp = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (compare) { + if (!(*retcmpp) && ((*tl == newnfs_true && + (nap->na_flags & UF_HIDDEN) == 0) || + (*tl == newnfs_false && + (nap->na_flags & UF_HIDDEN) != 0))) + *retcmpp = NFSERR_NOTSAME; + } else if (nap != NULL) { + if (*tl == newnfs_true) + nap->na_flags |= UF_HIDDEN; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_HOMOGENEOUS: @@ -2119,9 +2188,17 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, attrsum += NFSX_HYPER; break; case NFSATTRBIT_SYSTEM: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (compare && !(*retcmpp)) - *retcmpp = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (compare) { + if (!(*retcmpp) && ((*tl == newnfs_true && + (nap->na_flags & UF_SYSTEM) == 0) || + (*tl == newnfs_false && + (nap->na_flags & UF_SYSTEM) != 0))) + *retcmpp = NFSERR_NOTSAME; + } else if (nap != NULL) { + if (*tl == newnfs_true) + nap->na_flags |= UF_SYSTEM; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_TIMEACCESS: @@ -2297,6 +2374,23 @@ nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp, if (compare && !(*retcmpp) && i != nfs_srvmaxio) *retcmpp = NFSERR_NOTSAME; break; + case NFSATTRBIT_CHANGEATTRTYPE: + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (compare) { + if (!(*retcmpp)) { + tuint = NFSV4CHANGETYPE_UNDEFINED; + if ((vp->v_mount->mnt_vfc->vfc_flags & + VFCF_FILEREVINC) != 0) + tuint = NFSV4CHANGETYPE_VERS_COUNTER_NOPNFS; + else if ((vp->v_mount->mnt_vfc->vfc_flags & + VFCF_FILEREVCT) != 0) + tuint = NFSV4CHANGETYPE_TIME_METADATA; + if (fxdr_unsigned(uint32_t, *tl) != tuint) + *retcmpp = NFSERR_NOTSAME; + } + } + attrsum += NFSX_UNSIGNED; + break; default: printf("EEK! nfsv4_loadattr unknown attr=%d\n", bitpos); @@ -2553,7 +2647,8 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, NFSACL_T *saclp, struct vattr *vap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, NFSPROC_T *p, int isdgram, int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno, - struct statfs *pnfssf) + struct statfs *pnfssf, bool xattrsupp, bool has_hiddensystem, + bool has_namedattr) { int bitpos, retnum = 0; u_int32_t *tl; @@ -2567,8 +2662,7 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, struct nfsfsinfo fsinf; struct timespec temptime; NFSACL_T *aclp, *naclp = NULL; - size_t atsiz; - bool xattrsupp; + short irflag; #ifdef QUOTA struct dqblk dqb; uid_t savuid; @@ -2652,18 +2746,6 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, } } - /* Check to see if Extended Attributes are supported. */ - xattrsupp = false; - if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_XATTRSUPPORT)) { - if (NFSVOPLOCK(vp, LK_SHARED) == 0) { - error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, - "xxx", NULL, &atsiz, cred, p); - NFSVOPUNLOCK(vp); - if (error != EOPNOTSUPP) - xattrsupp = true; - } - } - /* * Put out the attribute bitmap for the ones being filled in * and get the field for the number of attributes returned. @@ -2685,11 +2767,15 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT); NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL); } + if (!has_hiddensystem) { + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN); + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM); + } retnum += nfsrv_putattrbit(nd, &attrbits); break; case NFSATTRBIT_TYPE: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); - *tl = vtonfsv34_type(vap->va_type); + *tl = vtonfsv4_type(vap); retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_FHEXPIRETYPE: @@ -2725,7 +2811,10 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, break; case NFSATTRBIT_NAMEDATTR: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); - *tl = newnfs_false; + if (has_namedattr) + *tl = newnfs_true; + else + *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_FSID: @@ -2786,7 +2875,15 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, retnum += NFSX_UNSIGNED; break; case NFSATTRBIT_FILEHANDLE: - retnum += nfsm_fhtom(NULL, nd, (u_int8_t *)fhp, 0, 0); + siz = 0; + if (vp != NULL) { + irflag = vn_irflag_read(vp); + if ((irflag & VIRF_NAMEDDIR) != 0) + siz = NFSX_FHMAX + 2; + else if ((irflag & VIRF_NAMEDATTR) != 0) + siz = NFSX_FHMAX + 3; + } + retnum += nfsm_fhtom(NULL, nd, (u_int8_t *)fhp, siz, 0); break; case NFSATTRBIT_FILEID: NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER); @@ -2819,6 +2916,14 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, *tl = 0; retnum += 2 * NFSX_UNSIGNED; break; + case NFSATTRBIT_HIDDEN: + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + if ((vap->va_flags & UF_HIDDEN) != 0) + *tl = newnfs_true; + else + *tl = newnfs_false; + retnum += NFSX_UNSIGNED; + break; case NFSATTRBIT_HOMOGENEOUS: NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fsinf.fs_properties & NFSV3FSINFO_HOMOGENEOUS) @@ -3008,6 +3113,14 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, txdr_hyper(vap->va_bytes, tl); retnum += NFSX_HYPER; break; + case NFSATTRBIT_SYSTEM: + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + if ((vap->va_flags & UF_SYSTEM) != 0) + *tl = newnfs_true; + else + *tl = newnfs_false; + retnum += NFSX_UNSIGNED; + break; case NFSATTRBIT_TIMEACCESS: NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME); txdr_nfsv4time(&vap->va_atime, tl); @@ -3109,6 +3222,33 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; + case NFSATTRBIT_MODEUMASK: + NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); + /* + * Since FreeBSD applies the umask above the VFS/VOP, + * there is no umask to handle here. If FreeBSD + * moves handling of umask to below the VFS/VOP, + * this could change. + */ + *tl++ = vtonfsv34_mode(vap->va_mode); + *tl = 0; + retnum += 2 * NFSX_UNSIGNED; + break; + case NFSATTRBIT_CHANGEATTRTYPE: + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + *tl = txdr_unsigned(NFSV4CHANGETYPE_UNDEFINED); + if (mp != NULL) { + if ((mp->mnt_vfc->vfc_flags & + VFCF_FILEREVINC) != 0) + *tl = txdr_unsigned( + NFSV4CHANGETYPE_VERS_COUNTER_NOPNFS); + else if ((mp->mnt_vfc->vfc_flags & + VFCF_FILEREVCT) != 0) + *tl = txdr_unsigned( + NFSV4CHANGETYPE_TIME_METADATA); + } + retnum += NFSX_UNSIGNED; + break; default: printf("EEK! Bad V4 attribute bitpos=%d\n", bitpos); } @@ -3419,13 +3559,13 @@ tryagain: /* * If an '@' is found and the domain name matches, search for * the name with dns stripped off. - * Mixed case alpahbetics will match for the domain name, but - * all upper case will not. + * The match for alphabetics in now case insensitive, + * since RFC8881 defines this string as a DNS domain name. */ if (cnt == 0 && i < len && i > 0 && (len - 1 - i) == NFSD_VNET(nfsrv_dnsnamelen) && - !nfsrv_cmpmixedcase(cp, - NFSD_VNET(nfsrv_dnsname), NFSD_VNET(nfsrv_dnsnamelen))) { + strncasecmp(cp, NFSD_VNET(nfsrv_dnsname), + NFSD_VNET(nfsrv_dnsnamelen)) == 0) { len -= (NFSD_VNET(nfsrv_dnsnamelen) + 1); *(cp - 1) = '\0'; } @@ -3646,8 +3786,8 @@ tryagain: */ if (cnt == 0 && i < len && i > 0 && (len - 1 - i) == NFSD_VNET(nfsrv_dnsnamelen) && - !nfsrv_cmpmixedcase(cp, - NFSD_VNET(nfsrv_dnsname), NFSD_VNET(nfsrv_dnsnamelen))) { + strncasecmp(cp, NFSD_VNET(nfsrv_dnsname), + NFSD_VNET(nfsrv_dnsnamelen)) == 0) { len -= (NFSD_VNET(nfsrv_dnsnamelen) + 1); *(cp - 1) = '\0'; } @@ -3696,35 +3836,6 @@ out: } /* - * Cmp len chars, allowing mixed case in the first argument to match lower - * case in the second, but not if the first argument is all upper case. - * Return 0 for a match, 1 otherwise. - */ -static int -nfsrv_cmpmixedcase(u_char *cp, u_char *cp2, int len) -{ - int i; - u_char tmp; - int fndlower = 0; - - for (i = 0; i < len; i++) { - if (*cp >= 'A' && *cp <= 'Z') { - tmp = *cp++ + ('a' - 'A'); - } else { - tmp = *cp++; - if (tmp >= 'a' && tmp <= 'z') - fndlower = 1; - } - if (tmp != *cp2++) - return (1); - } - if (fndlower) - return (0); - else - return (1); -} - -/* * Set the port for the nfsuserd. */ int @@ -4032,8 +4143,9 @@ nfssvc_idname(struct nfsd_idargs *nidp) */ cr = crget(); cr->cr_uid = cr->cr_ruid = cr->cr_svuid = nidp->nid_uid; - crsetgroups(cr, nidp->nid_ngroup, grps); - cr->cr_rgid = cr->cr_svgid = cr->cr_groups[0]; + crsetgroups_fallback(cr, nidp->nid_ngroup, grps, + GID_NOGROUP); + cr->cr_rgid = cr->cr_svgid = cr->cr_gid; cr->cr_prison = curthread->td_ucred->cr_prison; prison_hold(cr->cr_prison); #ifdef MAC @@ -4644,7 +4756,7 @@ newnfs_sndlock(int *flagp) ts.tv_sec = 0; ts.tv_nsec = 0; (void) nfsmsleep((caddr_t)flagp, NFSSOCKMUTEXPTR, - PZERO - 1, "nfsndlck", &ts); + PVFS, "nfsndlck", &ts); } *flagp |= NFSR_SNDLOCK; NFSUNLOCKSOCK(); @@ -5025,6 +5137,8 @@ nfsv4_freeslot(struct nfsclsession *sep, int slot, bool resetseq) mtx_lock(&sep->nfsess_mtx); if (resetseq) sep->nfsess_slotseq[slot]--; + else if (slot > sep->nfsess_foreslots) + sep->nfsess_slotseq[slot] = 0; if ((bitval & sep->nfsess_slots) == 0) printf("freeing free slot!!\n"); sep->nfsess_slots &= ~bitval; @@ -5154,3 +5268,46 @@ nfsrpc_destroysession(struct nfsmount *nmp, struct nfsclsession *tsep, m_freem(nd->nd_mrep); return (error); } + +/* + * Translate a vnode type into an NFSv4 type, including the named + * attribute types. + */ +static uint32_t +vtonfsv4_type(struct vattr *vap) +{ + nfstype ntyp; + + if (vap->va_type >= 9) + ntyp = NFNON; + else + ntyp = nfsv34_type[vap->va_type]; + if ((vap->va_bsdflags & SFBSD_NAMEDATTR) != 0) { + if (ntyp == NFDIR) + ntyp = NFATTRDIR; + else if (ntyp == NFREG) + ntyp = NFNAMEDATTR; + } + return (txdr_unsigned((uint32_t)ntyp)); +} + +/* + * Translate an NFS type to a vnode type. + */ +static __enum_uint8(vtype) +nfsv4tov_type(uint32_t ntyp, uint16_t *bsdflags) +{ + __enum_uint8(vtype) vtyp; + + ntyp = fxdr_unsigned(uint32_t, ntyp) % (NFNAMEDATTR + 1); + if (ntyp == NFATTRDIR) { + vtyp = VDIR; + *bsdflags |= SFBSD_NAMEDATTR; + } else if (ntyp == NFNAMEDATTR) { + vtyp = VREG; + *bsdflags |= SFBSD_NAMEDATTR; + } else { + vtyp = nv34tov_type[ntyp]; + } + return (vtyp); +} diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 950e0c097457..54f60a753c50 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -169,6 +169,7 @@ int nfsrv_mdscopymr(char *, char *, char *, char *, int *, char *, NFSPROC_T *, struct vnode **, struct vnode **, struct pnfsdsfile **, struct nfsdevice **, struct nfsdevice **); void nfsrv_marknospc(char *, bool); +void nfsrv_removedeleg(fhandle_t *, struct nfsrv_descript *, NFSPROC_T *); /* nfs_nfsdserv.c */ int nfsrvd_access(struct nfsrv_descript *, int, @@ -340,7 +341,7 @@ int nfsv4_loadattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, struct nfsfh **, fhandle_t *, int, struct nfsv3_pathconf *, struct statfs *, struct nfsstatfs *, struct nfsfsinfo *, NFSACL_T *, - int, int *, u_int32_t *, u_int32_t *, NFSPROC_T *, struct ucred *); + int, int *, u_int32_t *, u_int32_t *, bool *, NFSPROC_T *, struct ucred *); int nfsv4_lock(struct nfsv4lock *, int, int *, struct mtx *, struct mount *); void nfsv4_unlock(struct nfsv4lock *, int); void nfsv4_relref(struct nfsv4lock *); @@ -394,8 +395,9 @@ int nfsrv_putopbit(struct nfsrv_descript *, nfsopbit_t *); void nfsrv_wcc(struct nfsrv_descript *, int, struct nfsvattr *, int, struct nfsvattr *); int nfsv4_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, NFSACL_T *, - struct vattr *, fhandle_t *, int, nfsattrbit_t *, - struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *); + struct vattr *, fhandle_t *, int, nfsattrbit_t *, struct ucred *, + NFSPROC_T *, int, int, int, int, uint64_t, struct statfs *, bool, bool, + bool); void nfsrv_fillattr(struct nfsrv_descript *, struct nfsvattr *); struct mbuf *nfsrv_adj(struct mbuf *, int, int); void nfsrv_postopattr(struct nfsrv_descript *, int, struct nfsvattr *); @@ -438,6 +440,7 @@ int nfs_supportsnfsv4acls(vnode_t); /* nfs_commonacl.c */ int nfsrv_dissectace(struct nfsrv_descript *, struct acl_entry *, bool, int *, int *, NFSPROC_T *); +uint32_t nfs_aceperm(acl_perm_t); int nfsrv_buildacl(struct nfsrv_descript *, NFSACL_T *, __enum_uint8(vtype), NFSPROC_T *); int nfsrv_compareacl(NFSACL_T *, NFSACL_T *); @@ -481,11 +484,13 @@ int nfsrpc_mknod(vnode_t, char *, int, struct vattr *, u_int32_t, int nfsrpc_create(vnode_t, char *, int, struct vattr *, nfsquad_t, int, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *); -int nfsrpc_remove(vnode_t, char *, int, vnode_t, struct ucred *, NFSPROC_T *, - struct nfsvattr *, int *); -int nfsrpc_rename(vnode_t, vnode_t, char *, int, vnode_t, vnode_t, char *, int, - struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, - int *, int *); +int nfsrpc_remove(struct vnode *, char *, int, struct vnode *, + struct nfsvattr *, int *, nfsremove_status *, struct nfsvattr *, int *, + struct ucred *, NFSPROC_T *); +int nfsrpc_rename(struct vnode *, struct vnode *, char *, int, struct vnode *, + struct vnode *, char *, int, nfsremove_status *, struct nfsvattr *, + struct nfsvattr *, int *, int *, struct nfsvattr *, int *, struct ucred *, + NFSPROC_T *); int nfsrpc_link(vnode_t, vnode_t, char *, int, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, int *, int *); @@ -515,7 +520,7 @@ int nfsrpc_statfs(vnode_t, struct nfsstatfs *, struct nfsfsinfo *, uint32_t *, struct ucred *, NFSPROC_T *, struct nfsvattr *, int *); int nfsrpc_fsinfo(vnode_t, struct nfsfsinfo *, struct ucred *, NFSPROC_T *, struct nfsvattr *, int *); -int nfsrpc_pathconf(vnode_t, struct nfsv3_pathconf *, +int nfsrpc_pathconf(vnode_t, struct nfsv3_pathconf *, bool *, struct ucred *, NFSPROC_T *, struct nfsvattr *, int *); int nfsrpc_renew(struct nfsclclient *, struct nfsclds *, struct ucred *, NFSPROC_T *); @@ -568,6 +573,9 @@ int nfsrpc_listextattr(vnode_t, uint64_t *, struct uio *, size_t *, bool *, int nfsrpc_rmextattr(vnode_t, const char *, struct nfsvattr *, int *, struct ucred *, NFSPROC_T *); void nfsrpc_bindconnsess(CLIENT *, void *, struct ucred *); +int nfsrpc_openattr(struct nfsmount *, struct vnode *, uint8_t *, int, + bool, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsfh **, + int *); /* nfs_clstate.c */ int nfscl_open(vnode_t, u_int8_t *, int, u_int32_t, int, @@ -606,12 +614,12 @@ int nfscl_doclose(vnode_t, struct nfsclclient **, NFSPROC_T *); int nfsrpc_doclose(struct nfsmount *, struct nfsclopen *, NFSPROC_T *, bool, bool); int nfscl_deleg(mount_t, struct nfsclclient *, u_int8_t *, int, - struct ucred *, NFSPROC_T *, struct nfscldeleg **); + struct ucred *, NFSPROC_T *, struct nfscldeleg *); void nfscl_lockinit(struct nfsv4lock *); void nfscl_lockexcl(struct nfsv4lock *, void *); void nfscl_lockunlock(struct nfsv4lock *); void nfscl_lockderef(struct nfsv4lock *); -void nfscl_delegreturnvp(vnode_t, NFSPROC_T *); +void nfscl_delegreturnvp(struct vnode *, bool, NFSPROC_T *); void nfscl_docb(struct nfsrv_descript *, NFSPROC_T *); void nfscl_releasealllocks(struct nfsclclient *, vnode_t, NFSPROC_T *, void *, int); @@ -626,7 +634,7 @@ int nfscl_renamedeleg(vnode_t, nfsv4stateid_t *, int *, vnode_t, nfsv4stateid_t *, int *, NFSPROC_T *); void nfscl_reclaimnode(vnode_t); void nfscl_newnode(vnode_t); -void nfscl_delegmodtime(vnode_t); +void nfscl_delegmodtime(struct vnode *, struct timespec *); void nfscl_deleggetmodtime(vnode_t, struct timespec *); int nfscl_trydelegreturn(struct nfscldeleg *, struct ucred *, struct nfsmount *, NFSPROC_T *); @@ -651,6 +659,8 @@ void nfscl_freelayout(struct nfscllayout *); void nfscl_freeflayout(struct nfsclflayout *); void nfscl_freedevinfo(struct nfscldevinfo *); int nfscl_layoutcommit(vnode_t, NFSPROC_T *); +int nfscl_delegacecheck(struct vnode *, accmode_t, struct ucred *); +void nfscl_startdelegrecall(struct nfsclclient *, struct nfsfh *); /* nfs_clport.c */ int nfscl_nget(mount_t, vnode_t, struct nfsfh *, @@ -707,12 +717,12 @@ int nfsvno_symlink(struct nameidata *, struct nfsvattr *, char *, int, int, uid_t, struct ucred *, NFSPROC_T *, struct nfsexstuff *); int nfsvno_getsymlink(struct nfsrv_descript *, struct nfsvattr *, NFSPROC_T *, char **, int *); -int nfsvno_removesub(struct nameidata *, int, struct ucred *, NFSPROC_T *, - struct nfsexstuff *); +int nfsvno_removesub(struct nameidata *, bool, struct nfsrv_descript *, + NFSPROC_T *, struct nfsexstuff *); int nfsvno_rmdirsub(struct nameidata *, int, struct ucred *, NFSPROC_T *, struct nfsexstuff *); -int nfsvno_rename(struct nameidata *, struct nameidata *, u_int32_t, - u_int32_t, struct ucred *, NFSPROC_T *); +int nfsvno_rename(struct nameidata *, struct nameidata *, + struct nfsrv_descript *, NFSPROC_T *); int nfsvno_link(struct nameidata *, vnode_t, nfsquad_t, struct ucred *, NFSPROC_T *, struct nfsexstuff *); int nfsvno_fsync(vnode_t, u_int64_t, int, struct ucred *, NFSPROC_T *); @@ -726,7 +736,8 @@ int nfsvno_updfilerev(vnode_t, struct nfsvattr *, struct nfsrv_descript *, NFSPROC_T *); int nfsvno_fillattr(struct nfsrv_descript *, struct mount *, vnode_t, struct nfsvattr *, fhandle_t *, int, nfsattrbit_t *, - struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t); + struct ucred *, NFSPROC_T *, int, int, int, int, uint64_t, bool, bool, + bool); int nfsrv_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *, NFSACL_T *, NFSPROC_T *); int nfsv4_sattr(struct nfsrv_descript *, vnode_t, struct nfsvattr *, nfsattrbit_t *, @@ -780,6 +791,7 @@ int newnfs_request(struct nfsrv_descript *, struct nfsmount *, struct nfsclient *, struct nfssockreq *, vnode_t, NFSPROC_T *, struct ucred *, u_int32_t, u_int32_t, u_char *, int, u_int64_t *, struct nfsclsession *); +void nfs_resetslots(struct nfsclsession *); int newnfs_connect(struct nfsmount *, struct nfssockreq *, struct ucred *, NFSPROC_T *, int, bool, struct __rpc_client **); void newnfs_disconnect(struct nfsmount *, struct nfssockreq *); diff --git a/sys/fs/nfs/nfscl.h b/sys/fs/nfs/nfscl.h index a52b9e433145..3b1445e1923c 100644 --- a/sys/fs/nfs/nfscl.h +++ b/sys/fs/nfs/nfscl.h @@ -68,10 +68,11 @@ struct nfsv4node { * These flag bits are used for the argument to nfscl_fillsattr() to * indicate special handling of the attributes. */ -#define NFSSATTR_FULL 0x1 -#define NFSSATTR_SIZE0 0x2 -#define NFSSATTR_SIZENEG1 0x4 -#define NFSSATTR_SIZERDEV 0x8 +#define NFSSATTR_FULL 0x01 +#define NFSSATTR_SIZE0 0x02 +#define NFSSATTR_SIZENEG1 0x04 +#define NFSSATTR_SIZERDEV 0x08 +#define NFSSATTR_NEWFILE 0x10 /* Use this macro for debug printfs. */ #define NFSCL_DEBUG(level, ...) do { \ diff --git a/sys/fs/nfs/nfsclstate.h b/sys/fs/nfs/nfsclstate.h index d9f5ed13b54f..92669ff8d1aa 100644 --- a/sys/fs/nfs/nfsclstate.h +++ b/sys/fs/nfs/nfsclstate.h @@ -116,6 +116,10 @@ struct nfsclclient { struct proc *nfsc_renewthread; struct nfsmount *nfsc_nmp; time_t nfsc_expire; + int nfsc_delegcnt; + int nfsc_deleghighwater; + int nfsc_layoutcnt; + int nfsc_layouthighwater; u_int32_t nfsc_clientidrev; u_int32_t nfsc_rev; u_int32_t nfsc_renew; diff --git a/sys/fs/nfs/nfsport.h b/sys/fs/nfs/nfsport.h index 0b16ba9b85a8..c30b46261df0 100644 --- a/sys/fs/nfs/nfsport.h +++ b/sys/fs/nfs/nfsport.h @@ -439,10 +439,13 @@ /* Do an NFSv4 Verify+Write. */ #define NFSPROC_APPENDWRITE 69 +/* Do a NFSv4 Openattr. */ +#define NFSPROC_OPENATTR 70 + /* * Must be defined as one higher than the last NFSv4.2 Proc# above. */ -#define NFSV42_NPROCS 70 +#define NFSV42_NPROCS 71 /* Value of NFSV42_NPROCS for old nfsstats structure. (Always 69) */ #define NFSV42_OLDNPROCS 69 @@ -474,7 +477,7 @@ struct nfsstatsv1 { uint64_t readlink_bios; uint64_t biocache_readdirs; uint64_t readdir_bios; - uint64_t rpccnt[NFSV42_NPROCS + 10]; + uint64_t rpccnt[NFSV42_NPROCS + 9]; uint64_t rpcretries; uint64_t srvrpccnt[NFSV42_NOPS + NFSV4OP_FAKENOPS + 15]; uint64_t srvlayouts; @@ -690,6 +693,7 @@ struct nfsvattr { #define na_bytes na_vattr.va_bytes #define na_filerev na_vattr.va_filerev #define na_vaflags na_vattr.va_vaflags +#define na_bsdflags na_vattr.va_bsdflags #include <fs/nfsclient/nfsnode.h> @@ -1180,9 +1184,11 @@ struct nfsreq { */ #ifdef VV_DISABLEDELEG #define NFSVNO_DELEGOK(v) \ - ((v) == NULL || ((v)->v_vflag & VV_DISABLEDELEG) == 0) + ((v) == NULL || ((v)->v_vflag & VV_DISABLEDELEG) == 0 || \ + (vn_irflag_read(v) & VIRF_NAMEDATTR) == 0) #else -#define NFSVNO_DELEGOK(v) (1) +#define NFSVNO_DELEGOK(v) \ + ((v) == NULL || (vn_irflag_read(v) & VIRF_NAMEDATTR) == 0) #endif /* diff --git a/sys/fs/nfs/nfsproto.h b/sys/fs/nfs/nfsproto.h index cef886755d5a..cb5a80e8df73 100644 --- a/sys/fs/nfs/nfsproto.h +++ b/sys/fs/nfs/nfsproto.h @@ -275,6 +275,8 @@ #define NFSX_V4SESSIONID 16 #define NFSX_V4DEVICEID 16 #define NFSX_V4PNFSFH (sizeof(fhandle_t) + 1) +#define NFSX_V4NAMEDDIRFH 2 +#define NFSX_V4NAMEDATTRFH 3 #define NFSX_V4FILELAYOUT (4 * NFSX_UNSIGNED + NFSX_V4DEVICEID + \ NFSX_HYPER + NFSM_RNDUP(NFSX_V4PNFSFH)) #define NFSX_V4FLEXLAYOUT(m) (NFSX_HYPER + 3 * NFSX_UNSIGNED + \ @@ -406,10 +408,13 @@ /* Do an NFSv4 Verify+Write. */ #define NFSPROC_APPENDWRITE 69 +/* Do a NFSv4 Openattr. */ +#define NFSPROC_OPENATTR 70 + /* * Must be defined as one higher than the last NFSv4.2 Proc# above. */ -#define NFSV42_NPROCS 70 +#define NFSV42_NPROCS 71 /* Value of NFSV42_NPROCS for old nfsstats structure. (Always 69) */ #define NFSV42_OLDNPROCS 69 @@ -619,6 +624,8 @@ #define NFSV4OPEN_WDCONTENTION 0x00100000 #define NFSV4OPEN_WDNOTWANTED 0x00200000 #define NFSV4OPEN_WDSUPPFTYPE 0x00400000 +#define NFSV4OPEN_WDNOTSUPPDOWNGRADE 0x00800000 +#define NFSV4OPEN_WDNOTSUPPUPGRADE 0x01000000 /* * NFS V4 File Handle types @@ -742,6 +749,17 @@ #define NFSSECINFONONAME_CURFH 0 #define NFSSECINFONONAME_PARENT 1 +/* Bits for CB_RECALL_ANY. */ +#define NFSRCA4_RDATA_DLG 0x00000001 +#define NFSRCA4_WDATA_DLG 0x00000002 +#define NFSRCA4_DIR_DLG 0x00000004 +#define NFSRCA4_FILE_LAYOUT 0x00000008 +#define NFSRCA4_BLK_LAYOUT 0x00000010 +#define NFSRCA4_OBJ_LAYOUT_MIN 0x00000100 +#define NFSRCA4_OBJ_LAYOUT_MAX 0x00000200 +#define NFSRCA4_FF_LAYOUT_READ 0x00010000 +#define NFSRCA4_FF_LAYOUT_RW 0x00020000 + #if defined(_KERNEL) || defined(KERNEL) /* Conversion macros */ #define vtonfsv2_mode(t,m) \ @@ -1002,7 +1020,7 @@ struct nfsv3_sattr { #define NFSATTRBIT_SPACEFREED 78 #define NFSATTRBIT_CHANGEATTRTYPE 79 #define NFSATTRBIT_SECLABEL 80 -/* Not sure what attribute bit #81 is? */ +#define NFSATTRBIT_MODEUMASK 81 #define NFSATTRBIT_XATTRSUPPORT 82 #define NFSATTRBM_SUPPORTEDATTRS 0x00000001 @@ -1086,7 +1104,7 @@ struct nfsv3_sattr { #define NFSATTRBM_SPACEFREED 0x00004000 #define NFSATTRBM_CHANGEATTRTYPE 0x00008000 #define NFSATTRBM_SECLABEL 0x00010000 -/* Not sure what attribute bit#81/0x00020000 is? */ +#define NFSATTRBM_MODEUMASK 0x00020000 #define NFSATTRBM_XATTRSUPPORT 0x00040000 #define NFSATTRBIT_MAX 83 @@ -1124,6 +1142,7 @@ struct nfsv3_sattr { NFSATTRBM_FILESFREE | \ NFSATTRBM_FILESTOTAL | \ NFSATTRBM_FSLOCATIONS | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_HOMOGENEOUS | \ NFSATTRBM_MAXFILESIZE | \ NFSATTRBM_MAXLINK | \ @@ -1145,6 +1164,7 @@ struct nfsv3_sattr { NFSATTRBM_SPACEFREE | \ NFSATTRBM_SPACETOTAL | \ NFSATTRBM_SPACEUSED | \ + NFSATTRBM_SYSTEM | \ NFSATTRBM_TIMEACCESS | \ NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEDELTA | \ @@ -1174,6 +1194,7 @@ struct nfsv3_sattr { NFSATTRBM_LAYOUTBLKSIZE | \ NFSATTRBM_LAYOUTALIGNMENT | \ NFSATTRBM_SUPPATTREXCLCREAT | \ + NFSATTRBM_CHANGEATTRTYPE | \ NFSATTRBM_XATTRSUPPORT) /* @@ -1181,7 +1202,8 @@ struct nfsv3_sattr { */ #define NFSATTRBIT_SUPPSETONLY1 (NFSATTRBM_TIMEACCESSSET | \ NFSATTRBM_TIMEMODIFYSET) -#define NFSATTRBIT_SUPPSETONLY2 (NFSATTRBM_MODESETMASKED) +#define NFSATTRBIT_SUPPSETONLY2 (NFSATTRBM_MODESETMASKED | \ + NFSATTRBM_MODEUMASK) /* * NFSATTRBIT_SETABLE - SETABLE0 - bits 0<->31 @@ -1190,16 +1212,19 @@ struct nfsv3_sattr { */ #define NFSATTRBIT_SETABLE0 \ (NFSATTRBM_SIZE | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_ACL) #define NFSATTRBIT_SETABLE1 \ (NFSATTRBM_MODE | \ NFSATTRBM_OWNER | \ NFSATTRBM_OWNERGROUP | \ - NFSATTRBM_TIMECREATE | \ + NFSATTRBM_SYSTEM | \ + NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEACCESSSET | \ NFSATTRBM_TIMEMODIFYSET) #define NFSATTRBIT_SETABLE2 \ - (NFSATTRBM_MODESETMASKED) + (NFSATTRBM_MODESETMASKED | \ + NFSATTRBM_MODEUMASK) /* * NFSATTRBIT_NFSV41 - Attributes only supported by NFSv4.1. @@ -1216,7 +1241,10 @@ struct nfsv3_sattr { /* * NFSATTRBIT_NFSV42 - Attributes only supported by NFSv4.2. */ -#define NFSATTRBIT_NFSV42_2 NFSATTRBM_XATTRSUPPORT +#define NFSATTRBIT_NFSV42_2 \ + (NFSATTRBM_CHANGEATTRTYPE | \ + NFSATTRBM_XATTRSUPPORT | \ + NFSATTRBM_MODEUMASK) /* * Set of attributes that the getattr vnode op needs. @@ -1230,6 +1258,7 @@ struct nfsv3_sattr { NFSATTRBM_SIZE | \ NFSATTRBM_FSID | \ NFSATTRBM_FILEID | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_MAXREAD) /* @@ -1242,6 +1271,7 @@ struct nfsv3_sattr { NFSATTRBM_OWNERGROUP | \ NFSATTRBM_RAWDEV | \ NFSATTRBM_SPACEUSED | \ + NFSATTRBM_SYSTEM | \ NFSATTRBM_TIMEACCESS | \ NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEMETADATA | \ @@ -1264,6 +1294,7 @@ struct nfsv3_sattr { NFSATTRBM_SIZE | \ NFSATTRBM_FSID | \ NFSATTRBM_FILEID | \ + NFSATTRBM_HIDDEN | \ NFSATTRBM_MAXREAD) /* @@ -1274,6 +1305,7 @@ struct nfsv3_sattr { NFSATTRBM_NUMLINKS | \ NFSATTRBM_RAWDEV | \ NFSATTRBM_SPACEUSED | \ + NFSATTRBM_SYSTEM | \ NFSATTRBM_TIMEACCESS | \ NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEMETADATA | \ @@ -1390,6 +1422,7 @@ struct nfsv3_sattr { * NFSGETATTRBIT_PATHCONF0 - bits 0<->31 */ #define NFSGETATTRBIT_PATHCONF0 (NFSATTRBIT_GETATTR0 | \ + NFSATTRBM_NAMEDATTR | \ NFSATTRBM_CASEINSENSITIVE | \ NFSATTRBM_CASEPRESERVING | \ NFSATTRBM_CHOWNRESTRICTED | \ @@ -1651,4 +1684,11 @@ typedef struct nfsv4stateid nfsv4stateid_t; #define NFSV4SXATTR_CREATE 1 #define NFSV4SXATTR_REPLACE 2 +/* Values for ChangeAttrType (RFC-7862). */ +#define NFSV4CHANGETYPE_MONOTONIC_INCR 0 +#define NFSV4CHANGETYPE_VERS_COUNTER 1 +#define NFSV4CHANGETYPE_VERS_COUNTER_NOPNFS 2 +#define NFSV4CHANGETYPE_TIME_METADATA 3 +#define NFSV4CHANGETYPE_UNDEFINED 4 + #endif /* _NFS_NFSPROTO_H_ */ diff --git a/sys/fs/nfs/nfsrvstate.h b/sys/fs/nfs/nfsrvstate.h index da214ae9d4e9..cc19ed6fa1d2 100644 --- a/sys/fs/nfs/nfsrvstate.h +++ b/sys/fs/nfs/nfsrvstate.h @@ -333,7 +333,7 @@ struct nfsf_rec { u_int32_t numboots; /* Number of boottimes */ }; -void nfsrv_cleanclient(struct nfsclient *, NFSPROC_T *); +void nfsrv_cleanclient(struct nfsclient *, NFSPROC_T *, bool, SVCXPRT **); void nfsrv_freedeleglist(struct nfsstatehead *); /* diff --git a/sys/fs/nfsclient/nfs_clbio.c b/sys/fs/nfsclient/nfs_clbio.c index c691e797aa01..e181bf593e23 100644 --- a/sys/fs/nfsclient/nfs_clbio.c +++ b/sys/fs/nfsclient/nfs_clbio.c @@ -366,7 +366,7 @@ nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) bool old_lock; /* - * Ensure the exclusove access to the node before checking + * Ensure the exclusive access to the node before checking * whether the cache is consistent. */ old_lock = ncl_excl_start(vp); diff --git a/sys/fs/nfsclient/nfs_clcomsubs.c b/sys/fs/nfsclient/nfs_clcomsubs.c index 270f39d03c90..bca0bdcd0df1 100644 --- a/sys/fs/nfsclient/nfs_clcomsubs.c +++ b/sys/fs/nfsclient/nfs_clcomsubs.c @@ -271,7 +271,8 @@ nfsm_loadattr(struct nfsrv_descript *nd, struct nfsvattr *nap) if (nd->nd_flag & ND_NFSV4) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, - NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, + NULL); } else if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(fp, struct nfs_fattr *, NFSX_V3FATTR); nap->na_type = nfsv34tov_type(fp->fa_type); diff --git a/sys/fs/nfsclient/nfs_clnode.c b/sys/fs/nfsclient/nfs_clnode.c index be2024730cf0..f85f961d424e 100644 --- a/sys/fs/nfsclient/nfs_clnode.c +++ b/sys/fs/nfsclient/nfs_clnode.c @@ -205,7 +205,7 @@ nfs_freesillyrename(void *arg, __unused int pending) } static void -ncl_releasesillyrename(struct vnode *vp, struct thread *td) +ncl_releasesillyrename(struct vnode *vp, bool flushed, struct thread *td) { struct nfsnode *np; struct sillyrename *sp; @@ -220,7 +220,8 @@ ncl_releasesillyrename(struct vnode *vp, struct thread *td) sp = NULL; if (sp != NULL) { NFSUNLOCKNODE(np); - (void) ncl_vinvalbuf(vp, 0, td, 1); + if (flushed) + (void)ncl_vinvalbuf(vp, 0, td, 1); /* * Remove the silly file that was rename'd earlier */ @@ -238,9 +239,13 @@ ncl_inactive(struct vop_inactive_args *ap) struct vnode *vp = ap->a_vp; struct nfsnode *np; struct thread *td; + struct nfsmount *nmp; + bool flushed; td = curthread; np = VTONFS(vp); + nmp = VFSTONFS(vp->v_mount); + flushed = true; if (NFS_ISV4(vp) && vp->v_type == VREG) { NFSLOCKNODE(np); np->n_openstateid = NULL; @@ -251,13 +256,18 @@ ncl_inactive(struct vop_inactive_args *ap) * buffers/pages must be flushed before the close, so that the * stateid is available for the writes. */ - vnode_pager_clean_sync(vp); - (void)ncl_flush(vp, MNT_WAIT, td, 1, 0); + if ((nmp->nm_flag & NFSMNT_NOCTO) == 0 || !NFSHASNFSV4N(nmp) || + nfscl_mustflush(vp) != 0) { + vnode_pager_clean_sync(vp); + (void)ncl_flush(vp, MNT_WAIT, td, 1, 0); + } else { + flushed = false; + } (void)nfsrpc_close(vp, 1, td); } NFSLOCKNODE(np); - ncl_releasesillyrename(vp, td); + ncl_releasesillyrename(vp, flushed, td); /* * NMODIFIED means that there might be dirty/stale buffers @@ -294,7 +304,7 @@ ncl_reclaim(struct vop_reclaim_args *ap) nfs_reclaim_p(ap); NFSLOCKNODE(np); - ncl_releasesillyrename(vp, td); + ncl_releasesillyrename(vp, true, td); if (NFS_ISV4(vp) && vp->v_type == VREG) { np->n_openstateid = NULL; @@ -315,7 +325,7 @@ ncl_reclaim(struct vop_reclaim_args *ap) MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNTF) == 0) { MNT_IUNLOCK(mp); - nfscl_delegreturnvp(vp, td); + nfscl_delegreturnvp(vp, true, td); } else MNT_IUNLOCK(mp); } else diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c index 4e3a699fb170..b25d967982a1 100644 --- a/sys/fs/nfsclient/nfs_clport.c +++ b/sys/fs/nfsclient/nfs_clport.c @@ -828,7 +828,7 @@ nfscl_wcc_data(struct nfsrv_descript *nd, struct vnode *vp, == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); if (error) return (error); /* @@ -1489,3 +1489,4 @@ MODULE_DEPEND(nfscl, nfscommon, 1, 1, 1); MODULE_DEPEND(nfscl, krpc, 1, 1, 1); MODULE_DEPEND(nfscl, nfssvc, 1, 1, 1); MODULE_DEPEND(nfscl, xdr, 1, 1, 1); +MODULE_DEPEND(nfscl, acl_nfs4, 1, 1, 1); diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index 8c5532268287..2f3c59b68518 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -142,6 +142,7 @@ static int nfsrpc_createv4(vnode_t , char *, int, struct vattr *, nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, int *); +static bool nfscl_invalidfname(bool, char *, int); static int nfsrpc_locku(struct nfsrv_descript *, struct nfsmount *, struct nfscllockowner *, u_int64_t, u_int64_t, u_int32_t, struct ucred *, NFSPROC_T *, int); @@ -389,13 +390,25 @@ nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p) mode |= NFSV4OPEN_ACCESSREAD; if (amode & FWRITE) mode |= NFSV4OPEN_ACCESSWRITE; + if (NFSHASNFSV4N(nmp)) { + if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 && + nfs_numnfscbd > 0 && + (vn_irflag_read(vp) & VIRF_NAMEDATTR) == 0) { + if ((mode & NFSV4OPEN_ACCESSWRITE) != 0) + mode |= NFSV4OPEN_WANTWRITEDELEG; + else + mode |= NFSV4OPEN_WANTANYDELEG; + } else + mode |= NFSV4OPEN_WANTNODELEG; + } nfhp = np->n_fhp; retrycnt = 0; do { dp = NULL; - error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 1, - cred, p, NULL, &op, &newone, &ret, 1, true); + error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len, + (mode & NFSV4OPEN_ACCESSBOTH), 1, cred, p, NULL, + &op, &newone, &ret, 1, true); if (error) { return (error); } @@ -440,7 +453,7 @@ nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p) NFSUNLOCKNODE(np); (void) nfscl_deleg(nmp->nm_mountp, op->nfso_own->nfsow_clp, - nfhp->nfh_fh, nfhp->nfh_len, cred, p, &dp); + nfhp->nfh_fh, nfhp->nfh_len, cred, p, dp); } } else if (NFSHASNFSV4N(nmp)) { /* @@ -473,7 +486,7 @@ nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p) NFSUNLOCKNODE(np); (void) nfscl_deleg(nmp->nm_mountp, op->nfso_own->nfsow_clp, - nfhp->nfh_fh, nfhp->nfh_len, cred, p, &dp); + nfhp->nfh_fh, nfhp->nfh_len, cred, p, dp); } } else { error = EIO; @@ -547,7 +560,8 @@ nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, cred); NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); - *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); + *tl++ = txdr_unsigned(mode & (NFSV4OPEN_ACCESSBOTH | + NFSV4OPEN_WANTDELEGMASK)); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; @@ -664,6 +678,13 @@ nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, &ret, &acesize, p); if (error) goto nfsmout; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -675,7 +696,7 @@ nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, ("nfsrpc_openrpc: Getattr repstat")); error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, - NULL, NULL, NULL, p, cred); + NULL, NULL, NULL, NULL, p, cred); if (error) goto nfsmout; } @@ -1334,7 +1355,7 @@ nfsrpc_getattrnovp(struct nfsmount *nmp, u_int8_t *fhp, int fhlen, int syscred, if ((nd->nd_flag & ND_NFSV4) != 0) error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, leasep, NULL, - NULL, NULL); + NULL, NULL, NULL); else error = nfsm_loadattr(nd, nap); } else @@ -1546,7 +1567,7 @@ nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred, NFSM_BUILD(tl, uint32_t *, 6 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_OPEN); *tl++ = 0; /* seqid, ignored. */ - *tl++ = txdr_unsigned(openmode); + *tl++ = txdr_unsigned(openmode | NFSV4OPEN_WANTNODELEG); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); *tl++ = 0; /* ClientID, ignored. */ *tl = 0; @@ -1668,6 +1689,13 @@ nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred, ndp->nfsdl_stateid.other[0] = *tl++; ndp->nfsdl_stateid.other[1] = *tl++; ndp->nfsdl_stateid.other[2] = *tl++; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -2396,7 +2424,7 @@ nfsrpc_mknod(vnode_t dvp, char *name, int namelen, struct vattr *vap, *tl = vtonfsv34_type(vtyp); } if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); if ((nd->nd_flag & ND_NFSV3) && (vtyp == VCHR || vtyp == VBLK)) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); @@ -2484,7 +2512,7 @@ nfsrpc_create(vnode_t dvp, char *name, int namelen, struct vattr *vap, */ if (dp != NULL) (void) nfscl_deleg(nmp->nm_mountp, owp->nfsow_clp, - (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len, cred, p, &dp); + (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len, cred, p, dp); nfscl_ownerrelease(nmp, owp, error, newone, unlocked); if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || @@ -2595,8 +2623,17 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); - *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | - NFSV4OPEN_ACCESSREAD); + if (NFSHASNFSV4N(nmp)) { + if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 && + nfs_numnfscbd > 0) + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTWRITEDELEG); + else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTNODELEG); + } else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; @@ -2609,14 +2646,16 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, + 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, + 0); } } else { /* NFSv4.0 */ @@ -2627,7 +2666,7 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); @@ -2714,6 +2753,13 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, &ret, &acesize, p); if (error) goto nfsmout; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -2813,22 +2859,28 @@ nfsmout: * Nfs remove rpc */ int -nfsrpc_remove(vnode_t dvp, char *name, int namelen, vnode_t vp, - struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp) +nfsrpc_remove(struct vnode *dvp, char *name, int namelen, struct vnode *vp, + struct nfsvattr *nap, int *attrflagp, nfsremove_status *file_status, + struct nfsvattr *dnap, int *dattrflagp, struct ucred *cred, NFSPROC_T *p) { - u_int32_t *tl; + uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsnode *np; struct nfsmount *nmp; nfsv4stateid_t dstateid; - int error, ret = 0, i; + nfsattrbit_t attrbits; + int error, i, ret; *dattrflagp = 0; + *attrflagp = 0; + *file_status = UNKNOWN; + ret = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); nmp = VFSTONFS(dvp->v_mount); tryagain: - if (NFSHASNFSV4(nmp) && ret == 0) { + if (NFSHASNFSV4(nmp) && ((nmp->nm_flag & NFSMNT_NOCTO) == 0 || + !NFSHASNFSV4N(nmp)) && ret == 0) { ret = nfscl_removedeleg(vp, p, &dstateid); if (ret == 1) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGREMOVE, vp, cred); @@ -2853,9 +2905,19 @@ tryagain: } if (ret == 0) NFSCL_REQSTART(nd, NFSPROC_REMOVE, dvp, cred); - (void) nfsm_strtom(nd, name, namelen); + (void)nfsm_strtom(nd, name, namelen); + if (ret == 0 && (nd->nd_flag & ND_NFSV4) != 0) { + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + *tl = txdr_unsigned(NFSV4OP_PUTFH); + np = VTONFS(vp); + (void)nfsm_fhtom(nmp, nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + NFSGETATTR_ATTRBIT(&attrbits); + *tl = txdr_unsigned(NFSV4OP_GETATTR); + (void)nfsrv_putattrbit(nd, &attrbits); + } error = nfscl_request(nd, dvp, p, cred); - if (error) + if (error != 0) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { /* For NFSv4, parse out any Delereturn replies. */ @@ -2878,7 +2940,41 @@ tryagain: } error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL); } - if (nd->nd_repstat && !error) + if (ret == 0 && (nd->nd_flag & (ND_NFSV4 | + ND_NOMOREDATA)) == ND_NFSV4) { + /* Parse out the Remove reply for NFSPROC_REMOVE. */ + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED + 2 * NFSX_HYPER); + /* No use for change info for now. */ + /* The Remove succeeded. */ + nd->nd_repstat = 0; + } + if (ret == 0 && (nd->nd_flag & (ND_NFSV4 | + ND_NOMOREDATA)) == ND_NFSV4) { + /* Parse out the PutFH, Getattr for NFSPROC_REMOVE. */ + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + if (*(tl + 1) != 0) { + i = fxdr_unsigned(int, *(tl + 1)); + if (i == NFSERR_STALE) + *file_status = DELETED; + } else { + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + if (*(tl + 1) != 0) { + i = fxdr_unsigned(int, *(tl + 1)); + if (i == NFSERR_STALE) + *file_status = DELETED; + } else { + error = nfsm_loadattr(nd, nap); + if (error == 0) { + *attrflagp = 1; + if (nap->na_nlink == 0) + *file_status = NLINK_ZERO; + else + *file_status = VALID; + } + } + } + } + if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); @@ -2889,12 +2985,14 @@ nfsmout: * Do an nfs rename rpc. */ int -nfsrpc_rename(vnode_t fdvp, vnode_t fvp, char *fnameptr, int fnamelen, - vnode_t tdvp, vnode_t tvp, char *tnameptr, int tnamelen, struct ucred *cred, - NFSPROC_T *p, struct nfsvattr *fnap, struct nfsvattr *tnap, - int *fattrflagp, int *tattrflagp) +nfsrpc_rename(struct vnode *fdvp, struct vnode *fvp, char *fnameptr, + int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr, + int tnamelen, nfsremove_status *tvp_status, struct nfsvattr *fnap, + struct nfsvattr *tnap, int *fattrflagp, int *tattrflagp, + struct nfsvattr *tvpnap, int *tvpattrflagp, struct ucred *cred, + NFSPROC_T *p) { - u_int32_t *tl; + uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; struct nfsnode *np; @@ -2904,11 +3002,14 @@ nfsrpc_rename(vnode_t fdvp, vnode_t fvp, char *fnameptr, int fnamelen, *fattrflagp = 0; *tattrflagp = 0; + *tvpattrflagp = 0; + *tvp_status = UNKNOWN; nmp = VFSTONFS(fdvp->v_mount); if (fnamelen > NFS_MAXNAMLEN || tnamelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); tryagain: - if (NFSHASNFSV4(nmp) && ret == 0) { + if (NFSHASNFSV4(nmp) && ((nmp->nm_flag & NFSMNT_NOCTO) == 0 || + !NFSHASNFSV4N(nmp)) && ret == 0) { ret = nfscl_renamedeleg(fvp, &fdstateid, &gotfd, tvp, &tdstateid, &gottd, p); if (gotfd && gottd) { @@ -2961,29 +3062,44 @@ tryagain: } if (ret == 0) NFSCL_REQSTART(nd, NFSPROC_RENAME, fdvp, cred); - if (nd->nd_flag & ND_NFSV4) { + if ((nd->nd_flag & ND_NFSV4) != 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSWCCATTR_ATTRBIT(&attrbits); - (void) nfsrv_putattrbit(nd, &attrbits); + (void)nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void)nfsm_fhtom(nmp, nd, VTONFS(tdvp)->n_fhp->nfh_fh, VTONFS(tdvp)->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); - (void) nfsrv_putattrbit(nd, &attrbits); + (void)nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_V4WCCATTR; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_RENAME); } - (void) nfsm_strtom(nd, fnameptr, fnamelen); - if (!(nd->nd_flag & ND_NFSV4)) + (void)nfsm_strtom(nd, fnameptr, fnamelen); + if ((nd->nd_flag & ND_NFSV4) == 0) (void)nfsm_fhtom(nmp, nd, VTONFS(tdvp)->n_fhp->nfh_fh, VTONFS(tdvp)->n_fhp->nfh_len, 0); - (void) nfsm_strtom(nd, tnameptr, tnamelen); + (void)nfsm_strtom(nd, tnameptr, tnamelen); + if (ret == 0 && (nd->nd_flag & ND_NFSV4) != 0) { + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + /* When tvp == NULL, it doesn't matter which dvp is used. */ + *tl = txdr_unsigned(NFSV4OP_PUTFH); + if (tvp != NULL) + (void)nfsm_fhtom(nmp, nd, VTONFS(tvp)->n_fhp->nfh_fh, + VTONFS(tvp)->n_fhp->nfh_len, 0); + else + (void)nfsm_fhtom(nmp, nd, VTONFS(tdvp)->n_fhp->nfh_fh, + VTONFS(tdvp)->n_fhp->nfh_len, 0); + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + *tl = txdr_unsigned(NFSV4OP_GETATTR); + NFSGETATTR_ATTRBIT(&attrbits); + (void)nfsrv_putattrbit(nd, &attrbits); + } error = nfscl_request(nd, fdvp, p, cred); - if (error) + if (error != 0) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { /* For NFSv4, parse out any Delereturn replies. */ @@ -2999,7 +3115,7 @@ tryagain: for (i = 0; i < (ret * 2); i++) { if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { - NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) { if (i == 1 && ret > 1) { /* @@ -3019,23 +3135,57 @@ tryagain: } /* Now, the first wcc attribute reply. */ if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { - NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } error = nfscl_wcc_data(nd, fdvp, fnap, fattrflagp, NULL, NULL); /* and the second wcc attribute reply. */ if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && - !error) { - NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); + error == 0) { + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } - if (!error) + if (error == 0) error = nfscl_wcc_data(nd, tdvp, tnap, tattrflagp, NULL, NULL); } - if (nd->nd_repstat && !error) + if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && + ret == 0 && error == 0) { + /* Parse out the rename successful reply. */ + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + + 4 * NFSX_HYPER); + nd->nd_repstat = 0; /* Rename succeeded. */ + /* Parse PutFH reply for tvp. */ + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + if (*(tl + 1) != 0) { + if (tvp != NULL) { + i = fxdr_unsigned(int, *(tl + 1)); + if (i == NFSERR_STALE) + *tvp_status = DELETED; + } + } else { + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + if (*(tl + 1) != 0) { + if (tvp != NULL) { + i = fxdr_unsigned(int, *(tl + 1)); + if (i == NFSERR_STALE) + *tvp_status = DELETED; + } + } else { + error = nfsm_loadattr(nd, tvpnap); + if (error == 0 && tvp != NULL) { + *tvpattrflagp = 1; + if (tvpnap->na_nlink == 0) + *tvp_status = NLINK_ZERO; + else + *tvp_status = VALID; + } + } + } + } + if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); @@ -3068,14 +3218,19 @@ nfsrpc_link(vnode_t dvp, vnode_t vp, char *name, int namelen, VTONFS(dvp)->n_fhp->nfh_len, 0); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); - *tl = txdr_unsigned(NFSV4OP_GETATTR); - NFSWCCATTR_ATTRBIT(&attrbits); - (void) nfsrv_putattrbit(nd, &attrbits); - nd->nd_flag |= ND_V4WCCATTR; - NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_LINK); } (void) nfsm_strtom(nd, name, namelen); + if (nd->nd_flag & ND_NFSV4) { + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + *tl = txdr_unsigned(NFSV4OP_GETATTR); + NFSGETATTR_ATTRBIT(&attrbits); + (void)nfsrv_putattrbit(nd, &attrbits); + NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); + *tl++ = txdr_unsigned(NFSV4OP_RESTOREFH); + *tl = txdr_unsigned(NFSV4OP_GETATTR); + (void)nfsrv_putattrbit(nd, &attrbits); + } error = nfscl_request(nd, vp, p, cred); if (error) return (error); @@ -3084,19 +3239,28 @@ nfsrpc_link(vnode_t dvp, vnode_t vp, char *name, int namelen, if (!error) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL); - } else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { + } else if (nd->nd_repstat == 0 && (nd->nd_flag & ND_NFSV4) != 0) { /* - * First, parse out the PutFH and Getattr result. + * First and parse out the PutFH and Link results. */ - NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); - if (!(*(tl + 1))) - NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); - if (*(tl + 1)) + NFSM_DISSECT(tl, uint32_t *, 5 * NFSX_UNSIGNED + + 2 * NFSX_HYPER); + if (*(tl + 3)) nd->nd_flag |= ND_NOMOREDATA; /* - * Get the pre-op attributes. + * Get the directory post-op attributes. */ - error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL); + if ((nd->nd_flag & ND_NOMOREDATA) == 0) + error = nfscl_postop_attr(nd, dnap, dattrflagp); + if (error == 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) { + /* Get rid of the RestoreFH reply. */ + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + if (*(tl + 1)) + nd->nd_flag |= ND_NOMOREDATA; + } + /* Get the file's post-op attributes. */ + if (error == 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) + error = nfscl_postop_attr(nd, nap, attrflagp); } if (nd->nd_repstat && !error) error = nd->nd_repstat; @@ -3195,7 +3359,7 @@ nfsrpc_mkdir(vnode_t dvp, char *name, int namelen, struct vattr *vap, *tl = txdr_unsigned(NFDIR); } (void) nfsm_strtom(nd, name, namelen); - nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1 | NFSSATTR_NEWFILE, 0); if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); @@ -3280,6 +3444,31 @@ nfsrpc_rmdir(vnode_t dvp, char *name, int namelen, struct ucred *cred, } /* + * Check to make sure the file name in a Readdir reply is valid. + */ +static bool +nfscl_invalidfname(bool is_v4, char *name, int len) +{ + int i; + char *cp; + + if (is_v4 && ((len == 1 && name[0] == '.') || + (len == 2 && name[0] == '.' && name[1] == '.'))) { + printf("Readdir NFSv4 reply has dot or dotdot in it\n"); + return (true); + } + cp = name; + for (i = 0; i < len; i++, cp++) { + if (*cp == '/' || *cp == '\0') { + printf("Readdir reply file name had imbedded / or nul" + " byte\n"); + return (true); + } + } + return (false); +} + +/* * Readdir rpc. * Always returns with either uio_resid unchanged, if you are at the * end of the directory, or uio_resid == 0, with all DIRBLKSIZ chunks @@ -3327,10 +3516,13 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, nfsattrbit_t attrbits, dattrbits; u_int32_t rderr, *tl2 = NULL; size_t tresid; + bool validentry; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirrpc bad uio")); + KASSERT(uiop->uio_segflg == UIO_SYSSPACE, + ("nfsrpc_readdir: uio userspace")); ncookie.lval[0] = ncookie.lval[1] = 0; /* * There is no point in reading a lot more than uio_resid, however @@ -3405,7 +3597,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, - NULL, NULL, NULL, p, cred); + NULL, NULL, NULL, NULL, p, cred); if (error) { dotdotfileid = dotfileid; } else if (gotmnton) { @@ -3550,6 +3742,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { + validentry = true; if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; @@ -3588,6 +3781,17 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, uiop->uio_resid) bigenough = 0; if (bigenough) { + struct iovec saviov; + off_t savoff; + ssize_t savresid; + int savblksiz; + + saviov.iov_base = uiop->uio_iov->iov_base; + saviov.iov_len = uiop->uio_iov->iov_len; + savoff = uiop->uio_offset; + savresid = uiop->uio_resid; + savblksiz = blksiz; + dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; @@ -3603,20 +3807,36 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; + cp = uiop->uio_iov->iov_base; error = nfsm_mbufuio(nd, uiop, len); if (error) goto nfsmout; - cp = uiop->uio_iov->iov_base; - tlen -= len; - NFSBZERO(cp, tlen); - cp += tlen; /* points to cookie storage */ - tl2 = (u_int32_t *)cp; - uiop->uio_iov->iov_base = - (char *)uiop->uio_iov->iov_base + tlen + - NFSX_HYPER; - uiop->uio_iov->iov_len -= tlen + NFSX_HYPER; - uiop->uio_resid -= tlen + NFSX_HYPER; - uiop->uio_offset += (tlen + NFSX_HYPER); + /* Check for an invalid file name. */ + if (nfscl_invalidfname( + (nd->nd_flag & ND_NFSV4) != 0, cp, len)) { + /* Skip over this entry. */ + uiop->uio_iov->iov_base = + saviov.iov_base; + uiop->uio_iov->iov_len = + saviov.iov_len; + uiop->uio_offset = savoff; + uiop->uio_resid = savresid; + blksiz = savblksiz; + validentry = false; + } else { + cp = uiop->uio_iov->iov_base; + tlen -= len; + NFSBZERO(cp, tlen); + cp += tlen; /* points to cookie store */ + tl2 = (u_int32_t *)cp; + uiop->uio_iov->iov_base = + (char *)uiop->uio_iov->iov_base + + tlen + NFSX_HYPER; + uiop->uio_iov->iov_len -= tlen + + NFSX_HYPER; + uiop->uio_resid -= tlen + NFSX_HYPER; + uiop->uio_offset += (tlen + NFSX_HYPER); + } } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); if (error) @@ -3627,7 +3847,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, - NULL, NULL, &rderr, p, cred); + NULL, NULL, &rderr, NULL, p, cred); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); @@ -3640,7 +3860,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, ncookie.lval[0] = 0; ncookie.lval[1] = *tl++; } - if (bigenough) { + if (bigenough && validentry) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; @@ -3777,11 +3997,16 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, size_t tresid; u_int32_t *tl2 = NULL, rderr; struct timespec dctime, ts; - bool attr_ok; + bool attr_ok, named_dir, validentry; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirplusrpc bad uio")); + KASSERT(uiop->uio_segflg == UIO_SYSSPACE, + ("nfsrpc_readdirplus: uio userspace")); + named_dir = false; + if ((vp->v_irflag & VIRF_NAMEDDIR) != 0) + named_dir = true; ncookie.lval[0] = ncookie.lval[1] = 0; timespecclear(&dctime); *attrflagp = 0; @@ -3847,7 +4072,7 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, - NULL, NULL, NULL, p, cred); + NULL, NULL, NULL, NULL, p, cred); if (error) { dotdotfileid = dotfileid; } else if (gotmnton) { @@ -3933,6 +4158,13 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, NFSATTRBIT_TIMECREATE)) NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); + if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, + NFSATTRBIT_HIDDEN) || + !NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, + NFSATTRBIT_SYSTEM)) { + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN); + NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM); + } } /* @@ -3986,6 +4218,7 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { + validentry = true; NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (nd->nd_flag & ND_NFSV4) { ncookie.lval[0] = *tl++; @@ -4017,6 +4250,17 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, uiop->uio_resid) bigenough = 0; if (bigenough) { + struct iovec saviov; + off_t savoff; + ssize_t savresid; + int savblksiz; + + saviov.iov_base = uiop->uio_iov->iov_base; + saviov.iov_len = uiop->uio_iov->iov_len; + savoff = uiop->uio_offset; + savresid = uiop->uio_resid; + savblksiz = blksiz; + dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; @@ -4035,25 +4279,42 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; NFSCNHASHZERO(cnp); + cp = uiop->uio_iov->iov_base; error = nfsm_mbufuio(nd, uiop, len); if (error) goto nfsmout; - cp = uiop->uio_iov->iov_base; - tlen -= len; - NFSBZERO(cp, tlen); - cp += tlen; /* points to cookie storage */ - tl2 = (u_int32_t *)cp; - if (len == 2 && cnp->cn_nameptr[0] == '.' && - cnp->cn_nameptr[1] == '.') - isdotdot = 1; - else - isdotdot = 0; - uiop->uio_iov->iov_base = - (char *)uiop->uio_iov->iov_base + tlen + - NFSX_HYPER; - uiop->uio_iov->iov_len -= tlen + NFSX_HYPER; - uiop->uio_resid -= tlen + NFSX_HYPER; - uiop->uio_offset += (tlen + NFSX_HYPER); + /* Check for an invalid file name. */ + if (nfscl_invalidfname( + (nd->nd_flag & ND_NFSV4) != 0, cp, len)) { + /* Skip over this entry. */ + uiop->uio_iov->iov_base = + saviov.iov_base; + uiop->uio_iov->iov_len = + saviov.iov_len; + uiop->uio_offset = savoff; + uiop->uio_resid = savresid; + blksiz = savblksiz; + validentry = false; + } else { + cp = uiop->uio_iov->iov_base; + tlen -= len; + NFSBZERO(cp, tlen); + cp += tlen; /* points to cookie store */ + tl2 = (u_int32_t *)cp; + if (len == 2 && + cnp->cn_nameptr[0] == '.' && + cnp->cn_nameptr[1] == '.') + isdotdot = 1; + else + isdotdot = 0; + uiop->uio_iov->iov_base = + (char *)uiop->uio_iov->iov_base + + tlen + NFSX_HYPER; + uiop->uio_iov->iov_len -= tlen + + NFSX_HYPER; + uiop->uio_resid -= tlen + NFSX_HYPER; + uiop->uio_offset += (tlen + NFSX_HYPER); + } } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); if (error) @@ -4085,12 +4346,12 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, nfsva.na_mntonfileno = 0xffffffff; error = nfsv4_loadattr(nd, NULL, &nfsva, &nfhp, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, - NULL, NULL, &rderr, p, cred); + NULL, NULL, &rderr, NULL, p, cred); if (error) goto nfsmout; } - if (bigenough) { + if (bigenough && validentry) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; @@ -4190,7 +4451,8 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, if (cnp->cn_namelen <= NCHNAMLEN && ndp->ni_dvp != ndp->ni_vp && (newvp->v_type != VDIR || - dctime.tv_sec != 0)) { + dctime.tv_sec != 0) && + !named_dir) { cache_enter_time_flags(ndp->ni_dvp, ndp->ni_vp, cnp, &nfsva.na_ctime, @@ -4747,7 +5009,7 @@ nfsrpc_statfs(vnode_t vp, struct nfsstatfs *sbp, struct nfsfsinfo *fsp, if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, sbp, fsp, NULL, 0, NULL, leasep, NULL, - p, cred); + NULL, p, cred); if (!error) { nmp->nm_fsid[0] = nap->na_filesid[0]; nmp->nm_fsid[1] = nap->na_filesid[1]; @@ -4800,7 +5062,7 @@ nfsmout: * nfs pathconf rpc */ int -nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc, +nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc, bool *has_namedattrp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp) { struct nfsrv_descript nfsd, *nd = &nfsd; @@ -4810,6 +5072,7 @@ nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc, int error; struct nfsnode *np; + *has_namedattrp = false; *attrflagp = 0; nmp = VFSTONFS(vp->v_mount); if (NFSHASNFSV4(nmp)) { @@ -4836,8 +5099,8 @@ nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc, return (error); if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, - pc, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, - cred); + pc, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, + has_namedattrp, p, cred); if (!error) *attrflagp = 1; } else { @@ -5132,7 +5395,7 @@ nfsrpc_getacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp) return (error); if (!nd->nd_repstat) error = nfsv4_loadattr(nd, vp, NULL, NULL, NULL, 0, NULL, - NULL, NULL, NULL, aclp, 0, NULL, NULL, NULL, p, cred); + NULL, NULL, NULL, aclp, 0, NULL, NULL, NULL, NULL, p, cred); else error = nd->nd_repstat; m_freem(nd->nd_mrep); @@ -5173,7 +5436,8 @@ nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p, NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); (void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0, - &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); + &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL, false, false, + false); error = nfscl_request(nd, vp, p, cred); if (error) return (error); @@ -8109,7 +8373,8 @@ nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, 0, 0, cred); NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); - *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); + *tl++ = txdr_unsigned(mode & (NFSV4OPEN_ACCESSBOTH | + NFSV4OPEN_WANTDELEGMASK)); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; @@ -8210,6 +8475,13 @@ nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, &ret, &acesize, p); if (error != 0) goto nfsmout; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -8224,7 +8496,7 @@ nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, if (*++tl == 0) { error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, - NULL, NULL, NULL, p, cred); + NULL, NULL, NULL, NULL, p, cred); if (error != 0) goto nfsmout; if (ndp != NULL) { @@ -8301,8 +8573,17 @@ nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); - *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | - NFSV4OPEN_ACCESSREAD); + if (NFSHASNFSV4N(nmp)) { + if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 && + nfs_numnfscbd > 0) + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTWRITEDELEG); + else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTNODELEG); + } else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; @@ -8314,18 +8595,18 @@ nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); @@ -8421,6 +8702,13 @@ nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, &ret, &acesize, p); if (error != 0) goto nfsmout; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -9258,7 +9546,7 @@ nfsm_split(struct mbuf *mp, uint64_t xfer) if (pgno == m->m_epg_npgs) panic("nfsm_split: eroneous ext_pgs mbuf"); - m2 = mb_alloc_ext_pgs(M_WAITOK, mb_free_mext_pgs); + m2 = mb_alloc_ext_pgs(M_WAITOK, mb_free_mext_pgs, 0); m2->m_epg_flags |= EPG_FLAG_ANON; /* @@ -9381,6 +9669,50 @@ nfsmout: } /* + * nfs opeattr rpc + */ +int +nfsrpc_openattr(struct nfsmount *nmp, struct vnode *vp, uint8_t *fhp, int fhlen, + bool createit, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, + struct nfsfh **nfhpp, int *attrflagp) +{ + uint32_t *tl; + struct nfsrv_descript nfsd, *nd = &nfsd; + nfsattrbit_t attrbits; + int error = 0; + + *attrflagp = 0; + nfscl_reqstart(nd, NFSPROC_OPENATTR, nmp, fhp, fhlen, NULL, NULL, 0, 0, + cred); + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + if (createit) + *tl = newnfs_true; + else + *tl = newnfs_false; + NFSGETATTR_ATTRBIT(&attrbits); + NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); + *tl++ = txdr_unsigned(NFSV4OP_GETFH); + *tl = txdr_unsigned(NFSV4OP_GETATTR); + (void)nfsrv_putattrbit(nd, &attrbits); + error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, + NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); + if (error != 0) + return (error); + if (nd->nd_repstat == 0) { + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + error = nfsm_getfh(nd, nfhpp); + if (error != 0) + goto nfsmout; + error = nfscl_postop_attr(nd, nap, attrflagp); + } +nfsmout: + m_freem(nd->nd_mrep); + if (error == 0 && nd->nd_repstat != 0) + error = nd->nd_repstat; + return (error); +} + +/* * Do roughly what nfs_statfs() does for NFSv4, but when called with a shared * locked vnode. */ diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c index 9fbaa6e63a56..99a781640c53 100644 --- a/sys/fs/nfsclient/nfs_clstate.c +++ b/sys/fs/nfsclient/nfs_clstate.c @@ -93,11 +93,7 @@ NFSREQSPINLOCK; NFSCLSTATEMUTEX; int nfscl_inited = 0; struct nfsclhead nfsclhead; /* Head of clientid list */ -int nfscl_deleghighwater = NFSCLDELEGHIGHWATER; -int nfscl_layouthighwater = NFSCLLAYOUTHIGHWATER; -static int nfscl_delegcnt = 0; -static int nfscl_layoutcnt = 0; static int nfscl_getopen(struct nfsclownerhead *, struct nfsclopenhash *, u_int8_t *, int, u_int8_t *, u_int8_t *, u_int32_t, struct nfscllockowner **, struct nfsclopen **); @@ -433,25 +429,13 @@ nfscl_newopen(struct nfsclclient *clp, struct nfscldeleg *dp, */ int nfscl_deleg(mount_t mp, struct nfsclclient *clp, u_int8_t *nfhp, - int fhlen, struct ucred *cred, NFSPROC_T *p, struct nfscldeleg **dpp) + int fhlen, struct ucred *cred, NFSPROC_T *p, struct nfscldeleg *dp) { - struct nfscldeleg *dp = *dpp, *tdp; + struct nfscldeleg *tdp; struct nfsmount *nmp; KASSERT(mp != NULL, ("nfscl_deleg: mp NULL")); nmp = VFSTONFS(mp); - /* - * First, if we have received a Read delegation for a file on a - * read/write file system, just return it, because they aren't - * useful, imho. - */ - if (dp != NULL && !NFSMNT_RDONLY(mp) && - (dp->nfsdl_flags & NFSCLDL_READ)) { - nfscl_trydelegreturn(dp, cred, nmp, p); - free(dp, M_NFSCLDELEG); - *dpp = NULL; - return (0); - } /* * Since a delegation might be added to the mount, @@ -470,26 +454,40 @@ nfscl_deleg(mount_t mp, struct nfsclclient *clp, u_int8_t *nfhp, NFSUNLOCKCLSTATE(); return (NFSERR_BADSTATEID); } - *dpp = NULL; TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp, nfsdl_list); LIST_INSERT_HEAD(NFSCLDELEGHASH(clp, nfhp, fhlen), dp, nfsdl_hash); dp->nfsdl_timestamp = NFSD_MONOSEC + 120; nfsstatsv1.cldelegates++; - nfscl_delegcnt++; + clp->nfsc_delegcnt++; } else { /* - * Delegation already exists, what do we do if a new one?? + * A delegation already exists. If the new one is a Write + * delegation and the old one a Read delegation, return the + * Read delegation. Otherwise, return the new delegation. */ if (dp != NULL) { - printf("Deleg already exists!\n"); - free(dp, M_NFSCLDELEG); - *dpp = NULL; + if ((dp->nfsdl_flags & NFSCLDL_WRITE) != 0 && + (tdp->nfsdl_flags & NFSCLDL_READ) != 0) { + TAILQ_REMOVE(&clp->nfsc_deleg, tdp, nfsdl_list); + LIST_REMOVE(tdp, nfsdl_hash); + TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp, + nfsdl_list); + LIST_INSERT_HEAD(NFSCLDELEGHASH(clp, nfhp, + fhlen), dp, nfsdl_hash); + dp->nfsdl_timestamp = NFSD_MONOSEC + 120; + } else { + tdp = dp; /* Return this one. */ + } } else { - *dpp = tdp; + tdp = NULL; } } NFSUNLOCKCLSTATE(); + if (tdp != NULL) { + nfscl_trydelegreturn(tdp, cred, nmp, p); + free(tdp, M_NFSCLDELEG); + } return (0); } @@ -918,6 +916,10 @@ nfscl_getcl(struct mount *mp, struct ucred *cred, NFSPROC_T *p, for (i = 0; i < NFSCLLAYOUTHASHSIZE; i++) LIST_INIT(&clp->nfsc_layouthash[i]); clp->nfsc_flags = NFSCLFLAGS_INITED; + clp->nfsc_delegcnt = 0; + clp->nfsc_deleghighwater = NFSCLDELEGHIGHWATER; + clp->nfsc_layoutcnt = 0; + clp->nfsc_layouthighwater = NFSCLLAYOUTHIGHWATER; clp->nfsc_clientidrev = 1; clp->nfsc_cbident = nfscl_nextcbident(); nfscl_fillclid(nmp->nm_clval, uuid, clp->nfsc_id, @@ -1632,7 +1634,7 @@ nfscl_expireopen(struct nfsclclient *clp, struct nfsclopen *op, } if (dp != NULL) nfscl_deleg(nmp->nm_mountp, clp, op->nfso_fh, - op->nfso_fhlen, cred, p, &dp); + op->nfso_fhlen, cred, p, dp); } /* @@ -1750,10 +1752,10 @@ nfscl_freedeleg(struct nfscldeleghead *hdp, struct nfscldeleg *dp, bool freeit) TAILQ_REMOVE(hdp, dp, nfsdl_list); LIST_REMOVE(dp, nfsdl_hash); + dp->nfsdl_clp->nfsc_delegcnt--; if (freeit) free(dp, M_NFSCLDELEG); nfsstatsv1.cldelegates--; - nfscl_delegcnt--; } /* @@ -2863,7 +2865,7 @@ tryagain: nfsdl_list); LIST_REMOVE(dp, nfsdl_hash); TAILQ_INSERT_HEAD(&dh, dp, nfsdl_list); - nfscl_delegcnt--; + clp->nfsc_delegcnt--; nfsstatsv1.cldelegates--; } NFSLOCKCLSTATE(); @@ -2893,7 +2895,8 @@ tryagain: * The tailq list is in LRU order. */ dp = TAILQ_LAST(&clp->nfsc_deleg, nfscldeleghead); - while (nfscl_delegcnt > nfscl_deleghighwater && dp != NULL) { + while (clp->nfsc_delegcnt > clp->nfsc_deleghighwater && + dp != NULL) { ndp = TAILQ_PREV(dp, nfscldeleghead, nfsdl_list); if (dp->nfsdl_rwlock.nfslock_usecnt == 0 && dp->nfsdl_rwlock.nfslock_lock == 0 && @@ -2920,7 +2923,7 @@ tryagain: TAILQ_REMOVE(&clp->nfsc_deleg, dp, nfsdl_list); LIST_REMOVE(dp, nfsdl_hash); TAILQ_INSERT_HEAD(&dh, dp, nfsdl_list); - nfscl_delegcnt--; + clp->nfsc_delegcnt--; nfsstatsv1.cldelegates--; } } @@ -2976,13 +2979,14 @@ tryagain2: lyp = TAILQ_LAST(&clp->nfsc_layout, nfscllayouthead); while (lyp != NULL) { nlyp = TAILQ_PREV(lyp, nfscllayouthead, nfsly_list); - if (lyp->nfsly_timestamp < NFSD_MONOSEC && + if ((lyp->nfsly_timestamp < NFSD_MONOSEC || + clp->nfsc_layoutcnt > clp->nfsc_layouthighwater) && (lyp->nfsly_flags & (NFSLY_RECALL | NFSLY_RETONCLOSE)) == 0 && lyp->nfsly_lock.nfslock_usecnt == 0 && lyp->nfsly_lock.nfslock_lock == 0) { NFSCL_DEBUG(4, "ret stale lay=%d\n", - nfscl_layoutcnt); + clp->nfsc_layoutcnt); recallp = malloc(sizeof(*recallp), M_NFSLAYRECALL, M_NOWAIT); if (recallp == NULL) @@ -3504,7 +3508,7 @@ nfscl_delegreturnall(struct nfsclclient *clp, NFSPROC_T *p, * Return any delegation for this vp. */ void -nfscl_delegreturnvp(vnode_t vp, NFSPROC_T *p) +nfscl_delegreturnvp(struct vnode *vp, bool retdeleg, NFSPROC_T *p) { struct nfsclclient *clp; struct nfscldeleg *dp; @@ -3527,12 +3531,15 @@ nfscl_delegreturnvp(vnode_t vp, NFSPROC_T *p) if (clp != NULL) dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len); - if (dp != NULL) { + if (dp != NULL && + (dp->nfsdl_flags & (NFSCLDL_RECALL | NFSCLDL_DELEGRET)) == 0) { nfscl_cleandeleg(dp); nfscl_freedeleg(&clp->nfsc_deleg, dp, false); NFSUNLOCKCLSTATE(); - newnfs_copycred(&dp->nfsdl_cred, cred); - nfscl_trydelegreturn(dp, cred, clp->nfsc_nmp, p); + if (retdeleg) { + newnfs_copycred(&dp->nfsdl_cred, cred); + nfscl_trydelegreturn(dp, cred, clp->nfsc_nmp, p); + } free(dp, M_NFSCLDELEG); } else NFSUNLOCKCLSTATE(); @@ -3694,7 +3701,7 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p) if (!error) (void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va, NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0, - (uint64_t)0, NULL); + (uint64_t)0, NULL, false, false, false); break; case NFSV4OP_CBRECALL: NFSCL_DEBUG(4, "cbrecall\n"); @@ -3712,18 +3719,10 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p) clp = nfscl_getclnt(cbident); else clp = nfscl_getclntsess(sessionid); - if (clp != NULL) { - dp = nfscl_finddeleg(clp, nfhp->nfh_fh, - nfhp->nfh_len); - if (dp != NULL && (dp->nfsdl_flags & - NFSCLDL_DELEGRET) == 0) { - dp->nfsdl_flags |= - NFSCLDL_RECALL; - wakeup((caddr_t)clp); - } - } else { + if (clp != NULL) + nfscl_startdelegrecall(clp, nfhp); + else error = NFSERR_SERVERFAULT; - } NFSUNLOCKCLSTATE(); } if (nfhp != NULL) @@ -3933,6 +3932,77 @@ nfscl_docb(struct nfsrv_descript *nd, NFSPROC_T *p) *tl = txdr_unsigned(NFSV4_CBSLOTS - 1); } break; + case NFSV4OP_CBRECALLSLOT: + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + highslot = fxdr_unsigned(uint32_t, *tl); + NFSLOCKCLSTATE(); + clp = nfscl_getclntsess(sessionid); + if (clp == NULL) + error = NFSERR_SERVERFAULT; + if (error == 0) { + tsep = nfsmnt_mdssession(clp->nfsc_nmp); + mtx_lock(&tsep->nfsess_mtx); + if ((highslot + 1) < tsep->nfsess_foreslots) { + tsep->nfsess_foreslots = (highslot + 1); + nfs_resetslots(tsep); + } + mtx_unlock(&tsep->nfsess_mtx); + } + NFSUNLOCKCLSTATE(); + break; + case NFSV4OP_CBRECALLANY: + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + i = fxdr_unsigned(int, *tl++); + j = fxdr_unsigned(int, *tl); + if (i < 0 || j != 1) + error = NFSERR_BADXDR; + if (error == 0) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + j = fxdr_unsigned(int, *tl); + if (i < 100) + i = 100; + else if (i > 100000) + i = 100000; + NFSLOCKCLSTATE(); + clp = nfscl_getclntsess(sessionid); + if (clp == NULL) + error = NFSERR_SERVERFAULT; + if (((j & NFSRCA4_RDATA_DLG) != 0 || + (j & NFSRCA4_WDATA_DLG) != 0) && + error == 0 && i < + clp->nfsc_deleghighwater) + clp->nfsc_deleghighwater = i; + if (error == 0 && + ((!NFSHASFLEXFILE(clp->nfsc_nmp) && + (j & NFSRCA4_FILE_LAYOUT) != 0 && + i < clp->nfsc_layouthighwater) || + (NFSHASFLEXFILE(clp->nfsc_nmp) && + (j & (NFSRCA4_FF_LAYOUT_READ | + NFSRCA4_FF_LAYOUT_RW)) != 0 && + i < clp->nfsc_layouthighwater))) + clp->nfsc_layouthighwater = i; + NFSUNLOCKCLSTATE(); + } + break; + case NFSV4OP_CBNOTIFY: + case NFSV4OP_CBRECALLOBJAVAIL: + case NFSV4OP_CBNOTIFYLOCK: + /* + * These callbacks are not necessarily optional, + * so I think it is better to reply NFS_OK than + * NFSERR_NOTSUPP. + * All provide information for which the FreeBSD client + * does not currently have a use. + * I am not sure if any of these could be generated + * by a NFSv4.1/4.2 server for this client? + */ + error = 0; + NFSCL_DEBUG(1, "unsupp callback %d\n", op); + break; + case NFSV4OP_CBPUSHDELEG: + error = NFSERR_REJECTDELEG; + NFSCL_DEBUG(1, "unsupp callback %d\n", op); + break; default: if (i == 0 && minorvers != NFSV4_MINORVERSION) error = NFSERR_OPNOTINSESS; @@ -4647,7 +4717,7 @@ nfscl_mustflush(vnode_t vp) np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); - if (!NFSHASNFSV4(nmp)) + if (!NFSHASNFSV4(nmp) || vp->v_type != VREG) return (1); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & NFSMNTP_DELEGISSUED) == 0) { @@ -4687,7 +4757,7 @@ nfscl_nodeleg(vnode_t vp, int writedeleg) np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); - if (!NFSHASNFSV4(nmp)) + if (!NFSHASNFSV4(nmp) || vp->v_type != VREG) return (1); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & NFSMNTP_DELEGISSUED) == 0) { @@ -5099,7 +5169,7 @@ nfscl_newnode(vnode_t vp) * to the local clock time. */ void -nfscl_delegmodtime(vnode_t vp) +nfscl_delegmodtime(struct vnode *vp, struct timespec *mtime) { struct nfsclclient *clp; struct nfscldeleg *dp; @@ -5123,7 +5193,10 @@ nfscl_delegmodtime(vnode_t vp) } dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len); if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_WRITE)) { - nanotime(&dp->nfsdl_modtime); + if (mtime != NULL) + dp->nfsdl_modtime = *mtime; + else + nanotime(&dp->nfsdl_modtime); dp->nfsdl_flags |= NFSCLDL_MODTIMESET; } NFSUNLOCKCLSTATE(); @@ -5266,7 +5339,7 @@ nfscl_layout(struct nfsmount *nmp, vnode_t vp, u_int8_t *fhp, int fhlen, LIST_INSERT_HEAD(NFSCLLAYOUTHASH(clp, fhp, fhlen), lyp, nfsly_hash); lyp->nfsly_timestamp = NFSD_MONOSEC + 120; - nfscl_layoutcnt++; + clp->nfsc_layoutcnt++; nfsstatsv1.cllayouts++; } else { if (retonclose != 0) @@ -5641,7 +5714,7 @@ nfscl_freelayout(struct nfscllayout *layp) LIST_REMOVE(rp, nfsrecly_list); free(rp, M_NFSLAYRECALL); } - nfscl_layoutcnt--; + layp->nfsly_clp->nfsc_layoutcnt--; nfsstatsv1.cllayouts--; free(layp, M_NFSLAYOUT); } @@ -5879,3 +5952,69 @@ tryagain: NFSUNLOCKCLSTATE(); return (0); } + +/* + * Check access against a delegation ace. + * Return EINVAL for any case where the check cannot be completed. + */ +int +nfscl_delegacecheck(struct vnode *vp, accmode_t accmode, struct ucred *cred) +{ + struct nfsclclient *clp; + struct nfscldeleg *dp; + struct nfsnode *np; + struct nfsmount *nmp; + struct acl *aclp; + int error; + + np = VTONFS(vp); + nmp = VFSTONFS(vp->v_mount); + if (!NFSHASNFSV4(nmp) || !NFSHASNFSV4N(nmp) || vp->v_type != VREG) + return (EINVAL); + NFSLOCKMNT(nmp); + if ((nmp->nm_privflag & NFSMNTP_DELEGISSUED) == 0) { + NFSUNLOCKMNT(nmp); + return (EINVAL); + } + NFSUNLOCKMNT(nmp); + aclp = acl_alloc(M_WAITOK); + NFSLOCKCLSTATE(); + clp = nfscl_findcl(nmp); + if (clp == NULL) { + NFSUNLOCKCLSTATE(); + acl_free(aclp); + return (EINVAL); + } + dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len); + if (dp != NULL && (dp->nfsdl_flags & (NFSCLDL_RECALL | + NFSCLDL_DELEGRET)) == 0) { + memcpy(&aclp->acl_entry[0], &dp->nfsdl_ace, + sizeof(struct acl_entry)); + NFSUNLOCKCLSTATE(); + aclp->acl_cnt = 1; + error = vaccess_acl_nfs4(vp->v_type, np->n_vattr.na_uid, + np->n_vattr.na_gid, aclp, accmode, cred); + acl_free(aclp); + if (error == 0 || error == EACCES) + return (error); + } else { + NFSUNLOCKCLSTATE(); + acl_free(aclp); + } + return (EINVAL); +} + +/* + * Start the recall of a delegation. Called for CB_RECALL and REMOVE + * when nlink == 0 after the REMOVE. + */ +void nfscl_startdelegrecall(struct nfsclclient *clp, struct nfsfh *nfhp) +{ + struct nfscldeleg *dp; + + dp = nfscl_finddeleg(clp, nfhp->nfh_fh, nfhp->nfh_len); + if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_DELEGRET) == 0) { + dp->nfsdl_flags |= NFSCLDL_RECALL; + wakeup((caddr_t)clp); + } +} diff --git a/sys/fs/nfsclient/nfs_clsubs.c b/sys/fs/nfsclient/nfs_clsubs.c index 80ab979d22d7..ae9fa51947cc 100644 --- a/sys/fs/nfsclient/nfs_clsubs.c +++ b/sys/fs/nfsclient/nfs_clsubs.c @@ -54,6 +54,7 @@ #include <sys/socket.h> #include <sys/stat.h> #include <sys/malloc.h> +#include <sys/stdarg.h> #include <sys/syscall.h> #include <sys/sysproto.h> #include <sys/taskqueue.h> @@ -71,12 +72,6 @@ #include <netinet/in.h> -/* - * Note that stdarg.h and the ANSI style va_start macro is used for both - * ANSI and traditional C compilers. - */ -#include <machine/stdarg.h> - extern struct mtx ncl_iod_mutex; extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; @@ -188,7 +183,7 @@ ncl_getattrcache(struct vnode *vp, struct vattr *vaper) np = VTONFS(vp); vap = &np->n_vattr.na_vattr; nmp = VFSTONFS(vp->v_mount); - mustflush = nfscl_mustflush(vp); /* must be before mtx_lock() */ + mustflush = nfscl_nodeleg(vp, 0); /* must be before mtx_lock() */ NFSLOCKNODE(np); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime.tv_sec) / 10; @@ -221,8 +216,8 @@ ncl_getattrcache(struct vnode *vp, struct vattr *vaper) (time_second - np->n_attrstamp), timeo); #endif - if ((time_second - np->n_attrstamp) >= timeo && - (mustflush != 0 || np->n_attrstamp == 0)) { + if (mustflush != 0 && (np->n_attrstamp == 0 || + time_second - np->n_attrstamp >= timeo)) { nfsstatsv1.attrcache_misses++; NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_GET_MISS(vp); diff --git a/sys/fs/nfsclient/nfs_clvfsops.c b/sys/fs/nfsclient/nfs_clvfsops.c index c050eef7d4c3..0bd05c03885b 100644 --- a/sys/fs/nfsclient/nfs_clvfsops.c +++ b/sys/fs/nfsclient/nfs_clvfsops.c @@ -415,7 +415,7 @@ ncl_fsinfo(struct nfsmount *nmp, struct vnode *vp, struct ucred *cred, } /* - * Mount a remote root fs via. nfs. This depends on the info in the + * Mount a remote root fs via nfs. This depends on the info in the * nfs_diskless structure that has been filled in properly by some primary * bootstrap. * It goes something like this: @@ -1524,12 +1524,14 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, #endif NFSCL_DEBUG(3, "in mnt\n"); + CURVNET_SET(CRED_TO_VNET(cred)); clp = NULL; if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); printf("%s: MNT_UPDATE is no longer handled here\n", __func__); free(nam, M_SONAME); free(tlscertname, M_NEWNFSMNT); + CURVNET_RESTORE(); return (0); } else { /* NFS-over-TLS requires that rpctls be functioning. */ @@ -1544,6 +1546,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, if (error != 0) { free(nam, M_SONAME); free(tlscertname, M_NEWNFSMNT); + CURVNET_RESTORE(); return (error); } } @@ -1798,12 +1801,18 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, if (argp->flags & NFSMNT_NFSV3) ncl_fsinfo(nmp, *vpp, cred, td); - /* Mark if the mount point supports NFSv4 ACLs. */ - if ((argp->flags & NFSMNT_NFSV4) != 0 && nfsrv_useacl != 0 && - ret == 0 && - NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL)) { + /* + * Mark if the mount point supports NFSv4 ACLs and + * named attributes. + */ + if ((argp->flags & NFSMNT_NFSV4) != 0) { MNT_ILOCK(mp); - mp->mnt_flag |= MNT_NFS4ACLS; + if (ret == 0 && nfsrv_useacl != 0 && + NFSISSET_ATTRBIT(&nfsva.na_suppattr, + NFSATTRBIT_ACL)) + mp->mnt_flag |= MNT_NFS4ACLS; + if (nmp->nm_minorvers > 0) + mp->mnt_flag |= MNT_NAMEDATTR; MNT_IUNLOCK(mp); } @@ -1816,6 +1825,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, */ NFSVOPUNLOCK(*vpp); vfs_cache_root_set(mp, *vpp); + CURVNET_RESTORE(); return (0); } error = EIO; @@ -1844,6 +1854,7 @@ bad: free(nmp->nm_tlscertname, M_NEWNFSMNT); free(nmp, M_NEWNFSMNT); free(nam, M_SONAME); + CURVNET_RESTORE(); return (error); } diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c index 76a3cdf9281e..fa451887e73e 100644 --- a/sys/fs/nfsclient/nfs_clvnops.c +++ b/sys/fs/nfsclient/nfs_clvnops.c @@ -106,6 +106,7 @@ uint32_t nfscl_accesscache_load_done_id; extern struct nfsstatsv1 nfsstatsv1; extern int nfsrv_useacl; extern int nfscl_debuglevel; +NFSCLSTATEMUTEX; MALLOC_DECLARE(M_NEWNFSREQ); static vop_read_t nfsfifo_read; @@ -113,6 +114,8 @@ static vop_write_t nfsfifo_write; static vop_close_t nfsfifo_close; static int nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *, struct thread *); +static int nfs_get_namedattrdir(struct vnode *, struct componentname *, + struct vnode **); static vop_lookup_t nfs_lookup; static vop_create_t nfs_create; static vop_mknod_t nfs_mknod; @@ -248,10 +251,13 @@ VFS_VOP_VECTOR_REGISTER(newnfs_fifoops); static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap); static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name, - int namelen, struct ucred *cred, struct thread *td); + int namelen, struct ucred *cred, struct thread *td, bool silly); +static void nfs_removestatus(struct vnode *vp, nfsremove_status file_status, + bool silly, struct thread *td); static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp, - char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td); + char *tnameptr, int tnamelen, bool silly, struct ucred *cred, + struct thread *td); static int nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp, struct sillyrename *sp); @@ -474,6 +480,18 @@ nfs_access(struct vop_access_args *ap) break; } } + + /* + * For NFSv4, check for a delegation with an Allow ACE, to see + * if that permits access. + */ + if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOCTO) != 0) { + error = nfscl_delegacecheck(vp, ap->a_accmode, ap->a_cred); + if (error == 0) + return (error); + error = 0; + } + /* * For nfs v3 or v4, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. @@ -827,9 +845,11 @@ nfs_close(struct vop_close_args *ap) struct ucred *cred; int error = 0, ret, localcred = 0; int fmode = ap->a_fflag; + struct nfsmount *nmp; if (NFSCL_FORCEDISM(vp->v_mount)) return (0); + nmp = VFSTONFS(vp->v_mount); /* * During shutdown, a_cred isn't valid, so just use root. */ @@ -883,7 +903,9 @@ nfs_close(struct vop_close_args *ap) error = ncl_flush(vp, MNT_WAIT, ap->a_td, cm, 0); /* np->n_flag &= ~NMODIFIED; */ } else if (NFS_ISV4(vp)) { - if (nfscl_mustflush(vp) != 0) { + if (!NFSHASNFSV4N(nmp) || + (nmp->nm_flag & NFSMNT_NOCTO) == 0 || + nfscl_mustflush(vp) != 0) { int cm = newnfs_commit_on_close ? 1 : 0; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); @@ -925,7 +947,7 @@ nfs_close(struct vop_close_args *ap) * is the cause of some caching/coherency issue that might * crop up.) */ - if (VFSTONFS(vp->v_mount)->nm_negnametimeo == 0) { + if (nmp->nm_negnametimeo == 0) { np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); } @@ -940,9 +962,9 @@ nfs_close(struct vop_close_args *ap) /* * Get attributes so "change" is up to date. */ - if (error == 0 && nfscl_mustflush(vp) != 0 && + if (error == 0 && nfscl_nodeleg(vp, 0) != 0 && vp->v_type == VREG && - (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOCTO) == 0) { + (nmp->nm_flag & NFSMNT_NOCTO) == 0) { ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva); if (!ret) { np->n_change = nfsva.na_filerev; @@ -1023,8 +1045,9 @@ nfs_getattr(struct vop_getattr_args *ap) return (0); } } + error = nfsrpc_getattr(vp, ap->a_cred, td, &nfsva); - if (!error) + if (error == 0) error = nfscl_loadattrcache(&vp, &nfsva, vap, 0, 0); if (!error) { /* @@ -1051,21 +1074,29 @@ nfs_setattr(struct vop_setattr_args *ap) int error = 0; u_quad_t tsize; struct timespec ts; + struct nfsmount *nmp; #ifndef nolint tsize = (u_quad_t)0; #endif /* - * Setting of flags and marking of atimes are not supported. + * Only setting of UF_HIDDEN and UF_SYSTEM are supported and + * only for NFSv4 servers that support them. */ - if (vap->va_flags != VNOVAL) + nmp = VFSTONFS(vp->v_mount); + if (vap->va_flags != VNOVAL && (!NFSHASNFSV4(nmp) || + (vap->va_flags & ~(UF_HIDDEN | UF_SYSTEM)) != 0 || + ((vap->va_flags & UF_HIDDEN) != 0 && + !NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, NFSATTRBIT_HIDDEN)) || + ((vap->va_flags & UF_SYSTEM) != 0 && + !NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, NFSATTRBIT_SYSTEM)))) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ - if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || + if ((vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL || @@ -1120,7 +1151,7 @@ nfs_setattr(struct vop_setattr_args *ap) * Call nfscl_delegmodtime() to set the modify time * locally, as required. */ - nfscl_delegmodtime(vp); + nfscl_delegmodtime(vp, NULL); } else NFSUNLOCKNODE(np); /* @@ -1158,6 +1189,8 @@ nfs_setattr(struct vop_setattr_args *ap) NFSUNLOCKNODE(np); } } + if (vap->va_mtime.tv_sec != VNOVAL && error == 0) + nfscl_delegmodtime(vp, &vap->va_mtime); return (error); } @@ -1192,6 +1225,40 @@ nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred, } /* + * Get a named attribute directory for the vnode. + */ +static int +nfs_get_namedattrdir(struct vnode *vp, struct componentname *cnp, + struct vnode **vpp) +{ + struct nfsfh *nfhp; + struct nfsnode *np; + struct vnode *newvp; + struct nfsvattr nfsva; + int attrflag, error; + + attrflag = 0; + *vpp = NULL; + np = VTONFS(vp); + error = nfsrpc_openattr(VFSTONFS(vp->v_mount), vp, np->n_fhp->nfh_fh, + np->n_fhp->nfh_len, (cnp->cn_flags & CREATENAMED), + cnp->cn_cred, curthread, &nfsva, &nfhp, &attrflag); + if (error == NFSERR_NOTSUPP) + error = ENOATTR; + if (error == 0) + error = nfscl_nget(vp->v_mount, vp, nfhp, cnp, curthread, &np, + cnp->cn_lkflags); + if (error != 0) + return (error); + newvp = NFSTOV(np); + vn_irflag_set_cond(newvp, VIRF_NAMEDDIR); + if (attrflag != 0) + (void)nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); + *vpp = newvp; + return (0); +} + +/* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc @@ -1203,7 +1270,7 @@ nfs_lookup(struct vop_lookup_args *ap) struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct mount *mp = dvp->v_mount; - int flags = cnp->cn_flags; + uint64_t flags = cnp->cn_flags; struct vnode *newvp; struct nfsmount *nmp; struct nfsnode *np, *newnp; @@ -1214,15 +1281,57 @@ nfs_lookup(struct vop_lookup_args *ap) struct vattr vattr; struct timespec nctime, ts; uint32_t openmode; + bool is_nameddir, needs_nameddir, opennamed; + dattrflag = 0; *vpp = NULLVP; + nmp = VFSTONFS(mp); + opennamed = (flags & (OPENNAMED | ISLASTCN)) == (OPENNAMED | ISLASTCN); + if (opennamed && (!NFSHASNFSV4(nmp) || !NFSHASNFSV4N(nmp))) + return (ENOATTR); + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if ((is_nameddir && (flags & ISLASTCN) == 0 && (cnp->cn_namelen > 1 || + *cnp->cn_nameptr != '.')) || + (opennamed && !is_nameddir && (flags & ISDOTDOT) != 0)) + return (ENOATTR); if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); + np = VTONFS(dvp); + + needs_nameddir = false; + if (opennamed || is_nameddir) { + cnp->cn_flags &= ~MAKEENTRY; + if (!is_nameddir) + needs_nameddir = true; + } + + /* + * If the named attribute directory is needed, acquire it now. + */ + newvp = NULLVP; + if (needs_nameddir) { + KASSERT(np->n_v4 == NULL, ("nfs_lookup: O_NAMEDATTR when" + " n_v4 not NULL")); + error = nfs_get_namedattrdir(dvp, cnp, &newvp); + if (error != 0) + goto handle_error; + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + *vpp = newvp; + return (0); + } + dvp = newvp; + np = VTONFS(dvp); + newvp = NULLVP; + } else if (opennamed && cnp->cn_namelen == 1 && + *cnp->cn_nameptr == '.') { + VREF(dvp); + *vpp = dvp; + return (0); + } + if (dvp->v_type != VDIR) return (ENOTDIR); - nmp = VFSTONFS(mp); - np = VTONFS(dvp); /* For NFSv4, wait until any remove is done. */ NFSLOCKNODE(np); @@ -1235,80 +1344,91 @@ nfs_lookup(struct vop_lookup_args *ap) error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); - error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks); - if (error > 0 && error != ENOENT) - return (error); - if (error == -1) { - /* - * Lookups of "." are special and always return the - * current directory. cache_lookup() already handles - * associated locking bookkeeping, etc. - */ - if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { - return (0); - } - /* - * We only accept a positive hit in the cache if the - * change time of the file matches our cached copy. - * Otherwise, we discard the cache entry and fallback - * to doing a lookup RPC. We also only trust cache - * entries for less than nm_nametimeo seconds. - * - * To better handle stale file handles and attributes, - * clear the attribute cache of this node if it is a - * leaf component, part of an open() call, and not - * locally modified before fetching the attributes. - * This should allow stale file handles to be detected - * here where we can fall back to a LOOKUP RPC to - * recover rather than having nfs_open() detect the - * stale file handle and failing open(2) with ESTALE. - */ - newvp = *vpp; - newnp = VTONFS(newvp); - if (!(nmp->nm_flag & NFSMNT_NOCTO) && - (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) && - !(newnp->n_flag & NMODIFIED)) { - NFSLOCKNODE(newnp); - newnp->n_attrstamp = 0; - KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp); - NFSUNLOCKNODE(newnp); - } - if (nfscl_nodeleg(newvp, 0) == 0 || - ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) && - VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 && - timespeccmp(&vattr.va_ctime, &nctime, ==))) { - NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits); - return (0); - } - cache_purge(newvp); - if (dvp != newvp) - vput(newvp); - else - vrele(newvp); - *vpp = NULLVP; - } else if (error == ENOENT) { - if (VN_IS_DOOMED(dvp)) - return (ENOENT); - /* - * We only accept a negative hit in the cache if the - * modification time of the parent directory matches - * the cached copy in the name cache entry. - * Otherwise, we discard all of the negative cache - * entries for this directory. We also only trust - * negative cache entries for up to nm_negnametimeo - * seconds. - */ - if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) && - VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 && - timespeccmp(&vattr.va_mtime, &nctime, ==)) { - NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits); - return (ENOENT); + if (!opennamed && !is_nameddir) { + error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks); + if (error > 0 && error != ENOENT) + return (error); + if (error == -1) { + /* + * Lookups of "." are special and always return the + * current directory. cache_lookup() already handles + * associated locking bookkeeping, etc. + */ + if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { + return (0); + } + + /* + * We only accept a positive hit in the cache if the + * change time of the file matches our cached copy. + * Otherwise, we discard the cache entry and fallback + * to doing a lookup RPC. We also only trust cache + * entries for less than nm_nametimeo seconds. + * + * To better handle stale file handles and attributes, + * clear the attribute cache of this node if it is a + * leaf component, part of an open() call, and not + * locally modified before fetching the attributes. + * This should allow stale file handles to be detected + * here where we can fall back to a LOOKUP RPC to + * recover rather than having nfs_open() detect the + * stale file handle and failing open(2) with ESTALE. + */ + newvp = *vpp; + newnp = VTONFS(newvp); + if (!(nmp->nm_flag & NFSMNT_NOCTO) && + (flags & (ISLASTCN | ISOPEN)) == + (ISLASTCN | ISOPEN) && + !(newnp->n_flag & NMODIFIED)) { + NFSLOCKNODE(newnp); + newnp->n_attrstamp = 0; + KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp); + NFSUNLOCKNODE(newnp); + } + if (nfscl_nodeleg(newvp, 0) == 0 || + ((u_int)(ticks - ncticks) < + (nmp->nm_nametimeo * hz) && + VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 && + timespeccmp(&vattr.va_ctime, &nctime, ==))) { + NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits); + return (0); + } + cache_purge(newvp); + if (dvp != newvp) + vput(newvp); + else + vrele(newvp); + *vpp = NULLVP; + } else if (error == ENOENT) { + if (VN_IS_DOOMED(dvp)) + return (ENOENT); + /* + * We only accept a negative hit in the cache if the + * modification time of the parent directory matches + * the cached copy in the name cache entry. + * Otherwise, we discard all of the negative cache + * entries for this directory. We also only trust + * negative cache entries for up to nm_negnametimeo + * seconds. + */ + if ((u_int)(ticks - ncticks) < + (nmp->nm_negnametimeo * hz) && + VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 && + timespeccmp(&vattr.va_mtime, &nctime, ==)) { + NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits); + return (ENOENT); + } + cache_purge_negative(dvp); } - cache_purge_negative(dvp); } openmode = 0; +#if 0 + /* + * The use of LookupOpen breaks some builds. It is disabled + * until that is fixed. + */ /* * If this an NFSv4.1/4.2 mount using the "oneopenown" mount * option, it is possible to do the Open operation in the same @@ -1321,13 +1441,14 @@ nfs_lookup(struct vop_lookup_args *ap) if (NFSHASNFSV4N(nmp) && NFSHASONEOPENOWN(nmp) && !NFSHASPNFS(nmp) && (nmp->nm_privflag & NFSMNTP_DELEGISSUED) == 0 && (!NFSMNT_RDONLY(mp) || (flags & OPENWRITE) == 0) && - (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN)) { + (flags & (ISLASTCN | ISOPEN | OPENNAMED))) == (ISLASTCN | ISOPEN)) { if ((flags & OPENREAD) != 0) openmode |= NFSV4OPEN_ACCESSREAD; if ((flags & OPENWRITE) != 0) openmode |= NFSV4OPEN_ACCESSWRITE; } NFSUNLOCKMNT(nmp); +#endif newvp = NULLVP; NFSINCRGLOBAL(nfsstatsv1.lookupcache_misses); @@ -1337,6 +1458,11 @@ nfs_lookup(struct vop_lookup_args *ap) openmode); if (dattrflag) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); + if (needs_nameddir) { + vput(dvp); + dvp = ap->a_dvp; + } +handle_error: if (error) { if (newvp != NULLVP) { vput(newvp); @@ -1345,13 +1471,14 @@ nfs_lookup(struct vop_lookup_args *ap) if (error != ENOENT) { if (NFS_ISV4(dvp)) - error = nfscl_maperr(td, error, (uid_t)0, - (gid_t)0); + error = nfscl_maperr(td, error, + (uid_t)0, (gid_t)0); return (error); } /* The requested file was not found. */ - if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && + if ((cnp->cn_nameiop == CREATE || + cnp->cn_nameiop == RENAME) && (flags & ISLASTCN)) { /* * XXX: UFS does a full VOP_ACCESS(dvp, @@ -1392,7 +1519,8 @@ nfs_lookup(struct vop_lookup_args *ap) free(nfhp, M_NFSFH); return (EISDIR); } - error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, LK_EXCLUSIVE); + error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, + LK_EXCLUSIVE); if (error) return (error); newvp = NFSTOV(np); @@ -1413,7 +1541,8 @@ nfs_lookup(struct vop_lookup_args *ap) } NFSUNLOCKNODE(np); if (attrflag) - (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); + (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, + 0, 1); *vpp = newvp; return (0); } @@ -1454,19 +1583,23 @@ nfs_lookup(struct vop_lookup_args *ap) if (error != 0) return (error); if (attrflag) - (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); + (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, + 0, 1); } else if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) { free(nfhp, M_NFSFH); VREF(dvp); newvp = dvp; if (attrflag) - (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); + (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, + 0, 1); } else { error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, cnp->cn_lkflags); if (error) return (error); newvp = NFSTOV(np); + if (opennamed) + vn_irflag_set_cond(newvp, VIRF_NAMEDATTR); /* * If n_localmodtime >= time before RPC, then * a file modification operation, such as @@ -1484,8 +1617,10 @@ nfs_lookup(struct vop_lookup_args *ap) } NFSUNLOCKNODE(np); if (attrflag) - (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); - else if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) && + (void)nfscl_loadattrcache(&newvp, &nfsva, NULL, + 0, 1); + else if ((flags & (ISLASTCN | ISOPEN)) == + (ISLASTCN | ISOPEN) && !(np->n_flag & NMODIFIED)) { /* * Flush the attribute cache when opening a @@ -1746,6 +1881,7 @@ nfs_create(struct vop_create_args *ap) nfsquad_t cverf; int error = 0, attrflag, dattrflag, fmode = 0; struct vattr vattr; + bool is_nameddir, needs_nameddir, opennamed; /* * Oops, not for me.. @@ -1759,6 +1895,32 @@ nfs_create(struct vop_create_args *ap) fmode |= O_EXCL; dnp = VTONFS(dvp); nmp = VFSTONFS(dvp->v_mount); + needs_nameddir = false; + if (NFSHASNFSV4(nmp) && NFSHASNFSV4N(nmp)) { + opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) == + (OPENNAMED | ISLASTCN); + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if (opennamed || is_nameddir) { + cnp->cn_flags &= ~MAKEENTRY; + if (!is_nameddir) + needs_nameddir = true; + } + } + + /* + * If the named attribute directory is needed, acquire it now. + */ + if (needs_nameddir) { + KASSERT(dnp->n_v4 == NULL, ("nfs_create: O_NAMEDATTR when" + " n_v4 not NULL")); + error = nfs_get_namedattrdir(dvp, cnp, &newvp); + if (error != 0) + return (error); + dvp = newvp; + dnp = VTONFS(dvp); + newvp = NULL; + } + again: /* For NFSv4, wait until any remove is done. */ NFSLOCKNODE(dnp); @@ -1841,6 +2003,8 @@ again: KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } NFSUNLOCKNODE(dnp); + if (needs_nameddir) + vput(dvp); return (error); } @@ -1864,6 +2028,7 @@ nfs_remove(struct vop_remove_args *ap) struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; + struct nfsmount *nmp; KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount")); if (vp->v_type == VDIR) @@ -1871,6 +2036,7 @@ nfs_remove(struct vop_remove_args *ap) else if (vrefcnt(vp) == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred) == 0 && vattr.va_nlink > 1)) { + nmp = VFSTONFS(vp->v_mount); /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is @@ -1882,12 +2048,19 @@ nfs_remove(struct vop_remove_args *ap) /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. + * Flushing here would be more correct for the case + * where nfs_close() did not do a flush. However, it + * could be a large performance hit for some servers + * and only matters when the file name being removed is + * one of multiple hard links. */ - error = ncl_vinvalbuf(vp, 0, curthread, 1); + if (!NFSHASNFSV4(nmp) || !NFSHASNFSV4N(nmp) || + (nmp->nm_flag & NFSMNT_NOCTO) == 0) + error = ncl_vinvalbuf(vp, 0, curthread, 1); if (error != EINTR && error != EIO) /* Do the rpc */ error = nfs_removerpc(dvp, vp, cnp->cn_nameptr, - cnp->cn_namelen, cnp->cn_cred, curthread); + cnp->cn_namelen, cnp->cn_cred, curthread, false); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT @@ -1918,7 +2091,32 @@ ncl_removeit(struct sillyrename *sp, struct vnode *vp) if (sp->s_dvp->v_type == VBAD) return (0); return (nfs_removerpc(sp->s_dvp, vp, sp->s_name, sp->s_namlen, - sp->s_cred, NULL)); + sp->s_cred, NULL, true)); +} + +/* + * Handle the nfsremove_status reply from the RPC function. + */ +static void +nfs_removestatus(struct vnode *vp, nfsremove_status file_status, + bool silly, struct thread *td) +{ + + switch (file_status) { + case NLINK_ZERO: + /* Get rid of any delegation. */ + nfscl_delegreturnvp(vp, false, td); + /* FALLTHROUGH */ + case DELETED: + /* Throw away buffer cache blocks. */ + (void)ncl_vinvalbuf(vp, 0, td, 1); + break; + case VALID: + /* Nothing to do, delegation is still ok. */ + break; + default: + break; + } } /* @@ -1926,17 +2124,20 @@ ncl_removeit(struct sillyrename *sp, struct vnode *vp) */ static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name, - int namelen, struct ucred *cred, struct thread *td) + int namelen, struct ucred *cred, struct thread *td, bool silly) { - struct nfsvattr dnfsva; + struct nfsvattr dnfsva, nfsva; struct nfsnode *dnp = VTONFS(dvp); - int error = 0, dattrflag; + struct nfsmount *nmp; + int attrflag, error = 0, dattrflag; + nfsremove_status file_status; + nmp = VFSTONFS(dvp->v_mount); NFSLOCKNODE(dnp); dnp->n_flag |= NREMOVEINPROG; NFSUNLOCKNODE(dnp); - error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva, - &dattrflag); + error = nfsrpc_remove(dvp, name, namelen, vp, &nfsva, &attrflag, + &file_status, &dnfsva, &dattrflag, cred, td); NFSLOCKNODE(dnp); if ((dnp->n_flag & NREMOVEWANT)) { dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG); @@ -1946,11 +2147,19 @@ nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name, dnp->n_flag &= ~NREMOVEINPROG; NFSUNLOCKNODE(dnp); } - if (dattrflag) + + if (NFSHASNFSV4(nmp) && NFSHASNFSV4N(nmp)) { + if (file_status != DELETED && attrflag != 0) + (void)nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); + if ((nmp->nm_flag & NFSMNT_NOCTO) != 0) + nfs_removestatus(vp, file_status, silly, td); + } + + if (dattrflag != 0) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; - if (!dattrflag) { + if (dattrflag == 0) { dnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } @@ -1975,6 +2184,7 @@ nfs_rename(struct vop_rename_args *ap) struct nfsnode *fnp = VTONFS(ap->a_fvp); struct nfsnode *tdnp = VTONFS(ap->a_tdvp); struct nfsv4node *newv4 = NULL; + struct nfsmount *nmp; int error; /* Check for cross-device rename */ @@ -1983,6 +2193,7 @@ nfs_rename(struct vop_rename_args *ap) error = EXDEV; goto out; } + nmp = VFSTONFS(fvp->v_mount); if (fvp == tvp) { printf("nfs_rename: fvp == tvp (can't happen)\n"); @@ -2005,11 +2216,15 @@ nfs_rename(struct vop_rename_args *ap) * that was written back to our cache earlier. Not checking for * this condition can result in potential (silent) data loss. */ - error = VOP_FSYNC(fvp, MNT_WAIT, curthread); + if ((nmp->nm_flag & NFSMNT_NOCTO) == 0 || !NFSHASNFSV4(nmp) || + !NFSHASNFSV4N(nmp) || nfscl_mustflush(fvp) != 0) + error = VOP_FSYNC(fvp, MNT_WAIT, curthread); NFSVOPUNLOCK(fvp); - if (!error && tvp) + if (error == 0 && tvp != NULL && ((nmp->nm_flag & NFSMNT_NOCTO) == 0 || + !NFSHASNFSV4(nmp) || !NFSHASNFSV4N(nmp) || + nfscl_mustflush(tvp) != 0)) error = VOP_FSYNC(tvp, MNT_WAIT, curthread); - if (error) + if (error != 0) goto out; /* @@ -2024,7 +2239,7 @@ nfs_rename(struct vop_rename_args *ap) } error = nfs_renamerpc(fdvp, fvp, fcnp->cn_nameptr, fcnp->cn_namelen, - tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, + tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, false, tcnp->cn_cred, curthread); if (error == 0 && NFS_ISV4(tdvp)) { @@ -2093,7 +2308,7 @@ nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp, { return (nfs_renamerpc(sdvp, svp, scnp->cn_nameptr, scnp->cn_namelen, - sdvp, NULL, sp->s_name, sp->s_namlen, scnp->cn_cred, + sdvp, NULL, sp->s_name, sp->s_namlen, true, scnp->cn_cred, curthread)); } @@ -2103,16 +2318,19 @@ nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp, static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr, - int tnamelen, struct ucred *cred, struct thread *td) + int tnamelen, bool silly, struct ucred *cred, struct thread *td) { - struct nfsvattr fnfsva, tnfsva; + struct nfsvattr fnfsva, tnfsva, tvpnfsva; struct nfsnode *fdnp = VTONFS(fdvp); struct nfsnode *tdnp = VTONFS(tdvp); - int error = 0, fattrflag, tattrflag; + struct nfsmount *nmp; + int error = 0, fattrflag, tattrflag, tvpattrflag; + nfsremove_status tvp_status; + nmp = VFSTONFS(fdvp->v_mount); error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp, - tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag, - &tattrflag); + tnameptr, tnamelen, &tvp_status, &fnfsva, &tnfsva, &fattrflag, + &tattrflag, &tvpnfsva, &tvpattrflag, cred, td); NFSLOCKNODE(fdnp); fdnp->n_flag |= NMODIFIED; if (fattrflag != 0) { @@ -2133,6 +2351,15 @@ nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr, NFSUNLOCKNODE(tdnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp); } + + if (tvp != NULL) { + if (NFSHASNFSV4(nmp) && NFSHASNFSV4N(nmp) && + (nmp->nm_flag & NFSMNT_NOCTO) != 0) + nfs_removestatus(tvp, tvp_status, silly, td); + if (!silly && tvpattrflag != 0) + (void)nfscl_loadattrcache(&tvp, &tvpnfsva, NULL, 0, 1); + } + if (error && NFS_ISV4(fdvp)) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); @@ -2156,7 +2383,9 @@ nfs_link(struct vop_link_args *ap) * doesn't get "out of sync" with the server. * XXX There should be a better way! */ +#ifdef notnow VOP_FSYNC(vp, MNT_WAIT, curthread); +#endif error = nfsrpc_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread, &dnfsva, &nfsva, &attrflag, &dattrflag); @@ -4367,25 +4596,48 @@ nfs_pathconf(struct vop_pathconf_args *ap) struct nfsmount *nmp; struct thread *td = curthread; off_t off; - bool eof; + bool eof, has_namedattr, named_enabled; int attrflag, error; + struct nfsnode *np; + nmp = VFSTONFS(vp->v_mount); + np = VTONFS(vp); + named_enabled = false; + has_namedattr = false; if ((NFS_ISV34(vp) && (ap->a_name == _PC_LINK_MAX || ap->a_name == _PC_NAME_MAX || ap->a_name == _PC_CHOWN_RESTRICTED || ap->a_name == _PC_NO_TRUNC)) || - (NFS_ISV4(vp) && ap->a_name == _PC_ACL_NFS4)) { + (NFS_ISV4(vp) && (ap->a_name == _PC_ACL_NFS4 || + ap->a_name == _PC_HAS_NAMEDATTR))) { /* * Since only the above 4 a_names are returned by the NFSv3 * Pathconf RPC, there is no point in doing it for others. * For NFSv4, the Pathconf RPC (actually a Getattr Op.) can - * be used for _PC_NFS4_ACL as well. + * be used for _PC_ACL_NFS4 and _PC_HAS_NAMEDATTR as well. */ - error = nfsrpc_pathconf(vp, &pc, td->td_ucred, td, &nfsva, - &attrflag); + error = nfsrpc_pathconf(vp, &pc, &has_namedattr, td->td_ucred, + td, &nfsva, &attrflag); if (attrflag != 0) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error != 0) return (error); + } else if (NFS_ISV4(vp) && ap->a_name == _PC_NAMEDATTR_ENABLED && + (np->n_flag & NNAMEDNOTSUPP) == 0) { + struct nfsfh *nfhp; + + error = nfsrpc_openattr(nmp, vp, np->n_fhp->nfh_fh, + np->n_fhp->nfh_len, false, td->td_ucred, td, &nfsva, &nfhp, + &attrflag); + named_enabled = true; + if (error == 0) { + free(nfhp, M_NFSFH); + } else if (error == NFSERR_NOTSUPP) { + named_enabled = false; + NFSLOCKNODE(np); + np->n_flag |= NNAMEDNOTSUPP; + NFSUNLOCKNODE(np); + } + error = 0; } else { /* * For NFSv2 (or NFSv3 when not one of the above 4 a_names), @@ -4468,7 +4720,6 @@ nfs_pathconf(struct vop_pathconf_args *ap) case _PC_MIN_HOLE_SIZE: /* Only some NFSv4.2 servers support Seek for Holes. */ *ap->a_retval = 0; - nmp = VFSTONFS(vp->v_mount); if (NFS_ISV4(vp) && nmp->nm_minorvers == NFSV42_MINORVERSION) { /* * NFSv4.2 doesn't have an attribute for hole size, @@ -4499,6 +4750,27 @@ nfs_pathconf(struct vop_pathconf_args *ap) mtx_unlock(&nmp->nm_mtx); } break; + case _PC_NAMEDATTR_ENABLED: + if (named_enabled) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + break; + case _PC_HAS_NAMEDATTR: + if (has_namedattr) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + break; + case _PC_HAS_HIDDENSYSTEM: + if (NFS_ISV4(vp) && NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_HIDDEN) && + NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_SYSTEM)) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + break; default: error = vop_stdpathconf(ap); diff --git a/sys/fs/nfsclient/nfsnode.h b/sys/fs/nfsclient/nfsnode.h index cc1959b7bf79..9b2627015612 100644 --- a/sys/fs/nfsclient/nfsnode.h +++ b/sys/fs/nfsclient/nfsnode.h @@ -162,6 +162,7 @@ struct nfsnode { #define NDSCOMMIT 0x00100000 /* Commit is done via the DS. */ #define NVNSETSZSKIP 0x00200000 /* Skipped vnode_pager_setsize() */ #define NMIGHTBELOCKED 0x00400000 /* Might be file locked. */ +#define NNAMEDNOTSUPP 0x00800000 /* Openattr is not supported. */ /* * Convert between nfsnode pointers and vnode pointers diff --git a/sys/fs/nfsserver/nfs_nfsdcache.c b/sys/fs/nfsserver/nfs_nfsdcache.c index bf0ff4e84d98..de72187bbb91 100644 --- a/sys/fs/nfsserver/nfs_nfsdcache.c +++ b/sys/fs/nfsserver/nfs_nfsdcache.c @@ -392,7 +392,7 @@ loop: nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) { if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; - (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, + (void)mtx_sleep(rp, mutex, PVFS | PDROP, "nfsrc", 10 * hz); goto loop; } @@ -678,7 +678,7 @@ tryagain: rp = hitrp; if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; - (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, + (void)mtx_sleep(rp, mutex, PVFS | PDROP, "nfsrc", 10 * hz); goto tryagain; } @@ -750,7 +750,7 @@ nfsrc_lock(struct nfsrvcache *rp) mtx_assert(mutex, MA_OWNED); while ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; - (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0); + (void)mtx_sleep(rp, mutex, PVFS, "nfsrc", 0); } rp->rc_flag |= RC_LOCKED; } diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index 6f5b2855bcf0..4f0d5946d6b9 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -69,6 +69,7 @@ extern int nfsrv_maxpnfsmirror; extern uint32_t nfs_srvmaxio; extern int nfs_bufpackets; extern u_long sb_max_adj; +extern struct nfsv4lock nfsv4rootfs_lock; NFSD_VNET_DECLARE(int, nfsrv_numnfsd); NFSD_VNET_DECLARE(struct nfsrv_stablefirst, nfsrv_stablefirst); @@ -121,7 +122,6 @@ extern struct nfsdevicehead nfsrv_devidhead; /* Map d_type to vnode type. */ static uint8_t dtype_to_vnode[DT_WHT + 1] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON }; -#define NFS_DTYPETOVTYPE(t) ((t) <= DT_WHT ? dtype_to_vnode[(t)] : VNON) static int nfsrv_createiovec(int, struct mbuf **, struct mbuf **, struct iovec **); @@ -129,6 +129,7 @@ static int nfsrv_createiovec_extpgs(int, int, struct mbuf **, struct mbuf **, struct iovec **); static int nfsrv_createiovecw(int, struct mbuf *, char *, struct iovec **, int *); +static void nfs_dtypetovtype(struct nfsvattr *, struct vnode *, uint8_t); static void nfsrv_pnfscreate(struct vnode *, struct vattr *, struct ucred *, NFSPROC_T *); static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode **, @@ -178,8 +179,6 @@ SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss, 0, ""); SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW, &nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations"); -SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW, - &nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, debuglevel, CTLFLAG_RW, &nfsd_debuglevel, 0, "Debug level for NFS server"); NFSD_VNET_DECLARE(int, nfsd_enable_stringtouid); @@ -189,6 +188,10 @@ SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, static int nfsrv_pnfsgetdsattr = 1; SYSCTL_INT(_vfs_nfsd, OID_AUTO, pnfsgetdsattr, CTLFLAG_RW, &nfsrv_pnfsgetdsattr, 0, "When set getattr gets DS attributes via RPC"); +static bool nfsrv_recalldeleg = false; +SYSCTL_BOOL(_vfs_nfsd, OID_AUTO, recalldeleg, CTLFLAG_RW, + &nfsrv_recalldeleg, 0, + "When set remove/rename recalls delegations for same client"); /* * nfsrv_dsdirsize can only be increased and only when the nfsd threads are @@ -294,6 +297,38 @@ SYSCTL_PROC(_vfs_nfsd, OID_AUTO, srvmaxio, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_srvmaxio, "IU", "Maximum I/O size in bytes"); +static int +sysctl_dolocallocks(SYSCTL_HANDLER_ARGS) +{ + int error, igotlock, newdolocallocks; + + newdolocallocks = nfsrv_dolocallocks; + error = sysctl_handle_int(oidp, &newdolocallocks, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (newdolocallocks == nfsrv_dolocallocks) + return (0); + if (jailed(curthread->td_ucred)) + return (EINVAL); + + NFSLOCKV4ROOTMUTEX(); + do { + igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL, + NFSV4ROOTLOCKMUTEXPTR, NULL); + } while (!igotlock); + NFSUNLOCKV4ROOTMUTEX(); + + nfsrv_dolocallocks = newdolocallocks; + + NFSLOCKV4ROOTMUTEX(); + nfsv4_unlock(&nfsv4rootfs_lock, 0); + NFSUNLOCKV4ROOTMUTEX(); + return (0); +} +SYSCTL_PROC(_vfs_nfsd, OID_AUTO, enable_locallocks, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_dolocallocks, "IU", "Enable nfsd to acquire local locks on files"); + #define MAX_REORDERED_RPC 16 #define NUM_HEURISTIC 1031 #define NHUSE_INIT 64 @@ -413,6 +448,8 @@ nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap, gotattr = 1; } + nvap->na_bsdflags = 0; + nvap->na_flags = 0; error = VOP_GETATTR(vp, &nvap->na_vattr, nd->nd_cred); if (lockedit != 0) NFSVOPUNLOCK(vp); @@ -1451,32 +1488,61 @@ nfsmout: * Remove a non-directory object. */ int -nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred, +nfsvno_removesub(struct nameidata *ndp, bool is_v4, struct nfsrv_descript *nd, struct thread *p, struct nfsexstuff *exp) { - struct vnode *vp, *dsdvp[NFSDEV_MAXMIRRORS]; - int error = 0, mirrorcnt; + struct vnode *vp, *dsdvp[NFSDEV_MAXMIRRORS], *newvp; + struct mount *mp; + int error = 0, mirrorcnt, ret; char fname[PNFS_FILENAME_LEN + 1]; fhandle_t fh; vp = ndp->ni_vp; dsdvp[0] = NULL; - if (vp->v_type == VDIR) + if (vp->v_type == VDIR) { error = NFSERR_ISDIR; - else if (is_v4) - error = nfsrv_checkremove(vp, 1, NULL, (nfsquad_t)((u_quad_t)0), - p); + } else if (is_v4) { + if (nfsrv_recalldeleg || (nd->nd_flag & ND_NFSV41) == 0) + error = nfsrv_checkremove(vp, 1, NULL, + (nfsquad_t)((u_quad_t)0), p); + else + error = nfsrv_checkremove(vp, 1, NULL, nd->nd_clientid, + p); + } if (error == 0) nfsrv_pnfsremovesetup(vp, p, dsdvp, &mirrorcnt, fname, &fh); if (!error) error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd); if (error == 0 && dsdvp[0] != NULL) nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p); + if (is_v4 && (nd->nd_flag & ND_NFSV41) != 0 && error == 0) + error = nfsvno_getfh(vp, &fh, p); if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vput(vp); + + /* Use ret to determine if the file still exists. */ + if (is_v4 && (nd->nd_flag & ND_NFSV41) != 0 && error == 0) { + mp = vfs_busyfs(&fh.fh_fsid); + if (mp != NULL) { + /* Find out if the file still exists. */ + ret = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &newvp); + if (ret == 0) + vput(newvp); + else + ret = ESTALE; + vfs_unbusy(mp); + } else { + ret = ESTALE; + } + if (ret == ESTALE) { + /* Get rid of any delegation. */ + nfsrv_removedeleg(&fh, nd, p); + } + } + nfsvno_relpathbuf(ndp); NFSEXITCODE(error); return (error); @@ -1527,33 +1593,34 @@ out: */ int nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, - u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p) + struct nfsrv_descript *nd, struct thread *p) { - struct vnode *fvp, *tvp, *tdvp, *dsdvp[NFSDEV_MAXMIRRORS]; - int error = 0, mirrorcnt; + struct vnode *fvp, *tvp, *tdvp, *dsdvp[NFSDEV_MAXMIRRORS], *newvp; + struct mount *mp; + int error = 0, mirrorcnt, ret; char fname[PNFS_FILENAME_LEN + 1]; - fhandle_t fh; + fhandle_t fh, fh2; dsdvp[0] = NULL; fvp = fromndp->ni_vp; - if (ndstat) { + if (nd->nd_repstat != 0) { vrele(fromndp->ni_dvp); vrele(fvp); - error = ndstat; + error = nd->nd_repstat; goto out1; } tdvp = tondp->ni_dvp; tvp = tondp->ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { - error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST; + error = (nd->nd_flag & ND_NFSV2) ? EISDIR : EEXIST; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { - error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST; + error = (nd->nd_flag & ND_NFSV2) ? ENOTDIR : EEXIST; goto out; } if (tvp->v_type == VDIR && tvp->v_mountedhere) { - error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; + error = (nd->nd_flag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } @@ -1572,35 +1639,45 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, } } if (fvp->v_type == VDIR && fvp->v_mountedhere) { - error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; + error = (nd->nd_flag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } if (fvp->v_mount != tdvp->v_mount) { - error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; + error = (nd->nd_flag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } if (fvp == tdvp) { - error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL; + error = (nd->nd_flag & ND_NFSV2) ? ENOTEMPTY : EINVAL; goto out; } if (fvp == tvp) { /* - * If source and destination are the same, there is nothing to - * do. Set error to -1 to indicate this. + * If source and destination are the same, there is + * nothing to do. Set error to EJUSTRETURN to indicate + * this. */ - error = -1; + error = EJUSTRETURN; goto out; } - if (ndflag & ND_NFSV4) { + if (nd->nd_flag & ND_NFSV4) { if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) { - error = nfsrv_checkremove(fvp, 0, NULL, - (nfsquad_t)((u_quad_t)0), p); + if (nfsrv_recalldeleg || (nd->nd_flag & ND_NFSV41) == 0) + error = nfsrv_checkremove(fvp, 0, NULL, + (nfsquad_t)((u_quad_t)0), p); + else + error = nfsrv_checkremove(fvp, 0, NULL, + nd->nd_clientid, p); NFSVOPUNLOCK(fvp); } else error = EPERM; - if (tvp && !error) - error = nfsrv_checkremove(tvp, 1, NULL, - (nfsquad_t)((u_quad_t)0), p); + if (tvp && !error) { + if (nfsrv_recalldeleg || (nd->nd_flag & ND_NFSV41) == 0) + error = nfsrv_checkremove(tvp, 1, NULL, + (nfsquad_t)((u_quad_t)0), p); + else + error = nfsrv_checkremove(tvp, 1, NULL, + nd->nd_clientid, p); + } } else { /* * For NFSv2 and NFSv3, try to get rid of the delegation, so @@ -1612,15 +1689,35 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, nfsd_recalldelegation(fvp, p); } if (error == 0 && tvp != NULL) { - nfsrv_pnfsremovesetup(tvp, p, dsdvp, &mirrorcnt, fname, &fh); + if ((nd->nd_flag & ND_NFSV41) != 0) + error = nfsvno_getfh(tvp, &fh2, p); + if (error == 0) + nfsrv_pnfsremovesetup(tvp, p, dsdvp, &mirrorcnt, fname, + &fh); NFSD_DEBUG(4, "nfsvno_rename: pnfsremovesetup" " dsdvp=%p\n", dsdvp[0]); } out: - if (!error) { + mp = NULL; + if (error == 0) { + error = VOP_GETWRITEMOUNT(tondp->ni_dvp, &mp); + if (error == 0) { + if (mp == NULL) { + error = ENOENT; + } else { + error = lockmgr(&mp->mnt_renamelock, + LK_EXCLUSIVE | LK_NOWAIT, NULL); + if (error != 0) + error = ERELOOKUP; + } + } + } + if (error == 0) { error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp, &fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp, &tondp->ni_cnd); + lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0); + vfs_rel(mp); } else { if (tdvp == tvp) vrele(tdvp); @@ -1630,8 +1727,13 @@ out: vput(tvp); vrele(fromndp->ni_dvp); vrele(fvp); - if (error == -1) + if (error == EJUSTRETURN) { error = 0; + } else if (error == ERELOOKUP && mp != NULL) { + lockmgr(&mp->mnt_renamelock, LK_EXCLUSIVE, 0); + lockmgr(&mp->mnt_renamelock, LK_RELEASE, 0); + vfs_rel(mp); + } } /* @@ -1644,6 +1746,26 @@ out: NFSD_DEBUG(4, "nfsvno_rename: pnfsremove\n"); } + /* Use ret to determine if the file still exists. */ + if ((nd->nd_flag & ND_NFSV41) != 0 && error == 0) { + mp = vfs_busyfs(&fh2.fh_fsid); + if (mp != NULL) { + /* Find out if the file still exists. */ + ret = VFS_FHTOVP(mp, &fh2.fh_fid, LK_SHARED, &newvp); + if (ret == 0) + vput(newvp); + else + ret = ESTALE; + vfs_unbusy(mp); + } else { + ret = ESTALE; + } + if (ret == ESTALE) { + /* Get rid of any delegation. */ + nfsrv_removedeleg(&fh2, nd, p); + } + } + nfsvno_relpathbuf(tondp); out1: nfsvno_relpathbuf(fromndp); @@ -1990,7 +2112,8 @@ int nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p, int isdgram, int reterr, - int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno) + int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno, + bool xattrsupp, bool has_hiddensystem, bool has_namedattr) { struct statfs *sf; int error; @@ -2009,12 +2132,29 @@ nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, } error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror, attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root, - mounted_on_fileno, sf); + mounted_on_fileno, sf, xattrsupp, has_hiddensystem, has_namedattr); free(sf, M_TEMP); NFSEXITCODE2(0, nd); return (error); } +/* + * Convert a dirent d_type to a vnode type. + */ +static void nfs_dtypetovtype(struct nfsvattr *nvap, struct vnode *vp, + uint8_t dtype) +{ + + if ((vn_irflag_read(vp) & VIRF_NAMEDDIR) != 0) { + nvap->na_type = VREG; + nvap->na_bsdflags |= SFBSD_NAMEDATTR; + } else if (dtype <= DT_WHT) { + nvap->na_type = dtype_to_vnode[dtype]; + } else { + nvap->na_type = VNON; + } +} + /* Since the Readdir vnode ops vary, put the entire functions in here. */ /* * nfs readdir service @@ -2309,7 +2449,7 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, struct nfsvattr nva, at, *nvap = &nva; struct mbuf *mb0, *mb1; struct nfsreferral *refp; - int nlen, r, error = 0, getret = 1, usevget = 1; + int nlen, r, error = 0, getret = 1, ret, usevget = 1; int siz, cnt, fullsiz, eofflag, ncookies, entrycnt; caddr_t bpos0, bpos1; u_int64_t off, toff, verf __unused; @@ -2323,6 +2463,9 @@ nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, uint64_t mounted_on_fileno; struct thread *p = curthread; int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1; + size_t atsiz; + long pathval; + bool has_hiddensystem, has_namedattr, xattrsupp; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); @@ -2634,6 +2777,10 @@ again: LK_SHARED, &nvp); else r = EOPNOTSUPP; + if (r == 0 && (vn_irflag_read(vp) & + VIRF_NAMEDDIR) != 0) + vn_irflag_set_cond(nvp, + VIRF_NAMEDATTR); if (r == EOPNOTSUPP) { if (usevget) { usevget = 0; @@ -2648,6 +2795,10 @@ again: cn.cn_namelen = nlen; cn.cn_flags = ISLASTCN | NOFOLLOW | LOCKLEAF; + if ((vn_irflag_read(vp) & + VIRF_NAMEDDIR) != 0) + cn.cn_flags |= + OPENNAMED; if (nlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.') @@ -2765,7 +2916,7 @@ again: /* Only need Type and/or Fileid. */ VATTR_NULL(&nvap->na_vattr); nvap->na_fileid = dp->d_fileno; - nvap->na_type = NFS_DTYPETOVTYPE(dp->d_type); + nfs_dtypetovtype(nvap, vp, dp->d_type); } /* @@ -2789,9 +2940,32 @@ again: *tl++ = newnfs_true; txdr_hyper(*cookiep, tl); dirlen += nfsm_strtom(nd, dp->d_name, nlen); + xattrsupp = false; + has_hiddensystem = false; + has_namedattr = false; if (nvp != NULL) { supports_nfsv4acls = nfs_supportsnfsv4acls(nvp); + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_XATTRSUPPORT)) { + ret = VOP_GETEXTATTR(nvp, + EXTATTR_NAMESPACE_USER, + "xxx", NULL, &atsiz, + nd->nd_cred, p); + xattrsupp = ret != EOPNOTSUPP; + } + if (VOP_PATHCONF(nvp, + _PC_HAS_HIDDENSYSTEM, &pathval) != + 0) + pathval = 0; + has_hiddensystem = pathval > 0; + pathval = 0; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_NAMEDATTR) && + VOP_PATHCONF(nvp, _PC_HAS_NAMEDATTR, + &pathval) != 0) + pathval = 0; + has_namedattr = pathval > 0; NFSVOPUNLOCK(nvp); } else supports_nfsv4acls = 0; @@ -2811,13 +2985,15 @@ again: nvp, nvap, &nfh, r, &rderrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, - mounted_on_fileno); + mounted_on_fileno, xattrsupp, + has_hiddensystem, has_namedattr); } else { dirlen += nfsvno_fillattr(nd, new_mp, nvp, nvap, &nfh, r, &attrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, - mounted_on_fileno); + mounted_on_fileno, xattrsupp, + has_hiddensystem, has_namedattr); } if (nvp != NULL) vrele(nvp); @@ -2995,12 +3171,17 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, /* * Loop around getting the setable attributes. If an unsupported * one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return. + * Once nd_repstat != 0, do not set the attribute value, but keep + * parsing the attribute(s). */ if (retnotsup) { nd->nd_repstat = NFSERR_ATTRNOTSUPP; bitpos = NFSATTRBIT_MAX; } else { bitpos = 0; + if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_HIDDEN) || + NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SYSTEM)) + nvap->na_flags = 0; } moderet = 0; for (; bitpos < NFSATTRBIT_MAX; bitpos++) { @@ -3012,12 +3193,13 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, switch (bitpos) { case NFSATTRBIT_SIZE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); - if (vp != NULL && vp->v_type != VREG) { - error = (vp->v_type == VDIR) ? NFSERR_ISDIR : - NFSERR_INVAL; - goto nfsmout; + if (!nd->nd_repstat) { + if (vp != NULL && vp->v_type != VREG) + nd->nd_repstat = (vp->v_type == VDIR) ? + NFSERR_ISDIR : NFSERR_INVAL; + else + nvap->na_size = fxdr_hyper(tl); } - nvap->na_size = fxdr_hyper(tl); attrsum += NFSX_HYPER; break; case NFSATTRBIT_ACL: @@ -3036,9 +3218,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_HIDDEN: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (!nd->nd_repstat) - nd->nd_repstat = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (nd->nd_repstat == 0) { + if (*tl == newnfs_true) + nvap->na_flags |= UF_HIDDEN; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_MIMETYPE: @@ -3054,7 +3238,8 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, case NFSATTRBIT_MODE: moderet = NFSERR_INVAL; /* Can't do MODESETMASKED. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - nvap->na_mode = nfstov_mode(*tl); + if (!nd->nd_repstat) + nvap->na_mode = nfstov_mode(*tl); attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_OWNER: @@ -3112,9 +3297,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); break; case NFSATTRBIT_SYSTEM: - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - if (!nd->nd_repstat) - nd->nd_repstat = NFSERR_ATTRNOTSUPP; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (nd->nd_repstat == 0) { + if (*tl == newnfs_true) + nvap->na_flags |= UF_SYSTEM; + } attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_TIMEACCESSSET: @@ -3122,10 +3309,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, attrsum += NFSX_UNSIGNED; if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); - fxdr_nfsv4time(tl, &nvap->na_atime); + if (!nd->nd_repstat) + fxdr_nfsv4time(tl, &nvap->na_atime); toclient = 1; attrsum += NFSX_V4TIME; - } else { + } else if (!nd->nd_repstat) { vfs_timestamp(&nvap->na_atime); nvap->na_vaflags |= VA_UTIMES_NULL; } @@ -3138,7 +3326,8 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, break; case NFSATTRBIT_TIMECREATE: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); - fxdr_nfsv4time(tl, &nvap->na_btime); + if (!nd->nd_repstat) + fxdr_nfsv4time(tl, &nvap->na_btime); attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMODIFYSET: @@ -3146,10 +3335,11 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, attrsum += NFSX_UNSIGNED; if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); - fxdr_nfsv4time(tl, &nvap->na_mtime); + if (!nd->nd_repstat) + fxdr_nfsv4time(tl, &nvap->na_mtime); nvap->na_vaflags &= ~VA_UTIMES_NULL; attrsum += NFSX_V4TIME; - } else { + } else if (!nd->nd_repstat) { vfs_timestamp(&nvap->na_mtime); if (!toclient) nvap->na_vaflags |= VA_UTIMES_NULL; @@ -3167,18 +3357,40 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, * specified and this attribute cannot be done in the * same Setattr operation. */ - if ((nd->nd_flag & ND_NFSV41) == 0) - nd->nd_repstat = NFSERR_ATTRNOTSUPP; - else if ((mode & ~07777) != 0 || (mask & ~07777) != 0 || - vp == NULL) - nd->nd_repstat = NFSERR_INVAL; - else if (moderet == 0) - moderet = VOP_GETATTR(vp, &va, nd->nd_cred); - if (moderet == 0) - nvap->na_mode = (mode & mask) | - (va.va_mode & ~mask); - else - nd->nd_repstat = moderet; + if (!nd->nd_repstat) { + if ((nd->nd_flag & ND_NFSV41) == 0) + nd->nd_repstat = NFSERR_ATTRNOTSUPP; + else if ((mode & ~07777) != 0 || + (mask & ~07777) != 0 || vp == NULL) + nd->nd_repstat = NFSERR_INVAL; + else if (moderet == 0) + moderet = VOP_GETATTR(vp, &va, + nd->nd_cred); + if (moderet == 0) + nvap->na_mode = (mode & mask) | + (va.va_mode & ~mask); + else + nd->nd_repstat = moderet; + } + attrsum += 2 * NFSX_UNSIGNED; + break; + case NFSATTRBIT_MODEUMASK: + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + mode = fxdr_unsigned(u_short, *tl++); + mask = fxdr_unsigned(u_short, *tl); + /* + * If moderet != 0, mode has already been done. + * If vp != NULL, this is not a file object creation. + */ + if (!nd->nd_repstat) { + if ((nd->nd_flag & ND_NFSV42) == 0) + nd->nd_repstat = NFSERR_ATTRNOTSUPP; + else if ((mask & ~0777) != 0 || vp != NULL || + moderet != 0) + nd->nd_repstat = NFSERR_INVAL; + else + nvap->na_mode = (mode & ~mask); + } attrsum += 2 * NFSX_UNSIGNED; break; default: @@ -3193,7 +3405,7 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, /* * some clients pad the attrlist, so we need to skip over the - * padding. + * padding. This also skips over unparsed non-supported attributes. */ if (attrsum > attrsize) { error = NFSERR_BADXDR; @@ -3251,7 +3463,11 @@ nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp, NFSVNO_EXPORTANON(exp) || (nd->nd_flag & ND_AUTHNONE) != 0) { nd->nd_cred->cr_uid = credanon->cr_uid; - nd->nd_cred->cr_gid = credanon->cr_gid; + /* + * 'credanon' is already a 'struct ucred' that was built + * internally with calls to crsetgroups_fallback(), so + * we don't need a fallback here. + */ crsetgroups(nd->nd_cred, credanon->cr_ngroups, credanon->cr_groups); } else if ((nd->nd_flag & ND_GSS) == 0) { @@ -3398,6 +3614,15 @@ nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype, &credanon); vfs_unbusy(mp); + if (nd->nd_repstat == 0 && + nfp->nfsrvfh_len >= NFSX_MYFH + NFSX_V4NAMEDDIRFH && + nfp->nfsrvfh_len <= NFSX_MYFH + NFSX_V4NAMEDATTRFH) { + if (nfp->nfsrvfh_len == NFSX_MYFH + NFSX_V4NAMEDDIRFH) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); + } + /* * For NFSv4 without a pseudo root fs, unexported file handles * can be returned, so that Lookup works everywhere. @@ -5464,7 +5689,7 @@ nfsrv_writedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, int len, if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, - NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL); NFSD_DEBUG(4, "nfsrv_writedsdorpc: wcc attr=%d\n", error); if (error != 0) goto nfsmout; @@ -5495,7 +5720,7 @@ nfsrv_writedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, int len, if (error == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, - NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL); } NFSD_DEBUG(4, "nfsrv_writedsdorpc: aft loadattr=%d\n", error); nfsmout: @@ -5661,7 +5886,7 @@ nfsrv_allocatedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, - NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL); } else error = nd->nd_repstat; NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: aft loadattr=%d\n", error); @@ -5828,7 +6053,7 @@ nfsrv_deallocatedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, - NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL); NFSD_DEBUG(4, "nfsrv_deallocatedsdorpc: wcc attr=%d\n", error); if (error != 0) goto nfsmout; @@ -5842,7 +6067,7 @@ nfsrv_deallocatedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, - NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL); } else error = nd->nd_repstat; NFSD_DEBUG(4, "nfsrv_deallocatedsdorpc: aft loadattr=%d\n", error); @@ -5990,7 +6215,7 @@ nfsrv_setattrdsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL, - NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL); NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: wcc attr=%d\n", error); if (error != 0) goto nfsmout; @@ -6014,7 +6239,8 @@ nfsrv_setattrdsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, if (error == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL, - NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, + NULL); } NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: aft setattr loadattr=%d\n", error); nfsmout: @@ -6159,7 +6385,7 @@ nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, * the same type (VREG). */ nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL, - NULL, 0, 0, 0, 0, 0, NULL); + NULL, 0, 0, 0, 0, 0, NULL, false, false, false); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { @@ -6303,7 +6529,7 @@ nfsrv_getattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); /* * We can only save the updated values in the extended * attribute if the vp is exclusively locked. diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c index 0c8bda6dc6a6..9eebcda548c6 100644 --- a/sys/fs/nfsserver/nfs_nfsdserv.c +++ b/sys/fs/nfsserver/nfs_nfsdserv.c @@ -64,6 +64,7 @@ extern u_long sb_max_adj; extern int nfsrv_pnfsatime; extern int nfsrv_maxpnfsmirror; extern uint32_t nfs_srvmaxio; +extern int nfsrv_issuedelegs; static int nfs_async = 0; SYSCTL_DECL(_vfs_nfsd); @@ -240,7 +241,7 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, { struct nfsvattr nva; fhandle_t fh; - int at_root = 0, error = 0, supports_nfsv4acls; + int at_root = 0, error = 0, ret, supports_nfsv4acls; struct nfsreferral *refp; nfsattrbit_t attrbits, tmpbits; struct mount *mp; @@ -249,6 +250,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, uint64_t mounted_on_fileno = 0; accmode_t accmode; struct thread *p = curthread; + size_t atsiz; + long pathval; + bool has_hiddensystem, has_namedattr, xattrsupp; if (nd->nd_repstat) goto out; @@ -306,6 +310,26 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, &nva, &attrbits, p); if (nd->nd_repstat == 0) { supports_nfsv4acls = nfs_supportsnfsv4acls(vp); + xattrsupp = false; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_XATTRSUPPORT)) { + ret = VOP_GETEXTATTR(vp, + EXTATTR_NAMESPACE_USER, + "xxx", NULL, &atsiz, nd->nd_cred, + p); + xattrsupp = ret != EOPNOTSUPP; + } + if (VOP_PATHCONF(vp, _PC_HAS_HIDDENSYSTEM, + &pathval) != 0) + pathval = 0; + has_hiddensystem = pathval > 0; + pathval = 0; + if (NFSISSET_ATTRBIT(&attrbits, + NFSATTRBIT_NAMEDATTR) && + VOP_PATHCONF(vp, _PC_HAS_NAMEDATTR, + &pathval) != 0) + pathval = 0; + has_namedattr = pathval > 0; mp = vp->v_mount; if (nfsrv_enable_crossmntpt != 0 && vp->v_type == VDIR && @@ -339,7 +363,9 @@ nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, (void)nfsvno_fillattr(nd, mp, vp, &nva, &fh, 0, &attrbits, nd->nd_cred, p, isdgram, 1, supports_nfsv4acls, - at_root, mounted_on_fileno); + at_root, mounted_on_fileno, + xattrsupp, has_hiddensystem, + has_namedattr); vfs_unbusy(mp); } vrele(vp); @@ -375,6 +401,7 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, NFSACL_T *aclp = NULL; struct thread *p = curthread; + NFSZERO_ATTRBIT(&retbits); if (nd->nd_repstat) { nfsrv_wcc(nd, preat_ret, &nva2, postat_ret, &nva); goto out; @@ -401,9 +428,10 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, if (error) goto nfsmout; - /* For NFSv4, only va_uid is used from nva2. */ - NFSZERO_ATTRBIT(&retbits); + /* For NFSv4, only va_uid and va_flags is used from nva2. */ NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_OWNER); + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_HIDDEN); + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_SYSTEM); preat_ret = nfsvno_getattr(vp, &nva2, nd, p, 1, &retbits); if (!nd->nd_repstat) nd->nd_repstat = preat_ret; @@ -462,6 +490,9 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, &nva, &attrbits, exp, p); if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV4)) { + u_long oldflags; + + oldflags = nva2.na_flags; /* * For V4, try setting the attributes in sets, so that the * reply bitmap will be correct for an error case. @@ -531,6 +562,32 @@ nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_MODESETMASKED); } } + if (!nd->nd_repstat && + (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN) || + NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM))) { + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN)) { + if ((nva.na_flags & UF_HIDDEN) != 0) + oldflags |= UF_HIDDEN; + else + oldflags &= ~UF_HIDDEN; + } + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM)) { + if ((nva.na_flags & UF_SYSTEM) != 0) + oldflags |= UF_SYSTEM; + else + oldflags &= ~UF_SYSTEM; + } + NFSVNO_ATTRINIT(&nva2); + NFSVNO_SETATTRVAL(&nva2, flags, oldflags); + nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p, + exp); + if (!nd->nd_repstat) { + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_HIDDEN)) + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_HIDDEN); + if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SYSTEM)) + NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_SYSTEM); + } + } #ifdef NFS4_ACL_EXTATTR_NAME if (!nd->nd_repstat && aclp->acl_cnt > 0 && @@ -595,6 +652,8 @@ nfsrvd_lookup(struct nfsrv_descript *nd, __unused int isdgram, char *bufp; u_long *hashp; struct thread *p = curthread; + struct componentname *cnp; + short irflag; if (nd->nd_repstat) { nfsrv_postopattr(nd, dattr_ret, &dattr); @@ -611,8 +670,12 @@ nfsrvd_lookup(struct nfsrv_descript *nd, __unused int isdgram, goto out; } - NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, LOOKUP, - LOCKLEAF); + cnp = &named.ni_cnd; + irflag = vn_irflag_read(dp); + if ((irflag & VIRF_NAMEDDIR) != 0) + NFSNAMEICNDSET(cnp, nd->nd_cred, LOOKUP, LOCKLEAF | OPENNAMED); + else + NFSNAMEICNDSET(cnp, nd->nd_cred, LOOKUP, LOCKLEAF); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) { @@ -621,6 +684,10 @@ nfsrvd_lookup(struct nfsrv_descript *nd, __unused int isdgram, goto out; } if (!nd->nd_repstat) { + /* Don't set OPENNAMED for Lookupp (".."). */ + if (cnp->cn_namelen == 2 && *cnp->cn_pnbuf == '.' && + *(cnp->cn_pnbuf + 1) == '.') + cnp->cn_flags &= ~OPENNAMED; nd->nd_repstat = nfsvno_namei(nd, &named, dp, 0, exp, &dirp); } else { vrele(dp); @@ -1348,6 +1415,18 @@ nfsrvd_mknod(struct nfsrv_descript *nd, __unused int isdgram, NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); vtyp = nfsv34tov_type(*tl); nfs4type = fxdr_unsigned(nfstype, *tl); + if ((vn_irflag_read(dp) & VIRF_NAMEDDIR) != 0) { + /* + * Don't allow creation of non-regular file objects + * in a named attribute directory. + */ + nd->nd_repstat = NFSERR_INVAL; + vrele(dp); +#ifdef NFS4_ACL_EXTATTR_NAME + acl_free(aclp); +#endif + goto out; + } switch (nfs4type) { case NFLNK: error = nfsvno_getsymlink(nd, &nva, p, &pathcp, @@ -1577,14 +1656,14 @@ nfsrvd_remove(struct nfsrv_descript *nd, __unused int isdgram, nd->nd_repstat = nfsvno_rmdirsub(&named, 1, nd->nd_cred, p, exp); else - nd->nd_repstat = nfsvno_removesub(&named, 1, - nd->nd_cred, p, exp); + nd->nd_repstat = nfsvno_removesub(&named, true, + nd, p, exp); } else if (nd->nd_procnum == NFSPROC_RMDIR) { nd->nd_repstat = nfsvno_rmdirsub(&named, 0, nd->nd_cred, p, exp); } else { - nd->nd_repstat = nfsvno_removesub(&named, 0, - nd->nd_cred, p, exp); + nd->nd_repstat = nfsvno_removesub(&named, false, nd, p, + exp); } } if (!(nd->nd_flag & ND_NFSV2)) { @@ -1680,8 +1759,7 @@ nfsrvd_rename(struct nfsrv_descript *nd, int isdgram, } /* If this is the same file handle, just VREF() the vnode. */ - if (tfh.nfsrvfh_len == NFSX_MYFH && - !NFSBCMP(tfh.nfsrvfh_data, &fh, NFSX_MYFH)) { + if (!NFSBCMP(tfh.nfsrvfh_data, &fh, NFSX_MYFH)) { VREF(dp); tdp = dp; tnes = *exp; @@ -1749,8 +1827,7 @@ nfsrvd_rename(struct nfsrv_descript *nd, int isdgram, if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; nd->nd_repstat = nfsvno_namei(nd, &tond, tdp, 0, &tnes, &tdirp); - nd->nd_repstat = nfsvno_rename(&fromnd, &tond, nd->nd_repstat, - nd->nd_flag, nd->nd_cred, p); + nd->nd_repstat = nfsvno_rename(&fromnd, &tond, nd, p); if (fdirp) fdiraft_ret = nfsvno_getattr(fdirp, &fdiraft, nd, p, 0, NULL); if (tdirp) @@ -1804,8 +1881,15 @@ nfsrvd_link(struct nfsrv_descript *nd, int isdgram, nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } + if ((vn_irflag_read(vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0 || + (tovp != NULL && + (vn_irflag_read(tovp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)) { + nd->nd_repstat = NFSERR_INVAL; + if (tovp != NULL) + vrele(tovp); + } NFSVOPUNLOCK(vp); - if (vp->v_type == VDIR) { + if (!nd->nd_repstat && vp->v_type == VDIR) { if (nd->nd_flag & ND_NFSV4) nd->nd_repstat = NFSERR_ISDIR; else @@ -2829,7 +2913,7 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, int how = NFSCREATE_UNCHECKED; int32_t cverf[2], tverf[2] = { 0, 0 }; vnode_t vp = NULL, dirp = NULL; - struct nfsvattr nva, dirfor, diraft; + struct nfsvattr nva, dirfor, diraft, nva2; struct nameidata named; nfsv4stateid_t stateid, delegstateid; nfsattrbit_t attrbits; @@ -2839,6 +2923,8 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, NFSACL_T *aclp = NULL; struct thread *p = curthread; bool done_namei; + __enum_uint8_decl(wdelegace) { USENONE, USEMODE, USENFSV4ACL } + delegace; #ifdef NFS4_ACL_EXTATTR_NAME aclp = acl_alloc(M_WAITOK); @@ -2846,6 +2932,7 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, #endif NFSZERO_ATTRBIT(&attrbits); done_namei = false; + delegace = USEMODE; named.ni_cnd.cn_nameiop = 0; NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED); i = fxdr_unsigned(int, *(tl + 5)); @@ -2971,6 +3058,8 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); cverf[0] = *tl++; cverf[1] = *tl; + if ((vn_irflag_read(dp) & VIRF_NAMEDDIR) != 0) + nd->nd_repstat = NFSERR_INVAL; break; case NFSCREATE_EXCLUSIVE41: NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); @@ -2979,7 +3068,8 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, error = nfsv4_sattr(nd, NULL, &nva, &attrbits, aclp, p); if (error != 0) goto nfsmout; - if (NFSISSET_ATTRBIT(&attrbits, + if ((vn_irflag_read(dp) & VIRF_NAMEDDIR) != 0 || + NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET)) nd->nd_repstat = NFSERR_INVAL; /* @@ -3076,11 +3166,23 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, } break; case NFSCREATE_EXCLUSIVE: - exclusive_flag = 1; if (nd->nd_repstat == 0 && named.ni_vp == NULL) nva.na_mode = 0; - break; + /* FALLTHROUGH */ case NFSCREATE_EXCLUSIVE41: + if (nd->nd_repstat == 0 && named.ni_vp != NULL) { + nd->nd_repstat = nfsvno_getattr(named.ni_vp, + &nva2, nd, p, 1, NULL); + if (nd->nd_repstat == 0) { + tverf[0] = nva2.na_atime.tv_sec; + tverf[1] = nva2.na_atime.tv_nsec; + if (cverf[0] != tverf[0] || + cverf[1] != tverf[1]) + nd->nd_repstat = EEXIST; + } + if (nd->nd_repstat != 0) + done_namei = true; + } exclusive_flag = 1; break; } @@ -3170,16 +3272,27 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, NFSACCCHK_VPISLOCKED, NULL); } - if (!nd->nd_repstat) { + if (!nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); - if (!nd->nd_repstat) { - tverf[0] = nva.na_atime.tv_sec; - tverf[1] = nva.na_atime.tv_nsec; + + if (nd->nd_repstat == 0 && aclp != NULL && nfsrv_issuedelegs != 0 && + (dp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) { + if (aclp->acl_cnt == 0 && create == NFSV4OPEN_NOCREATE) { + int retacl; + + /* We do not yet have an ACL, so try and get one. */ + retacl = VOP_GETACL(vp, ACL_TYPE_NFS4, aclp, + nd->nd_cred, p); + if (retacl != 0 && retacl != ENOATTR && + retacl != EOPNOTSUPP && retacl != EINVAL) + delegace = USENONE; + else if (retacl == 0 && aclp->acl_cnt > 0) + delegace = USENFSV4ACL; + } else if (aclp->acl_cnt > 0 && create == NFSV4OPEN_CREATE) { + delegace = USENFSV4ACL; } } - if (!nd->nd_repstat && exclusive_flag && (cverf[0] != tverf[0] || - cverf[1] != tverf[1])) - nd->nd_repstat = EEXIST; + /* * Do the open locking/delegation stuff. */ @@ -3244,6 +3357,13 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_RESOURCE); *tl = newnfs_false; + } else if ((rflags & + NFSV4OPEN_WDNOTSUPPDOWNGRADE) != 0) { + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + *tl = txdr_unsigned(NFSV4OPEN_NOTSUPPDOWNGRADE); + } else if ((rflags & NFSV4OPEN_WDNOTSUPPUPGRADE) != 0) { + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + *tl = txdr_unsigned(NFSV4OPEN_NOTSUPPUPGRADE); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_NOTWANTED); @@ -3265,18 +3385,56 @@ nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, *tl++ = txdr_unsigned(NFSV4OPEN_LIMITSIZE); txdr_hyper(nva.na_size, tl); } - NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); - *tl++ = txdr_unsigned(NFSV4ACE_ALLOWEDTYPE); - *tl++ = txdr_unsigned(0x0); - acemask = NFSV4ACE_ALLFILESMASK; - if (nva.na_mode & S_IRUSR) - acemask |= NFSV4ACE_READMASK; - if (nva.na_mode & S_IWUSR) - acemask |= NFSV4ACE_WRITEMASK; - if (nva.na_mode & S_IXUSR) - acemask |= NFSV4ACE_EXECUTEMASK; - *tl = txdr_unsigned(acemask); - (void) nfsm_strtom(nd, "OWNER@", 6); + + /* Set up the write delegation ACE. */ + NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED); + if (delegace == USENFSV4ACL) { + int j; + + for (j = 0; j < aclp->acl_cnt; j++) { + if (aclp->acl_entry[j].ae_tag == + ACL_USER_OBJ || + aclp->acl_entry[j].ae_entry_type != + ACL_ENTRY_TYPE_ALLOW) + break; + } + if (j < aclp->acl_cnt && + aclp->acl_entry[j].ae_tag == + ACL_USER_OBJ && + aclp->acl_entry[j].ae_entry_type == + ACL_ENTRY_TYPE_ALLOW) { + /* Use this ACE. */ + *tl++ = txdr_unsigned( + NFSV4ACE_ALLOWEDTYPE); + *tl++ = txdr_unsigned(0x0); + *tl = txdr_unsigned( + nfs_aceperm( + aclp->acl_entry[j].ae_perm)); + (void)nfsm_strtom(nd, "OWNER@", 6); + } else + delegace = USENONE; + } + if (delegace == USENONE) { + /* Don't allow anything. */ + *tl++ = 0x0; + *tl++ = 0x0; + *tl = 0x0; + NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); + *tl = 0; + } else if (delegace == USEMODE) { + /* Build from mode. */ + *tl++ = txdr_unsigned(NFSV4ACE_ALLOWEDTYPE); + *tl++ = txdr_unsigned(0x0); + acemask = NFSV4ACE_ALLFILESMASK; + if (nva.na_mode & S_IRUSR) + acemask |= NFSV4ACE_READMASK; + if (nva.na_mode & S_IWUSR) + acemask |= NFSV4ACE_WRITEMASK; + if (nva.na_mode & S_IXUSR) + acemask |= NFSV4ACE_EXECUTEMASK; + *tl = txdr_unsigned(acemask); + (void)nfsm_strtom(nd, "OWNER@", 6); + } } *vpp = vp; } else if (vp) { @@ -3466,11 +3624,20 @@ nfsrvd_getfh(struct nfsrv_descript *nd, __unused int isdgram, { fhandle_t fh; struct thread *p = curthread; + int siz; + short irflag; nd->nd_repstat = nfsvno_getfh(vp, &fh, p); + irflag = vn_irflag_read(vp); vput(vp); - if (!nd->nd_repstat) - (void)nfsm_fhtom(NULL, nd, (u_int8_t *)&fh, 0, 0); + if (nd->nd_repstat == 0) { + siz = 0; + if ((irflag & VIRF_NAMEDDIR) != 0) + siz = NFSX_FHMAX + NFSX_V4NAMEDDIRFH; + else if ((irflag & VIRF_NAMEDATTR) != 0) + siz = NFSX_FHMAX + NFSX_V4NAMEDATTRFH; + (void)nfsm_fhtom(NULL, nd, (u_int8_t *)&fh, siz, 0); + } NFSEXITCODE2(0, nd); return (0); } @@ -4180,7 +4347,8 @@ nfsrvd_verify(struct nfsrv_descript *nd, int isdgram, if (!nd->nd_repstat) { nfsvno_getfs(&fs, isdgram); error = nfsv4_loadattr(nd, vp, &nva, NULL, &fh, fhsize, NULL, - sf, NULL, &fs, NULL, 1, &ret, NULL, NULL, p, nd->nd_cred); + sf, NULL, &fs, NULL, 1, &ret, NULL, NULL, NULL, p, + nd->nd_cred); if (!error) { if (nd->nd_procnum == NFSV4OP_NVERIFY) { if (ret == 0) @@ -4202,15 +4370,42 @@ nfsrvd_verify(struct nfsrv_descript *nd, int isdgram, */ int nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram, - vnode_t dp, __unused vnode_t *vpp, __unused fhandle_t *fhp, + struct vnode *dp, struct vnode **vpp, __unused fhandle_t *fhp, __unused struct nfsexstuff *exp) { - u_int32_t *tl; - int error = 0, createdir __unused; + uint32_t *tl; + struct componentname cn; + int error = 0; - NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); - createdir = fxdr_unsigned(int, *tl); - nd->nd_repstat = NFSERR_NOTSUPP; + NFSNAMEICNDSET(&cn, nd->nd_cred, LOOKUP, OPENNAMED | ISLASTCN | + NOFOLLOW | LOCKLEAF); + cn.cn_nameptr = "."; + cn.cn_namelen = 1; + cn.cn_lkflags = LK_SHARED; + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + if (*tl == newnfs_true) + cn.cn_flags |= CREATENAMED; + + nd->nd_repstat = vn_lock(dp, LK_SHARED); + if (nd->nd_repstat != 0) + goto nfsmout; + + if ((dp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) + nd->nd_repstat = NFSERR_NOTSUPP; + if (nd->nd_repstat == 0 && (vn_irflag_read(dp) & (VIRF_NAMEDDIR | + VIRF_NAMEDATTR)) != 0) + nd->nd_repstat = NFSERR_WRONGTYPE; + if (nd->nd_repstat == 0) { + nd->nd_repstat = VOP_LOOKUP(dp, vpp, &cn); + if (nd->nd_repstat == ENOATTR) + nd->nd_repstat = NFSERR_NOENT; + } + if (nd->nd_repstat == 0) + NFSVOPUNLOCK(*vpp); + + vput(dp); + NFSEXITCODE2(0, nd); + return (0); nfsmout: vrele(dp); NFSEXITCODE2(error, nd); diff --git a/sys/fs/nfsserver/nfs_nfsdsocket.c b/sys/fs/nfsserver/nfs_nfsdsocket.c index 1f50634405d0..d1b6198ba0e1 100644 --- a/sys/fs/nfsserver/nfs_nfsdsocket.c +++ b/sys/fs/nfsserver/nfs_nfsdsocket.c @@ -797,7 +797,7 @@ nfsrvd_compound(struct nfsrv_descript *nd, int isdgram, u_char *tag, !LIST_EMPTY(&clp->lc_deleg)) nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p); - nfsrv_cleanclient(clp, p); + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); nfsrv_freedeleglist(&clp->lc_olddeleg); LIST_REMOVE(clp, lc_hash); @@ -1422,13 +1422,11 @@ static struct ucred * nfsrv_createrootcred(void) { struct ucred *cr; - gid_t grp; cr = crget(); cr->cr_uid = cr->cr_ruid = cr->cr_svuid = UID_ROOT; - grp = GID_WHEEL; - crsetgroups(cr, 1, &grp); - cr->cr_rgid = cr->cr_svgid = cr->cr_groups[0]; + crsetgroups_fallback(cr, 0, NULL, GID_WHEEL); + cr->cr_rgid = cr->cr_svgid = cr->cr_gid; cr->cr_prison = curthread->td_ucred->cr_prison; prison_hold(cr->cr_prison); #ifdef MAC diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c index c73840277022..2e27817389dd 100644 --- a/sys/fs/nfsserver/nfs_nfsdstate.c +++ b/sys/fs/nfsserver/nfs_nfsdstate.c @@ -115,6 +115,11 @@ SYSCTL_INT(_vfs_nfsd, OID_AUTO, flexlinuxhack, CTLFLAG_RW, &nfsrv_flexlinuxhack, 0, "For Linux clients, hack around Flex File Layout bug"); +NFSD_VNET_DEFINE_STATIC(bool, nfsd_disable_grace) = false; +SYSCTL_BOOL(_vfs_nfsd, OID_AUTO, testing_disable_grace, + CTLFLAG_NFSD_VNET | CTLFLAG_RW, &NFSD_VNET_NAME(nfsd_disable_grace), + 0, "Disable grace for testing"); + /* * Hash lists for nfs V4. */ @@ -139,7 +144,7 @@ static void nfsrv_dumpaclient(struct nfsclient *clp, struct nfsd_dumpclients *dumpp); static void nfsrv_freeopenowner(struct nfsstate *stp, int cansleep, NFSPROC_T *p); -static int nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep, +static void nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep, NFSPROC_T *p); static void nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep, NFSPROC_T *p); @@ -204,7 +209,7 @@ static void nfsrv_locklf(struct nfslockfile *lfp); static void nfsrv_unlocklf(struct nfslockfile *lfp); static struct nfsdsession *nfsrv_findsession(uint8_t *sessionid); static int nfsrv_freesession(struct nfsrv_descript *nd, struct nfsdsession *sep, - uint8_t *sessionid); + uint8_t *sessionid, bool locked, SVCXPRT **old_xprtp); static int nfsv4_setcbsequence(struct nfsrv_descript *nd, struct nfsclient *clp, int dont_replycache, struct nfsdsession **sepp, int *slotposp); static int nfsv4_getcbsession(struct nfsclient *clp, struct nfsdsession **sepp); @@ -240,6 +245,50 @@ static int nfsrv_createdsfile(vnode_t vp, fhandle_t *fhp, struct pnfsdsfile *pf, static struct nfsdevice *nfsrv_findmirroredds(struct nfsmount *nmp); static int nfsrv_checkmachcred(int op, struct nfsrv_descript *nd, struct nfsclient *clp); +static void nfsrv_issuedelegation(struct vnode *vp, struct nfsclient *clp, + struct nfsrv_descript *nd, int delegate, int writedeleg, int readonly, + u_quad_t filerev, uint64_t rdonly, struct nfsstate **new_delegp, + struct nfsstate *new_stp, struct nfslockfile *lfp, uint32_t *rflagsp, + nfsv4stateid_t *delegstateidp); +static void nfsrv_clientlock(bool mlocked); +static void nfsrv_clientunlock(bool mlocked); + +/* + * Lock the client structure, either with the mutex or the exclusive nfsd lock. + */ +static void +nfsrv_clientlock(bool mlocked) +{ + int igotlock; + + if (mlocked) { + NFSLOCKSTATE(); + } else { + NFSLOCKV4ROOTMUTEX(); + nfsv4_relref(&nfsv4rootfs_lock); + do { + igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL, + NFSV4ROOTLOCKMUTEXPTR, NULL); + } while (!igotlock); + NFSUNLOCKV4ROOTMUTEX(); + } +} + +/* + * Unlock the client structure. + */ +static void +nfsrv_clientunlock(bool mlocked) +{ + + if (mlocked) { + NFSUNLOCKSTATE(); + } else { + NFSLOCKV4ROOTMUTEX(); + nfsv4_unlock(&nfsv4rootfs_lock, 1); + NFSUNLOCKV4ROOTMUTEX(); + } +} /* * Scan the client list for a match and either return the current one, @@ -261,7 +310,10 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, struct sockaddr_in6 *sin6, *rin6; #endif struct nfsdsession *sep, *nsep; - int zapit = 0, gotit, hasstate = 0, igotlock; + SVCXPRT *old_xprt; + struct nfssessionhead old_sess; + int zapit = 0, gotit, hasstate = 0; + bool mlocked; static u_int64_t confirm_index = 0; /* @@ -289,14 +341,11 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, */ new_clp->lc_program = 0; + mlocked = true; + if (nfsrv_dolocallocks != 0) + mlocked = false; /* Lock out other nfsd threads */ - NFSLOCKV4ROOTMUTEX(); - nfsv4_relref(&nfsv4rootfs_lock); - do { - igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL, - NFSV4ROOTLOCKMUTEXPTR, NULL); - } while (!igotlock); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientlock(mlocked); /* * Search for a match in the client list. @@ -313,6 +362,7 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, if (gotit == 0) i++; } + old_xprt = NULL; if (!gotit || (clp->lc_flags & (LCL_NEEDSCONFIRM | LCL_ADMINREVOKED))) { if ((nd->nd_flag & ND_NFSV41) != 0 && confirmp->lval[1] != 0) { @@ -320,9 +370,7 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, * For NFSv4.1, if confirmp->lval[1] is non-zero, the * client is trying to update a confirmed clientid. */ - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); confirmp->lval[1] = 0; error = NFSERR_NOENT; goto out; @@ -332,7 +380,10 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, */ if (i != nfsrv_clienthashsize) { LIST_REMOVE(clp, lc_hash); - nfsrv_cleanclient(clp, p); + if (mlocked) + nfsrv_cleanclient(clp, p, true, &old_xprt); + else + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); nfsrv_freedeleglist(&clp->lc_olddeleg); zapit = 1; @@ -367,11 +418,12 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, NFSD_VNET(nfsstatsv1_p)->srvclients++; nfsrv_openpluslock++; nfsrv_clients++; - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); - if (zapit) + nfsrv_clientunlock(mlocked); + if (zapit != 0) { + if (old_xprt != NULL) + SVC_RELEASE(old_xprt); nfsrv_zapclient(clp, p); + } *new_clpp = NULL; goto out; } @@ -385,7 +437,10 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, */ if (clp->lc_expiry < NFSD_MONOSEC && (!LIST_EMPTY(&clp->lc_open) || !LIST_EMPTY(&clp->lc_deleg))) { - nfsrv_cleanclient(clp, p); + if (mlocked) + nfsrv_cleanclient(clp, p, true, &old_xprt); + else + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); } @@ -430,9 +485,9 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, break; #endif } - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); + if (old_xprt != NULL) + SVC_RELEASE(old_xprt); error = NFSERR_CLIDINUSE; goto out; } @@ -442,17 +497,12 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, /* * If the verifier has changed, the client has rebooted * and a new client id is issued. The old state info - * can be thrown away once the SETCLIENTID_CONFIRM occurs. + * can be thrown away once the SetClientID_Confirm or + * Create_Session that confirms the clientid occurs. */ LIST_REMOVE(clp, lc_hash); - /* Get rid of all sessions on this clientid. */ - LIST_FOREACH_SAFE(sep, &clp->lc_session, sess_list, nsep) { - ret = nfsrv_freesession(NULL, sep, NULL); - if (ret != 0) - printf("nfsrv_setclient: verifier changed free" - " session failed=%d\n", ret); - } + LIST_NEWHEAD(&old_sess, &clp->lc_session, sess_list); new_clp->lc_flags |= LCL_NEEDSCONFIRM; if ((nd->nd_flag & ND_NFSV41) != 0) { @@ -496,21 +546,31 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, NFSD_VNET(nfsstatsv1_p)->srvclients++; nfsrv_openpluslock++; nfsrv_clients++; - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + if (!mlocked) { + nfsrv_clientunlock(mlocked); + NFSLOCKSTATE(); + } /* * Must wait until any outstanding callback on the old clp * completes. */ - NFSLOCKSTATE(); while (clp->lc_cbref) { clp->lc_flags |= LCL_WAKEUPWANTED; - (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1, + (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PVFS, "nfsd clp", 10 * hz); } NFSUNLOCKSTATE(); + if (old_xprt != NULL) + SVC_RELEASE(old_xprt); + /* Get rid of all sessions on this clientid. */ + LIST_FOREACH_SAFE(sep, &old_sess, sess_list, nsep) { + ret = nfsrv_freesession(NULL, sep, NULL, false, NULL); + if (ret != 0) + printf("nfsrv_setclient: verifier changed free" + " session failed=%d\n", ret); + } + nfsrv_zapclient(clp, p); *new_clpp = NULL; goto out; @@ -562,24 +622,31 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, nfsrv_openpluslock++; nfsrv_clients++; } - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + if (!mlocked) + nfsrv_clientunlock(mlocked); if ((nd->nd_flag & ND_NFSV41) == 0) { /* * Must wait until any outstanding callback on the old clp * completes. */ - NFSLOCKSTATE(); + if (!mlocked) + NFSLOCKSTATE(); while (clp->lc_cbref) { clp->lc_flags |= LCL_WAKEUPWANTED; - (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1, + (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PVFS, "nfsdclp", 10 * hz); } NFSUNLOCKSTATE(); + if (old_xprt != NULL) + SVC_RELEASE(old_xprt); nfsrv_zapclient(clp, p); *new_clpp = NULL; + } else { + if (mlocked) + NFSUNLOCKSTATE(); + if (old_xprt != NULL) + SVC_RELEASE(old_xprt); } out: @@ -599,11 +666,13 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, struct nfsstate *stp; int i; struct nfsclienthashhead *hp; - int error = 0, igotlock, doneok; + int error = 0, doneok, igotlock; struct nfssessionhash *shp; struct nfsdsession *sep; uint64_t sessid[2]; - bool sess_replay; + CLIENT *client; + SVCXPRT *old_xprt; + bool mlocked, sess_replay; static uint64_t next_sess = 0; if (clpp) @@ -620,13 +689,27 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, * already held. Otherwise, we need to get either that or, * for the case of Confirm, lock out the nfsd threads. */ + client = NULL; + old_xprt = NULL; + mlocked = true; + if (nfsrv_dolocallocks != 0) + mlocked = false; if (opflags & CLOPS_CONFIRM) { - NFSLOCKV4ROOTMUTEX(); - nfsv4_relref(&nfsv4rootfs_lock); - do { - igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL, - NFSV4ROOTLOCKMUTEXPTR, NULL); - } while (!igotlock); + if (nsep != NULL && + (nsep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0) + client = (struct __rpc_client *) + clnt_bck_create(nd->nd_xprt->xp_socket, + cbprogram, NFSV4_CBVERS); + if (mlocked) { + nfsrv_clientlock(mlocked); + } else { + NFSLOCKV4ROOTMUTEX(); + nfsv4_relref(&nfsv4rootfs_lock); + do { + igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, + NULL, NFSV4ROOTLOCKMUTEXPTR, NULL); + } while (!igotlock); + } /* * Create a new sessionid here, since we need to do it where * there is a mutex held to serialize update of next_sess. @@ -635,7 +718,8 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, sessid[0] = ++next_sess; sessid[1] = clientid.qval; } - NFSUNLOCKV4ROOTMUTEX(); + if (!mlocked) + NFSUNLOCKV4ROOTMUTEX(); } else if (opflags != CLOPS_RENEW) { NFSLOCKSTATE(); } @@ -672,9 +756,9 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, } if (error) { if (opflags & CLOPS_CONFIRM) { - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); + if (client != NULL) + CLNT_RELEASE(client); } else if (opflags != CLOPS_RENEW) { NFSUNLOCKSTATE(); } @@ -719,7 +803,10 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, * for an Open with CLAIM_DELEGATE_PREV unless in * grace, but get rid of the rest of the state. */ - nfsrv_cleanclient(clp, p); + if (mlocked) + nfsrv_cleanclient(clp, p, true, &old_xprt); + else + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_olddeleg); if (nfsrv_checkgrace(nd, clp, 0)) { /* In grace, so just delete delegations */ @@ -743,10 +830,10 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, /* Hold a reference on the xprt for a backchannel. */ if ((nsep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0 && !sess_replay) { - if (clp->lc_req.nr_client == NULL) - clp->lc_req.nr_client = (struct __rpc_client *) - clnt_bck_create(nd->nd_xprt->xp_socket, - cbprogram, NFSV4_CBVERS); + if (clp->lc_req.nr_client == NULL) { + clp->lc_req.nr_client = client; + client = NULL; + } if (clp->lc_req.nr_client != NULL) { SVC_ACQUIRE(nd->nd_xprt); CLNT_ACQUIRE(clp->lc_req.nr_client); @@ -763,13 +850,15 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, NFSX_V4SESSIONID); if (!sess_replay) { shp = NFSSESSIONHASH(nsep->sess_sessionid); - NFSLOCKSTATE(); + if (!mlocked) + NFSLOCKSTATE(); NFSLOCKSESSION(shp); LIST_INSERT_HEAD(&shp->list, nsep, sess_hash); LIST_INSERT_HEAD(&clp->lc_session, nsep, sess_list); nsep->sess_clp = clp; NFSUNLOCKSESSION(shp); - NFSUNLOCKSTATE(); + if (!mlocked) + NFSUNLOCKSTATE(); } } } @@ -803,9 +892,11 @@ nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp, clp->lc_expiry = nfsrv_leaseexpiry(); } if (opflags & CLOPS_CONFIRM) { - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); + if (client != NULL) + CLNT_RELEASE(client); + if (old_xprt != NULL) + SVC_RELEASE(old_xprt); } else if (opflags != CLOPS_RENEW) { NFSUNLOCKSTATE(); } @@ -825,21 +916,20 @@ nfsrv_destroyclient(struct nfsrv_descript *nd, nfsquad_t clientid, NFSPROC_T *p) { struct nfsclient *clp; struct nfsclienthashhead *hp; - int error = 0, i, igotlock; + SVCXPRT *old_xprt; + int error = 0, i; + bool mlocked; if (NFSD_VNET(nfsrvboottime) != clientid.lval[0]) { error = NFSERR_STALECLIENTID; goto out; } + mlocked = true; + if (nfsrv_dolocallocks != 0) + mlocked = false; /* Lock out other nfsd threads */ - NFSLOCKV4ROOTMUTEX(); - nfsv4_relref(&nfsv4rootfs_lock); - do { - igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL, - NFSV4ROOTLOCKMUTEXPTR, NULL); - } while (igotlock == 0); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientlock(mlocked); hp = NFSCLIENTHASH(clientid); LIST_FOREACH(clp, hp, lc_hash) { @@ -847,9 +937,7 @@ nfsrv_destroyclient(struct nfsrv_descript *nd, nfsquad_t clientid, NFSPROC_T *p) break; } if (clp == NULL) { - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); /* Just return ok, since it is gone. */ goto out; } @@ -857,9 +945,7 @@ nfsrv_destroyclient(struct nfsrv_descript *nd, nfsquad_t clientid, NFSPROC_T *p) /* Check for the SP4_MACH_CRED case. */ error = nfsrv_checkmachcred(NFSV4OP_DESTROYCLIENTID, nd, clp); if (error != 0) { - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); goto out; } @@ -872,28 +958,28 @@ nfsrv_destroyclient(struct nfsrv_descript *nd, nfsquad_t clientid, NFSPROC_T *p) /* Scan for state on the clientid. */ for (i = 0; i < nfsrv_statehashsize; i++) if (!LIST_EMPTY(&clp->lc_stateid[i])) { - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); error = NFSERR_CLIENTIDBUSY; goto out; } if (!LIST_EMPTY(&clp->lc_session) || !LIST_EMPTY(&clp->lc_deleg)) { - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); error = NFSERR_CLIENTIDBUSY; goto out; } /* Destroy the clientid and return ok. */ - nfsrv_cleanclient(clp, p); + old_xprt = NULL; + if (mlocked) + nfsrv_cleanclient(clp, p, true, &old_xprt); + else + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); nfsrv_freedeleglist(&clp->lc_olddeleg); LIST_REMOVE(clp, lc_hash); - NFSLOCKV4ROOTMUTEX(); - nfsv4_unlock(&nfsv4rootfs_lock, 1); - NFSUNLOCKV4ROOTMUTEX(); + nfsrv_clientunlock(mlocked); + if (old_xprt != NULL) + SVC_RELEASE(old_xprt); nfsrv_zapclient(clp, p); out: NFSEXITCODE2(error, nd); @@ -956,7 +1042,7 @@ nfsrv_adminrevoke(struct nfsd_clid *revokep, NFSPROC_T *p) */ clp->lc_flags &= ~LCL_CALLBACKSON; clp->lc_flags |= LCL_ADMINREVOKED; - nfsrv_cleanclient(clp, p); + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); nfsrv_freedeleglist(&clp->lc_olddeleg); NFSLOCKV4ROOTMUTEX(); @@ -1376,16 +1462,22 @@ nfsrv_servertimer(void *arg __unused) * there are no other active nfsd threads. */ void -nfsrv_cleanclient(struct nfsclient *clp, NFSPROC_T *p) +nfsrv_cleanclient(struct nfsclient *clp, NFSPROC_T *p, bool locked, + SVCXPRT **old_xprtp) { struct nfsstate *stp, *nstp; struct nfsdsession *sep, *nsep; - LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp) - nfsrv_freeopenowner(stp, 1, p); + LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp) { + if (locked) + nfsrv_freeopenowner(stp, 0, p); + else + nfsrv_freeopenowner(stp, 1, p); + } if ((clp->lc_flags & LCL_ADMINREVOKED) == 0) LIST_FOREACH_SAFE(sep, &clp->lc_session, sess_list, nsep) - (void)nfsrv_freesession(NULL, sep, NULL); + (void)nfsrv_freesession(NULL, sep, NULL, locked, + old_xprtp); } /* @@ -1479,7 +1571,7 @@ nfsrv_freeopenowner(struct nfsstate *stp, int cansleep, NFSPROC_T *p) while (nstp != LIST_END(&stp->ls_open)) { tstp = nstp; nstp = LIST_NEXT(nstp, ls_list); - (void) nfsrv_freeopen(tstp, NULL, cansleep, p); + nfsrv_freeopen(tstp, NULL, cansleep, p); } if (stp->ls_op) nfsrvd_derefcache(stp->ls_op); @@ -1494,12 +1586,11 @@ nfsrv_freeopenowner(struct nfsstate *stp, int cansleep, NFSPROC_T *p) * are no other opens on the file. * Returns 1 if it free'd the nfslockfile, 0 otherwise. */ -static int +static void nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep, NFSPROC_T *p) { struct nfsstate *nstp, *tstp; struct nfslockfile *lfp; - int ret; LIST_REMOVE(stp, ls_hash); LIST_REMOVE(stp, ls_list); @@ -1508,35 +1599,46 @@ nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep, NFSPROC_T *p) lfp = stp->ls_lfp; /* * Now, free all lockowners associated with this open. + * Note that, if vp != NULL, nfsrv_freelockowner() will + * not call nfsrv_freeallnfslocks(), so it needs to be called, below. */ LIST_FOREACH_SAFE(tstp, &stp->ls_open, ls_list, nstp) nfsrv_freelockowner(tstp, vp, cansleep, p); + if (vp != NULL) { + KASSERT(cansleep != 0, ("nfsrv_freeopen: cansleep == 0")); + mtx_assert(NFSSTATEMUTEXPTR, MA_OWNED); + /* + * Only called with vp != NULL for Close when + * vfs.nfsd.enable_locallocks != 0. + * Lock the lfp so that it will not go away and do the + * nfsrv_freeallnfslocks() call that was not done by + * nfsrv_freelockowner(). + */ + nfsrv_locklf(lfp); + NFSUNLOCKSTATE(); + NFSVOPUNLOCK(vp); + nfsrv_freeallnfslocks(stp, vp, cansleep, p); + NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY); + NFSLOCKSTATE(); + nfsrv_unlocklf(lfp); + } + /* * The nfslockfile is freed here if there are no locks * associated with the open. * If there are locks associated with the open, the * nfslockfile structure can be freed via nfsrv_freelockowner(). - * Acquire the state mutex to avoid races with calls to - * nfsrv_getlockfile(). */ - if (cansleep != 0) - NFSLOCKSTATE(); if (lfp != NULL && LIST_EMPTY(&lfp->lf_open) && LIST_EMPTY(&lfp->lf_deleg) && LIST_EMPTY(&lfp->lf_lock) && LIST_EMPTY(&lfp->lf_locallock) && LIST_EMPTY(&lfp->lf_rollback) && lfp->lf_usecount == 0 && - (cansleep != 0 || nfsv4_testlock(&lfp->lf_locallock_lck) == 0)) { + nfsv4_testlock(&lfp->lf_locallock_lck) == 0) nfsrv_freenfslockfile(lfp); - ret = 1; - } else - ret = 0; - if (cansleep != 0) - NFSUNLOCKSTATE(); free(stp, M_NFSDSTATE); NFSD_VNET(nfsstatsv1_p)->srvopens--; nfsrv_openpluslock--; - return (ret); } /* @@ -1549,7 +1651,8 @@ nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep, LIST_REMOVE(stp, ls_hash); LIST_REMOVE(stp, ls_list); - nfsrv_freeallnfslocks(stp, vp, cansleep, p); + if (vp == NULL) + nfsrv_freeallnfslocks(stp, vp, cansleep, p); if (stp->ls_op) nfsrvd_derefcache(stp->ls_op); free(stp, M_NFSDSTATE); @@ -2648,6 +2751,8 @@ tryagain: * considered a conflict since the client with a read delegation * could have done an Open with ReadAccess and WriteDeny * locally and then not have checked for the WriteDeny.) + * The exception is a NFSv4.1/4.2 client that has requested + * an atomic upgrade to a write delegation. * Don't check for a Reclaim, since that will be dealt with * by nfsrv_openctrl(). */ @@ -2657,9 +2762,10 @@ tryagain: while (stp != LIST_END(&lfp->lf_deleg)) { nstp = LIST_NEXT(stp, ls_file); if ((readonly && stp->ls_clp != clp && - (stp->ls_flags & NFSLCK_DELEGWRITE)) || + (stp->ls_flags & NFSLCK_DELEGWRITE) != 0) || (!readonly && (stp->ls_clp != clp || - (stp->ls_flags & NFSLCK_DELEGREAD)))) { + ((stp->ls_flags & NFSLCK_DELEGREAD) != 0 && + (new_stp->ls_flags & NFSLCK_WANTWDELEG) == 0)))) { ret = nfsrv_delegconflict(stp, &haslock, p, vp); if (ret) { /* @@ -2944,6 +3050,8 @@ tryagain: * considered a conflict since the client with a read delegation * could have done an Open with ReadAccess and WriteDeny * locally and then not have checked for the WriteDeny.) + * The exception is a NFSv4.1/4.2 client that has requested + * an atomic upgrade to a write delegation. */ if (!(new_stp->ls_flags & (NFSLCK_DELEGPREV | NFSLCK_DELEGCUR))) { stp = LIST_FIRST(&lfp->lf_deleg); @@ -2951,12 +3059,15 @@ tryagain: nstp = LIST_NEXT(stp, ls_file); if (stp->ls_clp != clp && (stp->ls_flags & NFSLCK_DELEGREAD)) writedeleg = 0; - else + else if (stp->ls_clp != clp || + (stp->ls_flags & NFSLCK_DELEGWRITE) != 0 || + (new_stp->ls_flags & NFSLCK_WANTWDELEG) == 0) delegate = 0; if ((readonly && stp->ls_clp != clp && - (stp->ls_flags & NFSLCK_DELEGWRITE)) || + (stp->ls_flags & NFSLCK_DELEGWRITE) != 0) || (!readonly && (stp->ls_clp != clp || - (stp->ls_flags & NFSLCK_DELEGREAD)))) { + ((stp->ls_flags & NFSLCK_DELEGREAD) != 0 && + (new_stp->ls_flags & NFSLCK_WANTWDELEG) == 0)))) { if (new_stp->ls_flags & NFSLCK_RECLAIM) { delegate = 2; } else { @@ -3204,47 +3315,9 @@ tryagain: /* * This is where we can choose to issue a delegation. */ - if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0) - *rflagsp |= NFSV4OPEN_WDNOTWANTED; - else if (nfsrv_issuedelegs == 0) - *rflagsp |= NFSV4OPEN_WDSUPPFTYPE; - else if (NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt)) - *rflagsp |= NFSV4OPEN_WDRESOURCE; - else if (delegate == 0 || writedeleg == 0 || - NFSVNO_EXRDONLY(exp) || (readonly != 0 && - nfsrv_writedelegifpos == 0) || - !NFSVNO_DELEGOK(vp) || - (new_stp->ls_flags & NFSLCK_WANTRDELEG) != 0 || - (clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) != - LCL_CALLBACKSON) - *rflagsp |= NFSV4OPEN_WDCONTENTION; - else { - new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1; - new_deleg->ls_stateid.other[0] = delegstateidp->other[0] - = clp->lc_clientid.lval[0]; - new_deleg->ls_stateid.other[1] = delegstateidp->other[1] - = clp->lc_clientid.lval[1]; - new_deleg->ls_stateid.other[2] = delegstateidp->other[2] - = nfsrv_nextstateindex(clp); - new_deleg->ls_flags = (NFSLCK_DELEGWRITE | - NFSLCK_READACCESS | NFSLCK_WRITEACCESS); - *rflagsp |= NFSV4OPEN_WRITEDELEGATE; - new_deleg->ls_uid = new_stp->ls_uid; - new_deleg->ls_lfp = lfp; - new_deleg->ls_clp = clp; - new_deleg->ls_filerev = filerev; - new_deleg->ls_compref = nd->nd_compref; - new_deleg->ls_lastrecall = 0; - nfsrv_writedelegcnt++; - LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file); - LIST_INSERT_HEAD(NFSSTATEHASH(clp, - new_deleg->ls_stateid), new_deleg, ls_hash); - LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list); - new_deleg = NULL; - NFSD_VNET(nfsstatsv1_p)->srvdelegates++; - nfsrv_openpluslock++; - nfsrv_delegatecnt++; - } + nfsrv_issuedelegation(vp, clp, nd, delegate, writedeleg, + readonly, filerev, NFSVNO_EXRDONLY(exp), &new_deleg, + new_stp, lfp, rflagsp, delegstateidp); } else { new_open->ls_stateid.seqid = 1; new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0]; @@ -3269,52 +3342,9 @@ tryagain: /* * This is where we can choose to issue a delegation. */ - if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0) - *rflagsp |= NFSV4OPEN_WDNOTWANTED; - else if (nfsrv_issuedelegs == 0) - *rflagsp |= NFSV4OPEN_WDSUPPFTYPE; - else if (NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt)) - *rflagsp |= NFSV4OPEN_WDRESOURCE; - else if (delegate == 0 || (writedeleg == 0 && - readonly == 0) || !NFSVNO_DELEGOK(vp) || - (clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) != - LCL_CALLBACKSON) - *rflagsp |= NFSV4OPEN_WDCONTENTION; - else { - new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1; - new_deleg->ls_stateid.other[0] = delegstateidp->other[0] - = clp->lc_clientid.lval[0]; - new_deleg->ls_stateid.other[1] = delegstateidp->other[1] - = clp->lc_clientid.lval[1]; - new_deleg->ls_stateid.other[2] = delegstateidp->other[2] - = nfsrv_nextstateindex(clp); - if (writedeleg && !NFSVNO_EXRDONLY(exp) && - (nfsrv_writedelegifpos || !readonly) && - (new_stp->ls_flags & NFSLCK_WANTRDELEG) == 0) { - new_deleg->ls_flags = (NFSLCK_DELEGWRITE | - NFSLCK_READACCESS | NFSLCK_WRITEACCESS); - *rflagsp |= NFSV4OPEN_WRITEDELEGATE; - nfsrv_writedelegcnt++; - } else { - new_deleg->ls_flags = (NFSLCK_DELEGREAD | - NFSLCK_READACCESS); - *rflagsp |= NFSV4OPEN_READDELEGATE; - } - new_deleg->ls_uid = new_stp->ls_uid; - new_deleg->ls_lfp = lfp; - new_deleg->ls_clp = clp; - new_deleg->ls_filerev = filerev; - new_deleg->ls_compref = nd->nd_compref; - new_deleg->ls_lastrecall = 0; - LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file); - LIST_INSERT_HEAD(NFSSTATEHASH(clp, - new_deleg->ls_stateid), new_deleg, ls_hash); - LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list); - new_deleg = NULL; - NFSD_VNET(nfsstatsv1_p)->srvdelegates++; - nfsrv_openpluslock++; - nfsrv_delegatecnt++; - } + nfsrv_issuedelegation(vp, clp, nd, delegate, writedeleg, + readonly, filerev, NFSVNO_EXRDONLY(exp), &new_deleg, + new_stp, lfp, rflagsp, delegstateidp); } } else { /* @@ -3337,78 +3367,28 @@ tryagain: if (new_stp->ls_flags & NFSLCK_RECLAIM) { new_stp->ls_flags = 0; } else if ((nd->nd_flag & ND_NFSV41) != 0) { - /* NFSv4.1 never needs confirmation. */ - new_stp->ls_flags = 0; + /* + * This is where we can choose to issue a delegation. + */ + nfsrv_issuedelegation(vp, clp, nd, delegate, writedeleg, + readonly, filerev, NFSVNO_EXRDONLY(exp), &new_deleg, + new_stp, lfp, rflagsp, delegstateidp); + /* NFSv4.1 never needs confirmation. */ + new_stp->ls_flags = 0; - /* - * This is where we can choose to issue a delegation. - */ - if (delegate && nfsrv_issuedelegs && - (writedeleg || readonly) && - (clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) == - LCL_CALLBACKSON && - !NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt) && - NFSVNO_DELEGOK(vp) && - ((nd->nd_flag & ND_NFSV41) == 0 || - (new_stp->ls_flags & NFSLCK_WANTNODELEG) == 0)) { - new_deleg->ls_stateid.seqid = - delegstateidp->seqid = 1; - new_deleg->ls_stateid.other[0] = - delegstateidp->other[0] - = clp->lc_clientid.lval[0]; - new_deleg->ls_stateid.other[1] = - delegstateidp->other[1] - = clp->lc_clientid.lval[1]; - new_deleg->ls_stateid.other[2] = - delegstateidp->other[2] - = nfsrv_nextstateindex(clp); - if (writedeleg && !NFSVNO_EXRDONLY(exp) && - (nfsrv_writedelegifpos || !readonly) && - ((nd->nd_flag & ND_NFSV41) == 0 || - (new_stp->ls_flags & NFSLCK_WANTRDELEG) == - 0)) { - new_deleg->ls_flags = - (NFSLCK_DELEGWRITE | - NFSLCK_READACCESS | - NFSLCK_WRITEACCESS); - *rflagsp |= NFSV4OPEN_WRITEDELEGATE; - nfsrv_writedelegcnt++; - } else { - new_deleg->ls_flags = - (NFSLCK_DELEGREAD | - NFSLCK_READACCESS); - *rflagsp |= NFSV4OPEN_READDELEGATE; - } - new_deleg->ls_uid = new_stp->ls_uid; - new_deleg->ls_lfp = lfp; - new_deleg->ls_clp = clp; - new_deleg->ls_filerev = filerev; - new_deleg->ls_compref = nd->nd_compref; - new_deleg->ls_lastrecall = 0; - LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, - ls_file); - LIST_INSERT_HEAD(NFSSTATEHASH(clp, - new_deleg->ls_stateid), new_deleg, ls_hash); - LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, - ls_list); - new_deleg = NULL; - NFSD_VNET(nfsstatsv1_p)->srvdelegates++; - nfsrv_openpluslock++; - nfsrv_delegatecnt++; - } - /* - * Since NFSv4.1 never does an OpenConfirm, the first - * open state will be acquired here. - */ - if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) { - clp->lc_flags |= LCL_STAMPEDSTABLE; - len = clp->lc_idlen; - NFSBCOPY(clp->lc_id, clidp, len); - gotstate = 1; - } + /* + * Since NFSv4.1 never does an OpenConfirm, the first + * open state will be acquired here. + */ + if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) { + clp->lc_flags |= LCL_STAMPEDSTABLE; + len = clp->lc_idlen; + NFSBCOPY(clp->lc_id, clidp, len); + gotstate = 1; + } } else { - *rflagsp |= NFSV4OPEN_RESULTCONFIRM; - new_stp->ls_flags = NFSLCK_NEEDSCONFIRM; + *rflagsp |= NFSV4OPEN_RESULTCONFIRM; + new_stp->ls_flags = NFSLCK_NEEDSCONFIRM; } nfsrvd_refcache(new_stp->ls_op); new_stp->ls_noopens = 0; @@ -3467,7 +3447,6 @@ nfsrv_openupdate(vnode_t vp, struct nfsstate *new_stp, nfsquad_t clientid, { struct nfsstate *stp; struct nfsclient *clp; - struct nfslockfile *lfp; u_int32_t bits; int error = 0, gotstate = 0, len = 0; u_char *clidp = NULL; @@ -3562,9 +3541,7 @@ nfsrv_openupdate(vnode_t vp, struct nfsstate *new_stp, nfsquad_t clientid, NFSBCOPY(clp->lc_id, clidp, len); gotstate = 1; } - NFSUNLOCKSTATE(); } else if (new_stp->ls_flags & NFSLCK_CLOSE) { - lfp = stp->ls_lfp; if (retwriteaccessp != NULL) { if ((stp->ls_flags & NFSLCK_WRITEACCESS) != 0) *retwriteaccessp = 1; @@ -3572,20 +3549,10 @@ nfsrv_openupdate(vnode_t vp, struct nfsstate *new_stp, nfsquad_t clientid, *retwriteaccessp = 0; } if (nfsrv_dolocallocks != 0 && !LIST_EMPTY(&stp->ls_open)) { - /* Get the lf lock */ - nfsrv_locklf(lfp); - NFSUNLOCKSTATE(); ASSERT_VOP_ELOCKED(vp, "nfsrv_openupdate"); - NFSVOPUNLOCK(vp); - if (nfsrv_freeopen(stp, vp, 1, p) == 0) { - NFSLOCKSTATE(); - nfsrv_unlocklf(lfp); - NFSUNLOCKSTATE(); - } - NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY); + nfsrv_freeopen(stp, vp, 1, p); } else { - (void) nfsrv_freeopen(stp, NULL, 0, p); - NFSUNLOCKSTATE(); + nfsrv_freeopen(stp, NULL, 0, p); } } else { /* @@ -3603,8 +3570,8 @@ nfsrv_openupdate(vnode_t vp, struct nfsstate *new_stp, nfsquad_t clientid, if ((nd->nd_flag & ND_NFSV41) != 0 && stp->ls_stateid.seqid == 0) stp->ls_stateid.seqid = 1; - NFSUNLOCKSTATE(); } + NFSUNLOCKSTATE(); /* * If the client just confirmed its first open, write a timestamp @@ -4419,11 +4386,13 @@ nfsrv_checkgrace(struct nfsrv_descript *nd, struct nfsclient *clp, * ReclaimComplete. If so, grace can end now. */ notreclaimed = 0; - LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head, - nst_list) { - if ((sp->nst_flag & NFSNST_RECLAIMED) == 0) { - notreclaimed = 1; - break; + if (!NFSD_VNET(nfsd_disable_grace)) { + LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head, + nst_list) { + if ((sp->nst_flag & NFSNST_RECLAIMED) == 0) { + notreclaimed = 1; + break; + } } } if (notreclaimed == 0) @@ -4616,7 +4585,7 @@ nfsrv_docallback(struct nfsclient *clp, int procnum, nfsv4stateid_t *stateidp, if (procnum != NFSV4PROC_CBNULL) nfsv4_freeslot(&sep->sess_cbsess, slotpos, true); - nfsrv_freesession(NULL, sep, NULL); + nfsrv_freesession(NULL, sep, NULL, false, NULL); } else if (nd->nd_procnum == NFSV4PROC_CBNULL) error = newnfs_connect(NULL, &clp->lc_req, cred, NULL, 1, dotls, &clp->lc_req.nr_client); @@ -4665,7 +4634,7 @@ nfsrv_docallback(struct nfsclient *clp, int procnum, nfsv4stateid_t *stateidp, nfsv4_freeslot(&sep->sess_cbsess, slotpos, true); } - nfsrv_freesession(NULL, sep, NULL); + nfsrv_freesession(NULL, sep, NULL, false, NULL); } else error = newnfs_request(nd, NULL, clp, &clp->lc_req, NULL, NULL, cred, clp->lc_program, @@ -4706,7 +4675,7 @@ errout: } else if (error == 0 && procnum == NFSV4OP_CBGETATTR) error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, - p, NULL); + NULL, p, NULL); m_freem(nd->nd_mrep); } NFSLOCKSTATE(); @@ -5179,6 +5148,11 @@ nfsrv_markreclaim(struct nfsclient *clp) * Now, just set the flag. */ sp->nst_flag |= NFSNST_RECLAIMED; + + /* + * Free up any old delegations. + */ + nfsrv_freedeleglist(&clp->lc_olddeleg); } /* @@ -5263,7 +5237,7 @@ nfsrv_clientconflict(struct nfsclient *clp, int *haslockp, vnode_t vp, */ nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p); nfsrv_backupstable(); - nfsrv_cleanclient(clp, p); + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); nfsrv_freedeleglist(&clp->lc_olddeleg); LIST_REMOVE(clp, lc_hash); @@ -5455,7 +5429,7 @@ nfsrv_delegconflict(struct nfsstate *stp, int *haslockp, NFSPROC_T *p, nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p); nfsrv_backupstable(); if (clp->lc_expiry < NFSD_MONOSEC) { - nfsrv_cleanclient(clp, p); + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); nfsrv_freedeleglist(&clp->lc_olddeleg); LIST_REMOVE(clp, lc_hash); @@ -6262,7 +6236,7 @@ nfsrv_throwawayallstate(NFSPROC_T *p) for (i = 0; i < nfsrv_clienthashsize; i++) { LIST_FOREACH_SAFE(clp, &NFSD_VNET(nfsclienthash)[i], lc_hash, nclp) { - nfsrv_cleanclient(clp, p); + nfsrv_cleanclient(clp, p, false, NULL); nfsrv_freedeleglist(&clp->lc_deleg); nfsrv_freedeleglist(&clp->lc_olddeleg); free(clp->lc_stateid, M_NFSDCLIENT); @@ -6485,7 +6459,7 @@ nfsrv_destroysession(struct nfsrv_descript *nd, uint8_t *sessionid) } while (igotlock == 0); NFSUNLOCKV4ROOTMUTEX(); - error = nfsrv_freesession(nd, NULL, sessionid); + error = nfsrv_freesession(nd, NULL, sessionid, false, NULL); if (error == 0 && samesess != 0) nd->nd_flag &= ~ND_HASSEQUENCE; @@ -6581,12 +6555,13 @@ out: */ static int nfsrv_freesession(struct nfsrv_descript *nd, struct nfsdsession *sep, - uint8_t *sessionid) + uint8_t *sessionid, bool locked, SVCXPRT **old_xprtp) { struct nfssessionhash *shp; int i; - NFSLOCKSTATE(); + if (!locked) + NFSLOCKSTATE(); if (sep == NULL) { shp = NFSSESSIONHASH(sessionid); NFSLOCKSESSION(shp); @@ -6600,28 +6575,36 @@ nfsrv_freesession(struct nfsrv_descript *nd, struct nfsdsession *sep, if (nd != NULL && nfsrv_checkmachcred(NFSV4OP_DESTROYSESSION, nd, sep->sess_clp) != 0) { NFSUNLOCKSESSION(shp); - NFSUNLOCKSTATE(); + if (!locked) + NFSUNLOCKSTATE(); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } sep->sess_refcnt--; if (sep->sess_refcnt > 0) { NFSUNLOCKSESSION(shp); - NFSUNLOCKSTATE(); + if (!locked) + NFSUNLOCKSTATE(); return (NFSERR_BACKCHANBUSY); } LIST_REMOVE(sep, sess_hash); LIST_REMOVE(sep, sess_list); } NFSUNLOCKSESSION(shp); - NFSUNLOCKSTATE(); + if (!locked) + NFSUNLOCKSTATE(); if (sep == NULL) return (NFSERR_BADSESSION); for (i = 0; i < NFSV4_SLOTS; i++) if (sep->sess_slots[i].nfssl_reply != NULL) m_freem(sep->sess_slots[i].nfssl_reply); - if (sep->sess_cbsess.nfsess_xprt != NULL) - SVC_RELEASE(sep->sess_cbsess.nfsess_xprt); + if (!locked) { + if (sep->sess_cbsess.nfsess_xprt != NULL) + SVC_RELEASE(sep->sess_cbsess.nfsess_xprt); + if (old_xprtp != NULL) + *old_xprtp = NULL; + } else if (old_xprtp != NULL) + *old_xprtp = sep->sess_cbsess.nfsess_xprt; free(sep, M_NFSDSESSION); return (0); } @@ -8943,3 +8926,112 @@ nfsrv_checkmachcred(int op, struct nfsrv_descript *nd, struct nfsclient *clp) return (0); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } + +/* + * Issue a delegation and, optionally set rflagsp for why not. + */ +static void +nfsrv_issuedelegation(struct vnode *vp, struct nfsclient *clp, + struct nfsrv_descript *nd, int delegate, int writedeleg, int readonly, + u_quad_t filerev, uint64_t rdonly, struct nfsstate **new_delegp, + struct nfsstate *new_stp, struct nfslockfile *lfp, uint32_t *rflagsp, + nfsv4stateid_t *delegstateidp) +{ + struct nfsstate *up_deleg, *new_deleg; + + new_deleg = *new_delegp; + up_deleg = LIST_FIRST(&lfp->lf_deleg); + if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0) + *rflagsp |= NFSV4OPEN_WDNOTWANTED; + else if (nfsrv_issuedelegs == 0) + *rflagsp |= NFSV4OPEN_WDSUPPFTYPE; + else if (NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt)) + *rflagsp |= NFSV4OPEN_WDRESOURCE; + else if (delegate == 0 || !NFSVNO_DELEGOK(vp) || + (writedeleg == 0 && (readonly == 0 || + (new_stp->ls_flags & NFSLCK_WANTWDELEG) != 0)) || + (clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) != + LCL_CALLBACKSON) { + /* Is this a downgrade attempt? */ + if (up_deleg != NULL && up_deleg->ls_clp == clp && + (up_deleg->ls_flags & NFSLCK_DELEGWRITE) != 0 && + (new_stp->ls_flags & NFSLCK_WANTRDELEG) != 0) + *rflagsp |= NFSV4OPEN_WDNOTSUPPDOWNGRADE; + else + *rflagsp |= NFSV4OPEN_WDCONTENTION; + } else if (up_deleg != NULL && + (up_deleg->ls_flags & NFSLCK_DELEGREAD) != 0 && + (new_stp->ls_flags & NFSLCK_WANTWDELEG) != 0) { + /* This is an atomic upgrade. */ + up_deleg->ls_stateid.seqid++; + delegstateidp->seqid = up_deleg->ls_stateid.seqid; + delegstateidp->other[0] = up_deleg->ls_stateid.other[0]; + delegstateidp->other[1] = up_deleg->ls_stateid.other[1]; + delegstateidp->other[2] = up_deleg->ls_stateid.other[2]; + up_deleg->ls_flags = (NFSLCK_DELEGWRITE | + NFSLCK_READACCESS | NFSLCK_WRITEACCESS); + *rflagsp |= NFSV4OPEN_WRITEDELEGATE; + nfsrv_writedelegcnt++; + } else { + new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1; + new_deleg->ls_stateid.other[0] = delegstateidp->other[0] + = clp->lc_clientid.lval[0]; + new_deleg->ls_stateid.other[1] = delegstateidp->other[1] + = clp->lc_clientid.lval[1]; + new_deleg->ls_stateid.other[2] = delegstateidp->other[2] + = nfsrv_nextstateindex(clp); + if (writedeleg && !rdonly && + (nfsrv_writedelegifpos || !readonly) && + (new_stp->ls_flags & (NFSLCK_WANTRDELEG | + NFSLCK_WANTWDELEG)) != NFSLCK_WANTRDELEG) { + new_deleg->ls_flags = (NFSLCK_DELEGWRITE | + NFSLCK_READACCESS | NFSLCK_WRITEACCESS); + *rflagsp |= NFSV4OPEN_WRITEDELEGATE; + nfsrv_writedelegcnt++; + } else { + new_deleg->ls_flags = (NFSLCK_DELEGREAD | + NFSLCK_READACCESS); + *rflagsp |= NFSV4OPEN_READDELEGATE; + } + new_deleg->ls_uid = new_stp->ls_uid; + new_deleg->ls_lfp = lfp; + new_deleg->ls_clp = clp; + new_deleg->ls_filerev = filerev; + new_deleg->ls_compref = nd->nd_compref; + new_deleg->ls_lastrecall = 0; + LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file); + LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_deleg->ls_stateid), + new_deleg, ls_hash); + LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list); + *new_delegp = NULL; + NFSD_VNET(nfsstatsv1_p)->srvdelegates++; + nfsrv_openpluslock++; + nfsrv_delegatecnt++; + } +} + +/* + * Find and remove any delegations for the fh. + */ +void +nfsrv_removedeleg(fhandle_t *fhp, struct nfsrv_descript *nd, NFSPROC_T *p) +{ + struct nfsclient *clp; + struct nfsstate *stp, *nstp; + struct nfslockfile *lfp; + int error; + + NFSLOCKSTATE(); + error = nfsrv_getclient(nd->nd_clientid, CLOPS_RENEW, &clp, NULL, + (nfsquad_t)((u_quad_t)0), 0, nd, p); + if (error == 0) + error = nfsrv_getlockfile(NFSLCK_CHECK, NULL, &lfp, fhp, 0); + /* + * Now we must free any delegations. + */ + if (error == 0) { + LIST_FOREACH_SAFE(stp, &lfp->lf_deleg, ls_file, nstp) + nfsrv_freedeleg(stp); + } + NFSUNLOCKSTATE(); +} diff --git a/sys/fs/nfsserver/nfs_nfsdsubs.c b/sys/fs/nfsserver/nfs_nfsdsubs.c index 0d7e4c73fe69..b09ec1b3a062 100644 --- a/sys/fs/nfsserver/nfs_nfsdsubs.c +++ b/sys/fs/nfsserver/nfs_nfsdsubs.c @@ -57,9 +57,6 @@ NFSD_VNET_DECLARE(int, nfs_rootfhset); NFSD_VNET_DECLARE(uid_t, nfsrv_defaultuid); NFSD_VNET_DECLARE(gid_t, nfsrv_defaultgid); -NFSD_VNET_DEFINE(struct nfsdontlisthead, nfsrv_dontlisthead); - - char nfs_v2pubfh[NFSX_V2FH]; struct nfsdontlisthead nfsrv_dontlisthead; struct nfslayouthead nfsrv_recalllisthead; @@ -1476,8 +1473,9 @@ int nfsrv_mtofh(struct nfsrv_descript *nd, struct nfsrvfh *fhp) { u_int32_t *tl; - int error = 0, len, copylen; + int error = 0, len, copylen, namedlen; + namedlen = 0; if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); @@ -1493,6 +1491,11 @@ nfsrv_mtofh(struct nfsrv_descript *nd, struct nfsrvfh *fhp) copylen = NFSX_MYFH; len = NFSM_RNDUP(len); nd->nd_flag |= ND_DSSERVER; + } else if (len >= NFSX_MYFH + NFSX_V4NAMEDDIRFH && + len <= NFSX_MYFH + NFSX_V4NAMEDATTRFH) { + copylen = NFSX_MYFH; + namedlen = len; + len = NFSM_RNDUP(len); } else if (len < NFSRV_MINFH || len > NFSRV_MAXFH) { if (nd->nd_flag & ND_NFSV4) { if (len > 0 && len <= NFSX_V4FHMAX) { @@ -1527,7 +1530,10 @@ nfsrv_mtofh(struct nfsrv_descript *nd, struct nfsrvfh *fhp) goto nfsmout; } NFSBCOPY(tl, (caddr_t)fhp->nfsrvfh_data, copylen); - fhp->nfsrvfh_len = copylen; + if (namedlen > 0) + fhp->nfsrvfh_len = namedlen; + else + fhp->nfsrvfh_len = copylen; nfsmout: NFSEXITCODE2(error, nd); return (error); @@ -1623,7 +1629,7 @@ nfsrv_checkuidgid(struct nfsrv_descript *nd, struct nfsvattr *nvap) if (nd->nd_cred->cr_uid == 0) goto out; if ((NFSVNO_ISSETUID(nvap) && nvap->na_uid != nd->nd_cred->cr_uid) || - (NFSVNO_ISSETGID(nvap) && nvap->na_gid != nd->nd_cred->cr_gid && + (NFSVNO_ISSETGID(nvap) && !groupmember(nvap->na_gid, nd->nd_cred))) error = NFSERR_PERM; @@ -1682,8 +1688,7 @@ nfsrv_fixattr(struct nfsrv_descript *nd, vnode_t vp, } if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_OWNERGROUP) && NFSVNO_ISSETGID(nvap)) { - if (nvap->na_gid == nd->nd_cred->cr_gid || - groupmember(nvap->na_gid, nd->nd_cred)) { + if (groupmember(nvap->na_gid, nd->nd_cred)) { nd->nd_cred->cr_uid = 0; nva.na_gid = nvap->na_gid; change++; diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c index 0356877eaf05..7dcc83880bb9 100644 --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -245,6 +245,10 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp) vp->v_object = lowervp->v_object; vn_irflag_set(vp, VIRF_PGREAD); } + if ((vn_irflag_read(lowervp) & VIRF_INOTIFY) != 0) + vn_irflag_set(vp, VIRF_INOTIFY); + if ((vn_irflag_read(lowervp) & VIRF_INOTIFY_PARENT) != 0) + vn_irflag_set(vp, VIRF_INOTIFY_PARENT); if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp) vp->v_vflag |= VV_ROOT; diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 7ab1fb6c1a25..4cddf24a5745 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -365,12 +365,7 @@ nullfs_statfs(struct mount *mp, struct statfs *sbp) return (error); } - /* now copy across the "interesting" information and fake the rest */ sbp->f_type = mstat->f_type; - sbp->f_flags &= MNT_RDONLY | MNT_NOEXEC | MNT_NOSUID | MNT_UNION | - MNT_NOSYMFOLLOW | MNT_AUTOMOUNTED | MNT_EXPORTED | MNT_IGNORE; - mstat->f_flags &= ~(MNT_ROOTFS | MNT_AUTOMOUNTED | MNT_EXPORTED); - sbp->f_flags |= mstat->f_flags; sbp->f_bsize = mstat->f_bsize; sbp->f_iosize = mstat->f_iosize; sbp->f_blocks = mstat->f_blocks; diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index 4747b1dd5b82..74c1a8f3acb6 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -190,6 +190,26 @@ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, &null_bug_bypass, 0, ""); /* + * Synchronize inotify flags with the lower vnode: + * - If the upper vnode has the flag set and the lower does not, then the lower + * vnode is unwatched and the upper vnode does not need to go through + * VOP_INOTIFY. + * - If the lower vnode is watched, then the upper vnode should go through + * VOP_INOTIFY, so copy the flag up. + */ +static void +null_copy_inotify(struct vnode *vp, struct vnode *lvp, short flag) +{ + if ((vn_irflag_read(vp) & flag) != 0) { + if (__predict_false((vn_irflag_read(lvp) & flag) == 0)) + vn_irflag_unset(vp, flag); + } else if ((vn_irflag_read(lvp) & flag) != 0) { + if (__predict_false((vn_irflag_read(vp) & flag) == 0)) + vn_irflag_set(vp, flag); + } +} + +/* * This is the 10-Apr-92 bypass routine. * This version has been optimized for speed, throwing away some * safety checks. It should still always work, but it's not as @@ -305,7 +325,10 @@ null_bypass(struct vop_generic_args *ap) lvp = *(vps_p[i]); /* - * Get rid of the transient hold on lvp. + * Get rid of the transient hold on lvp. Copy inotify + * flags up in case something is watching the lower + * layer. + * * If lowervp was unlocked during VOP * operation, nullfs upper vnode could have * been reclaimed, which changes its v_vnlock @@ -314,6 +337,10 @@ null_bypass(struct vop_generic_args *ap) * upper (reclaimed) vnode. */ if (lvp != NULLVP) { + null_copy_inotify(old_vps[i], lvp, + VIRF_INOTIFY); + null_copy_inotify(old_vps[i], lvp, + VIRF_INOTIFY_PARENT); if (VOP_ISLOCKED(lvp) == LK_EXCLUSIVE && old_vps[i]->v_vnlock != lvp->v_vnlock) { VOP_UNLOCK(lvp); @@ -385,7 +412,7 @@ null_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; - int flags = cnp->cn_flags; + uint64_t flags = cnp->cn_flags; struct vnode *vp, *ldvp, *lvp; struct mount *mp; int error; @@ -403,17 +430,25 @@ null_lookup(struct vop_lookup_args *ap) /* * Renames in the lower mounts might create an inconsistent - * configuration where lower vnode is moved out of the - * directory tree remounted by our null mount. Do not try to - * handle it fancy, just avoid VOP_LOOKUP() with DOTDOT name - * which cannot be handled by VOP, at least passing over lower - * root. + * configuration where lower vnode is moved out of the directory tree + * remounted by our null mount. + * + * Do not try to handle it fancy, just avoid VOP_LOOKUP() with DOTDOT + * name which cannot be handled by the VOP. */ - if ((ldvp->v_vflag & VV_ROOT) != 0 && (flags & ISDOTDOT) != 0) { - KASSERT((dvp->v_vflag & VV_ROOT) == 0, - ("ldvp %p fl %#x dvp %p fl %#x flags %#x", - ldvp, ldvp->v_vflag, dvp, dvp->v_vflag, flags)); - return (ENOENT); + if ((flags & ISDOTDOT) != 0) { + struct nameidata *ndp; + + if ((ldvp->v_vflag & VV_ROOT) != 0) { + KASSERT((dvp->v_vflag & VV_ROOT) == 0, + ("ldvp %p fl %#x dvp %p fl %#x flags %#jx", + ldvp, ldvp->v_vflag, dvp, dvp->v_vflag, + (uintmax_t)flags)); + return (ENOENT); + } + ndp = vfs_lookup_nameidata(cnp); + if (ndp != NULL && vfs_lookup_isroot(ndp, ldvp)) + return (ENOENT); } /* @@ -528,7 +563,7 @@ null_setattr(struct vop_setattr_args *ap) } } - return (null_bypass((struct vop_generic_args *)ap)); + return (null_bypass(&ap->a_gen)); } /* @@ -539,7 +574,7 @@ null_stat(struct vop_stat_args *ap) { int error; - if ((error = null_bypass((struct vop_generic_args *)ap)) != 0) + if ((error = null_bypass(&ap->a_gen)) != 0) return (error); ap->a_sb->st_dev = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; @@ -551,7 +586,7 @@ null_getattr(struct vop_getattr_args *ap) { int error; - if ((error = null_bypass((struct vop_generic_args *)ap)) != 0) + if ((error = null_bypass(&ap->a_gen)) != 0) return (error); ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; @@ -584,7 +619,7 @@ null_access(struct vop_access_args *ap) break; } } - return (null_bypass((struct vop_generic_args *)ap)); + return (null_bypass(&ap->a_gen)); } static int @@ -610,7 +645,7 @@ null_accessx(struct vop_accessx_args *ap) break; } } - return (null_bypass((struct vop_generic_args *)ap)); + return (null_bypass(&ap->a_gen)); } /* diff --git a/sys/fs/p9fs/p9_client.c b/sys/fs/p9fs/p9_client.c new file mode 100644 index 000000000000..547de98c4c03 --- /dev/null +++ b/sys/fs/p9fs/p9_client.c @@ -0,0 +1,1332 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains 9P client functions which prepares message to be sent to + * the server. Every fileop typically has a function defined here to interact + * with the host. + */ + +#include <vm/uma.h> +#include <sys/systm.h> +#include <sys/dirent.h> +#include <sys/fcntl.h> +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/sysctl.h> + +#include <fs/p9fs/p9_client.h> +#include <fs/p9fs/p9_debug.h> +#include <fs/p9fs/p9_transport.h> + +#define QEMU_HEADER 7 +#define P9FS_MAX_FID_CNT (1024 * 1024 * 1024) +#define P9FS_ROOT_FID_NO 2 +#define P9FS_MIN_TAG 1 +#define P9FS_MAX_TAG 65535 +#define WSTAT_SIZE 47 +#define WSTAT_EXTENSION_SIZE 14 + +static MALLOC_DEFINE(M_P9CLNT, "p9_client", "p9fs client structure"); +static uma_zone_t p9fs_fid_zone; +static uma_zone_t p9fs_req_zone; +static uma_zone_t p9fs_buf_zone; + +SYSCTL_DECL(_vfs_p9fs); +int p9_debug_level = 0; +SYSCTL_INT(_vfs_p9fs, OID_AUTO, debug_level, CTLFLAG_RW, + &p9_debug_level, 0, "p9fs debug logging"); + +static struct p9_req_t *p9_get_request(struct p9_client *c, int *error); +static struct p9_req_t *p9_client_request( + struct p9_client *c, int8_t type, int *error, const char *fmt, ...); + +inline int +p9_is_proto_dotl(struct p9_client *clnt) +{ + + return (clnt->proto_version == p9_proto_2000L); +} + +inline int +p9_is_proto_dotu(struct p9_client *clnt) +{ + + return (clnt->proto_version == p9_proto_2000u); +} + +/* Parse mount options into client structure */ +static int +p9_parse_opts(struct mount *mp, struct p9_client *clnt) +{ + int error, len; + char *trans; + + /* + * Default to virtio since thats the only transport we have for now. + */ + error = vfs_getopt(mp->mnt_optnew, "trans", (void **)&trans, &len); + if (error == ENOENT) + trans = "virtio"; + + /* These are defaults for now */ + clnt->proto_version = p9_proto_2000L; + clnt->msize = 8192; + + /* Get the default trans callback */ + clnt->ops = p9_get_trans_by_name(trans); + + return (0); +} + +/* Allocate buffer for sending request and getting responses */ +static struct p9_buffer * +p9_buffer_alloc(int alloc_msize) +{ + struct p9_buffer *fc; + + fc = uma_zalloc(p9fs_buf_zone, M_WAITOK | M_ZERO); + fc->capacity = alloc_msize; + fc->offset = 0; + fc->size = 0; + fc->sdata = (char *)fc + sizeof(struct p9_buffer); + + return (fc); +} + +/* Free memory used by request and response buffers */ +static void +p9_buffer_free(struct p9_buffer **buf) +{ + + /* Free the sdata buffers first, then the whole structure*/ + uma_zfree(p9fs_buf_zone, *buf); + *buf = NULL; +} + +/* Free the request */ +static void +p9_free_req(struct p9_client *clnt, struct p9_req_t *req) +{ + + if (req->tc != NULL) { + if (req->tc->tag != P9_NOTAG) + p9_tag_destroy(clnt, req->tc->tag); + p9_buffer_free(&req->tc); + } + + if (req->rc != NULL) + p9_buffer_free(&req->rc); + + uma_zfree(p9fs_req_zone, req); +} + +/* Allocate a request by tag */ +static struct p9_req_t * +p9_get_request(struct p9_client *clnt, int *error) +{ + struct p9_req_t *req; + int alloc_msize; + uint16_t tag; + + alloc_msize = P9FS_MTU; + + req = uma_zalloc(p9fs_req_zone, M_WAITOK | M_ZERO); + req->tc = p9_buffer_alloc(alloc_msize); + req->rc = p9_buffer_alloc(alloc_msize); + + tag = p9_tag_create(clnt); + if (tag == P9_NOTAG) { + *error = EAGAIN; + req->tc->tag = P9_NOTAG; + p9_free_req(clnt, req); + return (NULL); + } + req->tc->tag = tag; + return (req); +} + +/* Parse header arguments of the response buffer */ +static int +p9_parse_receive(struct p9_buffer *buf, struct p9_client *clnt) +{ + int8_t type; + int16_t tag; + int32_t size; + int error; + + buf->offset = 0; + + /* This value is set by QEMU for the header.*/ + if (buf->size == 0) + buf->size = QEMU_HEADER; + + /* This is the initial header. Parse size, type, and tag .*/ + error = p9_buf_readf(buf, 0, "dbw", &size, &type, &tag); + if (error != 0) + goto out; + + buf->size = size; + buf->id = type; + buf->tag = tag; + P9_DEBUG(TRANS, "%s: size=%d type: %d tag: %d\n", + __func__, buf->size, buf->id, buf->tag); +out: + return (error); +} + +/* Check 9P response for any errors returned and process it */ +static int +p9_client_check_return(struct p9_client *c, struct p9_req_t *req) +{ + int error; + int ecode; + char *ename; + + /* Check what we have in the receive bufer .*/ + error = p9_parse_receive(req->rc, c); + if (error != 0) + goto out; + + /* + * No error, We are done with the preprocessing. Return to the caller + * and process the actual data. + */ + if (req->rc->id != P9PROTO_RERROR && req->rc->id != P9PROTO_RLERROR) + return (0); + + /* + * Interpreting the error is done in different ways for Linux and + * Unix version. Make sure you interpret it right. + */ + if (req->rc->id == P9PROTO_RERROR) { + error = p9_buf_readf(req->rc, c->proto_version, "s?d", &ename, &ecode); + } else if (req->rc->id == P9PROTO_RLERROR) { + error = p9_buf_readf(req->rc, c->proto_version, "d", &ecode); + } else { + goto out; + } + if (error != 0) + goto out; + + /* if there was an ecode error make this the err now */ + error = ecode; + + /* + * Note this is still not completely an error, as lookups for files + * not present can hit this and return. Hence it is made a debug print. + */ + if (error != 0) { + if (req->rc->id == P9PROTO_RERROR) { + P9_DEBUG(PROTO, "RERROR error %d ename %s\n", + error, ename); + } else if (req->rc->id == P9PROTO_RLERROR) { + P9_DEBUG(PROTO, "RLERROR error %d\n", error); + } + } + + if (req->rc->id == P9PROTO_RERROR) { + free(ename, M_TEMP); + } + return (error); + +out: + P9_DEBUG(ERROR, "couldn't parse receive buffer error%d\n", error); + return (error); +} + +/* State machine changing helpers */ +void p9_client_disconnect(struct p9_client *clnt) +{ + + P9_DEBUG(TRANS, "%s: clnt %p\n", __func__, clnt); + clnt->trans_status = P9FS_DISCONNECT; +} + +void p9_client_begin_disconnect(struct p9_client *clnt) +{ + + P9_DEBUG(TRANS, "%s: clnt %p\n", __func__, clnt); + clnt->trans_status = P9FS_BEGIN_DISCONNECT; +} + +static struct p9_req_t * +p9_client_prepare_req(struct p9_client *c, int8_t type, + int req_size, int *error, const char *fmt, __va_list ap) +{ + struct p9_req_t *req; + + P9_DEBUG(TRANS, "%s: client %p op %d\n", __func__, c, type); + + /* + * Before we start with the request, check if its possible to finish + * this request. We are allowed to submit the request only if there + * are no close sessions happening or else there can be race. If the + * status is Disconnected, we stop any requests coming in after that. + */ + if (c->trans_status == P9FS_DISCONNECT) { + *error = EIO; + return (NULL); + } + + /* Allow only cleanup clunk messages once teardown has started. */ + if ((c->trans_status == P9FS_BEGIN_DISCONNECT) && + (type != P9PROTO_TCLUNK)) { + *error = EIO; + return (NULL); + } + + /* Allocate buffer for transferring and receiving data from host */ + req = p9_get_request(c, error); + if (*error != 0) { + P9_DEBUG(ERROR, "%s: request allocation failed.\n", __func__); + return (NULL); + } + + /* Marshall the data according to QEMU standards */ + *error = p9_buf_prepare(req->tc, type); + if (*error != 0) { + P9_DEBUG(ERROR, "%s: p9_buf_prepare failed: %d\n", + __func__, *error); + goto out; + } + + *error = p9_buf_vwritef(req->tc, c->proto_version, fmt, ap); + if (*error != 0) { + P9_DEBUG(ERROR, "%s: p9_buf_vwrite failed: %d\n", + __func__, *error); + goto out; + } + + *error = p9_buf_finalize(c, req->tc); + if (*error != 0) { + P9_DEBUG(ERROR, "%s: p9_buf_finalize failed: %d \n", + __func__, *error); + goto out; + } + + return (req); +out: + p9_free_req(c, req); + return (NULL); +} + +/* + * Issue a request and wait for response. The routine takes care of preparing + * the 9P request header to be sent, parsing and checking for error conditions + * in the received buffer. It returns the request structure. + */ +static struct p9_req_t * +p9_client_request(struct p9_client *c, int8_t type, int *error, + const char *fmt, ...) +{ + va_list ap; + struct p9_req_t *req; + + va_start(ap, fmt); + req = p9_client_prepare_req(c, type, c->msize, error, fmt, ap); + va_end(ap); + + /* Issue with allocation of request buffer */ + if (*error != 0) + return (NULL); + + /* Call into the transport for submission. */ + *error = c->ops->request(c->handle, req); + if (*error != 0) { + P9_DEBUG(ERROR, "%s: failed: %d\n", __func__, *error); + goto out; + } + + /* + * Before we return, pre process the header and the rc buffer before + * calling into the protocol infra to analyze the data in rc. + */ + *error = p9_client_check_return(c, req); + if (*error != 0) + goto out; + + return (req); +out: + p9_free_req(c, req); + return (NULL); +} + +/* Setup tag contents and structure */ +uint16_t +p9_tag_create(struct p9_client *clnt) +{ + int tag; + + tag = alloc_unr(&clnt->tagpool); + P9_DEBUG(LPROTO, "%s: clnt %p: tag %d\n", __func__, clnt, tag); + + /* Alloc_unr returning -1 is an error for no units left */ + if (tag == -1) { + return (P9_NOTAG); + } + return (tag); +} + +/* Clean up tag structures */ +void +p9_tag_destroy(struct p9_client *clnt, uint16_t tag) +{ + + P9_DEBUG(LPROTO, "%s: clnt %p: tag %d\n", __func__, clnt, tag); + + /* Release to the pool */ + free_unr(&clnt->tagpool, tag); +} + +/* Allocate a new fid from the fidpool */ +struct p9_fid * +p9_fid_create(struct p9_client *clnt) +{ + struct p9_fid *fid; + + + fid = uma_zalloc(p9fs_fid_zone, M_WAITOK | M_ZERO); + fid->fid = alloc_unr(&clnt->fidpool); + P9_DEBUG(LPROTO, "%s: fid %d\n", __func__, fid->fid); + + /* Alloc_unr returning -1 is an error for no units left */ + if (fid->fid == -1) { + uma_zfree(p9fs_fid_zone, fid); + return (NULL); + } + fid->mode = -1; + fid->uid = -1; + fid->clnt = clnt; + + return (fid); +} + +/* Free the fid by releasing it to fidpool */ +void +p9_fid_destroy(struct p9_fid *fid) +{ + struct p9_client *clnt; + + P9_DEBUG(LPROTO, "%s: fid %d\n", __func__, fid->fid); + clnt = fid->clnt; + /* Release to the pool */ + free_unr(&clnt->fidpool, fid->fid); + uma_zfree(p9fs_fid_zone, fid); +} + +/* Request the version of 9P protocol */ +int +p9_client_version(struct p9_client *c) +{ + int error; + struct p9_req_t *req; + char *version; + int msize; + + error = 0; + + P9_DEBUG(PROTO, "TVERSION msize %d protocol %d\n", + c->msize, c->proto_version); + + switch (c->proto_version) { + case p9_proto_2000L: + req = p9_client_request(c, P9PROTO_TVERSION, &error, "ds", + c->msize, "9P2000.L"); + break; + case p9_proto_2000u: + req = p9_client_request(c, P9PROTO_TVERSION, &error, "ds", + c->msize, "9P2000.u"); + break; + case p9_proto_legacy: + req = p9_client_request(c, P9PROTO_TVERSION, &error, "ds", + c->msize, "9P2000"); + break; + default: + return (EINVAL); + } + + /* Always return the relevant error code */ + if (error != 0) + return (error); + + error = p9_buf_readf(req->rc, c->proto_version, "ds", &msize, &version); + if (error != 0) { + P9_DEBUG(ERROR, "%s: version error: %d\n", __func__, error); + goto out; + } + + P9_DEBUG(PROTO, "RVERSION msize %d %s\n", msize, version); + + if (!strncmp(version, "9P2000.L", 8)) + c->proto_version = p9_proto_2000L; + else if (!strncmp(version, "9P2000.u", 8)) + c->proto_version = p9_proto_2000u; + else if (!strncmp(version, "9P2000", 6)) + c->proto_version = p9_proto_legacy; + else { + error = ENOMEM; + goto out; + } + + /* limit the msize .*/ + if (msize < c->msize) + c->msize = msize; +out: + p9_free_req(c, req); + return (error); +} + +/* + * Initialize zones for different things. This is called from Init module + * so that we just have them initalized once. + */ +void +p9_init_zones(void) +{ + + /* Create the request and the fid zones */ + p9fs_fid_zone = uma_zcreate("p9fs fid zone", + sizeof(struct p9_fid), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + + /* Create the request and the fid zones */ + p9fs_req_zone = uma_zcreate("p9fs req zone", + sizeof(struct p9_req_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + + /* Create the buffer zone */ + p9fs_buf_zone = uma_zcreate("p9fs buf zone", + sizeof(struct p9_buffer) + P9FS_MTU, NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); +} + +void +p9_destroy_zones(void) +{ + + uma_zdestroy(p9fs_fid_zone); + uma_zdestroy(p9fs_req_zone); + uma_zdestroy(p9fs_buf_zone); +} + +/* Return the client to the session in the FS to hold it */ +struct p9_client * +p9_client_create(struct mount *mp, int *error, const char *mount_tag) +{ + struct p9_client *clnt; + + clnt = malloc(sizeof(struct p9_client), M_P9CLNT, M_WAITOK | M_ZERO); + mtx_init(&clnt->clnt_mtx, "p9clnt", NULL, MTX_DEF); + + /* Parse should have set trans_mod */ + *error = p9_parse_opts(mp, clnt); + if (*error != 0) + goto out; + + if (clnt->ops == NULL) { + *error = EINVAL; + P9_DEBUG(ERROR, "%s: no transport\n", __func__); + goto out; + } + + /* All the structures from here are protected by the lock clnt_mtx */ + init_unrhdr(&clnt->fidpool, P9FS_ROOT_FID_NO, P9FS_MAX_FID_CNT, + &clnt->clnt_mtx); + init_unrhdr(&clnt->tagpool, P9FS_MIN_TAG, P9FS_MAX_TAG, + &clnt->clnt_mtx); + + P9_DEBUG(TRANS, "%s: clnt %p trans %p msize %d protocol %d\n", + __func__, clnt, clnt->ops, clnt->msize, clnt->proto_version); + + *error = clnt->ops->create(mount_tag, &clnt->handle); + if (*error != 0) { + P9_DEBUG(ERROR, "%s: transport create failed .%d \n", + __func__, *error); + goto out; + } + clnt->trans_status = P9FS_CONNECT; + + *error = p9_client_version(clnt); + if (*error != 0) + goto out; + + P9_DEBUG(TRANS, "%s: client creation succeeded.\n", __func__); + return (clnt); +out: + free(clnt, M_P9CLNT); + return (NULL); +} + +/* Destroy the client by destroying associated fidpool and tagpool */ +void +p9_client_destroy(struct p9_client *clnt) +{ + + P9_DEBUG(TRANS, "%s: client %p\n", __func__, clnt); + clnt->ops->close(clnt->handle); + + P9_DEBUG(TRANS, "%s : Destroying fidpool\n", __func__); + clear_unrhdr(&clnt->fidpool); + + P9_DEBUG(TRANS, "%s : Destroying tagpool\n", __func__); + clear_unrhdr(&clnt->tagpool); + + free(clnt, M_P9CLNT); +} + +/* + * Attach a user to the filesystem. Create a fid for that user to access + * the root of the filesystem. + */ +struct p9_fid * +p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, + const char *uname, uid_t n_uname, const char *aname, int *error) +{ + struct p9_req_t *req; + struct p9_fid *fid; + struct p9_qid qid; + + P9_DEBUG(PROTO, "TATTACH uname=%s aname=%s, n_uname=%d\n", + uname, aname, n_uname); + fid = p9_fid_create(clnt); + if (fid == NULL) { + *error = ENOMEM; + return (NULL); + } + fid->uid = n_uname; + + req = p9_client_request(clnt, P9PROTO_TATTACH, error, "ddssd", fid->fid, + P9PROTO_NOFID, uname, aname, n_uname); + if (*error != 0) + goto out; + + *error = p9_buf_readf(req->rc, clnt->proto_version, "Q", &qid); + if (*error != 0) { + P9_DEBUG(ERROR, "%s: p9_buf_readf failed: %d \n", + __func__, *error); + goto out; + } + + P9_DEBUG(PROTO, "RATTACH qid %x.%llx.%x\n", + qid.type, (unsigned long long)qid.path, qid.version); + + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); + p9_free_req(clnt, req); + + return (fid); +out: + if (req != NULL) + p9_free_req(clnt, req); + if (fid != NULL) + p9_fid_destroy(fid); + + return (NULL); +} + +/* Delete a file/directory. Corresponding fid will be cluncked too */ +int +p9_client_remove(struct p9_fid *fid) +{ + int error; + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DEBUG(PROTO, "TREMOVE fid %d\n", fid->fid); + + error = 0; + clnt = fid->clnt; + + req = p9_client_request(clnt, P9PROTO_TREMOVE, &error, "d", fid->fid); + if (error != 0) { + P9_DEBUG(PROTO, "RREMOVE fid %d\n", fid->fid); + return (error); + } + + p9_free_req(clnt, req); + return (error); +} + +int +p9_client_unlink(struct p9_fid *dfid, const char *name, int32_t flags) +{ + int error; + struct p9_client *clnt; + struct p9_req_t *req; + + error = 0; + clnt = dfid->clnt; + + req = p9_client_request(clnt, P9PROTO_TUNLINKAT, &error, "dsd", + dfid->fid, name, flags); + if (error != 0) { + P9_DEBUG(PROTO, "RUNLINKAT fid %d\n", dfid->fid); + return (error); + } + + p9_free_req(clnt, req); + return (error); +} + +/* Inform the file server that the current file represented by fid is no longer + * needed by the client. Any allocated fid on the server needs a clunk to be + * destroyed. + */ +int +p9_client_clunk(struct p9_fid *fid) +{ + int error; + struct p9_client *clnt; + struct p9_req_t *req; + + error = 0; + + if (fid == NULL) { + P9_DEBUG(ERROR, "%s: clunk with NULL fid is bad\n", __func__); + return (0); + } + + P9_DEBUG(PROTO, "TCLUNK fid %d \n", fid->fid); + + clnt = fid->clnt; + req = p9_client_request(clnt, P9PROTO_TCLUNK, &error, "d", fid->fid); + if (req != NULL) { + P9_DEBUG(PROTO, "RCLUNK fid %d\n", fid->fid); + p9_free_req(clnt, req); + } + + p9_fid_destroy(fid); + return (error); +} + +/* + * Client_walk is for searching any component name in a directory. + * This is usually called on lookups. Also when we need a new open fid + * as 9p needs to have an open fid for every file to fileops, we call this + * validate the component of the file and return the newfid(openfid) created. + */ +struct p9_fid * +p9_client_walk(struct p9_fid *oldfid, uint16_t nwnames, char **wnames, + int clone, int *error) +{ + struct p9_client *clnt; + struct p9_fid *fid; + struct p9_qid *wqids; + struct p9_req_t *req; + uint16_t nwqids, count; + + clnt = oldfid->clnt; + wqids = NULL; + nwqids = 0; + + /* + * Before, we go and create fid, make sure we are not tearing + * down. Only then we create. + * Allow only cleanup clunk messages once we are starting to teardown. + */ + if (clnt->trans_status != P9FS_CONNECT) { + *error = EIO; + return (NULL); + } + + if (clone) { + fid = p9_fid_create(clnt); + if (fid == NULL) { + *error = ENOMEM; + return (NULL); + } + fid->uid = oldfid->uid; + } else + fid = oldfid; + + P9_DEBUG(PROTO, "TWALK fids %d,%d nwnames %u wname %s\n", + oldfid->fid, fid->fid, nwnames, + wnames != NULL ? wnames[nwnames-1] : NULL); + + /* + * The newfid is for the component in search. We are preallocating as + * qemu on other side allocates or returns a fid if it sees a match + */ + req = p9_client_request(clnt, P9PROTO_TWALK, error, "ddT", oldfid->fid, + fid->fid, wnames, nwnames); + if (*error != 0) { + if (fid != oldfid) + p9_fid_destroy(fid); + return (NULL); + } + + *error = p9_buf_readf(req->rc, clnt->proto_version, "R", &nwqids, + &wqids); + if (*error != 0) + goto out; + + P9_DEBUG(PROTO, "RWALK nwqid %d:\n", nwqids); + + if (nwqids != nwnames) { + *error = ENOENT; + goto out; + } + + for (count = 0; count < nwqids; count++) + P9_DEBUG(TRANS, "%s: [%d] %x.%llx.%x\n", + __func__, count, wqids[count].type, + (unsigned long long)wqids[count].path, + wqids[count].version); + + if (nwnames) + memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid)); + else + fid->qid = oldfid->qid; + + p9_free_req(clnt, req); + free(wqids, M_TEMP); + return (fid); + +out: + p9_free_req(clnt, req); + if (wqids) + free(wqids, M_TEMP); + if (fid && fid != oldfid) + p9_client_clunk(fid); + return (NULL); +} + +/* Open a file with given fid and mode */ +int +p9_client_open(struct p9_fid *fid, int mode) +{ + int error, mtu; + struct p9_client *clnt; + struct p9_req_t *req; + + error = 0; + clnt = fid->clnt; + mtu = 0; + + P9_DEBUG(PROTO, "%s fid %d mode %d\n", + p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", + fid->fid, mode); + + if (fid->mode != -1) + return (EINVAL); + + if (p9_is_proto_dotl(clnt)) + req = p9_client_request(clnt, P9PROTO_TLOPEN, &error, "dd", + fid->fid, mode); + else + req = p9_client_request(clnt, P9PROTO_TOPEN, &error, "db", + fid->fid, mode); + + if (error != 0) + return (error); + + error = p9_buf_readf(req->rc, clnt->proto_version, "Qd", &fid->qid, + &mtu); + if (error != 0) + goto out; + + P9_DEBUG(PROTO, "%s qid %x.%llx.%x mtu %x\n", + p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", + (fid->qid).type, (unsigned long long)(fid->qid).path, + (fid->qid).version, mtu); + + fid->mode = mode; + fid->mtu = mtu; +out: + p9_free_req(clnt, req); + return (error); +} + +/* Request to get directory entries */ +int +p9_client_readdir(struct p9_fid *fid, char *data, uint64_t offset, + uint32_t count) +{ + int error; + uint32_t rsize; + struct p9_client *clnt; + struct p9_req_t *req; + char *dataptr; + + P9_DEBUG(PROTO, "TREADDIR fid %d offset %llu count %d\n", + fid->fid, (unsigned long long) offset, count); + + error = 0; + rsize = fid->mtu; + clnt = fid->clnt; + + if (rsize == 0 || rsize > clnt->msize) + rsize = clnt->msize; + + if (count < rsize) + rsize = count; + + req = p9_client_request(clnt, P9PROTO_TREADDIR, &error, "dqd", + fid->fid, offset, rsize); + + if (error != 0) { + P9_DEBUG(ERROR, "%s: couldn't allocate req in client_readdir\n", + __func__); + return (-error); + } + + error = p9_buf_readf(req->rc, clnt->proto_version, "D", &count, + &dataptr); + if (error != 0) { + P9_DEBUG(ERROR, "%s: p0_buf_readf failed: %d\n", + __func__, error); + p9_free_req(clnt, req); + return (-error); + } + + P9_DEBUG(PROTO, "RREADDIR count %u\n", count); + + /* Copy back the data into the input buffer. */ + memmove(data, dataptr, count); + p9_free_req(clnt, req); + return (count); +} + +/* + * Read count bytes from offset for the file fid into the character + * buffer data. This buffer is handed over to p9fs to process into user + * buffers. Note that this function typically returns the number of bytes read + * so in case of an error we return -error so that we can distinguish between + * error codes and bytes. + */ +int +p9_client_read(struct p9_fid *fid, uint64_t offset, uint32_t count, char *data) +{ + struct p9_client *clnt; + struct p9_req_t *req; + char *dataptr; + int error, rsize; + + clnt = fid->clnt; + rsize = fid->mtu; + error = 0; + + P9_DEBUG(PROTO, "TREAD fid %d offset %llu %u\n", + fid->fid, (unsigned long long) offset, count); + + if (!rsize || rsize > clnt->msize) + rsize = clnt->msize; + + if (count < rsize) + rsize = count; + + /* At this stage, we only have 8K buffers so only transfer */ + req = p9_client_request(clnt, P9PROTO_TREAD, &error, "dqd", fid->fid, + offset, rsize); + if (error != 0) { + P9_DEBUG(ERROR, "%s: failed allocate request\n", __func__); + return (-error); + } + + error = p9_buf_readf(req->rc, clnt->proto_version, "D", &count, + &dataptr); + if (error != 0) { + P9_DEBUG(ERROR, "%s: p9_buf_readf failed: %d\n", + __func__, error); + goto out; + } + + if (rsize < count) { + P9_DEBUG(PROTO, "RREAD count (%d > %d)\n", count, rsize); + count = rsize; + } + + P9_DEBUG(PROTO, "RREAD count %d\n", count); + + if (count == 0) { + error = -EIO; + P9_DEBUG(ERROR, "%s: EIO error in client_read \n", __func__); + goto out; + } + + /* Copy back the data into the input buffer. */ + memmove(data, dataptr, count); + p9_free_req(clnt, req); + return (count); +out: + p9_free_req(clnt, req); + return (-error); +} + +/* + * Write count bytes from buffer to the offset for the file fid + * Note that this function typically returns the number of bytes written + * so in case of an error we return -error so that we can distinguish between + * error codes and bytes. + */ + +int +p9_client_write(struct p9_fid *fid, uint64_t offset, uint32_t count, char *data) +{ + struct p9_client *clnt; + struct p9_req_t *req; + int ret, error, rsize; + + clnt = fid->clnt; + rsize = fid->mtu; + ret = 0; + error = 0; + + P9_DEBUG(PROTO, "TWRITE fid %d offset %llu %u\n", + fid->fid, (unsigned long long) offset, count); + + if (!rsize || rsize > clnt->msize) + rsize = clnt->msize; + + /* Limit set by Qemu ,8168 */ + if (count > rsize) { + count = rsize; + } + + /* + * Doing the Data blob instead. If at all we add the zerocopy, we can + * change it to uio direct copy + */ + req = p9_client_request(clnt, P9PROTO_TWRITE, &error, "dqD", fid->fid, + offset, count, data); + if (error != 0) { + P9_DEBUG(ERROR, "%s: failed allocate request: %d\n", + __func__, error); + return (-error); + } + + error = p9_buf_readf(req->rc, clnt->proto_version, "d", &ret); + if (error != 0) { + P9_DEBUG(ERROR, "%s: p9_buf_readf error: %d\n", + __func__, error); + goto out; + } + + if (count < ret) { + P9_DEBUG(PROTO, "RWRITE count (%d > %d)\n", count, ret); + ret = count; + } + P9_DEBUG(PROTO, "RWRITE count %d\n", ret); + + if (count == 0) { + error = EIO; + P9_DEBUG(ERROR, "%s: EIO error\n", __func__); + goto out; + } + + p9_free_req(clnt, req); + return (ret); +out: + p9_free_req(clnt, req); + return (-error); +} + + +/* Create file under directory fid, with name, permissions, mode. */ +int +p9_client_file_create(struct p9_fid *fid, char *name, uint32_t perm, int mode, + char *extension) +{ + int error; + struct p9_client *clnt; + struct p9_req_t *req; + struct p9_qid qid; + int mtu; + + P9_DEBUG(PROTO, "TCREATE fid %d name %s perm %d mode %d\n", + fid->fid, name, perm, mode); + + clnt = fid->clnt; + error = 0; + + if (fid->mode != -1) + return (EINVAL); + + req = p9_client_request(clnt, P9PROTO_TCREATE, &error, "dsdb?s", + fid->fid, name, perm, mode, extension); + if (error != 0) + return (error); + + error = p9_buf_readf(req->rc, clnt->proto_version, "Qd", &qid, &mtu); + if (error != 0) + goto out; + + P9_DEBUG(PROTO, "RCREATE qid %x.%jx.%x mtu %x\n", + qid.type, (uintmax_t)qid.path, qid.version, mtu); + fid->mode = mode; + fid->mtu = mtu; + +out: + p9_free_req(clnt, req); + return (error); +} + +/* Request file system information of the file system */ +int +p9_client_statfs(struct p9_fid *fid, struct p9_statfs *stat) +{ + int error; + struct p9_req_t *req; + struct p9_client *clnt; + + error = 0; + clnt = fid->clnt; + + P9_DEBUG(PROTO, "TSTATFS fid %d\n", fid->fid); + + req = p9_client_request(clnt, P9PROTO_TSTATFS, &error, "d", fid->fid); + if (error != 0) { + return (error); + } + + error = p9_buf_readf(req->rc, clnt->proto_version, "ddqqqqqqd", + &stat->type, &stat->bsize, &stat->blocks, &stat->bfree, + &stat->bavail, &stat->files, &stat->ffree, &stat->fsid, + &stat->namelen); + + if (error != 0) + goto out; + + P9_DEBUG(PROTO, "RSTATFS fid %d type 0x%jx bsize %ju " + "blocks %ju bfree %ju bavail %ju files %ju ffree %ju " + "fsid %ju namelen %ju\n", + fid->fid, (uintmax_t)stat->type, + (uintmax_t)stat->bsize, (uintmax_t)stat->blocks, + (uintmax_t)stat->bfree, (uintmax_t)stat->bavail, + (uintmax_t)stat->files, (uintmax_t)stat->ffree, + (uintmax_t)stat->fsid, (uintmax_t)stat->namelen); + +out: + p9_free_req(clnt, req); + return (error); +} + +/* Rename file referenced by the fid */ +int +p9_client_renameat(struct p9_fid *oldfid, char *oldname, struct p9_fid *newfid, + char *newname) +{ + int error; + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DEBUG(PROTO, "TRENAMEAT oldfid %d oldname %s newfid %d newfid %s", + oldfid->fid, oldname, newfid->fid, newname); + + error = 0; + clnt = oldfid->clnt; + + /* + * we are calling the request with TRENAMEAT tag and not TRENAME with + * the 9p protocol version 9p2000.u as the QEMU version supports this + * version of renaming + */ + req = p9_client_request(clnt, P9PROTO_TRENAMEAT, &error, "dsds", + oldfid->fid, oldname, newfid->fid, newname); + + if (error != 0) + return (error); + + p9_free_req(clnt, req); + return (error); +} + +/* Request to create symbolic link */ +int +p9_create_symlink(struct p9_fid *fid, char *name, char *symtgt, gid_t gid) +{ + int error; + struct p9_req_t *req; + struct p9_client *clnt; + struct p9_qid qid; + + error = 0; + clnt = fid->clnt; + + P9_DEBUG(PROTO, "TSYMLINK fid %d name %s\n", fid->fid, name); + + req = p9_client_request(clnt, P9PROTO_TSYMLINK, &error, "dssd", + fid->fid, name, symtgt, gid); + + if (error != 0) + return (error); + + error = p9_buf_readf(req->rc, clnt->proto_version, "Q", &qid); + if (error != 0) { + P9_DEBUG(ERROR, "%s: buf_readf failed %d\n", __func__, error); + return (error); + } + + P9_DEBUG(PROTO, "RSYMLINK qid %x.%jx.%x\n", + qid.type, (uintmax_t)qid.path, qid.version); + + p9_free_req(clnt, req); + return (0); +} + +/* Request to create hard link */ +int +p9_create_hardlink(struct p9_fid *dfid, struct p9_fid *oldfid, char *name) +{ + int error; + struct p9_req_t *req; + struct p9_client *clnt; + + error = 0; + clnt = dfid->clnt; + + P9_DEBUG(PROTO, "TLINK dfid %d oldfid %d name %s\n", + dfid->fid, oldfid->fid, name); + + req = p9_client_request(clnt, P9PROTO_TLINK, &error, "dds", dfid->fid, + oldfid->fid, name); + if (error != 0) + return (error); + + p9_free_req(clnt, req); + return (0); +} + +/* Request to read contents of symbolic link */ +int +p9_readlink(struct p9_fid *fid, char **target) +{ + int error; + struct p9_client *clnt; + struct p9_req_t *req; + + error = 0; + clnt = fid->clnt; + + P9_DEBUG(PROTO, "TREADLINK fid %d\n", fid->fid); + + req = p9_client_request(clnt, P9PROTO_TREADLINK, &error, "d", fid->fid); + if (error != 0) + return (error); + + error = p9_buf_readf(req->rc, clnt->proto_version, "s", target); + if (error != 0) { + P9_DEBUG(ERROR, "%s: buf_readf failed %d\n", __func__, error); + return (error); + } + + P9_DEBUG(PROTO, "RREADLINK target %s \n", *target); + + p9_free_req(clnt, req); + return (0); +} + +/* Get file attributes of the file referenced by the fid */ +int +p9_client_getattr(struct p9_fid *fid, struct p9_stat_dotl *stat_dotl, + uint64_t request_mask) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + + P9_DEBUG(PROTO, "TGETATTR fid %d mask %ju\n", + fid->fid, (uintmax_t)request_mask); + + clnt = fid->clnt; + req = p9_client_request(clnt, P9PROTO_TGETATTR, &err, "dq", fid->fid, + request_mask); + if (req == NULL) { + P9_DEBUG(ERROR, "%s: allocation failed %d", __func__, err); + goto error; + } + + err = p9_buf_readf(req->rc, clnt->proto_version, "A", stat_dotl); + if (err != 0) { + P9_DEBUG(ERROR, "%s: buf_readf failed %d\n", __func__, err); + goto error; + } + + p9_free_req(clnt, req); + P9_DEBUG(PROTO, "RGETATTR fid %d qid %x.%jx.%x st_mode %8.8x " + "uid %d gid %d nlink %ju rdev %jx st_size %jx blksize %ju " + "blocks %ju st_atime_sec %ju, st_atime_nsec %ju " + "st_mtime_sec %ju, st_mtime_nsec %ju st_ctime_sec %ju " + "st_ctime_nsec %ju st_btime_sec %ju, st_btime_nsec %ju " + "st_stat %ju, st_data_version %ju \n", fid->fid, + stat_dotl->qid.type, (uintmax_t)stat_dotl->qid.path, + stat_dotl->qid.version, stat_dotl->st_mode, stat_dotl->st_uid, + stat_dotl->st_gid, (uintmax_t)stat_dotl->st_nlink, + (uintmax_t)stat_dotl->st_rdev, (uintmax_t)stat_dotl->st_size, + (uintmax_t)stat_dotl->st_blksize, + (uintmax_t)stat_dotl->st_blocks, (uintmax_t)stat_dotl->st_atime_sec, + (uintmax_t)stat_dotl->st_atime_nsec, (uintmax_t)stat_dotl->st_mtime_sec, + (uintmax_t)stat_dotl->st_mtime_nsec, (uintmax_t)stat_dotl->st_ctime_sec, + (uintmax_t)stat_dotl->st_ctime_nsec, (uintmax_t)stat_dotl->st_btime_sec, + (uintmax_t)stat_dotl->st_btime_nsec, (uintmax_t)stat_dotl->st_gen, + (uintmax_t)stat_dotl->st_data_version); + + return (err); + +error: + if (req != NULL) + p9_free_req(clnt, req); + + return (err); +} + +/* Set file attributes of the file referenced by the fid */ +int +p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + + err = 0; + + P9_DEBUG(PROTO, "TSETATTR fid %d" + " valid %x mode %x uid %d gid %d size %ju" + " atime_sec %ju atime_nsec %ju" + " mtime_sec %ju mtime_nsec %ju\n", + fid->fid, + p9attr->valid, p9attr->mode, p9attr->uid, p9attr->gid, + (uintmax_t)p9attr->size, (uintmax_t)p9attr->atime_sec, + (uintmax_t)p9attr->atime_nsec, (uintmax_t)p9attr->mtime_sec, + (uintmax_t)p9attr->mtime_nsec); + + clnt = fid->clnt; + + /* Any client_request error is converted to req == NULL error*/ + req = p9_client_request(clnt, P9PROTO_TSETATTR, &err, "dA", fid->fid, + p9attr); + + if (req == NULL) { + P9_DEBUG(ERROR, "%s: allocation failed %d\n", __func__, err); + goto error; + } + + p9_free_req(clnt, req); +error: + return (err); +} + diff --git a/sys/fs/p9fs/p9_client.h b/sys/fs/p9fs/p9_client.h new file mode 100644 index 000000000000..4eb82c0232f4 --- /dev/null +++ b/sys/fs/p9fs/p9_client.h @@ -0,0 +1,169 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* 9P client definitions */ + +#ifndef FS_P9FS_P9_CLIENT_H +#define FS_P9FS_P9_CLIENT_H + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/_unrhdr.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/condvar.h> +#include <sys/systm.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/dirent.h> +#include <sys/stdarg.h> + +#include <fs/p9fs/p9_protocol.h> + +/* 9P protocol versions */ +enum p9_proto_versions { + p9_proto_legacy, /* legacy version */ + p9_proto_2000u, /* Unix version */ + p9_proto_2000L, /* Linux version */ +}; + +/* P9 Request exchanged between Host and Guest */ +struct p9_req_t { + struct p9_buffer *tc; /* request buffer */ + struct p9_buffer *rc; /* response buffer */ +}; + +/* 9P transport status */ +enum transport_status { + P9FS_CONNECT, /* transport is connected */ + P9FS_BEGIN_DISCONNECT,/* transport has begun to disconnect */ + P9FS_DISCONNECT, /* transport has been dosconnected */ +}; + +/* This is set by QEMU so we will oblige */ +#define P9FS_MTU 8192 + +/* + * Even though we have a 8k buffer, Qemu is typically doing 8168 + * because of a HDR of 24. Use that amount for transfers so that we dont + * drop anything. + */ +#define P9FS_IOUNIT (P9FS_MTU - 24) +#define P9FS_DIRENT_LEN 256 +#define P9_NOTAG 0 + +/* Client state information */ +struct p9_client { + struct p9_trans_module *ops; /* module API instantiated with this client */ + void *handle; /* module-specific client handle */ + struct mtx clnt_mtx; /* mutex to lock the client */ + struct mtx req_mtx; /* mutex to lock the request buffer */ + struct cv req_cv; /* condition variable on which to wake up thread */ + unsigned int msize; /* maximum data size */ + unsigned char proto_version; /* 9P version to use */ + struct unrhdr fidpool; /* fid handle accounting for session */ + struct unrhdr tagpool; /* transaction id accounting for session */ + enum transport_status trans_status; /* tranport instance state */ +}; + +/* The main fid structure which keeps track of the file.*/ +struct p9_fid { + struct p9_client *clnt; /* the instatntiating 9P client */ + uint32_t fid; /* numeric identifier */ + int mode; /* current mode of this fid */ + struct p9_qid qid; /* server identifier */ + uint32_t mtu; /* max transferrable unit at a time */ + uid_t uid; /* numeric uid of the local user who owns this handle */ + int v_opens; /* keep count on the number of opens called with this fiel handle */ + STAILQ_ENTRY(p9_fid) fid_next; /* points to next fid in the list */ +}; + +/* Directory entry structure */ +struct p9_dirent { + struct p9_qid qid; /* 9P server qid for this dirent */ + uint64_t d_off; /* offset to the next dirent */ + unsigned char d_type; /* file type */ + char d_name[P9FS_DIRENT_LEN]; /* file name */ + int len; +}; + +void p9_init_zones(void); +void p9_destroy_zones(void); + +/* Session and client Init Ops */ +struct p9_client *p9_client_create(struct mount *mp, int *error, + const char *mount_tag); +void p9_client_destroy(struct p9_client *clnt); +struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *fid, + const char *uname, uid_t n_uname, const char *aname, int *error); + +/* FILE OPS - These are individually called from the specific vop function */ + +int p9_client_open(struct p9_fid *fid, int mode); +int p9_client_close(struct p9_fid *fid); +struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwnames, + char **wnames, int clone, int *error); +struct p9_fid *p9_fid_create(struct p9_client *clnt); +void p9_fid_destroy(struct p9_fid *fid); +uint16_t p9_tag_create(struct p9_client *clnt); +void p9_tag_destroy(struct p9_client *clnt, uint16_t tag); +int p9_client_clunk(struct p9_fid *fid); +int p9_client_version(struct p9_client *clnt); +int p9_client_readdir(struct p9_fid *fid, char *data, uint64_t offset, uint32_t count); +int p9_client_read(struct p9_fid *fid, uint64_t offset, uint32_t count, char *data); +int p9_client_write(struct p9_fid *fid, uint64_t offset, uint32_t count, char *data); +int p9_client_file_create(struct p9_fid *fid, char *name, uint32_t perm, int mode, + char *extension); +int p9_client_remove(struct p9_fid *fid); +int p9_client_unlink(struct p9_fid *dfid, const char *name, int32_t flags); +int p9_dirent_read(struct p9_client *clnt, char *buf, int start, int len, + struct p9_dirent *dirent); +int p9_client_statfs(struct p9_fid *fid, struct p9_statfs *stat); +int p9_client_statread(struct p9_client *clnt, char *data, size_t len, struct p9_wstat *st); +int p9_is_proto_dotu(struct p9_client *clnt); +int p9_is_proto_dotl(struct p9_client *clnt); +void p9_client_cb(struct p9_client *c, struct p9_req_t *req); +int p9stat_read(struct p9_client *clnt, char *data, size_t len, struct p9_wstat *st); +void p9_client_disconnect(struct p9_client *clnt); +void p9_client_begin_disconnect(struct p9_client *clnt); +int p9_create_symlink(struct p9_fid *fid, char *name, char *symtgt, gid_t gid); +int p9_create_hardlink(struct p9_fid *dfid, struct p9_fid *oldfid, char *name); +int p9_readlink(struct p9_fid *fid, char **target); +int p9_client_renameat(struct p9_fid *oldfid, char *oldname, struct p9_fid *newfid, char *newname); +int p9_client_getattr(struct p9_fid *fid, struct p9_stat_dotl *stat_dotl, + uint64_t request_mask); +int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr); + +int p9_buf_vwritef(struct p9_buffer *buf, int proto_version, const char *fmt, + va_list ap); +int p9_buf_readf(struct p9_buffer *buf, int proto_version, const char *fmt, ...); +int p9_buf_prepare(struct p9_buffer *buf, int8_t type); +int p9_buf_finalize(struct p9_client *clnt, struct p9_buffer *buf); +void p9_buf_reset(struct p9_buffer *buf); + +#endif /* FS_P9FS_P9_CLIENT_H */ diff --git a/sys/fs/p9fs/p9_debug.h b/sys/fs/p9fs/p9_debug.h new file mode 100644 index 000000000000..463b009d00ad --- /dev/null +++ b/sys/fs/p9fs/p9_debug.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef FS_P9FS_P9_DEBUG_H +#define FS_P9FS_P9_DEBUG_H + +extern int p9_debug_level; /* All debugs on now */ + +/* 9P debug flags */ +#define P9_DEBUG_TRANS 0x0001 /* Trace transport */ +#define P9_DEBUG_SUBR 0x0002 /* Trace driver submissions */ +#define P9_DEBUG_LPROTO 0x0004 /* Low level protocol tracing */ +#define P9_DEBUG_PROTO 0x0008 /* High level protocol tracing */ +#define P9_DEBUG_VOPS 0x0010 /* VOPs tracing */ +#define P9_DEBUG_ERROR 0x0020 /* verbose error messages */ + +#define P9_DEBUG(category, fmt, ...) do { \ + if ((p9_debug_level & P9_DEBUG_##category) != 0) \ + printf(fmt, ##__VA_ARGS__); \ +} while (0) + +#endif /* FS_P9FS_P9_DEBUG_H */ diff --git a/sys/fs/p9fs/p9_protocol.c b/sys/fs/p9fs/p9_protocol.c new file mode 100644 index 000000000000..e0045f67993d --- /dev/null +++ b/sys/fs/p9fs/p9_protocol.c @@ -0,0 +1,632 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * 9P Protocol Support Code + * This file provides the standard for the FS interactions with the server + * interface as it can understand only this protocol. The details of the + * protocol can be found here + * XXX (link to protocol details page on FreeBSD wiki) + */ + +#include <sys/types.h> +#include <fs/p9fs/p9_client.h> +#include <fs/p9fs/p9_debug.h> +#include <fs/p9fs/p9_protocol.h> + +#define P9FS_MAXLEN 255 + +static int p9_buf_writef(struct p9_buffer *buf, int proto_version, + const char *fmt, ...); +static void stat_free(struct p9_wstat *sbuf); + +static void +stat_free(struct p9_wstat *stbuf) +{ + + free(stbuf->name, M_TEMP); + free(stbuf->uid, M_TEMP); + free(stbuf->gid, M_TEMP); + free(stbuf->muid, M_TEMP); + free(stbuf->extension, M_TEMP); +} + +static size_t +buf_read(struct p9_buffer *buf, void *data, size_t size) +{ + size_t len; + + len = min(buf->size - buf->offset, size); + + memcpy(data, &buf->sdata[buf->offset], len); + buf->offset += len; + + return (size - len); +} + +static size_t +buf_write(struct p9_buffer *buf, const void *data, size_t size) +{ + size_t len; + + len = min(buf->capacity - buf->size, size); + + memcpy(&buf->sdata[buf->size], data, len); + buf->size += len; + + return (size - len); +} + +/* + * Main buf_read routine. This copies the data from the buffer into the + * respective values based on the data type. + * Here + * b - int8_t + * w - int16_t + * d - int32_t + * q - int64_t + * s - string + * u - uid + * g - gid + * Q - qid + * S - stat + * A - getattr (9P2000.L) + * D - data blob (int32_t size followed by void *, results are not freed) + * T - array of strings (int16_t count, followed by strings) + * R - array of qids (int16_t count, followed by qids) + * ? - return if version is not .u or .l + */ +static int +p9_buf_vreadf(struct p9_buffer *buf, int proto_version, const char *fmt, + va_list ap) +{ + const char *ptr; + int error; + + error = 0; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b': + { + int8_t *val = va_arg(ap, int8_t *); + + if (buf_read(buf, val, sizeof(*val))) + error = EFAULT; + break; + } + case 'w': + { + int16_t *val = va_arg(ap, int16_t *); + + if (buf_read(buf, val, sizeof(*val))) + error = EFAULT; + break; + } + case 'd': + { + int32_t *val = va_arg(ap, int32_t *); + + if (buf_read(buf, val, sizeof(*val))) + error = EFAULT; + break; + } + case 'q': + { + int64_t *val = va_arg(ap, int64_t *); + + if (buf_read(buf, val, sizeof(*val))) + error = EFAULT; + break; + } + case 's': + { + char **sptr_p = va_arg(ap, char **); + uint16_t len; + char *sptr; + + error = buf_read(buf, &len, sizeof(uint16_t)); + if (error) + break; + + sptr = malloc(len + 1, M_TEMP, M_NOWAIT | M_ZERO); + + if (buf_read(buf, sptr, len)) { + error = EFAULT; + free(sptr, M_TEMP); + sptr = NULL; + } else { + (sptr)[len] = 0; + *sptr_p = sptr; + } + break; + } + case 'u': + { + uid_t *val = va_arg(ap, uid_t *); + + if (buf_read(buf, val, sizeof(*val))) + error = EFAULT; + break; + + } + case 'g': + { + gid_t *val = va_arg(ap, gid_t *); + + if (buf_read(buf, val, sizeof(*val))) + error = EFAULT; + break; + + } + case 'Q': + { + struct p9_qid *qid = va_arg(ap, struct p9_qid *); + + error = p9_buf_readf(buf, proto_version, "bdq", + &qid->type, &qid->version, &qid->path); + + break; + } + case 'S': + { + struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *); + + error = p9_buf_readf(buf, proto_version, "wwdQdddqssss?sddd", + &stbuf->size, &stbuf->type, &stbuf->dev, &stbuf->qid, + &stbuf->mode, &stbuf->atime, &stbuf->mtime, &stbuf->length, + &stbuf->name, &stbuf->uid, &stbuf->gid, &stbuf->muid, + &stbuf->extension, &stbuf->n_uid, &stbuf->n_gid, &stbuf->n_muid); + + if (error != 0) + stat_free(stbuf); + break; + } + case 'A': + { + struct p9_stat_dotl *stbuf = va_arg(ap, struct p9_stat_dotl *); + + error = p9_buf_readf(buf, proto_version, "qQdugqqqqqqqqqqqqqqq", + &stbuf->st_result_mask, &stbuf->qid, &stbuf->st_mode, + &stbuf->st_uid,&stbuf->st_gid, &stbuf->st_nlink, + &stbuf->st_rdev, &stbuf->st_size, &stbuf->st_blksize, + &stbuf->st_blocks, &stbuf->st_atime_sec, + &stbuf->st_atime_nsec, &stbuf->st_mtime_sec, + &stbuf->st_mtime_nsec, &stbuf->st_ctime_sec, + &stbuf->st_ctime_nsec, &stbuf->st_btime_sec, + &stbuf->st_btime_nsec, &stbuf->st_gen, + &stbuf->st_data_version); + + break; + } + case 'D': + { + uint32_t *count = va_arg(ap, uint32_t *); + void **data = va_arg(ap, void **); + + error = buf_read(buf, count, sizeof(uint32_t)); + if (error == 0) { + *count = MIN(*count, buf->size - buf->offset); + *data = &buf->sdata[buf->offset]; + } + break; + } + case 'T': + { + uint16_t *nwname_p = va_arg(ap, uint16_t *); + char ***wnames_p = va_arg(ap, char ***); + uint16_t nwname; + char **wnames; + int i; + + error = buf_read(buf, nwname_p, sizeof(uint16_t)); + if (error != 0) + break; + + nwname = *nwname_p; + wnames = malloc(sizeof(char *) * nwname, M_TEMP, M_NOWAIT | M_ZERO); + + for (i = 0; i < nwname && (error == 0); i++) + error = p9_buf_readf(buf, proto_version, "s", &wnames[i]); + + if (error != 0) { + for (i = 0; i < nwname; i++) + free((wnames)[i], M_TEMP); + free(wnames, M_TEMP); + } else + *wnames_p = wnames; + break; + } + case 'R': + { + uint16_t *nwqid_p = va_arg(ap, uint16_t *); + struct p9_qid **wqids_p = va_arg(ap, struct p9_qid **); + uint16_t nwqid; + struct p9_qid *wqids; + int i; + + wqids = NULL; + error = buf_read(buf, nwqid_p, sizeof(uint16_t)); + if (error != 0) + break; + + nwqid = *nwqid_p; + wqids = malloc(nwqid * sizeof(struct p9_qid), M_TEMP, M_NOWAIT | M_ZERO); + if (wqids == NULL) { + error = ENOMEM; + break; + } + for (i = 0; i < nwqid && (error == 0); i++) + error = p9_buf_readf(buf, proto_version, "Q", &(wqids)[i]); + + if (error != 0) { + free(wqids, M_TEMP); + } else + *wqids_p = wqids; + + break; + } + case '?': + { + if ((proto_version != p9_proto_2000u) && (proto_version != p9_proto_2000L)) + return (0); + break; + } + default: + break; + } + + if (error != 0) + break; + } + + return (error); +} + +/* + * Main buf_write routine. This copies the data into the buffer from the + * respective values based on the data type. + * Here + * b - int8_t + * w - int16_t + * d - int32_t + * q - int64_t + * s - string + * u - uid + * g - gid + * Q - qid + * S - stat + * D - data blob (int32_t size followed by void *, results are not freed) + * T - array of strings (int16_t count, followed by strings) + * W - string of a specific length + * R - array of qids (int16_t count, followed by qids) + * A - setattr (9P2000.L) + * ? - return if version is not .u or .l + */ + +int +p9_buf_vwritef(struct p9_buffer *buf, int proto_version, const char *fmt, + va_list ap) +{ + const char *ptr; + int error; + + error = 0; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b': + { + int8_t val = va_arg(ap, int); + + if (buf_write(buf, &val, sizeof(val))) + error = EFAULT; + break; + } + case 'w': + { + int16_t val = va_arg(ap, int); + + if (buf_write(buf, &val, sizeof(val))) + error = EFAULT; + break; + } + case 'd': + { + int32_t val = va_arg(ap, int32_t); + + if (buf_write(buf, &val, sizeof(val))) + error = EFAULT; + break; + } + case 'q': + { + int64_t val = va_arg(ap, int64_t); + + if (buf_write(buf, &val, sizeof(val))) + error = EFAULT; + + break; + } + case 's': + { + const char *sptr = va_arg(ap, const char *); + uint16_t len = 0; + + if (sptr) + len = MIN(strlen(sptr), P9FS_MAXLEN); + + error = buf_write(buf, &len, sizeof(uint16_t)); + if (error == 0 && buf_write(buf, sptr, len)) + error = EFAULT; + break; + } + case 'u': + { + uid_t val = va_arg(ap, uid_t); + + if (buf_write(buf, &val, sizeof(val))) + error = EFAULT; + break; + + } + case 'g': + { + gid_t val = va_arg(ap, gid_t); + + if (buf_write(buf, &val, sizeof(val))) + error = EFAULT; + break; + + } + case 'Q': + { + const struct p9_qid *qid = va_arg(ap, const struct p9_qid *); + + error = p9_buf_writef(buf, proto_version, "bdq", + qid->type, qid->version, qid->path); + break; + } + case 'S': + { + struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *); + + error = p9_buf_writef(buf, proto_version, + "wwdQdddqssss?sddd", stbuf->size, stbuf->type, stbuf->dev, &stbuf->qid, + stbuf->mode, stbuf->atime, stbuf->mtime, stbuf->length, stbuf->name, + stbuf->uid, stbuf->gid, stbuf->muid, stbuf->extension, stbuf->n_uid, + stbuf->n_gid, stbuf->n_muid); + + if (error != 0) + stat_free(stbuf); + + break; + } + case 'D': + { + uint32_t count = va_arg(ap, uint32_t); + void *data = va_arg(ap, void *); + + error = buf_write(buf, &count, sizeof(uint32_t)); + if ((error == 0) && buf_write(buf, data, count)) + error = EFAULT; + + break; + } + case 'T': + { + char **wnames = va_arg(ap, char **); + uint16_t nwnames = va_arg(ap, int); + + error = buf_write(buf, &nwnames, sizeof(uint16_t)); + if (error == 0) { + int i = 0; + for (i = 0; i < nwnames; i++) { + error = p9_buf_writef(buf, proto_version, "s", wnames[i]); + if (error != 0) + break; + } + } + break; + } + case 'W': + { + const char *sptr = va_arg(ap, const char*); + uint16_t len = va_arg(ap, int); + + error = buf_write(buf, &len, sizeof(uint16_t)); + if (error == 0 && buf_write(buf, sptr, len)) + error = EFAULT; + break; + + } + case 'R': + { + uint16_t nwqid = va_arg(ap, int); + struct p9_qid *wqids = va_arg(ap, struct p9_qid *); + int i; + + error = buf_write(buf, &nwqid, sizeof(uint16_t)); + if (error == 0) { + + for (i = 0; i < nwqid; i++) { + error = p9_buf_writef(buf, proto_version, "Q", &wqids[i]); + if (error != 0) + break; + } + } + break; + } + case 'A': + { + struct p9_iattr_dotl *p9attr = va_arg(ap, struct p9_iattr_dotl *); + + error = p9_buf_writef(buf, proto_version, "ddugqqqqq", + p9attr->valid, p9attr->mode, p9attr->uid, + p9attr->gid, p9attr->size, p9attr->atime_sec, + p9attr->atime_nsec, p9attr->mtime_sec, + p9attr->mtime_nsec); + + break; + } + case '?': + { + if ((proto_version != p9_proto_2000u) && (proto_version != p9_proto_2000L)) + return (0); + break; + } + default: + break; + } + + if (error != 0) + break; + } + + return (error); +} + +/* Variadic form of buf_read */ +int +p9_buf_readf(struct p9_buffer *buf, int proto_version, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = p9_buf_vreadf(buf, proto_version, fmt, ap); + va_end(ap); + + return (ret); +} + +/* Variadic form of buf_write */ +static int +p9_buf_writef(struct p9_buffer *buf, int proto_version, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = p9_buf_vwritef(buf, proto_version, fmt, ap); + va_end(ap); + + return (ret); +} + +/* File stats read routine for P9 to get attributes of files */ +int +p9stat_read(struct p9_client *clnt, char *buf, size_t len, struct p9_wstat *st) +{ + struct p9_buffer msg_buf; + int ret; + + msg_buf.size = len; + msg_buf.capacity = len; + msg_buf.sdata = buf; + msg_buf.offset = 0; + + ret = p9_buf_readf(&msg_buf, clnt->proto_version, "S", st); + if (ret) { + P9_DEBUG(ERROR, "%s: failed: %d\n", __func__, ret); + } + + return (ret); +} + +/* + * P9_header preparation routine. All p9 buffers have to have this header(QEMU_HEADER) at the + * front of the buffer. + */ +int +p9_buf_prepare(struct p9_buffer *buf, int8_t type) +{ + buf->id = type; + return (p9_buf_writef(buf, 0, "dbw", 0, type, buf->tag)); +} + +/* + * Final write to the buffer, this is the total size of the buffer. Since the buffer length can + * vary with request, this is computed at the end just before sending the request to the driver + */ +int +p9_buf_finalize(struct p9_client *clnt, struct p9_buffer *buf) +{ + int size; + int error; + + size = buf->size; + buf->size = 0; + error = p9_buf_writef(buf, 0, "d", size); + buf->size = size; + + P9_DEBUG(LPROTO, "%s: size=%d type: %d tag: %d\n", + __func__, buf->size, buf->id, buf->tag); + + return (error); +} + +/* Reset values of the buffer */ +void +p9_buf_reset(struct p9_buffer *buf) +{ + + buf->offset = 0; + buf->size = 0; +} + +/* + * Directory entry read with the buf we have. Call this once we have the buf to parse. + * This buf, obtained from the server, is parsed to make dirent in readdir. + */ +int +p9_dirent_read(struct p9_client *clnt, char *buf, int start, int len, + struct p9_dirent *dent) +{ + struct p9_buffer msg_buf; + int ret; + char *nameptr; + uint16_t sle; + + msg_buf.size = len; + msg_buf.capacity = len; + msg_buf.sdata = buf; + msg_buf.offset = start; + + ret = p9_buf_readf(&msg_buf, clnt->proto_version, "Qqbs", &dent->qid, + &dent->d_off, &dent->d_type, &nameptr); + if (ret) { + P9_DEBUG(ERROR, "%s: failed: %d\n", __func__, ret); + goto out; + } + + sle = strlen(nameptr); + strncpy(dent->d_name, nameptr, sle); + dent->len = sle; + free(nameptr, M_TEMP); +out: + return (msg_buf.offset); +} diff --git a/sys/fs/p9fs/p9_protocol.h b/sys/fs/p9fs/p9_protocol.h new file mode 100644 index 000000000000..7ffd7dd67bcf --- /dev/null +++ b/sys/fs/p9fs/p9_protocol.h @@ -0,0 +1,282 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* File contains 9P protocol definitions */ + +#ifndef FS_P9FS_P9_PROTOCOL_H +#define FS_P9FS_P9_PROTOCOL_H + +#include <sys/types.h> + +/* 9P message types */ +enum p9_cmds_t { + P9PROTO_TLERROR = 6, /* not used */ + P9PROTO_RLERROR, /* response for any failed request */ + P9PROTO_TSTATFS = 8, /* file system status request */ + P9PROTO_RSTATFS, /* file system status response */ + P9PROTO_TLOPEN = 12, /* open a file (9P2000.L) */ + P9PROTO_RLOPEN, /* response to opne request (9P2000.L) */ + P9PROTO_TLCREATE = 14, /* prepare for handle for I/O on a new file (9P2000.L) */ + P9PROTO_RLCREATE, /* response with file access information (9P2000.L) */ + P9PROTO_TSYMLINK = 16, /* symlink creation request */ + P9PROTO_RSYMLINK, /* symlink creation response */ + P9PROTO_TMKNOD = 18, /* create a special file object request */ + P9PROTO_RMKNOD, /* create a special file object response */ + P9PROTO_TRENAME = 20, /* rename a file request */ + P9PROTO_RRENAME, /* rename a file response */ + P9PROTO_TREADLINK = 22, /* request to read value of symbolic link */ + P9PROTO_RREADLINK, /* response to read value of symbolic link request */ + P9PROTO_TGETATTR = 24, /* get file attributes request */ + P9PROTO_RGETATTR, /* get file attributes response */ + P9PROTO_TSETATTR = 26, /* set file attributes request */ + P9PROTO_RSETATTR, /* set file attributes response */ + P9PROTO_TXATTRWALK = 30,/* request to read extended attributes */ + P9PROTO_RXATTRWALK, /* response from server with attributes */ + P9PROTO_TXATTRCREATE = 32,/* request to set extended attribute */ + P9PROTO_RXATTRCREATE, /* response from server for setting extended attribute */ + P9PROTO_TREADDIR = 40, /* request to read a directory */ + P9PROTO_RREADDIR, /* response from server for read request */ + P9PROTO_TFSYNC = 50, /* request to flush an cached data to disk */ + P9PROTO_RFSYNC, /* response when cache dat is flushed */ + P9PROTO_TLOCK = 52, /* acquire or release a POSIX record lock */ + P9PROTO_RLOCK, /* response with the status of the lock */ + P9PROTO_TGETLOCK = 54, /* request to check for presence of a POSIX record lock */ + P9PROTO_RGETLOCK, /* response with the details of the lock if acquired */ + P9PROTO_TLINK = 70, /* request to create hard link */ + P9PROTO_RLINK, /* create hard link response */ + P9PROTO_TMKDIR = 72, /* create a directory request */ + P9PROTO_RMKDIR, /* create a directory response */ + P9PROTO_TRENAMEAT = 74, /* request to rename a file or directory */ + P9PROTO_RRENAMEAT, /* reponse to rename request */ + P9PROTO_TUNLINKAT = 76, /* unlink a file or directory */ + P9PROTO_RUNLINKAT, /* reponse to unlink request */ + P9PROTO_TVERSION = 100, /* request for version handshake */ + P9PROTO_RVERSION, /* response for version handshake */ + P9PROTO_TAUTH = 102, /* request to establish authentication channel */ + P9PROTO_RAUTH, /* response with authentication information */ + P9PROTO_TATTACH = 104, /* establish a user access to a file system*/ + P9PROTO_RATTACH, /* response with top level handle to file hierarchy */ + P9PROTO_TERROR = 106, /* not used */ + P9PROTO_RERROR, /* response for any failed request */ + P9PROTO_TFLUSH = 108, /* request to abort a previous request */ + P9PROTO_RFLUSH, /* response when previous request has been cancelled */ + P9PROTO_TWALK = 110, /* descend a directory hierarchy */ + P9PROTO_RWALK, /* response with new handle for position within hierarchy */ + P9PROTO_TOPEN = 112, /* prepare file handle for I/O for an existing file */ + P9PROTO_ROPEN, /* response with file access information */ + P9PROTO_TCREATE = 114, /* prepare for handle for I/O on a new file */ + P9PROTO_RCREATE, /* response with file access information */ + P9PROTO_TREAD = 116, /* request to transfer data from a file */ + P9PROTO_RREAD, /* response with data requested */ + P9PROTO_TWRITE = 118, /* request to transfer data to a file */ + P9PROTO_RWRITE, /* response with how much data was written to the file */ + P9PROTO_TCLUNK = 120, /* forget about a handle to a file within the File System */ + P9PROTO_RCLUNK, /* response from the server for forgetting the file handle */ + P9PROTO_TREMOVE = 122, /* request to remove a file */ + P9PROTO_RREMOVE, /* response when server has removed the file */ + P9PROTO_TSTAT = 124, /* request file entity attributes */ + P9PROTO_RSTAT, /* response with file entity attributes */ + P9PROTO_TWSTAT = 126, /* request to update file entity attributes */ + P9PROTO_RWSTAT, /* response when file entity attributes are updated */ +}; + +/* File Open Modes */ +enum p9_open_mode_t { + P9PROTO_OREAD = 0x00, /* open file for reading only */ + P9PROTO_OWRITE = 0x01, /* open file for writing only */ + P9PROTO_ORDWR = 0x02, /* open file for both reading and writing */ + P9PROTO_OEXEC = 0x03, /* open file for execution */ + P9PROTO_OTRUNC = 0x10, /* truncate file to zero length before opening it */ + P9PROTO_OREXEC = 0x20, /* close the file when exec system call is made */ + P9PROTO_ORCLOSE = 0x40, /* remove the file when it is closed */ + P9PROTO_OAPPEND = 0x80, /* open the file and seek to the end of the file */ + P9PROTO_OEXCL = 0x1000, /* only create a file and not open it */ +}; + +/* FIle Permissions */ +enum p9_perm_t { + P9PROTO_DMDIR = 0x80000000, /* permission bit for directories */ + P9PROTO_DMAPPEND = 0x40000000, /* permission bit for is append-only */ + P9PROTO_DMEXCL = 0x20000000, /* permission bit for exclusive use (only one open handle allowed) */ + P9PROTO_DMMOUNT = 0x10000000, /* permission bit for mount points */ + P9PROTO_DMAUTH = 0x08000000, /* permission bit for authentication file */ + P9PROTO_DMTMP = 0x04000000, /* permission bit for non-backed-up files */ + P9PROTO_DMSYMLINK = 0x02000000, /* permission bit for symbolic link (9P2000.u) */ + P9PROTO_DMLINK = 0x01000000, /* permission bit for hard-link (9P2000.u) */ + P9PROTO_DMDEVICE = 0x00800000, /* permission bit for device files (9P2000.u) */ + P9PROTO_DMNAMEDPIPE = 0x00200000,/* permission bit for named pipe (9P2000.u) */ + P9PROTO_DMSOCKET = 0x00100000, /* permission bit for socket (9P2000.u) */ + P9PROTO_DMSETUID = 0x00080000, /* permission bit for setuid (9P2000.u) */ + P9PROTO_DMSETGID = 0x00040000, /* permission bit for setgid (9P2000.u) */ + P9PROTO_DMSETVTX = 0x00010000, /* permission bit for sticky bit (9P2000.u) */ +}; + +/* + * QID types - they are primarly used to + * differentiate semantics for a file system + */ +enum p9_qid_t { + P9PROTO_QTDIR = 0x80, /* directory */ + P9PROTO_QTAPPEND = 0x40, /* append-only */ + P9PROTO_QTEXCL = 0x20, /* exclusive use (only one open handle allowed)*/ + P9PROTO_QTMOUNT = 0x10, /* mount points */ + P9PROTO_QTAUTH = 0x08, /* authentication file */ + P9PROTO_QTTMP = 0x04, /* non-backed-up files */ + P9PROTO_QTSYMLINK = 0x02, /* symbolic links */ + P9PROTO_QTLINK = 0x01, /* hard link */ + P9PROTO_QTFILE = 0x00, /* normal files */ +}; + +/* P9 Magic Numbers */ +#define P9PROTO_NOFID (uint32_t)(~0) +#define P9_DEFUNAME "nobody" +#define P9_DEFANAME "" +#define P9_NONUNAME (uint32_t)(~0) +#define P9_MAXWELEM 16 + +/* Exchange unit between Qemu and Client */ +struct p9_qid { + uint8_t type; /* the type of the file */ + uint32_t version; /* version number for given path */ + uint64_t path; /* the file servers unique id for file */ +}; + +/* FS information stat structure */ +struct p9_statfs { + uint32_t type; /* type of file system */ + uint32_t bsize; /* optimal transfer block size */ + uint64_t blocks; /* total data blocks in file system */ + uint64_t bfree; /* free blocks in fs */ + uint64_t bavail; /* free blocks avail to non-superuser */ + uint64_t files; /* total file nodes in file system */ + uint64_t ffree; /* free file nodes in fs */ + uint64_t fsid; /* file system id */ + uint32_t namelen; /* maximum length of filenames */ +}; + + +/* File system metadata information */ +struct p9_wstat { + uint16_t size; /* total byte count of the following data */ + uint16_t type; /* type of file */ + uint32_t dev; /* id of device containing file */ + struct p9_qid qid; /* identifier used by server for file system entity information */ + uint32_t mode; /* protection */ + uint32_t atime; /* time of last access */ + uint32_t mtime; /* time of last modification */ + uint64_t length; /* length of file in bytes */ + char *name; /* file name */ + char *uid; /* user ID of owner */ + char *gid; /* group ID of owner */ + char *muid; /* name of the user who last modified the file */ + char *extension; /* 9p2000.u extensions */ + uid_t n_uid; /* 9p2000.u extensions */ + gid_t n_gid; /* 9p2000.u extensions */ + uid_t n_muid; /* 9p2000.u extensions */ +}; + +/* The linux version of FS information stat structure*/ +struct p9_stat_dotl { + uint64_t st_result_mask;/* indicates fields that are requested */ + struct p9_qid qid; /* identifier used by server for file system entity information */ + uint32_t st_mode; /* protection */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + uint64_t st_nlink; /* number of hard links */ + uint64_t st_rdev; /* device ID (if special file) */ + uint64_t st_size; /* total size, in bytes */ + uint64_t st_blksize; /* blocksize for file system I/O */ + uint64_t st_blocks; /* number of 512B blocks allocated */ + uint64_t st_atime_sec; /* time of last access, seconds */ + uint64_t st_atime_nsec; /* time of last access, nanoseconds */ + uint64_t st_mtime_sec; /* time of last modification, seconds */ + uint64_t st_mtime_nsec; /* time of last modifictaion, nanoseconds */ + uint64_t st_ctime_sec; /* time of last status change, seconds*/ + uint64_t st_ctime_nsec; /* time of last status change, nanoseconds*/ + uint64_t st_btime_sec; /* following memebers are reserved for future use */ + uint64_t st_btime_nsec; + uint64_t st_gen; + uint64_t st_data_version; +}; + +/* P9 inode attribute for setattr */ +struct p9_iattr_dotl { + uint32_t valid; /* bit fields specifying which fields are valid */ + uint32_t mode; /* protection */ + uid_t uid; /* user id of owner */ + gid_t gid; /* group id */ + uint64_t size; /* file size */ + uint64_t atime_sec; /* last access time in seconds */ + uint64_t atime_nsec; /* last access time in nanoseconds */ + uint64_t mtime_sec; /* last modification time in seconds */ + uint64_t mtime_nsec; /* last modification time in nanoseconds */ +}; + +#define P9PROTO_STATS_MODE 0x00000001ULL +#define P9PROTO_STATS_NLINK 0x00000002ULL +#define P9PROTO_STATS_UID 0x00000004ULL +#define P9PROTO_STATS_GID 0x00000008ULL +#define P9PROTO_STATS_RDEV 0x00000010ULL +#define P9PROTO_STATS_ATIME 0x00000020ULL +#define P9PROTO_STATS_MTIME 0x00000040ULL +#define P9PROTO_STATS_CTIME 0x00000080ULL +#define P9PROTO_STATS_INO 0x00000100ULL +#define P9PROTO_STATS_SIZE 0x00000200ULL +#define P9PROTO_STATS_BLOCKS 0x00000400ULL + +#define P9PROTO_STATS_BTIME 0x00000800ULL +#define P9PROTO_STATS_GEN 0x00001000ULL +#define P9PROTO_STATS_DATA_VERSION 0x00002000ULL + +#define P9PROTO_STATS_BASIC 0x000007ffULL /* Mask for fields up to BLOCKS */ +#define P9PROTO_STATS_ALL 0x00003fffULL /* Mask for All fields above */ + +#define P9PROTO_SETATTR_MODE 0x00000001UL +#define P9PROTO_SETATTR_UID 0x00000002UL +#define P9PROTO_SETATTR_GID 0x00000004UL +#define P9PROTO_SETATTR_SIZE 0x00000008UL +#define P9PROTO_SETATTR_ATIME 0x00000010UL +#define P9PROTO_SETATTR_MTIME 0x00000020UL +#define P9PROTO_SETATTR_CTIME 0x00000040UL +#define P9PROTO_SETATTR_ATIME_SET 0x00000080UL +#define P9PROTO_SETATTR_MTIME_SET 0x00000100UL +#define P9PROTO_SETATTR_MASK 0x000001bfUL + +#define P9PROTO_TGETATTR_BLK 512 + +#define P9PROTO_UNLINKAT_REMOVEDIR 0x200 + +/* PDU buffer used for SG lists. */ +struct p9_buffer { + uint32_t size; + uint16_t tag; + uint8_t id; + size_t offset; + size_t capacity; + uint8_t *sdata; +}; + +#endif /* FS_P9FS_P9_PROTOCOL_H */ diff --git a/sys/fs/p9fs/p9_transport.c b/sys/fs/p9fs/p9_transport.c new file mode 100644 index 000000000000..c82d81fedcd7 --- /dev/null +++ b/sys/fs/p9fs/p9_transport.c @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2022-present Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/kassert.h> +#include <sys/libkern.h> + +#include <fs/p9fs/p9_transport.h> + +TAILQ_HEAD(, p9_trans_module) transports; + +static void +p9_transport_init(void) +{ + + TAILQ_INIT(&transports); +} + +SYSINIT(p9_transport, SI_SUB_DRIVERS, SI_ORDER_FIRST, p9_transport_init, NULL); + +void +p9_register_trans(struct p9_trans_module *m) +{ + + TAILQ_INSERT_TAIL(&transports, m, link); +} + +void +p9_unregister_trans(struct p9_trans_module *m) +{ + + TAILQ_REMOVE(&transports, m, link); +} + +struct p9_trans_module * +p9_get_trans_by_name(char *name) +{ + struct p9_trans_module *m; + + TAILQ_FOREACH(m, &transports, link) { + if (strcmp(m->name, name) == 0) + return (m); + } + return (NULL); +} + diff --git a/sys/fs/p9fs/p9_transport.h b/sys/fs/p9fs/p9_transport.h new file mode 100644 index 000000000000..143c29f2382e --- /dev/null +++ b/sys/fs/p9fs/p9_transport.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* Transport definitions */ +#ifndef FS_P9FS_P9_TRANSPORT_H +#define FS_P9FS_P9_TRANSPORT_H + +#include <sys/queue.h> + +struct p9_req_t; + +/* Tranport module interface */ +struct p9_trans_module { + TAILQ_ENTRY(p9_trans_module) link; + char *name; /* name of transport */ + /* member function to create a new conection on this transport*/ + int (*create)(const char *mount_tag, void **handlep); + /* member function to terminate a connection on this transport */ + void (*close) (void *handle); + /* member function to issue a request to the transport*/ + int (*request) (void *handle, struct p9_req_t *req); + /* member function to cancel a request if it has been sent */ + int (*cancel) (void *handle, struct p9_req_t *req); +}; + +void p9_register_trans(struct p9_trans_module *m); +void p9_unregister_trans(struct p9_trans_module *m); +struct p9_trans_module *p9_get_trans_by_name(char *s); + +#endif /* FS_P9FS_P9_TRANSPORT_H */ diff --git a/sys/fs/p9fs/p9fs.h b/sys/fs/p9fs/p9fs.h new file mode 100644 index 000000000000..a270d8b5ce5f --- /dev/null +++ b/sys/fs/p9fs/p9fs.h @@ -0,0 +1,203 @@ +/*- + * Copyright (c) 2017-2020 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* This file has prototypes specific to the p9fs file system */ + +#ifndef FS_P9FS_P9FS_H +#define FS_P9FS_P9FS_H + +struct p9fs_session; + +/* QID: Unique identification for the file being accessed */ +struct p9fs_qid { + uint8_t qid_mode; /* file mode specifiying file type */ + uint32_t qid_version; /* version of the file */ + uint64_t qid_path; /* unique integer among all files in hierarchy */ +}; + +/* + * The in memory representation of the on disk inode. Save the current + * fields to write it back later. + */ +struct p9fs_inode { + /* Make it simple first, Add more fields later */ + uint64_t i_size; /* size of the inode */ + uint16_t i_type; /* type of inode */ + uint32_t i_dev; /* type of device */ + uint32_t i_mode; /* mode of the inode */ + uint32_t i_atime; /* time of last access */ + uint32_t i_mtime; /* time of last modification */ + uint32_t i_ctime; /* time of last status change */ + uint32_t i_atime_nsec; /* times of last access in nanoseconds resolution */ + uint32_t i_mtime_nsec; /* time of last modification in nanoseconds resolution */ + uint32_t i_ctime_nsec; /* time of last status change in nanoseconds resolution */ + uint64_t i_length; + char *i_name; /* inode name */ + char *i_uid; /* inode user id */ + char *i_gid; /* inode group id */ + char *i_muid; + char *i_extension; /* 9p2000.u extensions */ + uid_t n_uid; /* 9p2000.u extensions */ + gid_t n_gid; /* 9p2000.u extensions */ + uid_t n_muid; /* 9p2000.u extensions */ + /* bookkeeping info on the client. */ + uint16_t i_links_count; /*number of references to the inode*/ + uint64_t i_qid_path; /* using inode number for reference. */ + uint64_t i_flags; + uint64_t blksize; /* block size for file system */ + uint64_t blocks; /* number of 512B blocks allocated */ + uint64_t gen; /* reserved for future use */ + uint64_t data_version; /* reserved for future use */ + +}; + +#define P9FS_VFID_MTX(_sc) (&(_sc)->vfid_mtx) +#define P9FS_VFID_LOCK(_sc) mtx_lock(P9FS_VFID_MTX(_sc)) +#define P9FS_VFID_UNLOCK(_sc) mtx_unlock(P9FS_VFID_MTX(_sc)) +#define P9FS_VFID_LOCK_INIT(_sc) mtx_init(P9FS_VFID_MTX(_sc), \ + "VFID List lock", NULL, MTX_DEF) +#define P9FS_VFID_LOCK_DESTROY(_sc) mtx_destroy(P9FS_VFID_MTX(_sc)) + +#define P9FS_VOFID_MTX(_sc) (&(_sc)->vofid_mtx) +#define P9FS_VOFID_LOCK(_sc) mtx_lock(P9FS_VOFID_MTX(_sc)) +#define P9FS_VOFID_UNLOCK(_sc) mtx_unlock(P9FS_VOFID_MTX(_sc)) +#define P9FS_VOFID_LOCK_INIT(_sc) mtx_init(P9FS_VOFID_MTX(_sc), \ + "VOFID List lock", NULL, MTX_DEF) +#define P9FS_VOFID_LOCK_DESTROY(_sc) mtx_destroy(P9FS_VOFID_MTX(_sc)) + +#define VFID 0x01 +#define VOFID 0x02 + +/* A Plan9 node. */ +struct p9fs_node { + STAILQ_HEAD( ,p9_fid) vfid_list; /* vfid related to uid */ + struct mtx vfid_mtx; /* mutex for vfid list */ + STAILQ_HEAD( ,p9_fid) vofid_list; /* vofid related to uid */ + struct mtx vofid_mtx; /* mutex for vofid list */ + struct p9fs_node *parent; /* pointer to parent p9fs node */ + struct p9fs_qid vqid; /* the server qid, will be from the host */ + struct vnode *v_node; /* vnode for this fs_node. */ + struct p9fs_inode inode; /* in memory representation of ondisk information*/ + struct p9fs_session *p9fs_ses; /* Session_ptr for this node */ + STAILQ_ENTRY(p9fs_node) p9fs_node_next; + uint64_t flags; +}; + +#define P9FS_VTON(vp) ((struct p9fs_node *)(vp)->v_data) +#define P9FS_NTOV(node) ((node)->v_node) +#define VFSTOP9(mp) ((struct p9fs_mount *)(mp)->mnt_data) +#define QEMU_DIRENTRY_SZ 25 +#define P9FS_NODE_MODIFIED 0x1 /* indicating file change */ +#define P9FS_ROOT 0x2 /* indicating root p9fs node */ +#define P9FS_NODE_DELETED 0x4 /* indicating file or directory delete */ +#define P9FS_NODE_IN_SESSION 0x8 /* p9fs_node is in the session - virt_node_list */ +#define IS_ROOT(node) (node->flags & P9FS_ROOT) + +#define P9FS_SET_LINKS(inode) do { \ + (inode)->i_links_count = 1; \ +} while (0) \ + +#define P9FS_INCR_LINKS(inode) do { \ + (inode)->i_links_count++; \ +} while (0) \ + +#define P9FS_DECR_LINKS(inode) do { \ + (inode)->i_links_count--; \ +} while (0) \ + +#define P9FS_CLR_LINKS(inode) do { \ + (inode)->i_links_count = 0; \ +} while (0) \ + +#define P9FS_MTX(_sc) (&(_sc)->p9fs_mtx) +#define P9FS_LOCK(_sc) mtx_lock(P9FS_MTX(_sc)) +#define P9FS_UNLOCK(_sc) mtx_unlock(P9FS_MTX(_sc)) +#define P9FS_LOCK_INIT(_sc) mtx_init(P9FS_MTX(_sc), \ + "P9FS session chain lock", NULL, MTX_DEF) +#define P9FS_LOCK_DESTROY(_sc) mtx_destroy(P9FS_MTX(_sc)) + +/* Session structure for the FS */ +struct p9fs_session { + unsigned char flags; /* these flags for the session */ + struct mount *p9fs_mount; /* mount point */ + struct p9fs_node rnp; /* root p9fs node for this session */ + uid_t uid; /* the uid that has access */ + const char *uname; /* user name to mount as */ + const char *aname; /* name of remote file tree being mounted */ + struct p9_client *clnt; /* 9p client */ + struct mtx p9fs_mtx; /* mutex used for guarding the chain.*/ + STAILQ_HEAD( ,p9fs_node) virt_node_list; /* list of p9fs nodes in this session*/ + struct p9_fid *mnt_fid; /* to save nobody 's fid for unmounting as root user */ +}; + +struct p9fs_mount { + struct p9fs_session p9fs_session; /* per instance session information */ + struct mount *p9fs_mountp; /* mount point */ + int mount_tag_len; /* length of the mount tag */ + char *mount_tag; /* mount tag used */ +}; + +/* All session flags based on 9p versions */ +enum virt_session_flags { + P9FS_PROTO_2000U = 0x01, + P9FS_PROTO_2000L = 0x02, +}; + +/* Session access flags */ +#define P9_ACCESS_ANY 0x04 /* single attach for all users */ +#define P9_ACCESS_SINGLE 0x08 /* access to only the user who mounts */ +#define P9_ACCESS_USER 0x10 /* new attach established for every user */ +#define P9_ACCESS_MASK (P9_ACCESS_ANY|P9_ACCESS_SINGLE|P9_ACCESS_USER) + +u_quad_t p9fs_round_filesize_to_bytes(uint64_t filesize, uint64_t bsize); +u_quad_t p9fs_pow2_filesize_to_bytes(uint64_t filesize, uint64_t bsize); + +/* These are all the P9FS specific vops */ +int p9fs_stat_vnode_l(void); +int p9fs_stat_vnode_dotl(struct p9_stat_dotl *st, struct vnode *vp); +int p9fs_reload_stats_dotl(struct vnode *vp, struct ucred *cred); +int p9fs_proto_dotl(struct p9fs_session *vses); +struct p9_fid *p9fs_init_session(struct mount *mp, int *error); +void p9fs_close_session(struct mount *mp); +void p9fs_prepare_to_close(struct mount *mp); +void p9fs_complete_close(struct mount *mp); +int p9fs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); +int p9fs_vget_common(struct mount *mp, struct p9fs_node *np, int flags, + struct p9fs_node *parent, struct p9_fid *fid, struct vnode **vpp, + char *name); +int p9fs_node_cmp(struct vnode *vp, void *arg); +void p9fs_destroy_node(struct p9fs_node **npp); +void p9fs_dispose_node(struct p9fs_node **npp); +void p9fs_cleanup(struct p9fs_node *vp); +void p9fs_fid_remove_all(struct p9fs_node *np, int leave_ofids); +void p9fs_fid_remove(struct p9fs_node *np, struct p9_fid *vfid, + int fid_type); +void p9fs_fid_add(struct p9fs_node *np, struct p9_fid *fid, + int fid_type); +struct p9_fid *p9fs_get_fid(struct p9_client *clnt, + struct p9fs_node *np, struct ucred *cred, int fid_type, int mode, int *error); + +#endif /* FS_P9FS_P9FS_H */ diff --git a/sys/fs/p9fs/p9fs_proto.h b/sys/fs/p9fs/p9fs_proto.h new file mode 100644 index 000000000000..d78caa686f36 --- /dev/null +++ b/sys/fs/p9fs/p9fs_proto.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +/* + * Plan9 filesystem (9P2000.u) protocol definitions. + */ + +#ifndef FS_P9FS_P9FS_PROTO_H +#define FS_P9FS_P9FS_PROTO_H + +//#include <dev/virtio/virtio_fs_9p.h> + +/* File permissions */ +#define P9FS_OREAD 0 +#define P9FS_OWRITE 1 +#define P9FS_ORDWR 2 +#define P9FS_OEXEC 3 +#define P9FS_OTRUNC 0x10 + +#endif /* FS_P9FS_P9FS_PROTO_H */ diff --git a/sys/fs/p9fs/p9fs_subr.c b/sys/fs/p9fs/p9fs_subr.c new file mode 100644 index 000000000000..d0f04f6c5e97 --- /dev/null +++ b/sys/fs/p9fs/p9fs_subr.c @@ -0,0 +1,411 @@ +/*- + * Copyright (c) 2017 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +/*- + * 9P filesystem subroutines. This file consists of all the Non VFS subroutines. + * It contains all of the functions related to the driver submission which form + * the upper layer i.e, p9fs driver. This will interact with the client to make + * sure we have correct API calls in the header. + */ + +#include <sys/cdefs.h> +#include <sys/systm.h> +#include <sys/limits.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> + +#include "p9fs_proto.h" + +#include <fs/p9fs/p9_client.h> +#include <fs/p9fs/p9_debug.h> +#include <fs/p9fs/p9_protocol.h> +#include <fs/p9fs/p9fs.h> + +int +p9fs_proto_dotl(struct p9fs_session *vses) +{ + + return (vses->flags & P9FS_PROTO_2000L); +} + +/* Initialize a p9fs session */ +struct p9_fid * +p9fs_init_session(struct mount *mp, int *error) +{ + struct p9fs_session *vses; + struct p9fs_mount *virtmp; + struct p9_fid *fid; + char *access; + + virtmp = VFSTOP9(mp); + vses = &virtmp->p9fs_session; + vses->uid = P9_NONUNAME; + vses->uname = P9_DEFUNAME; + vses->aname = P9_DEFANAME; + + /* + * Create the client structure. Call into the driver to create + * driver structures for the actual IO transfer. + */ + vses->clnt = p9_client_create(mp, error, virtmp->mount_tag); + + if (vses->clnt == NULL) { + P9_DEBUG(ERROR, "%s: p9_client_create failed\n", __func__); + return (NULL); + } + /* + * Find the client version and cache the copy. We will use this copy + * throughout FS layer. + */ + if (p9_is_proto_dotl(vses->clnt)) + vses->flags |= P9FS_PROTO_2000L; + else if (p9_is_proto_dotu(vses->clnt)) + vses->flags |= P9FS_PROTO_2000U; + + /* Set the access mode */ + access = vfs_getopts(mp->mnt_optnew, "access", error); + if (access == NULL) + vses->flags |= P9_ACCESS_USER; + else if (!strcmp(access, "any")) + vses->flags |= P9_ACCESS_ANY; + else if (!strcmp(access, "single")) + vses->flags |= P9_ACCESS_SINGLE; + else if (!strcmp(access, "user")) + vses->flags |= P9_ACCESS_USER; + else { + P9_DEBUG(ERROR, "%s: unknown access mode\n", __func__); + *error = EINVAL; + goto out; + } + + *error = 0; + /* Attach with the backend host*/ + fid = p9_client_attach(vses->clnt, NULL, vses->uname, P9_NONUNAME, + vses->aname, error); + vses->mnt_fid = fid; + + if (*error != 0) { + P9_DEBUG(ERROR, "%s: attach failed: %d\n", __func__, *error); + goto out; + } + P9_DEBUG(SUBR, "%s: attach successful fid :%p\n", __func__, fid); + fid->uid = vses->uid; + + /* initialize the node list for the session */ + STAILQ_INIT(&vses->virt_node_list); + P9FS_LOCK_INIT(vses); + + P9_DEBUG(SUBR, "%s: INIT session successful\n", __func__); + + return (fid); +out: + p9_client_destroy(vses->clnt); + return (NULL); +} + +/* Begin to terminate a session */ +void +p9fs_prepare_to_close(struct mount *mp) +{ + struct p9fs_session *vses; + struct p9fs_mount *vmp; + struct p9fs_node *np, *pnp, *tmp; + + vmp = VFSTOP9(mp); + vses = &vmp->p9fs_session; + + /* break the node->parent references */ + STAILQ_FOREACH_SAFE(np, &vses->virt_node_list, p9fs_node_next, tmp) { + if (np->parent && np->parent != np) { + pnp = np->parent; + np->parent = NULL; + vrele(P9FS_NTOV(pnp)); + } + } + + /* We are about to teardown, we dont allow anything other than clunk after this.*/ + p9_client_begin_disconnect(vses->clnt); +} + +/* Shutdown a session */ +void +p9fs_complete_close(struct mount *mp) +{ + struct p9fs_session *vses; + struct p9fs_mount *vmp; + + vmp = VFSTOP9(mp); + vses = &vmp->p9fs_session; + + /* Finish the close*/ + p9_client_disconnect(vses->clnt); +} + + +/* Call from unmount. Close the session. */ +void +p9fs_close_session(struct mount *mp) +{ + struct p9fs_session *vses; + struct p9fs_mount *vmp; + + vmp = VFSTOP9(mp); + vses = &vmp->p9fs_session; + + p9fs_complete_close(mp); + /* Clean up the clnt structure. */ + p9_client_destroy(vses->clnt); + P9FS_LOCK_DESTROY(vses); + P9_DEBUG(SUBR, "%s: Clean close session .\n", __func__); +} + +/* + * Remove all the fids of a particular type from a p9fs node + * as well as destroy/clunk them. + */ +void +p9fs_fid_remove_all(struct p9fs_node *np, int leave_ofids) +{ + struct p9_fid *fid, *tfid; + + STAILQ_FOREACH_SAFE(fid, &np->vfid_list, fid_next, tfid) { + STAILQ_REMOVE(&np->vfid_list, fid, p9_fid, fid_next); + p9_client_clunk(fid); + } + + if (!leave_ofids) { + STAILQ_FOREACH_SAFE(fid, &np->vofid_list, fid_next, tfid) { + STAILQ_REMOVE(&np->vofid_list, fid, p9_fid, fid_next); + p9_client_clunk(fid); + } + } +} + + +/* Remove a fid from its corresponding fid list */ +void +p9fs_fid_remove(struct p9fs_node *np, struct p9_fid *fid, int fid_type) +{ + + switch (fid_type) { + case VFID: + P9FS_VFID_LOCK(np); + STAILQ_REMOVE(&np->vfid_list, fid, p9_fid, fid_next); + P9FS_VFID_UNLOCK(np); + break; + case VOFID: + P9FS_VOFID_LOCK(np); + STAILQ_REMOVE(&np->vofid_list, fid, p9_fid, fid_next); + P9FS_VOFID_UNLOCK(np); + break; + } +} + +/* Add a fid to the corresponding fid list */ +void +p9fs_fid_add(struct p9fs_node *np, struct p9_fid *fid, int fid_type) +{ + + switch (fid_type) { + case VFID: + P9FS_VFID_LOCK(np); + STAILQ_INSERT_TAIL(&np->vfid_list, fid, fid_next); + P9FS_VFID_UNLOCK(np); + break; + case VOFID: + P9FS_VOFID_LOCK(np); + STAILQ_INSERT_TAIL(&np->vofid_list, fid, fid_next); + P9FS_VOFID_UNLOCK(np); + break; + } +} + +/* Build the path from root to current directory */ +static int +p9fs_get_full_path(struct p9fs_node *np, char ***names) +{ + int i, n; + struct p9fs_node *node; + char **wnames; + + n = 0; + for (node = np ; (node != NULL) && !IS_ROOT(node) ; node = node->parent) + n++; + + if (node == NULL) + return (0); + + wnames = malloc(n * sizeof(char *), M_TEMP, M_ZERO|M_WAITOK); + + for (i = n-1, node = np; i >= 0 ; i--, node = node->parent) + wnames[i] = node->inode.i_name; + + *names = wnames; + return (n); +} + +/* + * Return TRUE if this fid can be used for the requested mode. + */ +static int +p9fs_compatible_mode(struct p9_fid *fid, int mode) +{ + /* + * Return TRUE for an exact match. For OREAD and OWRITE, allow + * existing ORDWR fids to match. Only check the low two bits + * of mode. + * + * TODO: figure out if this is correct for O_APPEND + */ + int fid_mode = fid->mode & 3; + if (fid_mode == mode) + return (TRUE); + if (fid_mode == P9PROTO_ORDWR) + return (mode == P9PROTO_OREAD || mode == P9PROTO_OWRITE); + return (FALSE); +} + +/* + * Retrieve fid structure corresponding to a particular + * uid and fid type for a p9fs node + */ +static struct p9_fid * +p9fs_get_fid_from_uid(struct p9fs_node *np, uid_t uid, int fid_type, int mode) +{ + struct p9_fid *fid; + + switch (fid_type) { + case VFID: + P9FS_VFID_LOCK(np); + STAILQ_FOREACH(fid, &np->vfid_list, fid_next) { + if (fid->uid == uid) { + P9FS_VFID_UNLOCK(np); + return (fid); + } + } + P9FS_VFID_UNLOCK(np); + break; + case VOFID: + P9FS_VOFID_LOCK(np); + STAILQ_FOREACH(fid, &np->vofid_list, fid_next) { + if (fid->uid == uid && p9fs_compatible_mode(fid, mode)) { + P9FS_VOFID_UNLOCK(np); + return (fid); + } + } + P9FS_VOFID_UNLOCK(np); + break; + } + + return (NULL); +} + +/* + * Function returns the fid sturcture for a file corresponding to current user id. + * First it searches in the fid list of the corresponding p9fs node. + * New fid will be created if not already present and added in the corresponding + * fid list in the p9fs node. + * If the user is not already attached then this will attach the user first + * and then create a new fid for this particular file by doing dir walk. + */ +struct p9_fid * +p9fs_get_fid(struct p9_client *clnt, struct p9fs_node *np, struct ucred *cred, + int fid_type, int mode, int *error) +{ + uid_t uid; + struct p9_fid *fid, *oldfid; + struct p9fs_node *root; + struct p9fs_session *vses; + int i, l, clone; + char **wnames = NULL; + uint16_t nwnames; + + oldfid = NULL; + vses = np->p9fs_ses; + + if (vses->flags & P9_ACCESS_ANY) + uid = vses->uid; + else if (cred) + uid = cred->cr_uid; + else + uid = 0; + + /* + * Search for the fid in corresponding fid list. + * We should return NULL for VOFID if it is not present in the list. + * Because VOFID should have been created during the file open. + * If VFID is not present in the list then we should create one. + */ + fid = p9fs_get_fid_from_uid(np, uid, fid_type, mode); + if (fid != NULL || fid_type == VOFID) + return (fid); + + /* Check root if the user is attached */ + root = &np->p9fs_ses->rnp; + fid = p9fs_get_fid_from_uid(root, uid, fid_type, mode); + if(fid == NULL) { + /* Attach the user */ + fid = p9_client_attach(clnt, NULL, NULL, uid, + vses->aname, error); + if (*error != 0) + return (NULL); + p9fs_fid_add(root, fid, fid_type); + } + + /* If we are looking for root then return it */ + if (IS_ROOT(np)) + return (fid); + + /* Get full path from root to p9fs node */ + nwnames = p9fs_get_full_path(np, &wnames); + + /* + * Could not get full path. + * If p9fs node is not deleted, parent should exist. + */ + KASSERT(nwnames != 0, ("%s: Directory of %s doesn't exist", __func__, np->inode.i_name)); + + clone = 1; + i = 0; + while (i < nwnames) { + l = MIN(nwnames - i, P9_MAXWELEM); + + fid = p9_client_walk(fid, l, wnames, clone, error); + if (*error != 0) { + if (oldfid) + p9_client_clunk(oldfid); + fid = NULL; + goto bail_out; + } + oldfid = fid; + clone = 0; + i += l ; + } + p9fs_fid_add(np, fid, fid_type); +bail_out: + free(wnames, M_TEMP); + return (fid); +} diff --git a/sys/fs/p9fs/p9fs_vfsops.c b/sys/fs/p9fs/p9fs_vfsops.c new file mode 100644 index 000000000000..3451bc052187 --- /dev/null +++ b/sys/fs/p9fs/p9fs_vfsops.c @@ -0,0 +1,610 @@ +/*- + * Copyright (c) 2017-2020 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file consists of all the VFS interactions of VFS ops which include + * mount, unmount, initilaize etc. for p9fs. + */ + +#include <sys/cdefs.h> +#include <sys/systm.h> +#include <sys/fnv_hash.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> +#include <sys/buf.h> +#include <vm/uma.h> + +#include <fs/p9fs/p9fs_proto.h> +#include <fs/p9fs/p9_client.h> +#include <fs/p9fs/p9_debug.h> +#include <fs/p9fs/p9fs.h> + +SYSCTL_NODE(_vfs, OID_AUTO, p9fs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Plan 9 filesystem"); + +/* This count is static now. Can be made tunable later */ +#define P9FS_FLUSH_RETRIES 10 + +static MALLOC_DEFINE(M_P9MNT, "p9fs_mount", "Mount structures for p9fs"); +static uma_zone_t p9fs_node_zone; +uma_zone_t p9fs_io_buffer_zone; +uma_zone_t p9fs_getattr_zone; +uma_zone_t p9fs_setattr_zone; +uma_zone_t p9fs_pbuf_zone; +extern struct vop_vector p9fs_vnops; + +/* option parsing */ +static const char *p9fs_opts[] = { + "from", "trans", "access", NULL +}; + +/* Dispose p9fs node, freeing it to the UMA zone */ +void +p9fs_dispose_node(struct p9fs_node **npp) +{ + struct p9fs_node *node; + struct vnode *vp; + + node = *npp; + + if (node == NULL) + return; + + if (node->parent && node->parent != node) { + vrele(P9FS_NTOV(node->parent)); + } + + P9_DEBUG(VOPS, "%s: node: %p\n", __func__, *npp); + + vp = P9FS_NTOV(node); + vp->v_data = NULL; + + /* Free our associated memory */ + if (!(vp->v_vflag & VV_ROOT)) { + free(node->inode.i_name, M_TEMP); + uma_zfree(p9fs_node_zone, node); + } + + *npp = NULL; +} + +/* Initialize memory allocation */ +static int +p9fs_init(struct vfsconf *vfsp) +{ + + p9fs_node_zone = uma_zcreate("p9fs node zone", + sizeof(struct p9fs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + + /* Create the getattr_dotl zone */ + p9fs_getattr_zone = uma_zcreate("p9fs getattr zone", + sizeof(struct p9_stat_dotl), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + + /* Create the setattr_dotl zone */ + p9fs_setattr_zone = uma_zcreate("p9fs setattr zone", + sizeof(struct p9_iattr_dotl), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + + /* Create the putpages zone */ + p9fs_pbuf_zone = pbuf_zsecond_create("p9fs pbuf zone", nswbuf / 2); + + /* + * Create the io_buffer zone pool to keep things simpler in case of + * multiple threads. Each thread works with its own so there is no + * contention. + */ + p9fs_io_buffer_zone = uma_zcreate("p9fs io_buffer zone", + P9FS_MTU, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + + return (0); +} + +/* Destroy all the allocated memory */ +static int +p9fs_uninit(struct vfsconf *vfsp) +{ + + uma_zdestroy(p9fs_node_zone); + uma_zdestroy(p9fs_io_buffer_zone); + uma_zdestroy(p9fs_getattr_zone); + uma_zdestroy(p9fs_setattr_zone); + uma_zdestroy(p9fs_pbuf_zone); + + return (0); +} + +/* Function to umount p9fs */ +static int +p9fs_unmount(struct mount *mp, int mntflags) +{ + struct p9fs_mount *vmp; + struct p9fs_session *vses; + int error, flags, i; + + error = 0; + flags = 0; + vmp = VFSTOP9(mp); + if (vmp == NULL) + return (0); + + vses = &vmp->p9fs_session; + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + + p9fs_prepare_to_close(mp); + for (i = 0; i < P9FS_FLUSH_RETRIES; i++) { + + /* Flush everything on this mount point.*/ + error = vflush(mp, 1, flags, curthread); + + if (error == 0 || (mntflags & MNT_FORCE) == 0) + break; + /* Sleep until interrupted or 1 tick expires. */ + error = tsleep(&error, PSOCK, "p9unmnt", 1); + if (error == EINTR) + break; + error = EBUSY; + } + + if (error != 0) + goto out; + p9fs_close_session(mp); + /* Cleanup the mount structure. */ + free(vmp, M_P9MNT); + mp->mnt_data = NULL; + return (error); +out: + /* Restore the flag in case of error */ + vses->clnt->trans_status = P9FS_CONNECT; + return (error); +} + +/* + * Compare qid stored in p9fs node + * Return 1 if does not match otherwise return 0 + */ +int +p9fs_node_cmp(struct vnode *vp, void *arg) +{ + struct p9fs_node *np; + struct p9_qid *qid; + + np = vp->v_data; + qid = (struct p9_qid *)arg; + + if (np == NULL) + return (1); + + if (np->vqid.qid_path == qid->path) { + if (vp->v_vflag & VV_ROOT) + return (0); + else if (np->vqid.qid_mode == qid->type && + np->vqid.qid_version == qid->version) + return (0); + } + + return (1); +} + +/* + * Cleanup p9fs node + * - Destroy the FID LIST locks + * - Dispose all node knowledge + */ +void +p9fs_destroy_node(struct p9fs_node **npp) +{ + struct p9fs_node *np; + + np = *npp; + + if (np == NULL) + return; + + /* Destroy the FID LIST locks */ + P9FS_VFID_LOCK_DESTROY(np); + P9FS_VOFID_LOCK_DESTROY(np); + + /* Dispose all node knowledge.*/ + p9fs_dispose_node(&np); +} + +/* + * Common code used across p9fs to return vnode for the file represented + * by the fid. + * Lookup for the vnode in hash_list. This lookup is based on the qid path + * which is unique to a file. p9fs_node_cmp is called in this lookup process. + * I. If the vnode we are looking for is found in the hash list + * 1. Check if the vnode is a valid vnode by reloading its stats + * a. if the reloading of the vnode stats returns error then remove the + * vnode from hash list and return + * b. If reloading of vnode stats returns without any error then, clunk the + * new fid which was created for the vnode as we know that the vnode + * already has a fid associated with it and return the vnode. + * This is to avoid fid leaks + * II. If vnode is not found in the hash list then, create new vnode, p9fs + * node and return the vnode + */ +int +p9fs_vget_common(struct mount *mp, struct p9fs_node *np, int flags, + struct p9fs_node *parent, struct p9_fid *fid, struct vnode **vpp, + char *name) +{ + struct p9fs_mount *vmp; + struct p9fs_session *vses; + struct vnode *vp; + struct p9fs_node *node; + struct thread *td; + uint32_t hash; + int error, error_reload = 0; + struct p9fs_inode *inode; + + td = curthread; + vmp = VFSTOP9(mp); + vses = &vmp->p9fs_session; + + /* Look for vp in the hash_list */ + hash = fnv_32_buf(&fid->qid.path, sizeof(uint64_t), FNV1_32_INIT); + error = vfs_hash_get(mp, hash, flags, td, &vp, p9fs_node_cmp, + &fid->qid); + if (error != 0) + return (error); + else if (vp != NULL) { + if (vp->v_vflag & VV_ROOT) { + if (np == NULL) + p9_client_clunk(fid); + *vpp = vp; + return (0); + } + error = p9fs_reload_stats_dotl(vp, curthread->td_ucred); + if (error != 0) { + node = vp->v_data; + /* Remove stale vnode from hash list */ + vfs_hash_remove(vp); + node->flags |= P9FS_NODE_DELETED; + + vput(vp); + *vpp = NULLVP; + vp = NULL; + } else { + *vpp = vp; + /* Clunk the new fid if not root */ + p9_client_clunk(fid); + return (0); + } + } + + /* + * We must promote to an exclusive lock for vnode creation. This + * can happen if lookup is passed LOCKSHARED. + */ + if ((flags & LK_TYPE_MASK) == LK_SHARED) { + flags &= ~LK_TYPE_MASK; + flags |= LK_EXCLUSIVE; + } + + /* Allocate a new vnode. */ + if ((error = getnewvnode("p9fs", mp, &p9fs_vnops, &vp)) != 0) { + *vpp = NULLVP; + P9_DEBUG(ERROR, "%s: getnewvnode failed: %d\n", __func__, error); + return (error); + } + + /* If we dont have it, create one. */ + if (np == NULL) { + np = uma_zalloc(p9fs_node_zone, M_WAITOK | M_ZERO); + /* Initialize the VFID list */ + P9FS_VFID_LOCK_INIT(np); + STAILQ_INIT(&np->vfid_list); + p9fs_fid_add(np, fid, VFID); + + /* Initialize the VOFID list */ + P9FS_VOFID_LOCK_INIT(np); + STAILQ_INIT(&np->vofid_list); + + vref(P9FS_NTOV(parent)); + np->parent = parent; + np->p9fs_ses = vses; /* Map the current session */ + inode = &np->inode; + /*Fill the name of the file in inode */ + inode->i_name = malloc(strlen(name)+1, M_TEMP, M_NOWAIT | M_ZERO); + strlcpy(inode->i_name, name, strlen(name)+1); + } else { + vp->v_type = VDIR; /* root vp is a directory */ + vp->v_vflag |= VV_ROOT; + vref(vp); /* Increment a reference on root vnode during mount */ + } + + vp->v_data = np; + np->v_node = vp; + inode = &np->inode; + inode->i_qid_path = fid->qid.path; + P9FS_SET_LINKS(inode); + + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + if (vp->v_type != VFIFO) + VN_LOCK_ASHARE(vp); + error = insmntque(vp, mp); + if (error != 0) { + /* + * vput(vp) is already called from insmntque_stddtr(). + * Just goto 'out' to dispose the node. + */ + goto out; + } + + /* Init the vnode with the disk info*/ + error = p9fs_reload_stats_dotl(vp, curthread->td_ucred); + if (error != 0) { + error_reload = 1; + goto out; + } + + error = vfs_hash_insert(vp, hash, flags, td, vpp, + p9fs_node_cmp, &fid->qid); + if (error != 0) { + goto out; + } + + if (*vpp == NULL) { + P9FS_LOCK(vses); + STAILQ_INSERT_TAIL(&vses->virt_node_list, np, p9fs_node_next); + np->flags |= P9FS_NODE_IN_SESSION; + P9FS_UNLOCK(vses); + + *vpp = vp; + } else { + /* + * Returning matching vp found in hashlist. + * So cleanup the np allocated above in this context. + */ + if (!IS_ROOT(np)) { + p9fs_destroy_node(&np); + } + } + + return (0); +out: + /* Something went wrong, dispose the node */ + if (!IS_ROOT(np)) { + p9fs_destroy_node(&np); + } + + if (error_reload) { + vput(vp); + } + + *vpp = NULLVP; + return (error); +} + +/* Main mount function for 9pfs */ +static int +p9_mount(struct mount *mp) +{ + struct p9_fid *fid; + struct p9fs_mount *vmp; + struct p9fs_session *vses; + struct p9fs_node *p9fs_root; + int error; + char *from; + int len; + + /* Verify the validity of mount options */ + if (vfs_filteropt(mp->mnt_optnew, p9fs_opts)) + return (EINVAL); + + /* Extract NULL terminated mount tag from mount options */ + error = vfs_getopt(mp->mnt_optnew, "from", (void **)&from, &len); + if (error != 0 || from[len - 1] != '\0') + return (EINVAL); + + /* Allocate and initialize the private mount structure. */ + vmp = malloc(sizeof (struct p9fs_mount), M_P9MNT, M_WAITOK | M_ZERO); + mp->mnt_data = vmp; + vmp->p9fs_mountp = mp; + vmp->mount_tag = from; + vmp->mount_tag_len = len; + vses = &vmp->p9fs_session; + vses->p9fs_mount = mp; + p9fs_root = &vses->rnp; + /* Hardware iosize from the Qemu */ + mp->mnt_iosize_max = PAGE_SIZE; + /* + * Init the session for the p9fs root. This creates a new root fid and + * attaches the client and server. + */ + fid = p9fs_init_session(mp, &error); + if (fid == NULL) { + goto out; + } + + P9FS_VFID_LOCK_INIT(p9fs_root); + STAILQ_INIT(&p9fs_root->vfid_list); + p9fs_fid_add(p9fs_root, fid, VFID); + P9FS_VOFID_LOCK_INIT(p9fs_root); + STAILQ_INIT(&p9fs_root->vofid_list); + p9fs_root->parent = p9fs_root; + p9fs_root->flags |= P9FS_ROOT; + p9fs_root->p9fs_ses = vses; + vfs_getnewfsid(mp); + strlcpy(mp->mnt_stat.f_mntfromname, from, + sizeof(mp->mnt_stat.f_mntfromname)); + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED; + MNT_IUNLOCK(mp); + P9_DEBUG(VOPS, "%s: Mount successful\n", __func__); + /* Mount structures created. */ + + return (0); +out: + P9_DEBUG(ERROR, "%s: Mount Failed \n", __func__); + if (vmp != NULL) { + free(vmp, M_P9MNT); + mp->mnt_data = NULL; + } + return (error); +} + +/* Mount entry point */ +static int +p9fs_mount(struct mount *mp) +{ + int error; + + /* + * Minimal support for MNT_UPDATE - allow changing from + * readonly. + */ + if (mp->mnt_flag & MNT_UPDATE) { + if ((mp->mnt_flag & MNT_RDONLY) && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { + mp->mnt_flag &= ~MNT_RDONLY; + } + return (0); + } + + error = p9_mount(mp); + if (error != 0) + (void) p9fs_unmount(mp, MNT_FORCE); + + return (error); +} + +/* + * Retrieve the root vnode of this mount. After filesystem is mounted, the root + * vnode is created for the first time. Subsequent calls to p9fs root will + * return the same vnode created during mount. + */ +static int +p9fs_root(struct mount *mp, int lkflags, struct vnode **vpp) +{ + struct p9fs_mount *vmp; + struct p9fs_node *np; + struct p9_client *clnt; + struct p9_fid *vfid; + int error; + + vmp = VFSTOP9(mp); + np = &vmp->p9fs_session.rnp; + clnt = vmp->p9fs_session.clnt; + error = 0; + + P9_DEBUG(VOPS, "%s: node=%p name=%s\n",__func__, np, np->inode.i_name); + + vfid = p9fs_get_fid(clnt, np, curthread->td_ucred, VFID, -1, &error); + + if (error != 0) { + /* for root use the nobody user's fid as vfid. + * This is used while unmounting as root when non-root + * user has mounted p9fs + */ + if (vfid == NULL && clnt->trans_status == P9FS_BEGIN_DISCONNECT) + vfid = vmp->p9fs_session.mnt_fid; + else { + *vpp = NULLVP; + return (error); + } + } + + error = p9fs_vget_common(mp, np, lkflags, np, vfid, vpp, NULL); + if (error != 0) { + *vpp = NULLVP; + return (error); + } + np->v_node = *vpp; + return (error); +} + +/* Retrieve the file system statistics */ +static int +p9fs_statfs(struct mount *mp __unused, struct statfs *buf) +{ + struct p9fs_mount *vmp; + struct p9fs_node *np; + struct p9_client *clnt; + struct p9_fid *vfid; + struct p9_statfs statfs; + int res, error; + + vmp = VFSTOP9(mp); + np = &vmp->p9fs_session.rnp; + clnt = vmp->p9fs_session.clnt; + error = 0; + + vfid = p9fs_get_fid(clnt, np, curthread->td_ucred, VFID, -1, &error); + if (error != 0) { + return (error); + } + + res = p9_client_statfs(vfid, &statfs); + + if (res == 0) { + buf->f_type = statfs.type; + /* + * We have a limit of 4k irrespective of what the + * Qemu server can do. + */ + if (statfs.bsize > PAGE_SIZE) + buf->f_bsize = PAGE_SIZE; + else + buf->f_bsize = statfs.bsize; + + buf->f_iosize = buf->f_bsize; + buf->f_blocks = statfs.blocks; + buf->f_bfree = statfs.bfree; + buf->f_bavail = statfs.bavail; + buf->f_files = statfs.files; + buf->f_ffree = statfs.ffree; + } + else { + /* Atleast set these if stat fail */ + buf->f_bsize = PAGE_SIZE; + buf->f_iosize = buf->f_bsize; /* XXX */ + } + + return (0); +} + +static int +p9fs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) +{ + + return (EINVAL); +} + +struct vfsops p9fs_vfsops = { + .vfs_init = p9fs_init, + .vfs_uninit = p9fs_uninit, + .vfs_mount = p9fs_mount, + .vfs_unmount = p9fs_unmount, + .vfs_root = p9fs_root, + .vfs_statfs = p9fs_statfs, + .vfs_fhtovp = p9fs_fhtovp, +}; + +VFS_SET(p9fs_vfsops, p9fs, VFCF_JAIL); +MODULE_VERSION(p9fs, 1); diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c new file mode 100644 index 000000000000..227e2b93883e --- /dev/null +++ b/sys/fs/p9fs/p9fs_vnops.c @@ -0,0 +1,2236 @@ +/* + * Copyright (c) 2017-2020 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* This file contains VFS file ops for the 9P protocol. + * This makes the upper layer of the p9fs driver. These functions interact + * with the VFS layer and lower layer of p9fs driver which is 9Pnet. All + * the user file operations are handled here. + */ +#include <sys/cdefs.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/dirent.h> +#include <sys/fcntl.h> +#include <sys/namei.h> +#include <sys/priv.h> +#include <sys/stat.h> +#include <sys/vnode.h> +#include <sys/rwlock.h> +#include <sys/vmmeter.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> + +#include <fs/p9fs/p9_client.h> +#include <fs/p9fs/p9_debug.h> +#include <fs/p9fs/p9fs.h> +#include <fs/p9fs/p9fs_proto.h> + +/* File permissions. */ +#define IEXEC 0000100 /* Executable. */ +#define IWRITE 0000200 /* Writeable. */ +#define IREAD 0000400 /* Readable. */ +#define ISVTX 0001000 /* Sticky bit. */ +#define ISGID 0002000 /* Set-gid. */ +#define ISUID 0004000 /* Set-uid. */ + +static MALLOC_DEFINE(M_P9UIOV, "uio", "UIOV structures for strategy in p9fs"); +extern uma_zone_t p9fs_io_buffer_zone; +extern uma_zone_t p9fs_getattr_zone; +extern uma_zone_t p9fs_setattr_zone; +extern uma_zone_t p9fs_pbuf_zone; +/* For the root vnode's vnops. */ +struct vop_vector p9fs_vnops; + +static uint32_t p9fs_unix2p9_mode(uint32_t mode); + +static void +p9fs_itimes(struct vnode *vp) +{ + struct p9fs_node *node; + struct timespec ts; + struct p9fs_inode *inode; + + node = P9FS_VTON(vp); + inode = &node->inode; + + vfs_timestamp(&ts); + inode->i_mtime = ts.tv_sec; +} + +/* + * Cleanup the p9fs node, the in memory representation of a vnode for p9fs. + * The cleanup includes invalidating all cache entries for the vnode, + * destroying the vobject, removing vnode from hashlist, removing p9fs node + * from the list of session p9fs nodes, and disposing of the p9fs node. + * Basically it is doing a reverse of what a create/vget does. + */ +void +p9fs_cleanup(struct p9fs_node *np) +{ + struct vnode *vp; + struct p9fs_session *vses; + + if (np == NULL) + return; + + vp = P9FS_NTOV(np); + vses = np->p9fs_ses; + + /* Remove the vnode from hash list if vnode is not already deleted */ + if ((np->flags & P9FS_NODE_DELETED) == 0) + vfs_hash_remove(vp); + + P9FS_LOCK(vses); + if ((np->flags & P9FS_NODE_IN_SESSION) != 0) { + np->flags &= ~P9FS_NODE_IN_SESSION; + STAILQ_REMOVE(&vses->virt_node_list, np, p9fs_node, p9fs_node_next); + } else { + P9FS_UNLOCK(vses); + return; + } + P9FS_UNLOCK(vses); + + /* Invalidate all entries to a particular vnode. */ + cache_purge(vp); + + /* Destroy the vm object and flush associated pages. */ + vnode_destroy_vobject(vp); + + /* Remove all the FID */ + p9fs_fid_remove_all(np, FALSE); + + /* Dispose all node knowledge.*/ + p9fs_destroy_node(&np); +} + +/* + * Reclaim VOP is defined to be called for every vnode. This starts off + * the cleanup by clunking(remove the fid on the server) and calls + * p9fs_cleanup to free all the resources allocated for p9fs node. + */ +static int +p9fs_reclaim(struct vop_reclaim_args *ap) +{ + struct vnode *vp; + struct p9fs_node *np; + + vp = ap->a_vp; + np = P9FS_VTON(vp); + + P9_DEBUG(VOPS, "%s: vp:%p node:%p\n", __func__, vp, np); + p9fs_cleanup(np); + + return (0); +} + +/* + * recycle vnodes which are no longer referenced i.e, their usecount is zero + */ +static int +p9fs_inactive(struct vop_inactive_args *ap) +{ + struct vnode *vp; + struct p9fs_node *np; + + vp = ap->a_vp; + np = P9FS_VTON(vp); + + P9_DEBUG(VOPS, "%s: vp:%p node:%p file:%s\n", __func__, vp, np, np->inode.i_name); + if (np->flags & P9FS_NODE_DELETED) + vrecycle(vp); + + return (0); +} + +struct p9fs_lookup_alloc_arg { + struct componentname *cnp; + struct p9fs_node *dnp; + struct p9_fid *newfid; +}; + +/* Callback for vn_get_ino */ +static int +p9fs_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + struct p9fs_lookup_alloc_arg *p9aa = arg; + + return (p9fs_vget_common(mp, NULL, p9aa->cnp->cn_lkflags, p9aa->dnp, + p9aa->newfid, vpp, p9aa->cnp->cn_nameptr)); +} + +/* + * p9fs_lookup is called for every component name that is being searched for. + * + * I. If component is found on the server, we look for the in-memory + * repesentation(vnode) of this component in namecache. + * A. If the node is found in the namecache, we check is the vnode is still + * valid. + * 1. If it is still valid, return vnode. + * 2. If it is not valid, we remove this vnode from the name cache and + * create a new vnode for the component and return that vnode. + * B. If the vnode is not found in the namecache, we look for it in the + * hash list. + * 1. If the vnode is in the hash list, we check if the vnode is still + * valid. + * a. If it is still valid, we add that vnode to the namecache for + * future lookups and return the vnode. + * b. If it is not valid, create a new vnode and p9fs node, + * initialize them and return the vnode. + * 2. If the vnode is not found in the hash list, we create a new vnode + * and p9fs node, initialize them and return the vnode. + * II. If the component is not found on the server, an error code is returned. + * A. For the creation case, we return EJUSTRETURN so VFS can handle it. + * B. For all other cases, ENOENT is returned. + */ +static int +p9fs_lookup(struct vop_lookup_args *ap) +{ + struct vnode *dvp; + struct vnode **vpp, *vp; + struct componentname *cnp; + struct p9fs_node *dnp; /*dir p9_node */ + struct p9fs_node *np; + struct p9fs_session *vses; + struct mount *mp; /* Get the mount point */ + struct p9_fid *dvfid, *newfid; + uint64_t flags; + int error; + struct vattr vattr; + char tmpchr; + + dvp = ap->a_dvp; + vpp = ap->a_vpp; + cnp = ap->a_cnp; + dnp = P9FS_VTON(dvp); + error = 0; + flags = cnp->cn_flags; + *vpp = NULLVP; + + if (dnp == NULL) + return (ENOENT); + + if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { + vref(dvp); + *vpp = dvp; + return (0); + } + + vses = dnp->p9fs_ses; + mp = vses->p9fs_mount; + + /* Do the cache part ourselves */ + if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) + return (EROFS); + + if (dvp->v_type != VDIR) + return (ENOTDIR); + + error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, curthread); + if (error) + return (error); + + /* Do the directory walk on host to check if file exist */ + dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); + if (error) + return (error); + + /* + * Save the character present at namelen in nameptr string and + * null terminate the character to get the search name for p9_dir_walk + * This is done to handle when lookup is for "a" and component + * name contains a/b/c + */ + tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; + cnp->cn_nameptr[cnp->cn_namelen] = '\0'; + + /* + * If the client_walk fails, it means the file looking for doesnt exist. + * Create the file is the flags are set or just return the error + */ + newfid = p9_client_walk(dvfid, 1, &cnp->cn_nameptr, 1, &error); + + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + + if (error != 0 || newfid == NULL) { + /* Clunk the newfid if it is not NULL */ + if (newfid != NULL) + p9_client_clunk(newfid); + + if (error != ENOENT) + return (error); + + /* The requested file was not found. */ + if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && + (flags & ISLASTCN)) { + + if (mp->mnt_flag & MNT_RDONLY) + return (EROFS); + + error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, + curthread); + if (!error) { + return (EJUSTRETURN); + } + } + return (error); + } + + /* Look for the entry in the component cache*/ + error = cache_lookup(dvp, vpp, cnp, NULL, NULL); + if (error > 0 && error != ENOENT) { + P9_DEBUG(VOPS, "%s: Cache lookup error %d \n", __func__, error); + goto out; + } + + if (error == -1) { + vp = *vpp; + /* Check if the entry in cache is stale or not */ + if ((p9fs_node_cmp(vp, &newfid->qid) == 0) && + ((error = VOP_GETATTR(vp, &vattr, cnp->cn_cred)) == 0)) { + goto out; + } + /* + * This case, we have an error coming from getattr, + * act accordingly. + */ + cache_purge(vp); + if (dvp != vp) + vput(vp); + else + vrele(vp); + + *vpp = NULLVP; + } else if (error == ENOENT) { + if (VN_IS_DOOMED(dvp)) + goto out; + if (VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0) { + error = ENOENT; + goto out; + } + cache_purge_negative(dvp); + } + /* Reset values */ + error = 0; + vp = NULLVP; + + tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; + cnp->cn_nameptr[cnp->cn_namelen] = '\0'; + + /* + * Looks like we have found an entry. Now take care of all other cases. + */ + if (flags & ISDOTDOT) { + struct p9fs_lookup_alloc_arg p9aa; + p9aa.cnp = cnp; + p9aa.dnp = dnp; + p9aa.newfid = newfid; + error = vn_vget_ino_gen(dvp, p9fs_lookup_alloc, &p9aa, 0, &vp); + if (error) + goto out; + *vpp = vp; + } else { + /* + * client_walk is equivalent to searching a component name in a + * directory(fid) here. If new fid is returned, we have found an + * entry for this component name so, go and create the rest of + * the vnode infra(vget_common) for the returned newfid. + */ + if ((cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) + && (flags & ISLASTCN)) { + error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, + curthread); + if (error) + goto out; + + error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, + dnp, newfid, &vp, cnp->cn_nameptr); + if (error) + goto out; + + *vpp = vp; + np = P9FS_VTON(vp); + if ((dnp->inode.i_mode & ISVTX) && + cnp->cn_cred->cr_uid != 0 && + cnp->cn_cred->cr_uid != dnp->inode.n_uid && + cnp->cn_cred->cr_uid != np->inode.n_uid) { + vput(*vpp); + *vpp = NULL; + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + return (EPERM); + } + } else { + error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, + dnp, newfid, &vp, cnp->cn_nameptr); + if (error) + goto out; + *vpp = vp; + } + } + + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + + /* Store the result the cache if MAKEENTRY is specified in flags */ + if ((cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, *vpp, cnp); + return (error); +out: + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + p9_client_clunk(newfid); + return (error); +} + +/* + * Common creation function for file/directory with respective flags. We first + * open the parent directory in order to create the file under it. For this, + * as 9P protocol suggests, we need to call client_walk to create the open fid. + * Once we have the open fid, the file_create function creates the direntry with + * the name and perm specified under the parent dir. If this succeeds (an entry + * is created for the new file on the server), we create our metadata for this + * file (vnode, p9fs node calling vget). Once we are done, we clunk the open + * fid of the parent directory. + */ +static int +create_common(struct p9fs_node *dnp, struct componentname *cnp, + char *extension, uint32_t perm, uint8_t mode, struct vnode **vpp) +{ + char tmpchr; + struct p9_fid *dvfid, *ofid, *newfid; + struct p9fs_session *vses; + struct mount *mp; + int error; + + P9_DEBUG(VOPS, "%s: name %s\n", __func__, cnp->cn_nameptr); + + vses = dnp->p9fs_ses; + mp = vses->p9fs_mount; + newfid = NULL; + error = 0; + + dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); + if (error != 0) + return (error); + + /* Clone the directory fid to create the new file */ + ofid = p9_client_walk(dvfid, 0, NULL, 1, &error); + if (error != 0) + return (error); + + /* + * Save the character present at namelen in nameptr string and + * null terminate the character to get the search name for p9_dir_walk + */ + tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; + cnp->cn_nameptr[cnp->cn_namelen] = '\0'; + + error = p9_client_file_create(ofid, cnp->cn_nameptr, perm, mode, + extension); + if (error != 0) { + P9_DEBUG(ERROR, "%s: p9_client_fcreate failed %d\n", __func__, error); + goto out; + } + + /* If its not hardlink only then do the walk, else we are done. */ + if (!(perm & P9PROTO_DMLINK)) { + /* + * Do the lookup part and add the vnode, p9fs node. Note that vpp + * is filled in here. + */ + newfid = p9_client_walk(dvfid, 1, &cnp->cn_nameptr, 1, &error); + if (newfid != NULL) { + error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, + dnp, newfid, vpp, cnp->cn_nameptr); + if (error != 0) + goto out; + } else { + /* Not found return NOENTRY.*/ + goto out; + } + + if ((cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(P9FS_NTOV(dnp), *vpp, cnp); + } + P9_DEBUG(VOPS, "%s: created file under vp %p node %p fid %ju\n", + __func__, *vpp, dnp, (uintmax_t)dvfid->fid); + /* Clunk the open ofid. */ + if (ofid != NULL) + (void)p9_client_clunk(ofid); + + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + return (0); +out: + if (ofid != NULL) + (void)p9_client_clunk(ofid); + + if (newfid != NULL) + (void)p9_client_clunk(newfid); + + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + return (error); +} + +/* + * This is the main file creation VOP. Make the permissions of the new + * file and call the create_common common code to complete the create. + */ +static int +p9fs_create(struct vop_create_args *ap) +{ + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; + uint32_t mode; + struct p9fs_node *dnp; + struct p9fs_inode *dinode; + uint32_t perm; + int ret; + + dvp = ap->a_dvp; + vpp = ap->a_vpp; + cnp = ap->a_cnp; + dnp = P9FS_VTON(dvp); + dinode = &dnp->inode; + mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); + perm = p9fs_unix2p9_mode(mode); + + P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); + + ret = create_common(dnp, cnp, NULL, perm, P9PROTO_ORDWR, vpp); + if (ret == 0) { + P9FS_INCR_LINKS(dinode); + } + + return (ret); +} + +/* + * p9fs_mkdir is the main directory creation vop. Make the permissions of the new dir + * and call the create_common common code to complete the create. + */ +static int +p9fs_mkdir(struct vop_mkdir_args *ap) +{ + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; + uint32_t mode; + struct p9fs_node *dnp; + struct p9fs_inode *dinode; + uint32_t perm; + int ret; + + dvp = ap->a_dvp; + vpp = ap->a_vpp; + cnp = ap->a_cnp; + dnp = P9FS_VTON(dvp); + dinode = &dnp->inode; + mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); + perm = p9fs_unix2p9_mode(mode | S_IFDIR); + + P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); + + ret = create_common(dnp, cnp, NULL, perm, P9PROTO_ORDWR, vpp); + if (ret == 0) + P9FS_INCR_LINKS(dinode); + + return (ret); +} + +/* + * p9fs_mknod is the main node creation vop. Make the permissions of the new node + * and call the create_common common code to complete the create. + */ +static int +p9fs_mknod(struct vop_mknod_args *ap) +{ + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; + uint32_t mode; + struct p9fs_node *dnp; + struct p9fs_inode *dinode; + uint32_t perm; + int ret; + + dvp = ap->a_dvp; + vpp = ap->a_vpp; + cnp = ap->a_cnp; + dnp = P9FS_VTON(dvp); + dinode = &dnp->inode; + mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); + perm = p9fs_unix2p9_mode(mode); + + P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); + + ret = create_common(dnp, cnp, NULL, perm, P9PROTO_OREAD, vpp); + if (ret == 0) { + P9FS_INCR_LINKS(dinode); + } + + return (ret); +} + +/* Convert open mode permissions to P9 */ +static int +p9fs_uflags_mode(int uflags, int extended) +{ + uint32_t ret; + + /* Convert first to O flags.*/ + uflags = OFLAGS(uflags); + + switch (uflags & 3) { + + case O_RDONLY: + ret = P9PROTO_OREAD; + break; + + case O_WRONLY: + ret = P9PROTO_OWRITE; + break; + + case O_RDWR: + ret = P9PROTO_ORDWR; + break; + } + + if (extended) { + if (uflags & O_EXCL) + ret |= P9PROTO_OEXCL; + + if (uflags & O_APPEND) + ret |= P9PROTO_OAPPEND; + } + + return (ret); +} + +/* + * This is the main open VOP for every file open. If the file is already + * open, then increment and return. If there is no open fid for this file, + * there needs to be a client_walk which creates a new open fid for this file. + * Once we have a open fid, call the open on this file with the mode creating + * the vobject. + */ +static int +p9fs_open(struct vop_open_args *ap) +{ + int error; + struct vnode *vp; + struct p9fs_node *np; + struct p9fs_session *vses; + struct p9_fid *vofid, *vfid; + size_t filesize; + uint32_t mode; + + error = 0; + vp = ap->a_vp; + np = P9FS_VTON(vp); + vses = np->p9fs_ses; + + P9_DEBUG(VOPS, "%s: vp %p\n", __func__, vp); + + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) + return (EOPNOTSUPP); + + error = p9fs_reload_stats_dotl(vp, ap->a_cred); + if (error != 0) + return (error); + + ASSERT_VOP_LOCKED(vp, __func__); + /* + * Invalidate the pages of the vm_object cache if the file is modified + * based on the flag set in reload stats + */ + if (vp->v_type == VREG && (np->flags & P9FS_NODE_MODIFIED) != 0) { + error = vinvalbuf(vp, 0, 0, 0); + if (error != 0) + return (error); + np->flags &= ~P9FS_NODE_MODIFIED; + } + + vfid = p9fs_get_fid(vses->clnt, np, ap->a_cred, VFID, -1, &error); + if (error != 0) + return (error); + + /* + * Translate kernel fflags to 9p mode + */ + mode = p9fs_uflags_mode(ap->a_mode, 1); + + /* + * Search the fid in vofid_list for current user. If found increase the open + * count and return. If not found clone a new fid and open the file using + * that cloned fid. + */ + vofid = p9fs_get_fid(vses->clnt, np, ap->a_cred, VOFID, mode, &error); + if (vofid != NULL) { + vofid->v_opens++; + return (0); + } else { + /*vofid is the open fid for this file.*/ + vofid = p9_client_walk(vfid, 0, NULL, 1, &error); + if (error != 0) + return (error); + } + + error = p9_client_open(vofid, mode); + if (error != 0) + p9_client_clunk(vofid); + else { + vofid->v_opens = 1; + filesize = np->inode.i_size; + vnode_create_vobject(vp, filesize, ap->a_td); + p9fs_fid_add(np, vofid, VOFID); + } + + return (error); +} + +/* + * Close the open references. Just reduce the open count on vofid and return. + * Let clunking of VOFID happen in p9fs_reclaim. + */ +static int +p9fs_close(struct vop_close_args *ap) +{ + struct vnode *vp; + struct p9fs_node *np; + struct p9fs_session *vses; + struct p9_fid *vofid; + int error; + + vp = ap->a_vp; + np = P9FS_VTON(vp); + + if (np == NULL) + return (0); + + vses = np->p9fs_ses; + error = 0; + + P9_DEBUG(VOPS, "%s: file_name %s\n", __func__, np->inode.i_name); + + /* + * Translate kernel fflags to 9p mode + */ + vofid = p9fs_get_fid(vses->clnt, np, ap->a_cred, VOFID, + p9fs_uflags_mode(ap->a_fflag, 1), &error); + if (vofid == NULL) + return (0); + + vofid->v_opens--; + + return (0); +} + +/* Helper routine for checking if fileops are possible on this file */ +static int +p9fs_check_possible(struct vnode *vp, struct vattr *vap, mode_t mode) +{ + + /* Check if we are allowed to write */ + switch (vap->va_type) { + case VDIR: + case VLNK: + case VREG: + /* + * Normal nodes: check if we're on a read-only mounted + * file system and bail out if we're trying to write. + */ + if ((mode & VMODIFY_PERMS) && (vp->v_mount->mnt_flag & MNT_RDONLY)) + return (EROFS); + break; + case VBLK: + case VCHR: + case VSOCK: + case VFIFO: + /* + * Special nodes: even on read-only mounted file systems + * these are allowed to be written to if permissions allow. + */ + break; + default: + /* No idea what this is */ + return (EINVAL); + } + + return (0); +} + +/* Check the access permissions of the file. */ +static int +p9fs_access(struct vop_access_args *ap) +{ + struct vnode *vp; + accmode_t accmode; + struct ucred *cred; + struct vattr vap; + int error; + + vp = ap->a_vp; + accmode = ap->a_accmode; + cred = ap->a_cred; + + P9_DEBUG(VOPS, "%s: vp %p\n", __func__, vp); + + /* make sure getattr is working correctly and is defined.*/ + error = VOP_GETATTR(vp, &vap, cred); + if (error != 0) + return (error); + + error = p9fs_check_possible(vp, &vap, accmode); + if (error != 0) + return (error); + + /* Call the Generic Access check in VOPS*/ + error = vaccess(vp->v_type, vap.va_mode, vap.va_uid, vap.va_gid, accmode, + cred); + + + return (error); +} + +/* + * Reload the file stats from the server and update the inode structure present + * in p9fs node. + */ +int +p9fs_reload_stats_dotl(struct vnode *vp, struct ucred *cred) +{ + struct p9_stat_dotl *stat; + int error; + struct p9fs_node *node; + struct p9fs_session *vses; + struct p9_fid *vfid; + + error = 0; + node = P9FS_VTON(vp); + vses = node->p9fs_ses; + + vfid = p9fs_get_fid(vses->clnt, node, cred, VOFID, P9PROTO_OREAD, &error); + if (vfid == NULL) { + vfid = p9fs_get_fid(vses->clnt, node, cred, VFID, -1, &error); + if (error) + return (error); + } + + stat = uma_zalloc(p9fs_getattr_zone, M_WAITOK | M_ZERO); + + error = p9_client_getattr(vfid, stat, P9PROTO_STATS_ALL); + if (error != 0) { + P9_DEBUG(ERROR, "%s: p9_client_getattr failed: %d\n", __func__, error); + goto out; + } + + /* Init the vnode with the disk info */ + p9fs_stat_vnode_dotl(stat, vp); +out: + if (stat != NULL) { + uma_zfree(p9fs_getattr_zone, stat); + } + + return (error); +} + +/* + * Read the current inode values into the vap attr. We reload the stats from + * the server. + */ +static int +p9fs_getattr_dotl(struct vop_getattr_args *ap) +{ + struct vnode *vp; + struct vattr *vap; + struct p9fs_node *node; + struct p9fs_inode *inode; + int error; + + vp = ap->a_vp; + vap = ap->a_vap; + node = P9FS_VTON(vp); + + if (node == NULL) + return (ENOENT); + + inode = &node->inode; + + P9_DEBUG(VOPS, "%s: %u %u\n", __func__, inode->i_mode, IFTOVT(inode->i_mode)); + + /* Reload our stats once to get the right values.*/ + error = p9fs_reload_stats_dotl(vp, ap->a_cred); + if (error != 0) { + P9_DEBUG(ERROR, "%s: failed: %d\n", __func__, error); + return (error); + } + + /* Basic info */ + VATTR_NULL(vap); + + vap->va_atime.tv_sec = inode->i_atime; + vap->va_mtime.tv_sec = inode->i_mtime; + vap->va_ctime.tv_sec = inode->i_ctime; + vap->va_atime.tv_nsec = inode->i_atime_nsec; + vap->va_mtime.tv_nsec = inode->i_mtime_nsec; + vap->va_ctime.tv_nsec = inode->i_ctime_nsec; + vap->va_type = IFTOVT(inode->i_mode); + vap->va_mode = inode->i_mode; + vap->va_uid = inode->n_uid; + vap->va_gid = inode->n_gid; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_size = inode->i_size; + vap->va_nlink = inode->i_links_count; + vap->va_blocksize = inode->blksize; + vap->va_fileid = inode->i_qid_path; + vap->va_flags = inode->i_flags; + vap->va_gen = inode->gen; + vap->va_filerev = inode->data_version; + vap->va_vaflags = 0; + vap->va_bytes = inode->blocks * P9PROTO_TGETATTR_BLK; + + return (0); +} + +/* Convert a standard FreeBSD permission to P9. */ +static uint32_t +p9fs_unix2p9_mode(uint32_t mode) +{ + uint32_t res; + + res = mode & 0777; + if (S_ISDIR(mode)) + res |= P9PROTO_DMDIR; + if (S_ISSOCK(mode)) + res |= P9PROTO_DMSOCKET; + if (S_ISLNK(mode)) + res |= P9PROTO_DMSYMLINK; + if (S_ISFIFO(mode)) + res |= P9PROTO_DMNAMEDPIPE; + if ((mode & S_ISUID) == S_ISUID) + res |= P9PROTO_DMSETUID; + if ((mode & S_ISGID) == S_ISGID) + res |= P9PROTO_DMSETGID; + if ((mode & S_ISVTX) == S_ISVTX) + res |= P9PROTO_DMSETVTX; + + return (res); +} + +/* Update inode with the stats read from server.(9P2000.L version) */ +int +p9fs_stat_vnode_dotl(struct p9_stat_dotl *stat, struct vnode *vp) +{ + struct p9fs_node *np; + struct p9fs_inode *inode; + + np = P9FS_VTON(vp); + inode = &np->inode; + + ASSERT_VOP_LOCKED(vp, __func__); + /* Update the pager size if file size changes on host */ + if (inode->i_size != stat->st_size) { + inode->i_size = stat->st_size; + if (vp->v_type == VREG) + vnode_pager_setsize(vp, inode->i_size); + } + + inode->i_mtime = stat->st_mtime_sec; + inode->i_atime = stat->st_atime_sec; + inode->i_ctime = stat->st_ctime_sec; + inode->i_mtime_nsec = stat->st_mtime_nsec; + inode->i_atime_nsec = stat->st_atime_nsec; + inode->i_ctime_nsec = stat->st_ctime_nsec; + inode->n_uid = stat->st_uid; + inode->n_gid = stat->st_gid; + inode->i_mode = stat->st_mode; + vp->v_type = IFTOVT(inode->i_mode); + inode->i_links_count = stat->st_nlink; + inode->blksize = stat->st_blksize; + inode->blocks = stat->st_blocks; + inode->gen = stat->st_gen; + inode->data_version = stat->st_data_version; + + ASSERT_VOP_LOCKED(vp, __func__); + /* Setting a flag if file changes based on qid version */ + if (np->vqid.qid_version != stat->qid.version) + np->flags |= P9FS_NODE_MODIFIED; + memcpy(&np->vqid, &stat->qid, sizeof(stat->qid)); + + return (0); +} + +/* + * Write the current in memory inode stats into persistent stats structure + * to write to the server(for linux version). + */ +static int +p9fs_inode_to_iattr(struct p9fs_inode *inode, struct p9_iattr_dotl *p9attr) +{ + p9attr->size = inode->i_size; + p9attr->mode = inode->i_mode; + p9attr->uid = inode->n_uid; + p9attr->gid = inode->n_gid; + p9attr->atime_sec = inode->i_atime; + p9attr->atime_nsec = inode->i_atime_nsec; + p9attr->mtime_sec = inode->i_mtime; + p9attr->mtime_nsec = inode->i_mtime_nsec; + + return (0); +} + +/* + * Modify the ownership of a file whenever the chown is called on the + * file. + */ +static int +p9fs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, + struct thread *td) +{ + struct p9fs_node *np; + struct p9fs_inode *inode; + uid_t ouid; + gid_t ogid; + int error; + + np = P9FS_VTON(vp); + inode = &np->inode; + + if (uid == (uid_t)VNOVAL) + uid = inode->n_uid; + if (gid == (gid_t)VNOVAL) + gid = inode->n_gid; + /* + * To modify the ownership of a file, must possess VADMIN for that + * file. + */ + if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td))) + return (error); + /* + * To change the owner of a file, or change the group of a file to a + * group of which we are not a member, the caller must have + * privilege. + */ + if (((uid != inode->n_uid && uid != cred->cr_uid) || + (gid != inode->n_gid && !groupmember(gid, cred))) && + (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) + return (error); + + ogid = inode->n_gid; + ouid = inode->n_uid; + + inode->n_gid = gid; + inode->n_uid = uid; + + if ((inode->i_mode & (ISUID | ISGID)) && + (ouid != uid || ogid != gid)) { + + if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) + inode->i_mode &= ~(ISUID | ISGID); + } + P9_DEBUG(VOPS, "%s: vp %p, cred %p, td %p - ret OK\n", __func__, vp, cred, td); + + return (0); +} + +/* + * Update the in memory inode with all chmod new permissions/mode. Typically a + * setattr is called to update it to server. + */ +static int +p9fs_chmod(struct vnode *vp, uint32_t mode, struct ucred *cred, struct thread *td) +{ + struct p9fs_node *np; + struct p9fs_inode *inode; + uint32_t nmode; + int error; + + np = P9FS_VTON(vp); + inode = &np->inode; + + P9_DEBUG(VOPS, "%s: vp %p, mode %x, cred %p, td %p\n", __func__, vp, mode, cred, td); + /* + * To modify the permissions on a file, must possess VADMIN + * for that file. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) + return (error); + + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the + * process is not a member of. Both of these are allowed in + * jail(8). + */ + if (vp->v_type != VDIR && (mode & S_ISTXT)) { + if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) + return (EFTYPE); + } + if (!groupmember(inode->n_gid, cred) && (mode & ISGID)) { + error = priv_check_cred(cred, PRIV_VFS_SETGID); + if (error != 0) + return (error); + } + + /* + * Deny setting setuid if we are not the file owner. + */ + if ((mode & ISUID) && inode->n_uid != cred->cr_uid) { + error = priv_check_cred(cred, PRIV_VFS_ADMIN); + if (error != 0) + return (error); + } + nmode = inode->i_mode; + nmode &= ~ALLPERMS; + nmode |= (mode & ALLPERMS); + inode->i_mode = nmode; + + P9_DEBUG(VOPS, "%s: to mode %x %d \n ", __func__, nmode, error); + + return (error); +} + +/* + * Set the attributes of a file referenced by fid. A valid bitmask is sent + * in request selecting which fields to set + */ +static int +p9fs_setattr_dotl(struct vop_setattr_args *ap) +{ + struct vnode *vp; + struct vattr *vap; + struct p9fs_node *node; + struct p9fs_inode *inode; + struct ucred *cred; + struct thread *td; + struct p9_iattr_dotl *p9attr; + struct p9fs_session *vses; + struct p9_fid *vfid; + uint64_t oldfilesize; + int error; + + vp = ap->a_vp; + vap = ap->a_vap; + node = P9FS_VTON(vp); + inode = &node->inode; + cred = ap->a_cred; + td = curthread; + vses = node->p9fs_ses; + error = 0; + + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + P9_DEBUG(ERROR, "%s: unsettable attribute\n", __func__); + return (EINVAL); + } + /* Disallow write attempts on read only filesystem */ + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + + /* Setting of flags is not supported */ + if (vap->va_flags != VNOVAL) + return (EOPNOTSUPP); + + /* Allocate p9attr struct */ + p9attr = uma_zalloc(p9fs_setattr_zone, M_WAITOK | M_ZERO); + if (p9attr == NULL) + return (ENOMEM); + + /* Check if we need to change the ownership of the file*/ + if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { + P9_DEBUG(VOPS, "%s: vp:%p td:%p uid/gid %x/%x\n", __func__, + vp, td, vap->va_uid, vap->va_gid); + + error = p9fs_chown(vp, vap->va_uid, vap->va_gid, cred, td); + p9attr->valid |= P9PROTO_SETATTR_UID | P9PROTO_SETATTR_GID | + P9PROTO_SETATTR_MODE; + if (error) + goto out; + } + + /* Check for mode changes */ + if (vap->va_mode != (mode_t)VNOVAL) { + P9_DEBUG(VOPS, "%s: vp:%p td:%p mode %x\n", __func__, vp, td, + vap->va_mode); + + error = p9fs_chmod(vp, (int)vap->va_mode, cred, td); + p9attr->valid |= P9PROTO_SETATTR_MODE; + if (error) + goto out; + } + + /* Update the size of the file and update mtime */ + if (vap->va_size != (uint64_t)VNOVAL) { + P9_DEBUG(VOPS, "%s: vp:%p td:%p size:%jx\n", __func__, + vp, td, (uintmax_t)vap->va_size); + switch (vp->v_type) { + case VDIR: + error = EISDIR; + goto out; + case VLNK: + case VREG: + /* Invalidate cached pages of vp */ + error = vinvalbuf(vp, 0, 0, 0); + if (error) + goto out; + oldfilesize = inode->i_size; + inode->i_size = vap->va_size; + /* Update the p9fs_inode time */ + p9fs_itimes(vp); + p9attr->valid |= P9PROTO_SETATTR_SIZE | + P9PROTO_SETATTR_ATIME | + P9PROTO_SETATTR_MTIME | + P9PROTO_SETATTR_ATIME_SET | + P9PROTO_SETATTR_MTIME_SET ; + break; + default: + goto out; + } + } else if (vap->va_atime.tv_sec != VNOVAL || + vap->va_mtime.tv_sec != VNOVAL) { + P9_DEBUG(VOPS, "%s: vp:%p td:%p time a/m %jx/%jx/\n", + __func__, vp, td, (uintmax_t)vap->va_atime.tv_sec, + (uintmax_t)vap->va_mtime.tv_sec); + /* Update the p9fs_inode times */ + p9fs_itimes(vp); + p9attr->valid |= P9PROTO_SETATTR_ATIME | + P9PROTO_SETATTR_MTIME | P9PROTO_SETATTR_ATIME_SET | + P9PROTO_SETATTR_MTIME_SET; + } + + vfid = p9fs_get_fid(vses->clnt, node, cred, VOFID, P9PROTO_OWRITE, &error); + if (vfid == NULL) { + vfid = p9fs_get_fid(vses->clnt, node, cred, VFID, -1, &error); + if (error) + goto out; + } + + /* Write the inode structure values into p9attr */ + p9fs_inode_to_iattr(inode, p9attr); + error = p9_client_setattr(vfid, p9attr); + if (vap->va_size != (uint64_t)VNOVAL && vp->v_type == VREG) { + if (error) + inode->i_size = oldfilesize; + else + vnode_pager_setsize(vp, inode->i_size); + } +out: + if (p9attr) { + uma_zfree(p9fs_setattr_zone, p9attr); + } + P9_DEBUG(VOPS, "%s: error: %d\n", __func__, error); + return (error); +} + +struct open_fid_state { + struct p9_fid *vofid; + int fflags; + int opened; +}; + +/* + * TODO: change this to take P9PROTO_* mode and avoid routing through + * VOP_OPEN, factoring out implementation of p9fs_open. + */ +static int +p9fs_get_open_fid(struct vnode *vp, int fflags, struct ucred *cr, struct open_fid_state *statep) +{ + struct p9fs_node *np; + struct p9fs_session *vses; + struct p9_fid *vofid; + int mode = p9fs_uflags_mode(fflags, TRUE); + int error = 0; + + statep->opened = FALSE; + + np = P9FS_VTON(vp); + vses = np->p9fs_ses; + vofid = p9fs_get_fid(vses->clnt, np, cr, VOFID, mode, &error); + if (vofid == NULL) { + error = VOP_OPEN(vp, fflags, cr, curthread, NULL); + if (error) { + return (error); + } + vofid = p9fs_get_fid(vses->clnt, np, cr, VOFID, mode, &error); + if (vofid == NULL) { + return (EBADF); + } + statep->fflags = fflags; + statep->opened = TRUE; + } + statep->vofid = vofid; + return (0); +} + +static void +p9fs_release_open_fid(struct vnode *vp, struct ucred *cr, struct open_fid_state *statep) +{ + if (statep->opened) { + (void) VOP_CLOSE(vp, statep->fflags, cr, curthread); + } +} + +/* + * An I/O buffer is used to to do any transfer. The uio is the vfs structure we + * need to copy data into. As long as resid is greater than zero, we call + * client_read to read data from offset(offset into the file) in the open fid + * for the file into the I/O buffer. The data is read into the user data buffer. + */ +static int +p9fs_read(struct vop_read_args *ap) +{ + struct vnode *vp; + struct uio *uio; + struct p9fs_node *np; + uint64_t offset; + int64_t ret; + uint64_t resid; + uint32_t count; + int error; + char *io_buffer = NULL; + uint64_t filesize; + struct open_fid_state ostate; + + vp = ap->a_vp; + uio = ap->a_uio; + np = P9FS_VTON(vp); + error = 0; + + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (EOPNOTSUPP); + if (vp->v_type != VREG) + return (EISDIR); + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0) + return (EINVAL); + + error = p9fs_get_open_fid(vp, FREAD, ap->a_cred, &ostate); + if (error) + return (error); + + /* where in the file are we to start reading */ + offset = uio->uio_offset; + filesize = np->inode.i_size; + if (uio->uio_offset >= filesize) + goto out; + + P9_DEBUG(VOPS, "%s: called %jd at %ju\n", + __func__, (intmax_t)uio->uio_resid, (uintmax_t)uio->uio_offset); + + /* Work with a local buffer from the pool for this vop */ + + io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK | M_ZERO); + while ((resid = uio->uio_resid) > 0) { + if (offset >= filesize) + break; + count = MIN(filesize - uio->uio_offset , resid); + if (count == 0) + break; + + /* Copy count bytes into the uio */ + ret = p9_client_read(ostate.vofid, offset, count, io_buffer); + /* + * This is the only place in the entire p9fs where we check the + * error for < 0 as p9_client_read/write return the number of + * bytes instead of an error code. In this case if ret is < 0, + * it means there is an IO error. + */ + if (ret < 0) { + error = -ret; + goto out; + } + error = uiomove(io_buffer, ret, uio); + if (error != 0) + goto out; + + offset += ret; + } + uio->uio_offset = offset; +out: + uma_zfree(p9fs_io_buffer_zone, io_buffer); + p9fs_release_open_fid(vp, ap->a_cred, &ostate); + + return (error); +} + +/* + * The user buffer contains the data to be written. This data is copied first + * from uio into I/O buffer. This I/O buffer is used to do the client_write to + * the fid of the file starting from the offset given upto count bytes. The + * number of bytes written is returned to the caller. + */ +static int +p9fs_write(struct vop_write_args *ap) +{ + struct vnode *vp; + struct uio *uio; + struct p9fs_node *np; + uint64_t off, offset; + int64_t ret; + uint64_t resid, bytes_written; + uint32_t count; + int error, ioflag; + uint64_t file_size; + char *io_buffer = NULL; + struct open_fid_state ostate; + + vp = ap->a_vp; + uio = ap->a_uio; + np = P9FS_VTON(vp); + error = 0; + ioflag = ap->a_ioflag; + + error = p9fs_get_open_fid(vp, FWRITE, ap->a_cred, &ostate); + if (error) + return (error); + + P9_DEBUG(VOPS, "%s: %#zx at %#jx\n", + __func__, uio->uio_resid, (uintmax_t)uio->uio_offset); + + if (uio->uio_offset < 0) { + error = EINVAL; + goto out; + } + if (uio->uio_resid == 0) + goto out; + + file_size = np->inode.i_size; + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = file_size; + break; + case VDIR: + return (EISDIR); + case VLNK: + break; + default: + panic("%s: bad file type vp: %p", __func__, vp); + } + + resid = uio->uio_resid; + offset = uio->uio_offset; + bytes_written = 0; + error = 0; + + io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK | M_ZERO); + while ((resid = uio->uio_resid) > 0) { + off = 0; + count = MIN(resid, P9FS_IOUNIT); + error = uiomove(io_buffer, count, uio); + + if (error != 0) { + P9_DEBUG(ERROR, "%s: uiomove failed: %d\n", __func__, error); + goto out; + } + + /* While count still exists, keep writing.*/ + while (count > 0) { + /* Copy count bytes from the uio */ + ret = p9_client_write(ostate.vofid, offset, count, + io_buffer + off); + if (ret < 0) { + if (bytes_written == 0) { + error = -ret; + goto out; + } else { + break; + } + } + P9_DEBUG(VOPS, "%s: write %#zx at %#jx\n", + __func__, uio->uio_resid, (uintmax_t)uio->uio_offset); + + off += ret; + offset += ret; + bytes_written += ret; + count -= ret; + } + } + /* Update the fields in the node to reflect the change*/ + if (file_size < uio->uio_offset + uio->uio_resid) { + np->inode.i_size = uio->uio_offset + uio->uio_resid; + vnode_pager_setsize(vp, uio->uio_offset + uio->uio_resid); + } +out: + if (io_buffer) + uma_zfree(p9fs_io_buffer_zone, io_buffer); + p9fs_release_open_fid(vp, ap->a_cred, &ostate); + + return (error); +} + +/* + * Common handler of all removal-related VOPs (e.g. rmdir, rm). Perform the + * client_remove op to send messages to remove the node's fid on the server. + * After that, does a node metadata cleanup on client side. + */ +static int +remove_common(struct p9fs_node *dnp, struct p9fs_node *np, const char *name, + struct ucred *cred) +{ + int error; + struct p9fs_session *vses; + struct vnode *vp; + struct p9_fid *vfid; + + error = 0; + vses = np->p9fs_ses; + vp = P9FS_NTOV(np); + + vfid = p9fs_get_fid(vses->clnt, dnp, cred, VFID, -1, &error); + if (error != 0) + return (error); + + error = p9_client_unlink(vfid, name, + np->v_node->v_type == VDIR ? P9PROTO_UNLINKAT_REMOVEDIR : 0); + if (error != 0) + return (error); + + /* Remove all non-open fids associated with the vp */ + if (np->inode.i_links_count == 1) + p9fs_fid_remove_all(np, TRUE); + + /* Invalidate all entries of vnode from name cache and hash list. */ + cache_purge(vp); + vfs_hash_remove(vp); + + np->flags |= P9FS_NODE_DELETED; + + return (error); +} + +/* Remove vop for all files. Call common code for remove and adjust links */ +static int +p9fs_remove(struct vop_remove_args *ap) +{ + struct vnode *vp; + struct p9fs_node *np; + struct vnode *dvp; + struct p9fs_node *dnp; + struct p9fs_inode *dinode; + struct componentname *cnp; + int error; + + cnp = ap->a_cnp; + vp = ap->a_vp; + np = P9FS_VTON(vp); + dvp = ap->a_dvp; + dnp = P9FS_VTON(dvp); + dinode = &dnp->inode; + + P9_DEBUG(VOPS, "%s: vp %p node %p \n", __func__, vp, np); + + if (vp->v_type == VDIR) + return (EISDIR); + + error = remove_common(dnp, np, cnp->cn_nameptr, cnp->cn_cred); + if (error == 0) + P9FS_DECR_LINKS(dinode); + + return (error); +} + +/* Remove vop for all directories. Call common code for remove and adjust links */ +static int +p9fs_rmdir(struct vop_rmdir_args *ap) +{ + struct vnode *vp; + struct p9fs_node *np; + struct vnode *dvp; + struct p9fs_node *dnp; + struct p9fs_inode *dinode; + struct componentname *cnp; + int error; + + cnp = ap->a_cnp; + vp = ap->a_vp; + np = P9FS_VTON(vp); + dvp = ap->a_dvp; + dnp = P9FS_VTON(dvp); + dinode = &dnp->inode; + + P9_DEBUG(VOPS, "%s: vp %p node %p \n", __func__, vp, np); + + error = remove_common(dnp, np, cnp->cn_nameptr, cnp->cn_cred); + if (error == 0) + P9FS_DECR_LINKS(dinode); + + return (error); +} + +/* + * Create symlinks. Make the permissions and call create_common code + * for Soft links. + */ +static int +p9fs_symlink(struct vop_symlink_args *ap) +{ + struct vnode *dvp; + struct vnode **vpp; + struct vattr *vap; + struct componentname *cnp; + char *symtgt; + struct p9fs_node *dnp; + struct p9fs_session *vses; + struct mount *mp; + struct p9_fid *dvfid, *newfid; + int error; + char tmpchr; + gid_t gid; + + dvp = ap->a_dvp; + vpp = ap->a_vpp; + vap = ap->a_vap; + cnp = ap->a_cnp; + symtgt = (char*)(uintptr_t) ap->a_target; + dnp = P9FS_VTON(dvp); + vses = dnp->p9fs_ses; + mp = vses->p9fs_mount; + newfid = NULL; + error = 0; + gid = vap->va_gid; + + P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); + + /* + * Save the character present at namelen in nameptr string and + * null terminate the character to get the search name for p9_dir_walk + */ + tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; + cnp->cn_nameptr[cnp->cn_namelen] = '\0'; + + dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); + if (error != 0) + goto out; + + error = p9_create_symlink(dvfid, cnp->cn_nameptr, symtgt, gid); + if (error != 0) + goto out; + + /*create vnode for symtgt */ + newfid = p9_client_walk(dvfid, 1, &cnp->cn_nameptr, 1, &error); + if (newfid != NULL) { + error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, + dnp, newfid, vpp, cnp->cn_nameptr); + if (error != 0) + goto out; + } else + goto out; + + if ((cnp->cn_flags & MAKEENTRY) != 0) { + cache_enter(P9FS_NTOV(dnp), *vpp, cnp); + } + P9_DEBUG(VOPS, "%s: created file under vp %p node %p fid %ju\n", + __func__, *vpp, dnp, (uintmax_t)dvfid->fid); + + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + return (error); + +out: + if (newfid != NULL) + p9_client_clunk(newfid); + cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; + return (error); +} + +/* Create hard link */ +static int +p9fs_link(struct vop_link_args *ap) +{ + struct vnode *vp; + struct vnode *tdvp; + struct componentname *cnp; + struct p9fs_node *dnp; + struct p9fs_node *np; + struct p9fs_inode *inode; + struct p9fs_session *vses; + struct p9_fid *dvfid, *oldvfid; + int error; + + vp = ap->a_vp; + tdvp = ap->a_tdvp; + cnp = ap->a_cnp; + dnp = P9FS_VTON(tdvp); + np = P9FS_VTON(vp); + inode = &np->inode; + vses = np->p9fs_ses; + error = 0; + + P9_DEBUG(VOPS, "%s: tdvp %p vp %p\n", __func__, tdvp, vp); + + dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); + if (error != 0) + return (error); + oldvfid = p9fs_get_fid(vses->clnt, np, cnp->cn_cred, VFID, -1, &error); + if (error != 0) + return (error); + + error = p9_create_hardlink(dvfid, oldvfid, cnp->cn_nameptr); + if (error != 0) + return (error); + /* Increment ref count on the inode */ + P9FS_INCR_LINKS(inode); + + return (0); +} + +/* Read contents of the symbolic link */ +static int +p9fs_readlink(struct vop_readlink_args *ap) +{ + struct vnode *vp; + struct uio *uio; + struct p9fs_node *dnp; + struct p9fs_session *vses; + struct p9_fid *dvfid; + int error, len; + char *target; + + vp = ap->a_vp; + uio = ap->a_uio; + dnp = P9FS_VTON(vp); + vses = dnp->p9fs_ses; + error = 0; + + P9_DEBUG(VOPS, "%s: vp %p\n", __func__, vp); + + dvfid = p9fs_get_fid(vses->clnt, dnp, ap->a_cred, VFID, -1, &error); + if (error != 0) + return (error); + + error = p9_readlink(dvfid, &target); + if (error != 0) + return (error); + + len = strlen(target); + error = uiomove(target, len, uio); + + return (0); +} + +/* + * Iterate through a directory. An entire 8k data is read into the I/O buffer. + * This buffer is parsed to make dir entries and fed to the user buffer to + * complete it to the VFS. + */ +static int +p9fs_readdir(struct vop_readdir_args *ap) +{ + struct uio *uio; + struct vnode *vp; + struct dirent cde; + int64_t offset; + uint64_t diroffset; + struct p9fs_node *np; + int error; + int32_t count; + struct p9_client *clnt; + struct p9_dirent dent; + char *io_buffer; + struct p9_fid *vofid; + + uio = ap->a_uio; + vp = ap->a_vp; + np = P9FS_VTON(ap->a_vp); + offset = 0; + diroffset = 0; + error = 0; + count = 0; + clnt = np->p9fs_ses->clnt; + + P9_DEBUG(VOPS, "%s: vp %p, offset %jd, resid %zd\n", __func__, vp, (intmax_t) uio->uio_offset, uio->uio_resid); + + if (ap->a_uio->uio_iov->iov_len <= 0) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + vofid = p9fs_get_fid(clnt, np, ap->a_cred, VOFID, P9PROTO_OREAD, &error); + if (vofid == NULL) { + P9_DEBUG(ERROR, "%s: NULL FID\n", __func__); + return (EBADF); + } + + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 0; + + io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK); + + /* We haven't reached the end yet. read more. */ + diroffset = uio->uio_offset; + while (uio->uio_resid >= sizeof(struct dirent)) { + /* + * We need to read more data as what is indicated by filesize because + * filesize is based on data stored in struct dirent structure but + * we read data in struct p9_dirent format which has different size. + * Hence we read max data(P9FS_IOUNIT) everytime from host, convert + * it into struct dirent structure and send it back. + */ + count = P9FS_IOUNIT; + bzero(io_buffer, P9FS_MTU); + count = p9_client_readdir(vofid, (char *)io_buffer, + diroffset, count); + + if (count == 0) { + if (ap->a_eofflag != NULL) + *ap->a_eofflag = 1; + break; + } + + if (count < 0) { + error = EIO; + goto out; + } + + offset = 0; + while (offset + QEMU_DIRENTRY_SZ <= count) { + + /* + * Read and make sense out of the buffer in one dirent + * This is part of 9p protocol read. This reads one p9_dirent, + * appends it to dirent(FREEBSD specifc) and continues to parse the buffer. + */ + bzero(&dent, sizeof(dent)); + offset = p9_dirent_read(clnt, io_buffer, offset, count, + &dent); + if (offset < 0 || offset > count) { + error = EIO; + goto out; + } + + bzero(&cde, sizeof(cde)); + strncpy(cde.d_name, dent.d_name, dent.len); + cde.d_fileno = dent.qid.path; + cde.d_type = dent.d_type; + cde.d_namlen = dent.len; + cde.d_reclen = GENERIC_DIRSIZ(&cde); + + /* + * If there isn't enough space in the uio to return a + * whole dirent, break off read + */ + if (uio->uio_resid < GENERIC_DIRSIZ(&cde)) + break; + + /* Transfer */ + error = uiomove(&cde, GENERIC_DIRSIZ(&cde), uio); + if (error != 0) { + error = EIO; + goto out; + } + diroffset = dent.d_off; + } + } + /* Pass on last transferred offset */ + uio->uio_offset = diroffset; + +out: + uma_zfree(p9fs_io_buffer_zone, io_buffer); + + return (error); +} + +static void +p9fs_doio(struct vnode *vp, struct buf *bp, struct p9_fid *vofid, struct ucred *cr) +{ + struct uio *uiov; + struct iovec io; + int error; + uint64_t off, offset; + uint64_t filesize; + uint64_t resid; + uint32_t count; + int64_t ret; + struct p9fs_node *np; + char *io_buffer; + + error = 0; + np = P9FS_VTON(vp); + + filesize = np->inode.i_size; + uiov = malloc(sizeof(struct uio), M_P9UIOV, M_WAITOK); + uiov->uio_iov = &io; + uiov->uio_iovcnt = 1; + uiov->uio_segflg = UIO_SYSSPACE; + io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK | M_ZERO); + + if (bp->b_iocmd == BIO_READ) { + io.iov_len = uiov->uio_resid = bp->b_bcount; + io.iov_base = bp->b_data; + uiov->uio_rw = UIO_READ; + + switch (vp->v_type) { + + case VREG: + { + uiov->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; + + if (uiov->uio_resid) { + int left = uiov->uio_resid; + int nread = bp->b_bcount - left; + + if (left > 0) + bzero((char *)bp->b_data + nread, left); + } + /* where in the file are we to start reading */ + offset = uiov->uio_offset; + if (uiov->uio_offset >= filesize) + goto out; + + while ((resid = uiov->uio_resid) > 0) { + if (offset >= filesize) + break; + count = min(filesize - uiov->uio_offset, resid); + if (count == 0) + break; + + P9_DEBUG(VOPS, "%s: read called %#zx at %#jx\n", + __func__, uiov->uio_resid, (uintmax_t)uiov->uio_offset); + + /* Copy count bytes into the uio */ + ret = p9_client_read(vofid, offset, count, io_buffer); + error = uiomove(io_buffer, ret, uiov); + + if (error != 0) + goto out; + offset += ret; + } + break; + } + default: + printf("vfs: type %x unexpected\n", vp->v_type); + break; + } + } else { + if (bp->b_dirtyend > bp->b_dirtyoff) { + io.iov_len = uiov->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; + uiov->uio_offset = ((off_t)bp->b_blkno) * PAGE_SIZE + bp->b_dirtyoff; + io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; + uiov->uio_rw = UIO_WRITE; + + if (uiov->uio_offset < 0) { + error = EINVAL; + goto out; + } + + if (uiov->uio_resid == 0) + goto out; + + resid = uiov->uio_resid; + offset = uiov->uio_offset; + error = 0; + + while ((resid = uiov->uio_resid) > 0) { + off = 0; + count = MIN(resid, P9FS_IOUNIT); + error = uiomove(io_buffer, count, uiov); + if (error != 0) { + goto out; + } + + while (count > 0) { + /* Copy count bytes from the uio */ + ret = p9_client_write(vofid, offset, count, + io_buffer + off); + if (ret < 0) + goto out; + + P9_DEBUG(VOPS, "%s: write called %#zx at %#jx\n", + __func__, uiov->uio_resid, (uintmax_t)uiov->uio_offset); + off += ret; + offset += ret; + count -= ret; + } + } + + /* Update the fields in the node to reflect the change */ + if (filesize < uiov->uio_offset + uiov->uio_resid) { + np->inode.i_size = uiov->uio_offset + uiov->uio_resid; + vnode_pager_setsize(vp, uiov->uio_offset + uiov->uio_resid); + /* update the modified timers. */ + p9fs_itimes(vp); + } + } else { + bp->b_resid = 0; + goto out1; + } + } +out: + /* Set the error */ + if (error != 0) { + bp->b_error = error; + bp->b_ioflags |= BIO_ERROR; + } + bp->b_resid = uiov->uio_resid; +out1: + bufdone(bp); + uma_zfree(p9fs_io_buffer_zone, io_buffer); + free(uiov, M_P9UIOV); +} + +/* + * The I/O buffer is mapped to a uio and a client_write/client_read is performed + * the same way as p9fs_read and p9fs_write. + */ +static int +p9fs_strategy(struct vop_strategy_args *ap) +{ + struct vnode *vp; + struct buf *bp; + struct ucred *cr; + int error; + struct open_fid_state ostate; + + vp = ap->a_vp; + bp = ap->a_bp; + error = 0; + + P9_DEBUG(VOPS, "%s: vp %p, iocmd %d\n ", __func__, vp, bp->b_iocmd); + + if (bp->b_iocmd == BIO_READ) + cr = bp->b_rcred; + else + cr = bp->b_wcred; + + error = p9fs_get_open_fid(vp, bp->b_iocmd == BIO_READ ? FREAD : FWRITE, cr, &ostate); + if (error) { + P9_DEBUG(ERROR, "%s: p9fs_get_open_fid failed: %d\n", __func__, error); + bp->b_error = error; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return (0); + } + + p9fs_doio(vp, bp, ostate.vofid, cr); + p9fs_release_open_fid(vp, cr, &ostate); + + return (0); +} + +/* Rename a file */ +static int +p9fs_rename(struct vop_rename_args *ap) +{ + struct vnode *tvp; + struct vnode *tdvp; + struct vnode *fvp; + struct vnode *fdvp; + struct componentname *tcnp; + struct componentname *fcnp; + struct p9fs_node *tdnode; + struct p9fs_node *fdnode; + struct p9fs_inode *fdinode; + struct p9fs_node *fnode; + struct p9fs_inode *finode; + struct p9fs_session *vses; + struct p9fs_node *tnode; + struct p9fs_inode *tinode; + struct p9_fid *olddirvfid, *newdirvfid ; + int error; + + tvp = ap->a_tvp; + tdvp = ap->a_tdvp; + fvp = ap->a_fvp; + fdvp = ap->a_fdvp; + tcnp = ap->a_tcnp; + fcnp = ap->a_fcnp; + tdnode = P9FS_VTON(tdvp); + fdnode = P9FS_VTON(fdvp); + fdinode = &fdnode->inode; + fnode = P9FS_VTON(fvp); + finode = &fnode->inode; + vses = fnode->p9fs_ses; + error = 0; + + P9_DEBUG(VOPS, "%s: tvp %p, tdvp %p, fvp %p, fdvp %p\n ", __func__, tvp, tdvp, fvp, fdvp); + + /* Check for cross mount operation */ + if (fvp->v_mount != tdvp->v_mount || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; + goto out; + } + + /* warning if you are renaming to the same name */ + if (fvp == tvp) + error = 0; + + olddirvfid = p9fs_get_fid(vses->clnt, fdnode, fcnp->cn_cred, VFID, -1, &error); + if (error != 0) + goto out; + newdirvfid = p9fs_get_fid(vses->clnt, tdnode, tcnp->cn_cred, VFID, -1, &error); + if (error != 0) + goto out; + + error = p9_client_renameat(olddirvfid, fcnp->cn_nameptr, newdirvfid, tcnp->cn_nameptr); + if (error != 0) + goto out; + + /* + * decrement the link count on the "from" file whose name is going + * to be changed if its a directory + */ + if (fvp->v_type == VDIR) { + if (tvp && tvp->v_type == VDIR) + cache_purge(tdvp); + P9FS_DECR_LINKS(fdinode); + cache_purge(fdvp); + } + + /* Taking exclusive lock on the from node before decrementing the link count */ + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) + goto out; + P9FS_DECR_LINKS(finode); + VOP_UNLOCK(fvp); + + if (tvp) { + tnode = P9FS_VTON(tvp); + tinode = &tnode->inode; + P9FS_DECR_LINKS(tinode); + } + +out: + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + vrele(fdvp); + vrele(fvp); + return (error); +} + +/* + * Put VM pages, synchronously. + * XXX: like smbfs, cannot use vop_stdputpages due to mapping requirement + */ +static int +p9fs_putpages(struct vop_putpages_args *ap) +{ + struct uio uio; + struct iovec iov; + int i, error, npages, count; + off_t offset; + int *rtvals; + struct vnode *vp; + struct thread *td; + struct ucred *cred; + struct p9fs_node *np; + vm_page_t *pages; + vm_offset_t kva; + struct buf *bp; + + vp = ap->a_vp; + np = P9FS_VTON(vp); + td = curthread; + cred = curthread->td_ucred; + pages = ap->a_m; + count = ap->a_count; + rtvals = ap->a_rtvals; + npages = btoc(count); + offset = IDX_TO_OFF(pages[0]->pindex); + + /* + * When putting pages, do not extend file past EOF. + */ + if (offset + count > np->inode.i_size) { + count = np->inode.i_size - offset; + if (count < 0) + count = 0; + } + + for (i = 0; i < npages; i++) + rtvals[i] = VM_PAGER_ERROR; + + bp = uma_zalloc(p9fs_pbuf_zone, M_WAITOK); + kva = (vm_offset_t) bp->b_data; + pmap_qenter(kva, pages, npages); + + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, count); + + iov.iov_base = (caddr_t) kva; + iov.iov_len = count; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = offset; + uio.uio_resid = count; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = UIO_WRITE; + uio.uio_td = td; + + P9_DEBUG(VOPS, "of=%jd,resid=%zd\n", (intmax_t)uio.uio_offset, uio.uio_resid); + + error = VOP_WRITE(vp, &uio, vnode_pager_putpages_ioflags(ap->a_sync), + cred); + + pmap_qremove(kva, npages); + uma_zfree(p9fs_pbuf_zone, bp); + + if (error == 0) + vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid, + np->inode.i_size - offset, npages * PAGE_SIZE); + + return (rtvals[0]); +} + +struct vop_vector p9fs_vnops = { + .vop_default = &default_vnodeops, + .vop_lookup = p9fs_lookup, + .vop_open = p9fs_open, + .vop_close = p9fs_close, + .vop_access = p9fs_access, + .vop_getattr = p9fs_getattr_dotl, + .vop_setattr = p9fs_setattr_dotl, + .vop_reclaim = p9fs_reclaim, + .vop_inactive = p9fs_inactive, + .vop_readdir = p9fs_readdir, + .vop_create = p9fs_create, + .vop_mknod = p9fs_mknod, + .vop_read = p9fs_read, + .vop_write = p9fs_write, + .vop_remove = p9fs_remove, + .vop_mkdir = p9fs_mkdir, + .vop_rmdir = p9fs_rmdir, + .vop_strategy = p9fs_strategy, + .vop_symlink = p9fs_symlink, + .vop_rename = p9fs_rename, + .vop_link = p9fs_link, + .vop_readlink = p9fs_readlink, + .vop_putpages = p9fs_putpages, +}; +VFS_VOP_VECTOR_REGISTER(p9fs_vnops); diff --git a/sys/fs/procfs/procfs_mem.c b/sys/fs/procfs/procfs_mem.c index 6ef725ee0ee7..0020b8f8a8d8 100644 --- a/sys/fs/procfs/procfs_mem.c +++ b/sys/fs/procfs/procfs_mem.c @@ -41,6 +41,7 @@ #include <sys/ptrace.h> #include <sys/systm.h> #include <sys/uio.h> +#include <sys/priv.h> #include <fs/pseudofs/pseudofs.h> #include <fs/procfs/procfs.h> diff --git a/sys/fs/procfs/procfs_osrel.c b/sys/fs/procfs/procfs_osrel.c index fd6a4d7e0eea..0102090de4da 100644 --- a/sys/fs/procfs/procfs_osrel.c +++ b/sys/fs/procfs/procfs_osrel.c @@ -45,9 +45,11 @@ procfs_doosrel(PFS_FILL_ARGS) if (uio == NULL) return (EOPNOTSUPP); - if (uio->uio_rw == UIO_READ) { + switch (uio->uio_rw) { + case UIO_READ: sbuf_printf(sb, "%d\n", p->p_osrel); - } else { + break; + case UIO_WRITE: sbuf_trim(sb); sbuf_finish(sb); pp = sbuf_data(sb); @@ -62,6 +64,7 @@ procfs_doosrel(PFS_FILL_ARGS) osrel = ov; } p->p_osrel = osrel; + break; } return (0); } diff --git a/sys/fs/procfs/procfs_rlimit.c b/sys/fs/procfs/procfs_rlimit.c index 83e11f44b3f8..6be933ac6e44 100644 --- a/sys/fs/procfs/procfs_rlimit.c +++ b/sys/fs/procfs/procfs_rlimit.c @@ -57,6 +57,9 @@ #include <fs/pseudofs/pseudofs.h> #include <fs/procfs/procfs.h> +_Static_assert(nitems(rlimit_ident) == RLIM_NLIMITS, + "resource.h RLIMIT_IDENT needs update"); + int procfs_doprocrlimit(PFS_FILL_ARGS) { diff --git a/sys/fs/procfs/procfs_status.c b/sys/fs/procfs/procfs_status.c index 9c2f42a45102..38070e0946bb 100644 --- a/sys/fs/procfs/procfs_status.c +++ b/sys/fs/procfs/procfs_status.c @@ -61,6 +61,7 @@ int procfs_doprocstatus(PFS_FILL_ARGS) { + struct timeval start, ut, st; struct session *sess; struct thread *tdfirst; struct tty *tp; @@ -121,21 +122,16 @@ procfs_doprocstatus(PFS_FILL_ARGS) wmesg = "nochan"; thread_unlock(tdfirst); - if (p->p_flag & P_INMEM) { - struct timeval start, ut, st; - - PROC_STATLOCK(p); - calcru(p, &ut, &st); - PROC_STATUNLOCK(p); - start = p->p_stats->p_start; - getboottime(&boottime); - timevaladd(&start, &boottime); - sbuf_printf(sb, " %jd,%ld %jd,%ld %jd,%ld", - (intmax_t)start.tv_sec, start.tv_usec, - (intmax_t)ut.tv_sec, ut.tv_usec, - (intmax_t)st.tv_sec, st.tv_usec); - } else - sbuf_printf(sb, " -1,-1 -1,-1 -1,-1"); + PROC_STATLOCK(p); + calcru(p, &ut, &st); + PROC_STATUNLOCK(p); + start = p->p_stats->p_start; + getboottime(&boottime); + timevaladd(&start, &boottime); + sbuf_printf(sb, " %jd,%ld %jd,%ld %jd,%ld", + (intmax_t)start.tv_sec, start.tv_usec, + (intmax_t)ut.tv_sec, ut.tv_usec, + (intmax_t)st.tv_sec, st.tv_usec); sbuf_printf(sb, " %s", wmesg); diff --git a/sys/fs/pseudofs/pseudofs.c b/sys/fs/pseudofs/pseudofs.c index eb4ca8a82456..ef45f96a6192 100644 --- a/sys/fs/pseudofs/pseudofs.c +++ b/sys/fs/pseudofs/pseudofs.c @@ -98,12 +98,10 @@ pfs_alloc_node(struct pfs_info *pi, const char *name, pfs_type_t type) /* * Add a node to a directory */ -static void +static int pfs_add_node(struct pfs_node *parent, struct pfs_node *pn) { -#ifdef INVARIANTS struct pfs_node *iter; -#endif KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); @@ -123,8 +121,6 @@ pfs_add_node(struct pfs_node *parent, struct pfs_node *pn) KASSERT(iter->pn_type != pfstype_procdir, ("%s(): nested process directories", __func__)); for (iter = parent->pn_nodes; iter != NULL; iter = iter->pn_next) { - KASSERT(strcmp(pn->pn_name, iter->pn_name) != 0, - ("%s(): homonymous siblings", __func__)); if (pn->pn_type == pfstype_procdir) KASSERT(iter->pn_type != pfstype_procdir, ("%s(): sibling process directories", __func__)); @@ -133,8 +129,19 @@ pfs_add_node(struct pfs_node *parent, struct pfs_node *pn) pn->pn_parent = parent; pfs_fileno_alloc(pn); - pfs_lock(parent); + for (iter = parent->pn_nodes; iter != NULL; iter = iter->pn_next) { + if (strcmp(pn->pn_name, iter->pn_name) != 0) + continue; + printf("pfs_add_node: homonymous siblings: '%s/%s' type %d", + parent->pn_name, pn->pn_name, pn->pn_type); + /* Do not detach, because we are not yet attached. */ + pn->pn_parent = NULL; + pfs_unlock(parent); + return (EEXIST); + } + + if ((parent->pn_flags & PFS_PROCDEP) != 0) pn->pn_flags |= PFS_PROCDEP; if (parent->pn_nodes == NULL) { @@ -151,10 +158,11 @@ pfs_add_node(struct pfs_node *parent, struct pfs_node *pn) parent->pn_last_node = pn; } pfs_unlock(parent); + return (0); } /* - * Detach a node from its aprent + * Detach a node from its parent */ static void pfs_detach_node(struct pfs_node *pn) @@ -196,6 +204,7 @@ static int pfs_fixup_dir_flags(struct pfs_node *parent, int flags) { struct pfs_node *dot, *dotdot; + int rc; dot = pfs_alloc_node_flags(parent->pn_info, ".", pfstype_this, flags); if (dot == NULL) @@ -205,9 +214,14 @@ pfs_fixup_dir_flags(struct pfs_node *parent, int flags) pfs_destroy(dot); return (ENOMEM); } - pfs_add_node(parent, dot); - pfs_add_node(parent, dotdot); - return (0); + rc = pfs_add_node(parent, dot); + if (rc == 0) + rc = pfs_add_node(parent, dotdot); + if (rc != 0) { + pfs_destroy(dot); + pfs_destroy(dotdot); + } + return (rc); } static void @@ -236,11 +250,12 @@ pfs_create_dir(struct pfs_node *parent, const char *name, pn->pn_vis = vis; pn->pn_destroy = destroy; pn->pn_flags = flags; - pfs_add_node(parent, pn); - rc = pfs_fixup_dir_flags(pn, flags); - if (rc) { + rc = pfs_add_node(parent, pn); + if (rc == 0) + rc = pfs_fixup_dir_flags(pn, flags); + if (rc != 0) { pfs_destroy(pn); - return (NULL); + pn = NULL; } return (pn); } @@ -263,8 +278,10 @@ pfs_create_file(struct pfs_node *parent, const char *name, pfs_fill_t fill, pn->pn_vis = vis; pn->pn_destroy = destroy; pn->pn_flags = flags; - pfs_add_node(parent, pn); - + if (pfs_add_node(parent, pn) != 0) { + pfs_destroy(pn); + pn = NULL; + } return (pn); } @@ -286,7 +303,10 @@ pfs_create_link(struct pfs_node *parent, const char *name, pfs_fill_t fill, pn->pn_vis = vis; pn->pn_destroy = destroy; pn->pn_flags = flags; - pfs_add_node(parent, pn); + if (pfs_add_node(parent, pn) != 0) { + pfs_destroy(pn); + pn = NULL; + } return (pn); } diff --git a/sys/fs/smbfs/smbfs_io.c b/sys/fs/smbfs/smbfs_io.c index 324f38abd10e..35454998fc8e 100644 --- a/sys/fs/smbfs/smbfs_io.c +++ b/sys/fs/smbfs/smbfs_io.c @@ -629,7 +629,7 @@ smbfs_vinvalbuf(struct vnode *vp, struct thread *td) while (np->n_flag & NFLUSHINPROG) { np->n_flag |= NFLUSHWANT; - error = tsleep(&np->n_flag, PRIBIO + 2, "smfsvinv", 2 * hz); + error = tsleep(&np->n_flag, PRIBIO, "smfsvinv", 2 * hz); error = smb_td_intr(td); if (error == EINTR) return EINTR; diff --git a/sys/fs/smbfs/smbfs_vnops.c b/sys/fs/smbfs/smbfs_vnops.c index 1e7dcafb1121..5d412cabadb8 100644 --- a/sys/fs/smbfs/smbfs_vnops.c +++ b/sys/fs/smbfs/smbfs_vnops.c @@ -810,6 +810,9 @@ smbfs_pathconf(struct vop_pathconf_args *ap) case _PC_NO_TRUNC: *retval = 1; break; + case _PC_HAS_HIDDENSYSTEM: + *retval = 1; + break; default: error = vop_stdpathconf(ap); } @@ -1051,7 +1054,7 @@ smbfs_lookup(struct vop_lookup_args *ap) struct smbfattr fattr, *fap; struct smb_cred *scred; char *name = cnp->cn_nameptr; - int flags = cnp->cn_flags; + uint64_t flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; int nmlen = cnp->cn_namelen; int error, islastcn, isdot; diff --git a/sys/fs/tarfs/tarfs.h b/sys/fs/tarfs/tarfs.h index ff1985e488cd..46fa8b55b3ad 100644 --- a/sys/fs/tarfs/tarfs.h +++ b/sys/fs/tarfs/tarfs.h @@ -74,7 +74,7 @@ struct tarfs_node { struct timespec mtime; struct timespec ctime; struct timespec birthtime; - unsigned long gen; + uint32_t gen; /* Block map */ size_t nblk; @@ -161,10 +161,9 @@ struct tarfs_zio { }; struct tarfs_fid { - u_short len; /* length of data in bytes */ - u_short data0; /* force alignment */ - ino_t ino; - unsigned long gen; + u_short len; /* length of data in bytes */ + uint32_t gen; + ino_t ino; }; #define TARFS_NODE_LOCK(tnp) \ diff --git a/sys/fs/tarfs/tarfs_vnops.c b/sys/fs/tarfs/tarfs_vnops.c index 8c97fab185fc..afb8e05f5929 100644 --- a/sys/fs/tarfs/tarfs_vnops.c +++ b/sys/fs/tarfs/tarfs_vnops.c @@ -668,6 +668,8 @@ tarfs_vptofh(struct vop_vptofh_args *ap) { struct tarfs_fid *tfp; struct tarfs_node *tnp; + _Static_assert(sizeof(struct tarfs_fid) <= sizeof(struct fid), + "struct tarfs_fid cannot be larger than struct fid"); tfp = (struct tarfs_fid *)ap->a_fhp; tnp = VP_TO_TARFS_NODE(ap->a_vp); diff --git a/sys/fs/tmpfs/tmpfs.h b/sys/fs/tmpfs/tmpfs.h index c28f3a02a7bf..52307cc7c7b2 100644 --- a/sys/fs/tmpfs/tmpfs.h +++ b/sys/fs/tmpfs/tmpfs.h @@ -292,6 +292,15 @@ struct tmpfs_node { */ off_t tn_readdir_lastn; struct tmpfs_dirent * tn_readdir_lastp; + + /* + * Total size of whiteout directory entries. This + * must be a multiple of sizeof(struct tmpfs_dirent) + * and is used to determine whether a directory is + * empty (excluding whiteout entries) during rename/ + * rmdir operations. + */ + off_t tn_wht_size; /* (v) */ } tn_dir; /* Valid when tn_type == VLNK. */ @@ -439,11 +448,10 @@ struct tmpfs_mount { * NFS code. */ struct tmpfs_fid_data { + unsigned short tfd_len; ino_t tfd_id; unsigned long tfd_gen; -}; -_Static_assert(sizeof(struct tmpfs_fid_data) <= MAXFIDSZ, - "(struct tmpfs_fid_data) is larger than (struct fid).fid_data"); +} __packed; struct tmpfs_dir_cursor { struct tmpfs_dirent *tdc_current; @@ -484,6 +492,7 @@ int tmpfs_dir_getdents(struct tmpfs_mount *, struct tmpfs_node *, struct uio *, int, uint64_t *, int *); int tmpfs_dir_whiteout_add(struct vnode *, struct componentname *); void tmpfs_dir_whiteout_remove(struct vnode *, struct componentname *); +void tmpfs_dir_clear_whiteouts(struct vnode *); int tmpfs_reg_resize(struct vnode *, off_t, boolean_t); int tmpfs_reg_punch_hole(struct vnode *vp, off_t *, off_t *); int tmpfs_chflags(struct vnode *, u_long, struct ucred *, struct thread *); @@ -533,6 +542,8 @@ tmpfs_update(struct vnode *vp) #define TMPFS_VALIDATE_DIR(node) do { \ MPASS((node)->tn_type == VDIR); \ MPASS((node)->tn_size % sizeof(struct tmpfs_dirent) == 0); \ + MPASS((node)->tn_dir.tn_wht_size % sizeof(struct tmpfs_dirent) == 0); \ + MPASS((node)->tn_dir.tn_wht_size <= (node)->tn_size); \ } while (0) /* diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index 9bdcc4575511..1237f6b92cdb 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -120,7 +120,7 @@ tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, /* * Forced unmount? */ - if (vp == NULL) { + if (vp == NULL || vp->v_object == NULL) { KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, ("object %p with OBJ_TMPFS_VREF but without vnode", object)); @@ -183,6 +183,9 @@ tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, KASSERT((object->flags & OBJ_ANON) == 0, ("%s: object %p with OBJ_ANON", __func__, object)); old = object->un_pager.swp.writemappings; + KASSERT(old >= (vm_ooffset_t)end - start, + ("tmpfs obj %p writecount %jx dec %jx", object, (uintmax_t)old, + (uintmax_t)((vm_ooffset_t)end - start))); object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; new = object->un_pager.swp.writemappings; tmpfs_pager_writecount_recalc(object, old, new); @@ -346,7 +349,7 @@ tmpfs_node_init(void *mem, int size, int flags) node = mem; node->tn_id = 0; - mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); + mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF | MTX_NEW); node->tn_gen = arc4random(); return (0); } @@ -425,7 +428,7 @@ sysctl_mem_percent(SYSCTL_HANDLER_ARGS) if ((unsigned) percent > 100) return (EINVAL); - *(long *)arg1 = percent; + *(int *)arg1 = percent; tmpfs_set_reserve_from_percent(); return (0); } @@ -440,7 +443,7 @@ tmpfs_set_reserve_from_percent(void) } SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_percent, - CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_mem_percent, 0, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, &tmpfs_mem_percent, 0, sysctl_mem_percent, "I", "Percent of available memory that can be used if no size limit"); @@ -490,50 +493,11 @@ static int tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, int end, boolean_t ignerr) { - vm_page_t m; - int rv, error; - - VM_OBJECT_ASSERT_WLOCKED(object); - KASSERT(base >= 0, ("%s: base %d", __func__, base)); - KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, - end)); - error = 0; - -retry: - m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); - if (m != NULL) { - MPASS(vm_page_all_valid(m)); - } else if (vm_pager_has_page(object, idx, NULL, NULL)) { - m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | - VM_ALLOC_WAITFAIL); - if (m == NULL) - goto retry; - vm_object_pip_add(object, 1); - VM_OBJECT_WUNLOCK(object); - rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); - VM_OBJECT_WLOCK(object); - vm_object_pip_wakeup(object); - if (rv == VM_PAGER_OK) { - /* - * Since the page was not resident, and therefore not - * recently accessed, immediately enqueue it for - * asynchronous laundering. The current operation is - * not regarded as an access. - */ - vm_page_launder(m); - } else { - vm_page_free(m); - m = NULL; - if (!ignerr) - error = EIO; - } - } - if (m != NULL) { - pmap_zero_page_area(m, base, end - base); - vm_page_set_dirty(m); - vm_page_xunbusy(m); - } + int error; + error = vm_page_grab_zero_partial(object, idx, base, end); + if (ignerr) + error = 0; return (error); } @@ -643,6 +607,7 @@ tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; nnode->tn_dir.tn_readdir_lastn = 0; nnode->tn_dir.tn_readdir_lastp = NULL; + nnode->tn_dir.tn_wht_size = 0; nnode->tn_links++; TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); nnode->tn_dir.tn_parent->tn_links++; @@ -954,6 +919,8 @@ tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) VM_OBJECT_WLOCK(obj); VI_LOCK(vp); + vp->v_object = NULL; + /* * May be going through forced unmount. */ @@ -1094,15 +1061,19 @@ loop: KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, ("%s: object %p with OBJ_TMPFS_VREF but without vnode", __func__, object)); - KASSERT(object->un_pager.swp.writemappings == 0, - ("%s: object %p has writemappings", - __func__, object)); VI_LOCK(vp); KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); vp->v_object = object; vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | VIRF_TEXT_REF); VI_UNLOCK(vp); + VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, + ("leaked OBJ_TMPFS_VREF")); + if (object->un_pager.swp.writemappings > 0) { + vrefact(vp); + vlazy(vp); + vm_object_set_flag(object, OBJ_TMPFS_VREF); + } VM_OBJECT_WUNLOCK(object); break; case VDIR: @@ -1822,13 +1793,16 @@ int tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; + struct tmpfs_node *dnode; int error; error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) return (error); + dnode = VP_TO_TMPFS_DIR(dvp); tmpfs_dir_attach(dvp, de); + dnode->tn_dir.tn_wht_size += sizeof(*de); return (0); } @@ -1836,14 +1810,44 @@ void tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; + struct tmpfs_node *dnode; - de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); + dnode = VP_TO_TMPFS_DIR(dvp); + de = tmpfs_dir_lookup(dnode, NULL, cnp); MPASS(de != NULL && de->td_node == NULL); + MPASS(dnode->tn_dir.tn_wht_size >= sizeof(*de)); + dnode->tn_dir.tn_wht_size -= sizeof(*de); tmpfs_dir_detach(dvp, de); tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); } /* + * Frees any dirents still associated with the directory represented + * by dvp in preparation for the removal of the directory. This is + * required when removing a directory which contains only whiteout + * entries. + */ +void +tmpfs_dir_clear_whiteouts(struct vnode *dvp) +{ + struct tmpfs_dir_cursor dc; + struct tmpfs_dirent *de; + struct tmpfs_node *dnode; + + dnode = VP_TO_TMPFS_DIR(dvp); + + while ((de = tmpfs_dir_first(dnode, &dc)) != NULL) { + KASSERT(de->td_node == NULL, ("%s: non-whiteout dirent %p", + __func__, de)); + dnode->tn_dir.tn_wht_size -= sizeof(*de); + tmpfs_dir_detach(dvp, de); + tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); + } + MPASS(dnode->tn_size == 0); + MPASS(dnode->tn_dir.tn_wht_size == 0); +} + +/* * Resizes the aobj associated with the regular file pointed to by 'vp' to the * size 'newsize'. 'vp' must point to a vnode that represents a regular file. * 'newsize' must be positive. diff --git a/sys/fs/tmpfs/tmpfs_vfsops.c b/sys/fs/tmpfs/tmpfs_vfsops.c index 32eb9c958df1..431893b77bb9 100644 --- a/sys/fs/tmpfs/tmpfs_vfsops.c +++ b/sys/fs/tmpfs/tmpfs_vfsops.c @@ -208,7 +208,7 @@ again: continue; } vm = vmspace_acquire_ref(p); - _PHOLD_LITE(p); + _PHOLD(p); PROC_UNLOCK(p); if (vm == NULL) { PRELE(p); @@ -585,29 +585,25 @@ static int tmpfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { - struct tmpfs_fid_data tfd; + struct tmpfs_fid_data *tfd; struct tmpfs_mount *tmp; struct tmpfs_node *node; int error; - if (fhp->fid_len != sizeof(tfd)) + if (fhp->fid_len != sizeof(*tfd)) return (EINVAL); - /* - * Copy from fid_data onto the stack to avoid unaligned pointer use. - * See the comment in sys/mount.h on struct fid for details. - */ - memcpy(&tfd, fhp->fid_data, fhp->fid_len); + tfd = (struct tmpfs_fid_data *)fhp; tmp = VFS_TO_TMPFS(mp); - if (tfd.tfd_id >= tmp->tm_nodes_max) + if (tfd->tfd_id >= tmp->tm_nodes_max) return (EINVAL); TMPFS_LOCK(tmp); LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) { - if (node->tn_id == tfd.tfd_id && - node->tn_gen == tfd.tfd_gen) { + if (node->tn_id == tfd->tfd_id && + node->tn_gen == tfd->tfd_gen) { tmpfs_ref_node(node); break; } diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index 718cfef6bfa3..9d2a587b177a 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -476,6 +476,7 @@ tmpfs_stat(struct vop_stat_args *v) sb->st_blksize = PAGE_SIZE; sb->st_flags = node->tn_flags; sb->st_gen = node->tn_gen; + sb->st_filerev = 0; if (vp->v_type == VREG) { #ifdef __ILP32__ vm_object_t obj = node->tn_reg.tn_aobj; @@ -1078,7 +1079,9 @@ tmpfs_rename(struct vop_rename_args *v) } if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) { - if (tnode->tn_size > 0) { + if (tnode->tn_size != 0 && + ((tcnp->cn_flags & IGNOREWHITEOUT) == 0 || + tnode->tn_size > tnode->tn_dir.tn_wht_size)) { error = ENOTEMPTY; goto out_locked; } @@ -1239,6 +1242,16 @@ tmpfs_rename(struct vop_rename_args *v) tde = tmpfs_dir_lookup(tdnode, tnode, tcnp); tmpfs_dir_detach(tdvp, tde); + /* + * If we are overwriting a directory, per the ENOTEMPTY check + * above it must either be empty or contain only whiteout + * entries. In the latter case (which can only happen if + * IGNOREWHITEOUT was passed in tcnp->cn_flags), clear the + * whiteout entries to avoid leaking memory. + */ + if (tnode->tn_type == VDIR && tnode->tn_size > 0) + tmpfs_dir_clear_whiteouts(tvp); + /* Update node's ctime because of possible hardlinks. */ tnode->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(tvp); @@ -1309,6 +1322,7 @@ tmpfs_rmdir(struct vop_rmdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; + struct componentname *cnp = v->a_cnp; int error; struct tmpfs_dirent *de; @@ -1320,13 +1334,18 @@ tmpfs_rmdir(struct vop_rmdir_args *v) dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_DIR(vp); - /* Directories with more than two entries ('.' and '..') cannot be - * removed. */ - if (node->tn_size > 0) { - error = ENOTEMPTY; - goto out; - } + /* + * Directories with more than two non-whiteout entries ('.' and '..') + * cannot be removed. + */ + if (node->tn_size != 0 && + ((cnp->cn_flags & IGNOREWHITEOUT) == 0 || + node->tn_size > node->tn_dir.tn_wht_size)) { + error = ENOTEMPTY; + goto out; + } + /* Check flags to see if we are allowed to remove the directory. */ if ((dnode->tn_flags & APPEND) || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; @@ -1334,27 +1353,31 @@ tmpfs_rmdir(struct vop_rmdir_args *v) } /* This invariant holds only if we are not trying to remove "..". - * We checked for that above so this is safe now. */ + * We checked for that above so this is safe now. */ MPASS(node->tn_dir.tn_parent == dnode); /* Get the directory entry associated with node (vp). This was * filled by tmpfs_lookup while looking up the entry. */ - de = tmpfs_dir_lookup(dnode, node, v->a_cnp); + de = tmpfs_dir_lookup(dnode, node, cnp); MPASS(TMPFS_DIRENT_MATCHES(de, - v->a_cnp->cn_nameptr, - v->a_cnp->cn_namelen)); - - /* Check flags to see if we are allowed to remove the directory. */ - if ((dnode->tn_flags & APPEND) != 0 || - (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) != 0) { - error = EPERM; - goto out; - } + cnp->cn_nameptr, + cnp->cn_namelen)); /* Detach the directory entry from the directory (dnode). */ tmpfs_dir_detach(dvp, de); - if (v->a_cnp->cn_flags & DOWHITEOUT) - tmpfs_dir_whiteout_add(dvp, v->a_cnp); + + /* + * If we are removing a directory, per the ENOTEMPTY check above it + * must either be empty or contain only whiteout entries. In the + * latter case (which can only happen if IGNOREWHITEOUT was passed + * in cnp->cn_flags), clear the whiteout entries to avoid leaking + * memory. + */ + if (node->tn_size > 0) + tmpfs_dir_clear_whiteouts(vp); + + if (cnp->cn_flags & DOWHITEOUT) + tmpfs_dir_whiteout_add(dvp, cnp); /* No vnode should be allocated for this entry from this point */ TMPFS_NODE_LOCK(node); @@ -1668,6 +1691,10 @@ tmpfs_pathconf(struct vop_pathconf_args *v) *retval = PAGE_SIZE; break; + case _PC_HAS_HIDDENSYSTEM: + *retval = 1; + break; + default: error = vop_stdpathconf(v); } @@ -1684,21 +1711,15 @@ vop_vptofh { }; */ { - struct tmpfs_fid_data tfd; + struct tmpfs_fid_data *const tfd = (struct tmpfs_fid_data *)ap->a_fhp; struct tmpfs_node *node; - struct fid *fhp; + _Static_assert(sizeof(struct tmpfs_fid_data) <= sizeof(struct fid), + "struct tmpfs_fid_data cannot be larger than struct fid"); node = VP_TO_TMPFS_NODE(ap->a_vp); - fhp = ap->a_fhp; - fhp->fid_len = sizeof(tfd); - - /* - * Copy into fid_data from the stack to avoid unaligned pointer use. - * See the comment in sys/mount.h on struct fid for details. - */ - tfd.tfd_id = node->tn_id; - tfd.tfd_gen = node->tn_gen; - memcpy(fhp->fid_data, &tfd, fhp->fid_len); + tfd->tfd_len = sizeof(*tfd); + tfd->tfd_gen = node->tn_gen; + tfd->tfd_id = node->tn_id; return (0); } @@ -2070,31 +2091,10 @@ tmpfs_setextattr(struct vop_setextattr_args *ap) static off_t tmpfs_seek_data_locked(vm_object_t obj, off_t noff) { - vm_page_t m; - vm_pindex_t p, p_m, p_swp; - - p = OFF_TO_IDX(noff); - m = vm_page_find_least(obj, p); - - /* - * Microoptimize the most common case for SEEK_DATA, where - * there is no hole and the page is resident. - */ - if (m != NULL && vm_page_any_valid(m) && m->pindex == p) - return (noff); - - p_swp = swap_pager_find_least(obj, p); - if (p_swp == p) - return (noff); - - p_m = m == NULL ? obj->size : m->pindex; - return (IDX_TO_OFF(MIN(p_m, p_swp))); -} + vm_pindex_t p; -static off_t -tmpfs_seek_next(off_t noff) -{ - return (noff + PAGE_SIZE - (noff & PAGE_MASK)); + p = swap_pager_seek_data(obj, OFF_TO_IDX(noff)); + return (p == OFF_TO_IDX(noff) ? noff : IDX_TO_OFF(p)); } static int @@ -2111,30 +2111,8 @@ tmpfs_seek_clamp(struct tmpfs_node *tn, off_t *noff, bool seekdata) static off_t tmpfs_seek_hole_locked(vm_object_t obj, off_t noff) { - vm_page_t m; - vm_pindex_t p, p_swp; - - for (;; noff = tmpfs_seek_next(noff)) { - /* - * Walk over the largest sequential run of the valid pages. - */ - for (m = vm_page_lookup(obj, OFF_TO_IDX(noff)); - m != NULL && vm_page_any_valid(m); - m = vm_page_next(m), noff = tmpfs_seek_next(noff)) - ; - /* - * Found a hole in the object's page queue. Check if - * there is a hole in the swap at the same place. - */ - p = OFF_TO_IDX(noff); - p_swp = swap_pager_find_least(obj, p); - if (p_swp != p) { - noff = IDX_TO_OFF(p); - break; - } - } - return (noff); + return (IDX_TO_OFF(swap_pager_seek_hole(obj, OFF_TO_IDX(noff)))); } static int diff --git a/sys/fs/udf/ecma167-udf.h b/sys/fs/udf/ecma167-udf.h index 839bbec08254..19e114763cac 100644 --- a/sys/fs/udf/ecma167-udf.h +++ b/sys/fs/udf/ecma167-udf.h @@ -243,7 +243,7 @@ struct part_map_spare { uint8_t n_st; /* Number of Sparing Tables */ uint8_t reserved1; uint32_t st_size; - uint32_t st_loc[1]; + uint32_t st_loc[]; } __packed; union udf_pmap { @@ -266,7 +266,7 @@ struct udf_sparing_table { uint16_t rt_l; /* Relocation Table len */ uint8_t reserved[2]; uint32_t seq_num; - struct spare_map_entry entries[1]; + struct spare_map_entry entries[]; } __packed; /* Partition Descriptor [3/10.5] */ diff --git a/sys/fs/udf/udf_vfsops.c b/sys/fs/udf/udf_vfsops.c index 866d0172f745..c5ef1f686093 100644 --- a/sys/fs/udf/udf_vfsops.c +++ b/sys/fs/udf/udf_vfsops.c @@ -32,7 +32,7 @@ /* * Ok, here's how it goes. The UDF specs are pretty clear on how each data * structure is made up, but not very clear on how they relate to each other. - * Here is the skinny... This demostrates a filesystem with one file in the + * Here is the skinny... This demonstrates a filesystem with one file in the * root directory. Subdirectories are treated just as normal files, but they * have File Id Descriptors of their children as their file data. As for the * Anchor Volume Descriptor Pointer, it can exist in two of the following three @@ -81,6 +81,7 @@ #include <sys/fcntl.h> #include <sys/iconv.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/namei.h> @@ -729,7 +730,7 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) struct ifid *ifhp; struct vnode *nvp; struct udf_node *np; - off_t fsize; + uint64_t fsize; int error; ifhp = (struct ifid *)fhp; @@ -741,6 +742,10 @@ udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) np = VTON(nvp); fsize = le64toh(np->fentry->inf_len); + if (fsize > OFF_MAX) { + *vpp = NULLVP; + return (EIO); + } *vpp = nvp; vnode_create_vobject(*vpp, fsize, curthread); diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c index f230ca0c72fa..37889241e8c3 100644 --- a/sys/fs/udf/udf_vnops.c +++ b/sys/fs/udf/udf_vnops.c @@ -39,6 +39,7 @@ #include <sys/conf.h> #include <sys/buf.h> #include <sys/iconv.h> +#include <sys/limits.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/dirent.h> @@ -182,11 +183,14 @@ udf_access(struct vop_access_args *a) } static int -udf_open(struct vop_open_args *ap) { +udf_open(struct vop_open_args *ap) +{ struct udf_node *np = VTON(ap->a_vp); - off_t fsize; + uint64_t fsize; fsize = le64toh(np->fentry->inf_len); + if (fsize > OFF_MAX) + return (EIO); vnode_create_vobject(ap->a_vp, fsize, ap->a_td); return 0; } @@ -314,12 +318,13 @@ udf_getattr(struct vop_getattr_args *a) * that directories consume at least one logical block, * make it appear so. */ - if (fentry->logblks_rec != 0) { - vap->va_size = - le64toh(fentry->logblks_rec) * node->udfmp->bsize; - } else { + vap->va_size = le64toh(fentry->logblks_rec); + if (vap->va_size == 0) vap->va_size = node->udfmp->bsize; - } + else if (vap->va_size > UINT64_MAX / node->udfmp->bsize) + vap->va_size = UINT64_MAX; + else + vap->va_size *= node->udfmp->bsize; } else { vap->va_size = le64toh(fentry->inf_len); } @@ -446,6 +451,7 @@ udf_read(struct vop_read_args *ap) struct buf *bp; uint8_t *data; daddr_t lbn, rablock; + uint64_t len; off_t diff, fsize; ssize_t n; int error = 0; @@ -471,7 +477,12 @@ udf_read(struct vop_read_args *ap) return (error); } - fsize = le64toh(node->fentry->inf_len); + len = le64toh(node->fentry->inf_len); + if (len > OFF_MAX) { + /* too big, just cap to the requested length */ + len = uio->uio_resid; + } + fsize = len; udfmp = node->udfmp; do { lbn = lblkno(udfmp, uio->uio_offset); @@ -783,6 +794,7 @@ udf_readdir(struct vop_readdir_args *a) struct udf_uiodir uiodir; struct udf_dirstream *ds; uint64_t *cookies = NULL; + uint64_t len; int ncookies; int error = 0; @@ -800,8 +812,6 @@ udf_readdir(struct vop_readdir_args *a) */ ncookies = uio->uio_resid / 8; cookies = malloc(sizeof(*cookies) * ncookies, M_TEMP, M_WAITOK); - if (cookies == NULL) - return (ENOMEM); uiodir.ncookies = ncookies; uiodir.cookies = cookies; uiodir.acookies = 0; @@ -813,8 +823,12 @@ udf_readdir(struct vop_readdir_args *a) * Iterate through the file id descriptors. Give the parent dir * entry special attention. */ - ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len), - node->udfmp); + len = le64toh(node->fentry->inf_len); + if (len > INT_MAX) { + /* too big, just cap to INT_MAX */ + len = INT_MAX; + } + ds = udf_opendir(node, uio->uio_offset, len, node->udfmp); while ((fid = udf_getfid(ds)) != NULL) { /* XXX Should we return an error on a bad fid? */ @@ -906,7 +920,8 @@ udf_readlink(struct vop_readlink_args *ap) struct udf_node *node; void *buf; char *cp; - int error, len, root; + uint64_t len; + int error, root; /* * A symbolic link in UDF is a list of variable-length path @@ -916,6 +931,8 @@ udf_readlink(struct vop_readlink_args *ap) vp = ap->a_vp; node = VTON(vp); len = le64toh(node->fentry->inf_len); + if (len > MAXPATHLEN) + return (EIO); buf = malloc(len, M_DEVBUF, M_WAITOK); iov[0].iov_len = len; iov[0].iov_base = buf; @@ -1118,13 +1135,14 @@ udf_lookup(struct vop_cachedlookup_args *a) struct udf_mnt *udfmp; struct fileid_desc *fid = NULL; struct udf_dirstream *ds; + uint64_t fsize; u_long nameiop; u_long flags; char *nameptr; long namelen; ino_t id = 0; int offset, error = 0; - int fsize, lkflags, ltype, numdirpasses; + int lkflags, ltype, numdirpasses; dvp = a->a_dvp; node = VTON(dvp); @@ -1135,6 +1153,10 @@ udf_lookup(struct vop_cachedlookup_args *a) nameptr = a->a_cnp->cn_nameptr; namelen = a->a_cnp->cn_namelen; fsize = le64toh(node->fentry->inf_len); + if (fsize > INT_MAX) { + /* too big, just cap to INT_MAX */ + fsize = INT_MAX; + } /* * If this is a LOOKUP and we've already partially searched through @@ -1276,6 +1298,8 @@ udf_vptofh(struct vop_vptofh_args *a) { struct udf_node *node; struct ifid *ifhp; + _Static_assert(sizeof(struct ifid) <= sizeof(struct fid), + "struct ifid cannot be larger than struct fid"); node = VTON(a->a_vp); ifhp = (struct ifid *)a->a_fhp; diff --git a/sys/fs/unionfs/union.h b/sys/fs/unionfs/union.h index 467db3b29ff8..0bd1894a2195 100644 --- a/sys/fs/unionfs/union.h +++ b/sys/fs/unionfs/union.h @@ -97,15 +97,17 @@ struct unionfs_node { char *un_path; /* path */ int un_pathlen; /* strlen of path */ - int un_flag; /* unionfs node flag */ -}; -/* - * unionfs node flags - * It needs the vnode with exclusive lock, when changing the un_flag variable. - */ -#define UNIONFS_OPENEXTL 0x01 /* openextattr (lower) */ -#define UNIONFS_OPENEXTU 0x02 /* openextattr (upper) */ + /* + * unionfs node flags + * Changing these flags requires the vnode to be locked exclusive. + */ + #define UNIONFS_OPENEXTL 0x01 /* openextattr (lower) */ + #define UNIONFS_OPENEXTU 0x02 /* openextattr (upper) */ + #define UNIONFS_COPY_IN_PROGRESS 0x04 /* copy/dir shadow in progres */ + #define UNIONFS_LOOKUP_IN_PROGRESS 0x08 + unsigned int un_flag; /* unionfs node flag */ +}; extern struct vop_vector unionfs_vnodeops; @@ -131,34 +133,32 @@ int unionfs_uninit(struct vfsconf *); int unionfs_nodeget(struct mount *, struct vnode *, struct vnode *, struct vnode *, struct vnode **, struct componentname *); void unionfs_noderem(struct vnode *); +struct unionfs_node_status * unionfs_find_node_status(struct unionfs_node *, + struct thread *td); void unionfs_get_node_status(struct unionfs_node *, struct thread *, struct unionfs_node_status **); void unionfs_tryrem_node_status(struct unionfs_node *, struct unionfs_node_status *); int unionfs_check_rmdir(struct vnode *, struct ucred *, struct thread *td); -int unionfs_copyfile(struct unionfs_node *, int, struct ucred *, +int unionfs_copyfile(struct vnode *, int, struct ucred *, struct thread *); void unionfs_create_uppervattr_core(struct unionfs_mount *, struct vattr *, struct vattr *, struct thread *); int unionfs_create_uppervattr(struct unionfs_mount *, struct vnode *, struct vattr *, struct ucred *, struct thread *); -int unionfs_mkshadowdir(struct unionfs_mount *, struct vnode *, - struct unionfs_node *, struct componentname *, struct thread *); +int unionfs_mkshadowdir(struct vnode *, struct vnode *, + struct componentname *, struct thread *); int unionfs_mkwhiteout(struct vnode *, struct vnode *, struct componentname *, struct thread *, char *, int); int unionfs_relookup(struct vnode *, struct vnode **, struct componentname *, struct componentname *, struct thread *, char *, int, u_long); -int unionfs_relookup_for_create(struct vnode *, struct componentname *, - struct thread *); -int unionfs_relookup_for_delete(struct vnode *, struct componentname *, - struct thread *); -int unionfs_relookup_for_rename(struct vnode *, struct componentname *, - struct thread *); void unionfs_forward_vop_start_pair(struct vnode *, int *, struct vnode *, int *); bool unionfs_forward_vop_finish_pair(struct vnode *, struct vnode *, int, struct vnode *, struct vnode *, int); +int unionfs_set_in_progress_flag(struct vnode *, unsigned int); +void unionfs_clear_in_progress_flag(struct vnode *, unsigned int); static inline void unionfs_forward_vop_start(struct vnode *basevp, int *lkflags) diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c index bb57f3d56ade..edcc6716b674 100644 --- a/sys/fs/unionfs/union_subr.c +++ b/sys/fs/unionfs/union_subr.c @@ -203,19 +203,19 @@ unionfs_ins_cached_vnode(struct unionfs_node *uncp, struct unionfs_node_hashhead *hd; struct vnode *vp; - ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__); - ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__); - KASSERT(uncp->un_uppervp == NULLVP || uncp->un_uppervp->v_type == VDIR, - ("%s: v_type != VDIR", __func__)); - KASSERT(uncp->un_lowervp == NULLVP || uncp->un_lowervp->v_type == VDIR, - ("%s: v_type != VDIR", __func__)); - vp = NULLVP; VI_LOCK(dvp); - if (uncp->un_uppervp != NULL) + if (uncp->un_uppervp != NULLVP) { + ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__); + KASSERT(uncp->un_uppervp->v_type == VDIR, + ("%s: v_type != VDIR", __func__)); vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp); - else if (uncp->un_lowervp != NULL) + } else if (uncp->un_lowervp != NULLVP) { + ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__); + KASSERT(uncp->un_lowervp->v_type == VDIR, + ("%s: v_type != VDIR", __func__)); vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp); + } if (vp == NULLVP) { hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ? uncp->un_uppervp : uncp->un_lowervp)); @@ -276,9 +276,11 @@ unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp) if (unp->un_dvp != NULLVP) vrele(unp->un_dvp); - if (unp->un_uppervp != NULLVP) + if (unp->un_uppervp != NULLVP) { vput(unp->un_uppervp); - if (unp->un_lowervp != NULLVP) + if (unp->un_lowervp != NULLVP) + vrele(unp->un_lowervp); + } else if (unp->un_lowervp != NULLVP) vput(unp->un_lowervp); if (unp->un_hashtbl != NULL) hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); @@ -314,7 +316,7 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp, *vpp = NULLVP; if (uppervp == NULLVP && lowervp == NULLVP) - panic("%s: upper and lower is null", __func__); + panic("%s: upper and lower are both null", __func__); vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type); @@ -327,7 +329,9 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp, vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp); if (vp != NULLVP) { *vpp = vp; - goto unionfs_nodeget_out; + if (lkflags != 0) + vn_lock(*vpp, lkflags | LK_RETRY); + return (0); } } @@ -385,27 +389,47 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp, KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0, ("%s: NULL dvp for non-root vp %p", __func__, vp)); - vn_lock_pair(lowervp, false, LK_EXCLUSIVE, uppervp, false, - LK_EXCLUSIVE); + + /* + * NOTE: There is still a possibility for cross-filesystem locking here. + * If dvp has an upper FS component and is locked, while the new vnode + * created here only has a lower-layer FS component, then we will end + * up taking a lower-FS lock while holding an upper-FS lock. + * That situation could be dealt with here using vn_lock_pair(). + * However, that would only address one instance out of many in which + * a child vnode lock is taken while holding a lock on its parent + * directory. This is done in many places in common VFS code, as well as + * a few places within unionfs (which could lead to the same cross-FS + * locking issue if, for example, the upper FS is another nested unionfs + * instance). Additionally, it is unclear under what circumstances this + * specific lock sequence (a directory on one FS followed by a child of + * its 'peer' directory on another FS) would present the practical + * possibility of deadlock due to some other agent on the system + * attempting to lock those two specific vnodes in the opposite order. + */ + if (uppervp != NULLVP) + vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY); + else + vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); error = insmntque1(vp, mp); if (error != 0) { unionfs_nodeget_cleanup(vp, unp); return (error); } - if (lowervp != NULL && VN_IS_DOOMED(lowervp)) { - vput(lowervp); - unp->un_lowervp = lowervp = NULL; - } - if (uppervp != NULL && VN_IS_DOOMED(uppervp)) { - vput(uppervp); - unp->un_uppervp = uppervp = NULL; - if (lowervp != NULLVP) - vp->v_vnlock = lowervp->v_vnlock; - } - if (lowervp == NULL && uppervp == NULL) { - unionfs_nodeget_cleanup(vp, unp); - return (ENOENT); - } + /* + * lowervp and uppervp should only be doomed by a forced unmount of + * their respective filesystems, but that can only happen if the + * unionfs instance is first unmounted. We also effectively hold the + * lock on the new unionfs vnode at this point. Therefore, if a + * unionfs umount has not yet reached the point at which the above + * insmntque1() would fail, then its vflush() call will end up + * blocked on our vnode lock, effectively also preventing unmount + * of the underlying filesystems. + */ + VNASSERT(lowervp == NULLVP || !VN_IS_DOOMED(lowervp), vp, + ("%s: doomed lowervp %p", __func__, lowervp)); + VNASSERT(uppervp == NULLVP || !VN_IS_DOOMED(uppervp), vp, + ("%s: doomed lowervp %p", __func__, uppervp)); vn_set_state(vp, VSTATE_CONSTRUCTED); @@ -413,18 +437,16 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp, *vpp = unionfs_ins_cached_vnode(unp, dvp); if (*vpp != NULLVP) { unionfs_nodeget_cleanup(vp, unp); - vp = *vpp; - } else { - if (uppervp != NULL) - VOP_UNLOCK(uppervp); - if (lowervp != NULL) - VOP_UNLOCK(lowervp); + if (lkflags != 0) + vn_lock(*vpp, lkflags | LK_RETRY); + return (0); + } else *vpp = vp; - } -unionfs_nodeget_out: - if (lkflags & LK_TYPE_MASK) - vn_lock(vp, lkflags | LK_RETRY); + if ((lkflags & LK_SHARED) != 0) + vn_lock(vp, LK_DOWNGRADE); + else if ((lkflags & LK_EXCLUSIVE) == 0) + VOP_UNLOCK(vp); return (0); } @@ -443,6 +465,7 @@ unionfs_noderem(struct vnode *vp) struct vnode *dvp; int count; int writerefs; + bool unlock_lvp; /* * The root vnode lock may be recursed during unmount, because @@ -455,18 +478,36 @@ unionfs_noderem(struct vnode *vp) */ KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0, ("%s: vnode %p locked recursively", __func__, vp)); + + unp = VTOUNIONFS(vp); + VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__)); + lvp = unp->un_lowervp; + uvp = unp->un_uppervp; + dvp = unp->un_dvp; + unlock_lvp = (uvp == NULLVP); + + /* + * Lock the lower vnode in addition to the upper vnode lock in order + * to synchronize against any unionfs_lock() operation which may still + * hold the lower vnode lock. We do not need to do this for the root + * vnode, as the root vnode should always have both upper and lower + * base vnodes for its entire lifecycled, so unionfs_lock() should + * never attempt to lock its lower vnode in the first place. + * Moreover, during unmount of a non-"below" unionfs mount, the lower + * root vnode will already be locked as it is the covered vnode. + */ + if (uvp != NULLVP && lvp != NULLVP && (vp->v_vflag & VV_ROOT) == 0) { + vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE); + unlock_lvp = true; + } + if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("%s: failed to acquire lock for vnode lock", __func__); - /* * Use the interlock to protect the clearing of v_data to * prevent faults in unionfs_lock(). */ VI_LOCK(vp); - unp = VTOUNIONFS(vp); - lvp = unp->un_lowervp; - uvp = unp->un_uppervp; - dvp = unp->un_dvp; unp->un_lowervp = unp->un_uppervp = NULLVP; vp->v_vnlock = &(vp->v_lock); vp->v_data = NULL; @@ -502,18 +543,16 @@ unionfs_noderem(struct vnode *vp) ("%s: write reference without upper vnode", __func__)); VOP_ADD_WRITECOUNT(uvp, -writerefs); } - if (lvp != NULLVP) - VOP_UNLOCK(lvp); if (uvp != NULLVP) - VOP_UNLOCK(uvp); + vput(uvp); + if (unlock_lvp) + vput(lvp); + else if (lvp != NULLVP) + vrele(lvp); if (dvp != NULLVP) unionfs_rem_cached_vnode(unp, dvp); - if (lvp != NULLVP) - vrele(lvp); - if (uvp != NULLVP) - vrele(uvp); if (unp->un_path != NULL) { free(unp->un_path, M_UNIONFSPATH); unp->un_path = NULL; @@ -539,35 +578,52 @@ unionfs_noderem(struct vnode *vp) } /* - * Get the unionfs node status object for the vnode corresponding to unp, - * for the process that owns td. Allocate a new status object if one - * does not already exist. + * Find the unionfs node status object for the vnode corresponding to unp, + * for the process that owns td. Return NULL if no such object exists. */ -void -unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, - struct unionfs_node_status **unspp) +struct unionfs_node_status * +unionfs_find_node_status(struct unionfs_node *unp, struct thread *td) { struct unionfs_node_status *unsp; pid_t pid; pid = td->td_proc->p_pid; - KASSERT(NULL != unspp, ("%s: NULL status", __func__)); ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) { if (unsp->uns_pid == pid) { - *unspp = unsp; - return; + return (unsp); } } - /* create a new unionfs node status */ - unsp = malloc(sizeof(struct unionfs_node_status), - M_TEMP, M_WAITOK | M_ZERO); + return (NULL); +} + +/* + * Get the unionfs node status object for the vnode corresponding to unp, + * for the process that owns td. Allocate a new status object if one + * does not already exist. + */ +void +unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, + struct unionfs_node_status **unspp) +{ + struct unionfs_node_status *unsp; + pid_t pid; + + pid = td->td_proc->p_pid; - unsp->uns_pid = pid; - LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list); + KASSERT(NULL != unspp, ("%s: NULL status", __func__)); + unsp = unionfs_find_node_status(unp, td); + if (unsp == NULL) { + /* create a new unionfs node status */ + unsp = malloc(sizeof(struct unionfs_node_status), + M_TEMP, M_WAITOK | M_ZERO); + + unsp->uns_pid = pid; + LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list); + } *unspp = unsp; } @@ -697,110 +753,6 @@ unionfs_relookup(struct vnode *dvp, struct vnode **vpp, } /* - * relookup for CREATE namei operation. - * - * dvp is unionfs vnode. dvp should be locked. - * - * If it called 'unionfs_copyfile' function by unionfs_link etc, - * VOP_LOOKUP information is broken. - * So it need relookup in order to create link etc. - */ -int -unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp, - struct thread *td) -{ - struct vnode *udvp; - struct vnode *vp; - struct componentname cn; - int error; - - udvp = UNIONFSVPTOUPPERVP(dvp); - vp = NULLVP; - - error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, - cnp->cn_namelen, CREATE); - if (error) - return (error); - - if (vp != NULLVP) { - if (udvp == vp) - vrele(vp); - else - vput(vp); - - error = EEXIST; - } - - return (error); -} - -/* - * relookup for DELETE namei operation. - * - * dvp is unionfs vnode. dvp should be locked. - */ -int -unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp, - struct thread *td) -{ - struct vnode *udvp; - struct vnode *vp; - struct componentname cn; - int error; - - udvp = UNIONFSVPTOUPPERVP(dvp); - vp = NULLVP; - - error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, - cnp->cn_namelen, DELETE); - if (error) - return (error); - - if (vp == NULLVP) - error = ENOENT; - else { - if (udvp == vp) - vrele(vp); - else - vput(vp); - } - - return (error); -} - -/* - * relookup for RENAME namei operation. - * - * dvp is unionfs vnode. dvp should be locked. - */ -int -unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp, - struct thread *td) -{ - struct vnode *udvp; - struct vnode *vp; - struct componentname cn; - int error; - - udvp = UNIONFSVPTOUPPERVP(dvp); - vp = NULLVP; - - error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, - cnp->cn_namelen, RENAME); - if (error) - return (error); - - if (vp != NULLVP) { - if (udvp == vp) - vrele(vp); - else - vput(vp); - } - - return (error); -} - -/* * Update the unionfs_node. * * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the @@ -836,6 +788,8 @@ unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp, vp->v_vnlock = uvp->v_vnlock; VI_UNLOCK(vp); + for (count = 0; count < lockrec + 1; count++) + VOP_UNLOCK(lvp); /* * Re-cache the unionfs vnode against the upper vnode */ @@ -851,18 +805,87 @@ unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp, } /* + * Mark a unionfs operation as being in progress, sleeping if the + * same operation is already in progress. + * This is useful, for example, during copy-up operations in which + * we may drop the target vnode lock, but we want to avoid the + * possibility of a concurrent copy-up on the same vnode triggering + * a spurious failure. + */ +int +unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag) +{ + struct unionfs_node *unp; + int error; + + error = 0; + ASSERT_VOP_ELOCKED(vp, __func__); + VI_LOCK(vp); + unp = VTOUNIONFS(vp); + while (error == 0 && (unp->un_flag & flag) != 0) { + VOP_UNLOCK(vp); + error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VI_LOCK(vp); + if (error == 0) { + /* + * If we waited on a concurrent copy-up and that + * copy-up was successful, return a non-fatal + * indication that the desired operation is already + * complete. If we waited on a concurrent lookup, + * return ERELOOKUP to indicate the VFS cache should + * be re-queried to avoid creating a duplicate unionfs + * vnode. + */ + unp = VTOUNIONFS(vp); + if (unp == NULL) + error = ENOENT; + else if (flag == UNIONFS_COPY_IN_PROGRESS && + unp->un_uppervp != NULLVP) + error = EJUSTRETURN; + else if (flag == UNIONFS_LOOKUP_IN_PROGRESS) + error = ERELOOKUP; + } + } + if (error == 0) + unp->un_flag |= flag; + VI_UNLOCK(vp); + + return (error); +} + +void +unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag) +{ + struct unionfs_node *unp; + + ASSERT_VOP_ELOCKED(vp, __func__); + unp = VTOUNIONFS(vp); + VI_LOCK(vp); + if (unp != NULL) { + VNASSERT((unp->un_flag & flag) != 0, vp, + ("%s: copy not in progress", __func__)); + unp->un_flag &= ~flag; + } + wakeup(vp); + VI_UNLOCK(vp); +} + +/* * Create a new shadow dir. * - * udvp should be locked on entry and will be locked on return. + * dvp and vp are unionfs vnodes representing a parent directory and + * child file, should be locked on entry, and will be locked on return. * * If no error returned, unp will be updated. */ int -unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, - struct unionfs_node *unp, struct componentname *cnp, struct thread *td) +unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp, + struct componentname *cnp, struct thread *td) { struct vnode *lvp; struct vnode *uvp; + struct vnode *udvp; struct vattr va; struct vattr lva; struct nameidata nd; @@ -870,10 +893,25 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, struct ucred *cred; struct ucred *credbk; struct uidinfo *rootinfo; + struct unionfs_mount *ump; + struct unionfs_node *dunp; + struct unionfs_node *unp; int error; + ASSERT_VOP_ELOCKED(dvp, __func__); + ASSERT_VOP_ELOCKED(vp, __func__); + ump = MOUNTTOUNIONFSMOUNT(vp->v_mount); + unp = VTOUNIONFS(vp); if (unp->un_uppervp != NULLVP) return (EEXIST); + dunp = VTOUNIONFS(dvp); + udvp = dunp->un_uppervp; + + error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); + if (error == EJUSTRETURN) + return (0); + else if (error != 0) + return (error); lvp = unp->un_lowervp; uvp = NULLVP; @@ -882,11 +920,6 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, /* Authority change to root */ rootinfo = uifind((uid_t)0); cred = crdup(cnp->cn_cred); - /* - * The calls to chgproccnt() are needed to compensate for change_ruid() - * calling chgproccnt(). - */ - chgproccnt(cred->cr_ruidinfo, 1, 0); change_euid(cred, rootinfo); change_ruid(cred, rootinfo); change_svuid(cred, (uid_t)0); @@ -897,11 +930,29 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, NDPREINIT(&nd); if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred))) - goto unionfs_mkshadowdir_abort; + goto unionfs_mkshadowdir_finish; + vref(udvp); + VOP_UNLOCK(vp); if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td, - cnp->cn_nameptr, cnp->cn_namelen, CREATE))) - goto unionfs_mkshadowdir_abort; + cnp->cn_nameptr, cnp->cn_namelen, CREATE))) { + /* + * When handling error cases here, we drop udvp's lock and + * then jump to exit code that relocks dvp, which in most + * cases will effectively relock udvp. However, this is + * not guaranteed to be the case, as various calls made + * here (such as unionfs_relookup() above and VOP_MKDIR() + * below) may unlock and then relock udvp, allowing dvp to + * be reclaimed in the meantime. In such a situation dvp + * will no longer share its lock with udvp. Since + * performance isn't a concern for these error cases, it + * makes more sense to reuse the common code that locks + * dvp on exit than to explicitly check for reclamation + * of dvp. + */ + vput(udvp); + goto unionfs_mkshadowdir_relock; + } if (uvp != NULLVP) { if (udvp == uvp) vrele(uvp); @@ -909,11 +960,14 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, vput(uvp); error = EEXIST; - goto unionfs_mkshadowdir_abort; + vput(udvp); + goto unionfs_mkshadowdir_relock; } - if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) - goto unionfs_mkshadowdir_abort; + if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) { + vput(udvp); + goto unionfs_mkshadowdir_relock; + } unionfs_create_uppervattr_core(ump, &lva, &va, td); /* @@ -924,7 +978,7 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, * component. This *should* be fine, as cn_namelen will still * correctly indicate the length of only the current component, * but ZFS in particular does not respect cn_namelen in its VOP_MKDIR - * implementation + * implementation. * Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by * something like a local namei() operation and the temporary * NUL-termination will not have an effect on other threads. @@ -934,29 +988,59 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, *pathend = '\0'; error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va); *pathend = pathterm; - - if (!error) { - /* - * XXX The bug which cannot set uid/gid was corrected. - * Ignore errors. - */ - va.va_type = VNON; - VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred); - + if (error != 0) { /* - * VOP_SETATTR() may transiently drop uvp's lock, so it's - * important to call it before unionfs_node_update() transfers - * the unionfs vnode's lock from lvp to uvp; otherwise the - * unionfs vnode itself would be transiently unlocked and - * potentially doomed. + * See the comment after unionfs_relookup() above for an + * explanation of why we unlock udvp here only to relock + * dvp on exit. */ - unionfs_node_update(unp, uvp, td); + vput(udvp); + vn_finished_write(mp); + goto unionfs_mkshadowdir_relock; } + + /* + * XXX The bug which cannot set uid/gid was corrected. + * Ignore errors. + */ + va.va_type = VNON; + /* + * VOP_SETATTR() may transiently drop uvp's lock, so it's + * important to call it before unionfs_node_update() transfers + * the unionfs vnode's lock from lvp to uvp; otherwise the + * unionfs vnode itself would be transiently unlocked and + * potentially doomed. + */ + VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred); + + /* + * uvp may become doomed during VOP_VPUT_PAIR() if the implementation + * must temporarily drop uvp's lock. However, since we hold a + * reference to uvp from the VOP_MKDIR() call above, this would require + * a forcible unmount of uvp's filesystem, which in turn can only + * happen if our unionfs instance is first forcibly unmounted. We'll + * therefore catch this case in the NULL check of unp below. + */ + VOP_VPUT_PAIR(udvp, &uvp, false); vn_finished_write(mp); + vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); + unp = VTOUNIONFS(vp); + if (unp == NULL) { + vput(uvp); + error = ENOENT; + } else + unionfs_node_update(unp, uvp, td); + VOP_UNLOCK(vp); + +unionfs_mkshadowdir_relock: + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp))) + error = ENOENT; -unionfs_mkshadowdir_abort: +unionfs_mkshadowdir_finish: + unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); cnp->cn_cred = credbk; - chgproccnt(cred->cr_ruidinfo, -1, 0); crfree(cred); return (error); @@ -1116,23 +1200,31 @@ unionfs_forward_vop_finish_pair( /* * Create a new whiteout. * - * udvp and dvp should be locked on entry and will be locked on return. + * dvp and vp are unionfs vnodes representing a parent directory and + * child file, should be locked on entry, and will be locked on return. */ int -unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp, +unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct thread *td, char *path, int pathlen) { + struct vnode *udvp; struct vnode *wvp; struct nameidata nd; struct mount *mp; int error; - int lkflags; + bool dvp_locked; + + ASSERT_VOP_ELOCKED(dvp, __func__); + ASSERT_VOP_ELOCKED(vp, __func__); + udvp = VTOUNIONFS(dvp)->un_uppervp; wvp = NULLVP; NDPREINIT(&nd); + vref(udvp); + VOP_UNLOCK(vp); if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path, pathlen, CREATE))) { - return (error); + goto unionfs_mkwhiteout_cleanup; } if (wvp != NULLVP) { if (udvp == wvp) @@ -1140,18 +1232,27 @@ unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp, else vput(wvp); - return (EEXIST); + if (nd.ni_cnd.cn_flags & ISWHITEOUT) + error = 0; + else + error = EEXIST; + goto unionfs_mkwhiteout_cleanup; } if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) - goto unionfs_mkwhiteout_free_out; - unionfs_forward_vop_start(udvp, &lkflags); + goto unionfs_mkwhiteout_cleanup; error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE); - unionfs_forward_vop_finish(dvp, udvp, lkflags); - vn_finished_write(mp); -unionfs_mkwhiteout_free_out: +unionfs_mkwhiteout_cleanup: + if (VTOUNIONFS(dvp) == NULL) { + vput(udvp); + dvp_locked = false; + } else { + vrele(udvp); + dvp_locked = true; + } + vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE); return (error); } @@ -1165,10 +1266,11 @@ unionfs_mkwhiteout_free_out: */ static int unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, - struct unionfs_node *unp, struct vattr *uvap, struct thread *td) + struct vnode *vp, struct vattr *uvap, struct thread *td) { struct unionfs_mount *ump; - struct vnode *vp; + struct unionfs_node *unp; + struct vnode *uvp; struct vnode *lvp; struct ucred *cred; struct vattr lva; @@ -1176,8 +1278,10 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, int fmode; int error; + ASSERT_VOP_ELOCKED(vp, __func__); + unp = VTOUNIONFS(vp); ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount); - vp = NULLVP; + uvp = NULLVP; lvp = unp->un_lowervp; cred = td->td_ucred; fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL); @@ -1200,42 +1304,39 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, NDPREINIT(&nd); vref(udvp); - if ((error = vfs_relookup(udvp, &vp, &nd.ni_cnd, false)) != 0) - goto unionfs_vn_create_on_upper_free_out2; - vrele(udvp); + VOP_UNLOCK(vp); + if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) { + vrele(udvp); + return (error); + } - if (vp != NULLVP) { - if (vp == udvp) - vrele(vp); + if (uvp != NULLVP) { + if (uvp == udvp) + vrele(uvp); else - vput(vp); + vput(uvp); error = EEXIST; - goto unionfs_vn_create_on_upper_free_out1; + goto unionfs_vn_create_on_upper_cleanup; } - if ((error = VOP_CREATE(udvp, &vp, &nd.ni_cnd, uvap)) != 0) - goto unionfs_vn_create_on_upper_free_out1; + if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0) + goto unionfs_vn_create_on_upper_cleanup; - if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) { - vput(vp); - goto unionfs_vn_create_on_upper_free_out1; + if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) { + vput(uvp); + goto unionfs_vn_create_on_upper_cleanup; } - error = VOP_ADD_WRITECOUNT(vp, 1); + error = VOP_ADD_WRITECOUNT(uvp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", - __func__, vp, vp->v_writecount); + __func__, uvp, uvp->v_writecount); if (error == 0) { - *vpp = vp; + *vpp = uvp; } else { - VOP_CLOSE(vp, fmode, cred, td); + VOP_CLOSE(uvp, fmode, cred, td); } -unionfs_vn_create_on_upper_free_out1: - VOP_UNLOCK(udvp); - -unionfs_vn_create_on_upper_free_out2: - KASSERT(nd.ni_cnd.cn_pnbuf == unp->un_path, - ("%s: cn_pnbuf changed", __func__)); - +unionfs_vn_create_on_upper_cleanup: + vput(udvp); return (error); } @@ -1310,13 +1411,18 @@ unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp, * * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to * docopy. + * + * vp is a unionfs vnode that should be locked on entry and will be + * locked on return. * * If no error returned, unp will be updated. */ int -unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred, +unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred, struct thread *td) { + struct unionfs_node *unp; + struct unionfs_node *dunp; struct mount *mp; struct vnode *udvp; struct vnode *lvp; @@ -1324,6 +1430,8 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred, struct vattr uva; int error; + ASSERT_VOP_ELOCKED(vp, __func__); + unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = NULLVP; @@ -1333,22 +1441,51 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred, return (EINVAL); if (unp->un_uppervp != NULLVP) return (EEXIST); - udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp; + + udvp = NULLVP; + VI_LOCK(unp->un_dvp); + dunp = VTOUNIONFS(unp->un_dvp); + if (dunp != NULL) + udvp = dunp->un_uppervp; + VI_UNLOCK(unp->un_dvp); + if (udvp == NULLVP) return (EROFS); if ((udvp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); + ASSERT_VOP_UNLOCKED(udvp, __func__); + + error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); + if (error == EJUSTRETURN) + return (0); + else if (error != 0) + return (error); error = VOP_ACCESS(lvp, VREAD, cred, td); if (error != 0) - return (error); + goto unionfs_copyfile_cleanup; if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0) - return (error); - error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td); + goto unionfs_copyfile_cleanup; + error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td); if (error != 0) { vn_finished_write(mp); - return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + goto unionfs_copyfile_cleanup; + } + + /* + * Note that it's still possible for e.g. VOP_WRITE to relock + * uvp below while holding vp[=lvp] locked. Replacing + * unionfs_copyfile_core with vn_generic_copy_file_range() will + * allow us to avoid the problem by moving this vn_lock_pair() + * call much later. + */ + vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); + unp = VTOUNIONFS(vp); + if (unp == NULL) { + error = ENOENT; + goto unionfs_copyfile_cleanup; } if (docopy != 0) { @@ -1369,18 +1506,30 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred, /* Reset the attributes. Ignore errors. */ uva.va_type = VNON; VOP_SETATTR(uvp, &uva, cred); + unionfs_node_update(unp, uvp, td); } - unionfs_node_update(unp, uvp, td); - +unionfs_copyfile_cleanup: + unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); return (error); } /* - * It checks whether vp can rmdir. (check empty) + * Determine if the unionfs view of a directory is empty such that + * an rmdir operation can be permitted. + * + * We assume the VOP_RMDIR() against the upper layer vnode will take + * care of this check for us where the upper FS is concerned, so here + * we concentrate on the lower FS. We need to check for the presence + * of files other than "." and ".." in the lower FS directory and + * then cross-check any files we find against the upper FS to see if + * a whiteout is present (in which case we treat the lower file as + * non-present). + * + * The logic here is based heavily on vn_dir_check_empty(). * - * vp is unionfs vnode. - * vp should be locked. + * vp should be a locked unionfs node, and vp's lowervp should also be + * locked. */ int unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td) @@ -1388,115 +1537,127 @@ unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td) struct vnode *uvp; struct vnode *lvp; struct vnode *tvp; + char *dirbuf; + size_t dirbuflen, len; + off_t off; struct dirent *dp; - struct dirent *edp; struct componentname cn; - struct iovec iov; - struct uio uio; struct vattr va; int error; int eofflag; - int lookuperr; - - /* - * The size of buf needs to be larger than DIRBLKSIZ. - */ - char buf[256 * 6]; - - ASSERT_VOP_ELOCKED(vp, __func__); eofflag = 0; - uvp = UNIONFSVPTOUPPERVP(vp); lvp = UNIONFSVPTOLOWERVP(vp); + uvp = UNIONFSVPTOUPPERVP(vp); + + /* + * Note that the locking here still isn't ideal: We expect the caller + * to hold both the upper and lower layer locks as well as the upper + * parent directory lock, which it can do in a manner that avoids + * deadlock. However, if the cross-check logic below needs to call + * VOP_LOOKUP(), that may relock the upper vnode and lock any found + * child vnode in a way that doesn't protect against deadlock given + * the other held locks. Beyond that, the various other VOPs we issue + * below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the + * lower vnode. + * We might instead just handoff between the upper vnode lock + * (and its parent directory lock) and the lower vnode lock as needed, + * so that the lower lock is never held at the same time as the upper + * locks, but that opens up a wider window in which the upper + * directory (and also the lower directory if it isn't truly + * read-only) may change while the relevant lock is dropped. But + * since re-locking may happen here and open up such a window anyway, + * perhaps that is a worthwile tradeoff? Or perhaps we can ultimately + * do sufficient tracking of empty state within the unionfs vnode + * (in conjunction with upcalls from the lower FSes to notify us + * of out-of-band state changes) that we can avoid these costly checks + * altogether. + */ + ASSERT_VOP_LOCKED(lvp, __func__); + ASSERT_VOP_ELOCKED(uvp, __func__); - /* check opaque */ if ((error = VOP_GETATTR(uvp, &va, cred)) != 0) return (error); if (va.va_flags & OPAQUE) return (0); - /* open vnode */ #ifdef MAC - if ((error = mac_vnode_check_open(cred, vp, VEXEC|VREAD)) != 0) + if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0) return (error); #endif - if ((error = VOP_ACCESS(vp, VEXEC|VREAD, cred, td)) != 0) + if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0) + return (error); + if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0) return (error); - if ((error = VOP_OPEN(vp, FREAD, cred, td, NULL)) != 0) + if ((error = VOP_GETATTR(lvp, &va, cred)) != 0) return (error); - uio.uio_rw = UIO_READ; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_td = td; - uio.uio_offset = 0; + dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); + if (dirbuflen < va.va_blocksize) + dirbuflen = va.va_blocksize; + dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); -#ifdef MAC - error = mac_vnode_check_readdir(td->td_ucred, lvp); -#endif - while (!error && !eofflag) { - iov.iov_base = buf; - iov.iov_len = sizeof(buf); - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_resid = iov.iov_len; + len = 0; + off = 0; + eofflag = 0; - error = VOP_READDIR(lvp, &uio, cred, &eofflag, NULL, NULL); + for (;;) { + error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen, + &dp, &len, &off, &eofflag); if (error != 0) break; - KASSERT(eofflag != 0 || uio.uio_resid < sizeof(buf), - ("%s: empty read from lower FS", __func__)); - - edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid]; - for (dp = (struct dirent*)buf; !error && dp < edp; - dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) { - if (dp->d_type == DT_WHT || dp->d_fileno == 0 || - (dp->d_namlen == 1 && dp->d_name[0] == '.') || - (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2))) - continue; - - cn.cn_namelen = dp->d_namlen; - cn.cn_pnbuf = NULL; - cn.cn_nameptr = dp->d_name; - cn.cn_nameiop = LOOKUP; - cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; - cn.cn_lkflags = LK_EXCLUSIVE; - cn.cn_cred = cred; - - /* - * check entry in lower. - * Sometimes, readdir function returns - * wrong entry. - */ - lookuperr = VOP_LOOKUP(lvp, &tvp, &cn); - if (!lookuperr) - vput(tvp); - else - continue; /* skip entry */ - - /* - * check entry - * If it has no exist/whiteout entry in upper, - * directory is not empty. - */ - cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; - lookuperr = VOP_LOOKUP(uvp, &tvp, &cn); + if (len == 0) { + /* EOF */ + error = 0; + break; + } - if (!lookuperr) - vput(tvp); + if (dp->d_type == DT_WHT) + continue; - /* ignore exist or whiteout entry */ - if (!lookuperr || - (lookuperr == ENOENT && (cn.cn_flags & ISWHITEOUT))) - continue; + /* + * Any file in the directory which is not '.' or '..' indicates + * the directory is not empty. + */ + switch (dp->d_namlen) { + case 2: + if (dp->d_name[1] != '.') { + /* Can't be '..' (nor '.') */ + break; + } + /* FALLTHROUGH */ + case 1: + if (dp->d_name[0] != '.') { + /* Can't be '..' nor '.' */ + break; + } + continue; + default: + break; + } + cn.cn_namelen = dp->d_namlen; + cn.cn_pnbuf = NULL; + cn.cn_nameptr = dp->d_name; + cn.cn_nameiop = LOOKUP; + cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; + cn.cn_lkflags = LK_EXCLUSIVE; + cn.cn_cred = cred; + + error = VOP_LOOKUP(uvp, &tvp, &cn); + if (tvp != NULLVP) + vput(tvp); + if (error != 0 && error != ENOENT && error != EJUSTRETURN) + break; + else if ((cn.cn_flags & ISWHITEOUT) == 0) { error = ENOTEMPTY; - } + break; + } else + error = 0; } - /* close vnode */ - VOP_CLOSE(vp, FREAD, cred, td); - + VOP_CLOSE(lvp, FREAD, cred, td); + free(dirbuf, M_TEMP); return (error); } - diff --git a/sys/fs/unionfs/union_vfsops.c b/sys/fs/unionfs/union_vfsops.c index cb55c2dd6474..9342317ad08e 100644 --- a/sys/fs/unionfs/union_vfsops.c +++ b/sys/fs/unionfs/union_vfsops.c @@ -327,18 +327,15 @@ unionfs_domount(struct mount *mp) * unionfs_lock()) and the mountpoint's busy count. Without this, * unmount will lock the covered vnode lock (directly through the * covered vnode) and wait for the busy count to drain, while a - * concurrent lookup will increment the busy count and then lock + * concurrent lookup will increment the busy count and then may lock * the covered vnode lock (indirectly through unionfs_lock()). * - * Note that we can't yet use this facility for the 'below' case - * in which the upper vnode is the covered vnode, because that would - * introduce a different LOR in which the cross-mount lookup would - * effectively hold the upper vnode lock before acquiring the lower - * vnode lock, while an unrelated lock operation would still acquire - * the lower vnode lock before the upper vnode lock, which is the - * order unionfs currently requires. + * Note that this is only needed for the 'below' case in which the + * upper vnode is also the covered vnode, because unionfs_lock() + * only locks the upper vnode as long as both lower and upper vnodes + * are present (which they will always be for the unionfs mount root). */ - if (!below) { + if (below) { vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE); mp->mnt_vnodecovered->v_vflag |= VV_CROSSLOCK; VOP_UNLOCK(mp->mnt_vnodecovered); diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c index aa2a7273825a..03130f0ca949 100644 --- a/sys/fs/unionfs/union_vnops.c +++ b/sys/fs/unionfs/union_vnops.c @@ -76,6 +76,21 @@ VNASSERT(((vp)->v_op == &unionfs_vnodeops), vp, \ ("%s: non-unionfs vnode", __func__)) +static bool +unionfs_lookup_isroot(struct componentname *cnp, struct vnode *dvp) +{ + struct nameidata *ndp; + + if (dvp == NULL) + return (false); + if ((dvp->v_vflag & VV_ROOT) != 0) + return (true); + ndp = vfs_lookup_nameidata(cnp); + if (ndp == NULL) + return (false); + return (vfs_lookup_isroot(ndp, dvp)); +} + static int unionfs_lookup(struct vop_cachedlookup_args *ap) { @@ -84,13 +99,12 @@ unionfs_lookup(struct vop_cachedlookup_args *ap) struct vattr va; struct componentname *cnp; struct thread *td; + uint64_t cnflags; u_long nameiop; - u_long cnflags, cnflagsbk; - int iswhiteout; int lockflag; - int error , uerror, lerror; + int lkflags; + int error, uerror, lerror; - iswhiteout = 0; lockflag = 0; error = uerror = lerror = ENOENT; cnp = ap->a_cnp; @@ -120,87 +134,185 @@ unionfs_lookup(struct vop_cachedlookup_args *ap) return (EROFS); /* + * Note that a lookup is in-flight, and block if another lookup + * is already in-flight against dvp. This is done because we may + * end up dropping dvp's lock to look up a lower vnode or to create + * a shadow directory, opening up the possibility of parallel lookups + * against the same directory creating duplicate unionfs vnodes for + * the same file(s). Note that if this function encounters an + * in-progress lookup for the directory, it will block until the + * lookup is complete and then return ERELOOKUP to allow any + * existing unionfs vnode to be loaded from the VFS cache. + * This is really a hack; filesystems that support MNTK_LOOKUP_SHARED + * (which unionfs currently doesn't) seem to deal with this by using + * the vfs_hash_* functions to manage a per-mount vnode cache keyed + * by the inode number (or some roughly equivalent unique ID + * usually assocated with the storage medium). It may make sense + * for unionfs to adopt something similar as a replacement for its + * current half-baked directory-only cache implementation, particularly + * if we want to support MNTK_LOOKUP_SHARED here. + */ + error = unionfs_set_in_progress_flag(dvp, UNIONFS_LOOKUP_IN_PROGRESS); + if (error != 0) + return (error); + /* * lookup dotdot */ if (cnflags & ISDOTDOT) { - if (LOOKUP != nameiop && udvp == NULLVP) - return (EROFS); + if (LOOKUP != nameiop && udvp == NULLVP) { + error = EROFS; + goto unionfs_lookup_return; + } - if (udvp != NULLVP) { - dtmpvp = udvp; - if (ldvp != NULLVP) - VOP_UNLOCK(ldvp); + if (unionfs_lookup_isroot(cnp, udvp) || + unionfs_lookup_isroot(cnp, ldvp)) { + error = ENOENT; + goto unionfs_lookup_return; } + + if (udvp != NULLVP) + dtmpvp = udvp; else dtmpvp = ldvp; + unionfs_forward_vop_start(dtmpvp, &lkflags); error = VOP_LOOKUP(dtmpvp, &vp, cnp); + unionfs_forward_vop_finish(dvp, dtmpvp, lkflags); - if (dtmpvp == udvp && ldvp != NULLVP) { - VOP_UNLOCK(udvp); - vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); - dunp = VTOUNIONFS(dvp); - if (error == 0 && dunp == NULL) - error = ENOENT; - } + /* + * Drop the lock and reference on vp. If the lookup was + * successful, we'll either need to exchange vp's lock and + * reference for the unionfs parent vnode's lock and + * reference, or (if dvp was reclaimed) we'll need to drop + * vp's lock and reference to return early. + */ + if (vp != NULLVP) + vput(vp); + dunp = VTOUNIONFS(dvp); + if (error == 0 && dunp == NULL) + error = ENOENT; if (error == 0) { - /* - * Exchange lock and reference from vp to - * dunp->un_dvp. vp is upper/lower vnode, but it - * will need to return the unionfs vnode. - */ - if (nameiop == DELETE || nameiop == RENAME || - (cnp->cn_lkflags & LK_TYPE_MASK)) - VOP_UNLOCK(vp); - vrele(vp); - dtmpvp = dunp->un_dvp; vref(dtmpvp); VOP_UNLOCK(dvp); *(ap->a_vpp) = dtmpvp; - if (nameiop == DELETE || nameiop == RENAME) - vn_lock(dtmpvp, LK_EXCLUSIVE | LK_RETRY); - else if (cnp->cn_lkflags & LK_TYPE_MASK) - vn_lock(dtmpvp, cnp->cn_lkflags | - LK_RETRY); + vn_lock(dtmpvp, cnp->cn_lkflags | LK_RETRY); + if (VN_IS_DOOMED(dtmpvp)) { + vput(dtmpvp); + *(ap->a_vpp) = NULLVP; + error = ENOENT; + } vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); - } else if (error == ENOENT && (cnflags & MAKEENTRY) != 0) - cache_enter(dvp, NULLVP, cnp); + } - goto unionfs_lookup_return; + goto unionfs_lookup_cleanup; } /* + * Lookup lower layer. We do this before looking up the the upper + * layer, as we may drop the upper parent directory's lock, and we + * want to ensure the upper parent remains locked from the point of + * lookup through any ensuing VOP that may require it to be locked. + * The cost of this is that we may end up performing an unnecessary + * lower layer lookup if a whiteout is present in the upper layer. + */ + if (ldvp != NULLVP && !(cnflags & DOWHITEOUT)) { + struct componentname lcn; + bool is_dot; + + if (udvp != NULLVP) { + vref(ldvp); + VOP_UNLOCK(dvp); + vn_lock(ldvp, LK_EXCLUSIVE | LK_RETRY); + } + + lcn = *cnp; + /* always op is LOOKUP */ + lcn.cn_nameiop = LOOKUP; + lcn.cn_flags = cnflags; + is_dot = false; + + if (udvp == NULLVP) + unionfs_forward_vop_start(ldvp, &lkflags); + lerror = VOP_LOOKUP(ldvp, &lvp, &lcn); + if (udvp == NULLVP && + unionfs_forward_vop_finish(dvp, ldvp, lkflags)) { + if (lvp != NULLVP) + VOP_UNLOCK(lvp); + error = ENOENT; + goto unionfs_lookup_cleanup; + } + + if (udvp == NULLVP) + cnp->cn_flags = lcn.cn_flags; + + if (lerror == 0) { + if (ldvp == lvp) { /* is dot */ + vrele(lvp); + *(ap->a_vpp) = dvp; + vref(dvp); + is_dot = true; + error = lerror; + } else if (lvp != NULLVP) + VOP_UNLOCK(lvp); + } + + if (udvp != NULLVP) { + vput(ldvp); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); + if (VN_IS_DOOMED(dvp)) + error = ENOENT; + } + if (is_dot) + goto unionfs_lookup_return; + else if (error != 0) + goto unionfs_lookup_cleanup; + } + /* * lookup upper layer */ if (udvp != NULLVP) { + bool iswhiteout = false; + + unionfs_forward_vop_start(udvp, &lkflags); uerror = VOP_LOOKUP(udvp, &uvp, cnp); + if (unionfs_forward_vop_finish(dvp, udvp, lkflags)) { + if (uvp != NULLVP) + VOP_UNLOCK(uvp); + error = ENOENT; + goto unionfs_lookup_cleanup; + } if (uerror == 0) { if (udvp == uvp) { /* is dot */ + if (lvp != NULLVP) + vrele(lvp); vrele(uvp); *(ap->a_vpp) = dvp; vref(dvp); error = uerror; goto unionfs_lookup_return; - } - if (nameiop == DELETE || nameiop == RENAME || - (cnp->cn_lkflags & LK_TYPE_MASK)) + } else if (uvp != NULLVP) VOP_UNLOCK(uvp); } /* check whiteout */ - if (uerror == ENOENT || uerror == EJUSTRETURN) - if (cnp->cn_flags & ISWHITEOUT) - iswhiteout = 1; /* don't lookup lower */ - if (iswhiteout == 0 && ldvp != NULLVP) - if (!VOP_GETATTR(udvp, &va, cnp->cn_cred) && - (va.va_flags & OPAQUE)) - iswhiteout = 1; /* don't lookup lower */ + if ((uerror == ENOENT || uerror == EJUSTRETURN) && + (cnp->cn_flags & ISWHITEOUT)) + iswhiteout = true; + else if (VOP_GETATTR(udvp, &va, cnp->cn_cred) == 0 && + (va.va_flags & OPAQUE)) + iswhiteout = true; + + if (iswhiteout && lvp != NULLVP) { + vrele(lvp); + lvp = NULLVP; + } + #if 0 UNIONFS_INTERNAL_DEBUG( "unionfs_lookup: debug: whiteout=%d, path=%s\n", @@ -209,39 +321,6 @@ unionfs_lookup(struct vop_cachedlookup_args *ap) } /* - * lookup lower layer - */ - if (ldvp != NULLVP && !(cnflags & DOWHITEOUT) && iswhiteout == 0) { - /* always op is LOOKUP */ - cnp->cn_nameiop = LOOKUP; - cnflagsbk = cnp->cn_flags; - cnp->cn_flags = cnflags; - - lerror = VOP_LOOKUP(ldvp, &lvp, cnp); - - cnp->cn_nameiop = nameiop; - if (udvp != NULLVP && (uerror == 0 || uerror == EJUSTRETURN)) - cnp->cn_flags = cnflagsbk; - - if (lerror == 0) { - if (ldvp == lvp) { /* is dot */ - if (uvp != NULLVP) - vrele(uvp); /* no need? */ - vrele(lvp); - *(ap->a_vpp) = dvp; - vref(dvp); - - UNIONFS_INTERNAL_DEBUG( - "unionfs_lookup: leave (%d)\n", lerror); - - return (lerror); - } - if (cnp->cn_lkflags & LK_TYPE_MASK) - VOP_UNLOCK(lvp); - } - } - - /* * check lookup result */ if (uvp == NULLVP && lvp == NULLVP) { @@ -280,8 +359,7 @@ unionfs_lookup(struct vop_cachedlookup_args *ap) if (unp == NULL) error = ENOENT; else - error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount), - udvp, unp, cnp, td); + error = unionfs_mkshadowdir(dvp, vp, cnp, td); if (lockflag != 0) VOP_UNLOCK(vp); if (error != 0) { @@ -293,6 +371,10 @@ unionfs_lookup(struct vop_cachedlookup_args *ap) vrele(vp); goto unionfs_lookup_cleanup; } + /* + * TODO: Since unionfs_mkshadowdir() relocks udvp after + * creating the new directory, return ERELOOKUP here? + */ if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED) vn_lock(vp, LK_SHARED | LK_RETRY); } @@ -313,9 +395,12 @@ unionfs_lookup(struct vop_cachedlookup_args *ap) "unionfs_lookup: Unable to create unionfs vnode."); goto unionfs_lookup_cleanup; } - if ((nameiop == DELETE || nameiop == RENAME) && - (cnp->cn_lkflags & LK_TYPE_MASK) == 0) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + } + + if (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)) { + error = ENOENT; + vput(vp); + goto unionfs_lookup_cleanup; } *(ap->a_vpp) = vp; @@ -329,10 +414,12 @@ unionfs_lookup_cleanup: if (lvp != NULLVP) vrele(lvp); - if (error == ENOENT && (cnflags & MAKEENTRY) != 0) + if (error == ENOENT && (cnflags & MAKEENTRY) != 0 && + !VN_IS_DOOMED(dvp)) cache_enter(dvp, NULLVP, cnp); unionfs_lookup_return: + unionfs_clear_in_progress_flag(dvp, UNIONFS_LOOKUP_IN_PROGRESS); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error); @@ -492,6 +579,61 @@ unionfs_downgrade_lock(struct vnode *vp, enum unionfs_lkupgrade status) vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } +/* + * Exchange the default (upper vnode) lock on a unionfs vnode for the lower + * vnode lock, in support of operations that require access to the lower vnode + * even when an upper vnode is present. We don't use vn_lock_pair() to hold + * both vnodes at the same time, primarily because the caller may proceed + * to issue VOPs to the lower layer which re-lock or perform other operations + * which may not be safe in the presence of a locked vnode from another FS. + * Moreover, vn_lock_pair()'s deadlock resolution approach can introduce + * additional overhead that isn't necessary on these paths. + * + * vp must be a locked unionfs vnode; the lock state of this vnode is + * returned through *lkflags for later use in unionfs_unlock_lvp(). + * + * Returns the locked lower vnode, or NULL if the lower vnode (and therefore + * also the unionfs vnode above it) has been doomed. + */ +static struct vnode * +unionfs_lock_lvp(struct vnode *vp, int *lkflags) +{ + struct unionfs_node *unp; + struct vnode *lvp; + + unp = VTOUNIONFS(vp); + lvp = unp->un_lowervp; + ASSERT_VOP_LOCKED(vp, __func__); + ASSERT_VOP_UNLOCKED(lvp, __func__); + *lkflags = VOP_ISLOCKED(vp); + vref(lvp); + VOP_UNLOCK(vp); + vn_lock(lvp, *lkflags | LK_RETRY); + if (VN_IS_DOOMED(lvp)) { + vput(lvp); + lvp = NULLVP; + vn_lock(vp, *lkflags | LK_RETRY); + } + return (lvp); +} + +/* + * Undo a previous call to unionfs_lock_lvp(), restoring the default lock + * on the unionfs vnode. This function reloads and returns the vnode + * private data for the unionfs vnode, which will be NULL if the unionfs + * vnode became doomed while its lock was dropped. The caller must check + * for this case. + */ +static struct unionfs_node * +unionfs_unlock_lvp(struct vnode *vp, struct vnode *lvp, int lkflags) +{ + ASSERT_VOP_LOCKED(lvp, __func__); + ASSERT_VOP_UNLOCKED(vp, __func__); + vput(lvp); + vn_lock(vp, lkflags | LK_RETRY); + return (VTOUNIONFS(vp)); +} + static int unionfs_open(struct vop_open_args *ap) { @@ -504,7 +646,9 @@ unionfs_open(struct vop_open_args *ap) struct ucred *cred; struct thread *td; int error; + int lkflags; enum unionfs_lkupgrade lkstatus; + bool lock_lvp, open_lvp; UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n"); @@ -515,6 +659,7 @@ unionfs_open(struct vop_open_args *ap) targetvp = NULLVP; cred = ap->a_cred; td = ap->a_td; + open_lvp = lock_lvp = false; /* * The executable loader path may call this function with vp locked @@ -546,10 +691,12 @@ unionfs_open(struct vop_open_args *ap) if (targetvp == NULLVP) { if (uvp == NULLVP) { if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) { - error = unionfs_copyfile(unp, + error = unionfs_copyfile(vp, !(ap->a_mode & O_TRUNC), cred, td); - if (error != 0) + if (error != 0) { + unp = VTOUNIONFS(vp); goto unionfs_open_abort; + } targetvp = uvp = unp->un_uppervp; } else targetvp = lvp; @@ -557,30 +704,69 @@ unionfs_open(struct vop_open_args *ap) targetvp = uvp; } + if (targetvp == uvp && uvp->v_type == VDIR && lvp != NULLVP && + unsp->uns_lower_opencnt <= 0) + open_lvp = true; + else if (targetvp == lvp && uvp != NULLVP) + lock_lvp = true; + + if (lock_lvp) { + unp = NULL; + lvp = unionfs_lock_lvp(vp, &lkflags); + if (lvp == NULLVP) { + error = ENOENT; + goto unionfs_open_abort; + } + } else + unionfs_forward_vop_start(targetvp, &lkflags); + error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fp); - if (error == 0) { - if (targetvp == uvp) { - if (uvp->v_type == VDIR && lvp != NULLVP && - unsp->uns_lower_opencnt <= 0) { - /* open lower for readdir */ - error = VOP_OPEN(lvp, FREAD, cred, td, NULL); - if (error != 0) { - VOP_CLOSE(uvp, ap->a_mode, cred, td); - goto unionfs_open_abort; - } - unsp->uns_node_flag |= UNS_OPENL_4_READDIR; - unsp->uns_lower_opencnt++; + + if (lock_lvp) { + unp = unionfs_unlock_lvp(vp, lvp, lkflags); + if (unp == NULL && error == 0) + error = ENOENT; + } else if (unionfs_forward_vop_finish(vp, targetvp, lkflags)) + error = error ? error : ENOENT; + + if (error != 0) + goto unionfs_open_abort; + + if (targetvp == uvp) { + if (open_lvp) { + unp = NULL; + lvp = unionfs_lock_lvp(vp, &lkflags); + if (lvp == NULLVP) { + error = ENOENT; + goto unionfs_open_abort; } - unsp->uns_upper_opencnt++; - } else { + /* open lower for readdir */ + error = VOP_OPEN(lvp, FREAD, cred, td, NULL); + unp = unionfs_unlock_lvp(vp, lvp, lkflags); + if (unp == NULL) { + error = error ? error : ENOENT; + goto unionfs_open_abort; + } + if (error != 0) { + unionfs_forward_vop_start(uvp, &lkflags); + VOP_CLOSE(uvp, ap->a_mode, cred, td); + if (unionfs_forward_vop_finish(vp, uvp, lkflags)) + unp = NULL; + goto unionfs_open_abort; + } + unsp->uns_node_flag |= UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt++; - unsp->uns_lower_openmode = ap->a_mode; } - vp->v_object = targetvp->v_object; + unsp->uns_upper_opencnt++; + } else { + unsp->uns_lower_opencnt++; + unsp->uns_lower_openmode = ap->a_mode; } + vp->v_object = targetvp->v_object; unionfs_open_abort: - if (error != 0) + + if (error != 0 && unp != NULL) unionfs_tryrem_node_status(unp, unsp); unionfs_open_cleanup: @@ -599,9 +785,13 @@ unionfs_close(struct vop_close_args *ap) struct ucred *cred; struct thread *td; struct vnode *vp; + struct vnode *uvp; + struct vnode *lvp; struct vnode *ovp; int error; + int lkflags; enum unionfs_lkupgrade lkstatus; + bool lock_lvp; UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n"); @@ -611,6 +801,7 @@ unionfs_close(struct vop_close_args *ap) cred = ap->a_cred; td = ap->a_td; error = 0; + lock_lvp = false; /* * If the vnode is reclaimed while upgrading, we can't safely use unp @@ -621,44 +812,77 @@ unionfs_close(struct vop_close_args *ap) goto unionfs_close_cleanup; unp = VTOUNIONFS(vp); - unionfs_get_node_status(unp, td, &unsp); + lvp = unp->un_lowervp; + uvp = unp->un_uppervp; + unsp = unionfs_find_node_status(unp, td); - if (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0) { + if (unsp == NULL || + (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0)) { #ifdef DIAGNOSTIC - printf("unionfs_close: warning: open count is 0\n"); + if (unsp != NULL) + printf("unionfs_close: warning: open count is 0\n"); #endif - if (unp->un_uppervp != NULLVP) - ovp = unp->un_uppervp; + if (uvp != NULLVP) + ovp = uvp; else - ovp = unp->un_lowervp; + ovp = lvp; } else if (unsp->uns_upper_opencnt > 0) - ovp = unp->un_uppervp; + ovp = uvp; else - ovp = unp->un_lowervp; + ovp = lvp; + + if (ovp == lvp && uvp != NULLVP) { + lock_lvp = true; + unp = NULL; + lvp = unionfs_lock_lvp(vp, &lkflags); + if (lvp == NULLVP) { + error = ENOENT; + goto unionfs_close_abort; + } + } else + unionfs_forward_vop_start(ovp, &lkflags); error = VOP_CLOSE(ovp, ap->a_fflag, cred, td); + if (lock_lvp) { + unp = unionfs_unlock_lvp(vp, lvp, lkflags); + if (unp == NULL && error == 0) + error = ENOENT; + } else if (unionfs_forward_vop_finish(vp, ovp, lkflags)) + error = error ? error : ENOENT; + if (error != 0) goto unionfs_close_abort; vp->v_object = ovp->v_object; - if (ovp == unp->un_uppervp) { - unsp->uns_upper_opencnt--; - if (unsp->uns_upper_opencnt == 0) { + if (ovp == uvp) { + if (unsp != NULL && ((--unsp->uns_upper_opencnt) == 0)) { if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) { - VOP_CLOSE(unp->un_lowervp, FREAD, cred, td); + unp = NULL; + lvp = unionfs_lock_lvp(vp, &lkflags); + if (lvp == NULLVP) { + error = ENOENT; + goto unionfs_close_abort; + } + VOP_CLOSE(lvp, FREAD, cred, td); + unp = unionfs_unlock_lvp(vp, lvp, lkflags); + if (unp == NULL) { + error = ENOENT; + goto unionfs_close_abort; + } unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt--; } if (unsp->uns_lower_opencnt > 0) - vp->v_object = unp->un_lowervp->v_object; + vp->v_object = lvp->v_object; } - } else + } else if (unsp != NULL) unsp->uns_lower_opencnt--; unionfs_close_abort: - unionfs_tryrem_node_status(unp, unsp); + if (unp != NULL && unsp != NULL) + unionfs_tryrem_node_status(unp, unsp); unionfs_close_cleanup: unionfs_downgrade_lock(vp, lkstatus); @@ -883,7 +1107,7 @@ unionfs_setattr(struct vop_setattr_args *ap) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { - error = unionfs_copyfile(unp, (vap->va_size != 0), + error = unionfs_copyfile(ap->a_vp, (vap->va_size != 0), ap->a_cred, td); if (error != 0) return (error); @@ -1078,8 +1302,10 @@ unionfs_remove(struct vop_remove_args *ap) error = VOP_REMOVE(udvp, uvp, cnp); unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags, ap->a_vp, uvp, uvp_lkflags); - } else if (lvp != NULLVP) - error = unionfs_mkwhiteout(ap->a_dvp, udvp, cnp, td, path, pathlen); + } else if (lvp != NULLVP) { + error = unionfs_mkwhiteout(ap->a_dvp, ap->a_vp, cnp, td, + path, pathlen); + } UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error); @@ -1096,7 +1322,6 @@ unionfs_link(struct vop_link_args *ap) struct componentname *cnp; struct thread *td; int error; - int needrelookup; UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n"); @@ -1104,7 +1329,6 @@ unionfs_link(struct vop_link_args *ap) KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; - needrelookup = 0; dunp = VTOUNIONFS(ap->a_tdvp); unp = NULL; udvp = dunp->un_uppervp; @@ -1121,16 +1345,15 @@ unionfs_link(struct vop_link_args *ap) if (ap->a_vp->v_type != VREG) return (EOPNOTSUPP); - error = unionfs_copyfile(unp, 1, cnp->cn_cred, td); - if (error != 0) - return (error); - needrelookup = 1; + VOP_UNLOCK(ap->a_tdvp); + error = unionfs_copyfile(ap->a_vp, 1, cnp->cn_cred, td); + vn_lock(ap->a_tdvp, LK_EXCLUSIVE | LK_RETRY); + if (error == 0) + error = ERELOOKUP; + return (error); } uvp = unp->un_uppervp; - if (needrelookup != 0) - error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td); - if (error == 0) { int udvp_lkflags, uvp_lkflags; unionfs_forward_vop_start_pair(udvp, &udvp_lkflags, @@ -1154,8 +1377,6 @@ unionfs_rename(struct vop_rename_args *ap) struct vnode *tdvp; struct vnode *tvp; struct componentname *tcnp; - struct vnode *ltdvp; - struct vnode *ltvp; struct thread *td; /* rename target vnodes */ @@ -1164,7 +1385,6 @@ unionfs_rename(struct vop_rename_args *ap) struct vnode *rtdvp; struct vnode *rtvp; - struct unionfs_mount *ump; struct unionfs_node *unp; int error; @@ -1177,8 +1397,6 @@ unionfs_rename(struct vop_rename_args *ap) tdvp = ap->a_tdvp; tvp = ap->a_tvp; tcnp = ap->a_tcnp; - ltdvp = NULLVP; - ltvp = NULLVP; td = curthread; rfdvp = fdvp; rfvp = fvp; @@ -1238,7 +1456,6 @@ unionfs_rename(struct vop_rename_args *ap) UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n", fvp, unp->un_uppervp, unp->un_lowervp); #endif - ump = MOUNTTOUNIONFSMOUNT(fvp->v_mount); /* * If we only have a lower vnode, copy the source file to the upper * FS so that the rename operation can be issued against the upper FS. @@ -1282,10 +1499,10 @@ unionfs_rename(struct vop_rename_args *ap) else if (unp->un_uppervp == NULLVP) { switch (fvp->v_type) { case VREG: - error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td); + error = unionfs_copyfile(fvp, 1, fcnp->cn_cred, td); break; case VDIR: - error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td); + error = unionfs_mkshadowdir(fdvp, fvp, fcnp, td); break; default: error = ENODEV; @@ -1327,7 +1544,6 @@ unionfs_rename(struct vop_rename_args *ap) goto unionfs_rename_abort; } rtdvp = unp->un_uppervp; - ltdvp = unp->un_lowervp; vref(rtdvp); if (tvp != NULLVP) { @@ -1348,7 +1564,6 @@ unionfs_rename(struct vop_rename_args *ap) goto unionfs_rename_abort; } rtvp = unp->un_uppervp; - ltvp = unp->un_lowervp; vref(rtvp); } } @@ -1365,12 +1580,8 @@ unionfs_rename(struct vop_rename_args *ap) cache_purge(fdvp); } - if (ltdvp != NULLVP) - VOP_UNLOCK(ltdvp); if (tdvp != rtdvp) vrele(tdvp); - if (ltvp != NULLVP) - VOP_UNLOCK(ltvp); if (tvp != rtvp && tvp != NULLVP) { if (rtvp == NULLVP) vput(tvp); @@ -1504,43 +1715,55 @@ unionfs_rmdir(struct vop_rmdir_args *ap) if (uvp != NULLVP) { if (lvp != NULLVP) { + /* + * We need to keep dvp and vp's upper vnodes locked + * going into the VOP_RMDIR() call, but the empty + * directory check also requires the lower vnode lock. + * For this third, cross-filesystem lock we use a + * similar approach taken by various FS' VOP_RENAME + * implementations (which require 2-4 vnode locks). + * First we attempt a NOWAIT acquisition, then if + * that fails we drops the other two vnode locks, + * acquire lvp's lock in the normal fashion to reduce + * the likelihood of spinning on it in the future, + * then drop, reacquire the other locks, and return + * ERELOOKUP to re-drive the lookup in case the dvp-> + * vp relationship has changed. + */ + if (vn_lock(lvp, LK_SHARED | LK_NOWAIT) != 0) { + VOP_UNLOCK(ap->a_vp); + VOP_UNLOCK(ap->a_dvp); + vn_lock(lvp, LK_SHARED | LK_RETRY); + VOP_UNLOCK(lvp); + vn_lock(ap->a_dvp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); + return (ERELOOKUP); + } error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td); + /* + * It's possible for a direct operation on the lower FS + * to make the lower directory non-empty after we drop + * the lock, but it's also possible for the upper-layer + * VOP_RMDIR to relock udvp/uvp which would lead to + * LOR if we kept lvp locked across that call. + */ + VOP_UNLOCK(lvp); if (error != 0) return (error); } ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP) - cnp->cn_flags |= DOWHITEOUT; - /* - * The relookup path will need to relock the parent dvp and - * possibly the vp as well. Locking is expected to be done - * in parent->child order; drop the lock on vp to avoid LOR - * and potential recursion on vp's lock. - * vp is expected to remain referenced during VOP_RMDIR(), - * so vref/vrele should not be necessary here. - */ - VOP_UNLOCK(ap->a_vp); - VNPASS(vrefcnt(ap->a_vp) > 0, ap->a_vp); - error = unionfs_relookup_for_delete(ap->a_dvp, cnp, td); - vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); - /* - * VOP_RMDIR is dispatched against udvp, so if uvp became - * doomed while the lock was dropped above the target - * filesystem may not be able to cope. - */ - if (error == 0 && VN_IS_DOOMED(uvp)) - error = ENOENT; - if (error == 0) { - int udvp_lkflags, uvp_lkflags; - unionfs_forward_vop_start_pair(udvp, &udvp_lkflags, - uvp, &uvp_lkflags); - error = VOP_RMDIR(udvp, uvp, cnp); - unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags, - ap->a_vp, uvp, uvp_lkflags); - } - } else if (lvp != NULLVP) - error = unionfs_mkwhiteout(ap->a_dvp, udvp, cnp, td, + cnp->cn_flags |= (DOWHITEOUT | IGNOREWHITEOUT); + int udvp_lkflags, uvp_lkflags; + unionfs_forward_vop_start_pair(udvp, &udvp_lkflags, + uvp, &uvp_lkflags); + error = VOP_RMDIR(udvp, uvp, cnp); + unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags, + ap->a_vp, uvp, uvp_lkflags); + } else if (lvp != NULLVP) { + error = unionfs_mkwhiteout(ap->a_dvp, ap->a_vp, cnp, td, unp->un_path, unp->un_pathlen); + } if (error == 0) { cache_purge(ap->a_dvp); @@ -1613,6 +1836,7 @@ unionfs_readdir(struct vop_readdir_args *ap) uint64_t *cookies_bk; int error; int eofflag; + int lkflags; int ncookies_bk; int uio_offset_bk; enum unionfs_lkupgrade lkstatus; @@ -1668,18 +1892,26 @@ unionfs_readdir(struct vop_readdir_args *ap) /* upper only */ if (uvp != NULLVP && lvp == NULLVP) { + unionfs_forward_vop_start(uvp, &lkflags); error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); - unsp->uns_readdir_status = 0; + if (unionfs_forward_vop_finish(vp, uvp, lkflags)) + error = error ? error : ENOENT; + else + unsp->uns_readdir_status = 0; goto unionfs_readdir_exit; } /* lower only */ if (uvp == NULLVP && lvp != NULLVP) { + unionfs_forward_vop_start(lvp, &lkflags); error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); - unsp->uns_readdir_status = 2; + if (unionfs_forward_vop_finish(vp, lvp, lkflags)) + error = error ? error : ENOENT; + else + unsp->uns_readdir_status = 2; goto unionfs_readdir_exit; } @@ -1689,14 +1921,17 @@ unionfs_readdir(struct vop_readdir_args *ap) */ KASSERT(uvp != NULLVP, ("unionfs_readdir: null upper vp")); KASSERT(lvp != NULLVP, ("unionfs_readdir: null lower vp")); + if (uio->uio_offset == 0) unsp->uns_readdir_status = 0; if (unsp->uns_readdir_status == 0) { /* read upper */ + unionfs_forward_vop_start(uvp, &lkflags); error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag, ap->a_ncookies, ap->a_cookies); - + if (unionfs_forward_vop_finish(vp, uvp, lkflags) && error == 0) + error = ENOENT; if (error != 0 || eofflag == 0) goto unionfs_readdir_exit; unsp->uns_readdir_status = 1; @@ -1735,14 +1970,22 @@ unionfs_readdir(struct vop_readdir_args *ap) uio->uio_offset = 0; } - if (lvp == NULLVP) { - error = EBADF; + lvp = unionfs_lock_lvp(vp, &lkflags); + if (lvp == NULL) { + error = ENOENT; goto unionfs_readdir_exit; } + /* read lower */ error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); + + unp = unionfs_unlock_lvp(vp, lvp, lkflags); + if (unp == NULL && error == 0) + error = ENOENT; + + /* * We can't return an uio_offset of 0: this would trigger an * infinite loop, because the next call to unionfs_readdir would @@ -1907,96 +2150,49 @@ unionfs_print(struct vop_print_args *ap) } static int -unionfs_get_llt_revlock(struct vnode *vp, int flags) -{ - int revlock; - - revlock = 0; - - switch (flags & LK_TYPE_MASK) { - case LK_SHARED: - if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) - revlock = LK_UPGRADE; - else - revlock = LK_RELEASE; - break; - case LK_EXCLUSIVE: - case LK_UPGRADE: - revlock = LK_RELEASE; - break; - case LK_DOWNGRADE: - revlock = LK_UPGRADE; - break; - default: - break; - } - - return (revlock); -} - -/* - * The state of an acquired lock is adjusted similarly to - * the time of error generating. - * flags: LK_RELEASE or LK_UPGRADE - */ -static void -unionfs_revlock(struct vnode *vp, int flags) -{ - if (flags & LK_RELEASE) - VOP_UNLOCK_FLAGS(vp, flags); - else { - /* UPGRADE */ - if (vn_lock(vp, flags) != 0) - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - } -} - -static int unionfs_lock(struct vop_lock1_args *ap) { struct unionfs_node *unp; struct vnode *vp; - struct vnode *uvp; - struct vnode *lvp; + struct vnode *tvp; int error; int flags; - int revlock; - int interlock; - int uhold; + bool lvp_locked; - /* - * TODO: rework the unionfs locking scheme. - * It's not guaranteed to be safe to blindly lock two vnodes on - * different mounts as is done here. Further, the entanglement - * of locking both vnodes with the various options that can be - * passed to VOP_LOCK() makes this code hard to reason about. - * Instead, consider locking only the upper vnode, or the lower - * vnode is the upper is not present, and taking separate measures - * to lock both vnodes in the few cases when that is needed. - */ error = 0; - interlock = 1; - uhold = 0; flags = ap->a_flags; vp = ap->a_vp; if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK)) return (VOP_UNLOCK_FLAGS(vp, flags | LK_RELEASE)); +unionfs_lock_restart: + /* + * We currently need the interlock here to ensure we can safely + * access the unionfs vnode's private data. We may be able to + * eliminate this extra locking by instead using vfs_smr_enter() + * and vn_load_v_data_smr() here in conjunction with an SMR UMA + * zone for unionfs nodes. + */ if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); + else + flags &= ~LK_INTERLOCK; unp = VTOUNIONFS(vp); - if (unp == NULL) - goto unionfs_lock_null_vnode; - - KASSERT_UNIONFS_VNODE(ap->a_vp); - - lvp = unp->un_lowervp; - uvp = unp->un_uppervp; + if (unp == NULL) { + VI_UNLOCK(vp); + ap->a_flags = flags; + return (vop_stdlock(ap)); + } - if ((revlock = unionfs_get_llt_revlock(vp, flags)) == 0) - panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK); + if (unp->un_uppervp != NULL) { + tvp = unp->un_uppervp; + lvp_locked = false; + } else { + tvp = unp->un_lowervp; + lvp_locked = true; + } /* * During unmount, the root vnode lock may be taken recursively, @@ -2009,150 +2205,77 @@ unionfs_lock(struct vop_lock1_args *ap) (vp->v_vflag & VV_ROOT) != 0) flags |= LK_CANRECURSE; - if (lvp != NULLVP) { - if (uvp != NULLVP && flags & LK_UPGRADE) { + vholdnz(tvp); + VI_UNLOCK(vp); + error = VOP_LOCK(tvp, flags); + vdrop(tvp); + if (error == 0 && (lvp_locked || VTOUNIONFS(vp) == NULL)) { + /* + * After dropping the interlock above, there exists a window + * in which another thread may acquire the lower vnode lock + * and then either doom the unionfs vnode or create an upper + * vnode. In either case, we will effectively be holding the + * wrong lock, so we must drop the lower vnode lock and + * restart the lock operation. + * + * If unp is not already NULL, we assume that we can safely + * access it because we currently hold lvp's lock. + * unionfs_noderem() acquires lvp's lock before freeing + * the vnode private data, ensuring it can't be concurrently + * freed while we are using it here. Likewise, + * unionfs_node_update() acquires lvp's lock before installing + * an upper vnode. Without those guarantees, we would need to + * reacquire the vnode interlock here. + * Note that unionfs_noderem() doesn't acquire lvp's lock if + * this is the root vnode, but the root vnode should always + * have an upper vnode and therefore we should never use its + * lower vnode lock here. + */ + unp = VTOUNIONFS(vp); + if (unp == NULL || unp->un_uppervp != NULLVP) { + VOP_UNLOCK(tvp); /* - * Share Lock is once released and a deadlock is - * avoided. + * If we previously held the lock, the upgrade may + * have temporarily dropped the lock, in which case + * concurrent dooming or copy-up will necessitate + * acquiring a different lock. Since we never held + * the new lock, LK_UPGRADE must be cleared here to + * avoid triggering a lockmgr panic. */ - vholdnz(uvp); - uhold = 1; - VOP_UNLOCK(uvp); - } - VI_LOCK_FLAGS(lvp, MTX_DUPOK); - flags |= LK_INTERLOCK; - vholdl(lvp); - - VI_UNLOCK(vp); - ap->a_flags &= ~LK_INTERLOCK; - - error = VOP_LOCK(lvp, flags); - - VI_LOCK(vp); - unp = VTOUNIONFS(vp); - if (unp == NULL) { - /* vnode is released. */ - VI_UNLOCK(vp); - if (error == 0) - VOP_UNLOCK(lvp); - vdrop(lvp); - if (uhold != 0) - vdrop(uvp); - goto unionfs_lock_fallback; + if (flags & LK_UPGRADE) + flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; + VNASSERT((flags & LK_DOWNGRADE) == 0, vp, + ("%s: vnode doomed during downgrade", __func__)); + goto unionfs_lock_restart; } } - if (error == 0 && uvp != NULLVP) { - if (uhold && flags & LK_UPGRADE) { - flags &= ~LK_TYPE_MASK; - flags |= LK_EXCLUSIVE; - } - VI_LOCK_FLAGS(uvp, MTX_DUPOK); - flags |= LK_INTERLOCK; - if (uhold == 0) { - vholdl(uvp); - uhold = 1; - } - - VI_UNLOCK(vp); - ap->a_flags &= ~LK_INTERLOCK; - - error = VOP_LOCK(uvp, flags); - - VI_LOCK(vp); - unp = VTOUNIONFS(vp); - if (unp == NULL) { - /* vnode is released. */ - VI_UNLOCK(vp); - if (error == 0) - VOP_UNLOCK(uvp); - vdrop(uvp); - if (lvp != NULLVP) { - VOP_UNLOCK(lvp); - vdrop(lvp); - } - goto unionfs_lock_fallback; - } - if (error != 0 && lvp != NULLVP) { - /* rollback */ - VI_UNLOCK(vp); - unionfs_revlock(lvp, revlock); - interlock = 0; - } - } - - if (interlock) - VI_UNLOCK(vp); - if (lvp != NULLVP) - vdrop(lvp); - if (uhold != 0) - vdrop(uvp); - return (error); - -unionfs_lock_null_vnode: - ap->a_flags |= LK_INTERLOCK; - return (vop_stdlock(ap)); - -unionfs_lock_fallback: - /* - * If we reach this point, we've discovered the unionfs vnode - * has been reclaimed while the upper/lower vnode locks were - * temporarily dropped. Such temporary droppage may happen - * during the course of an LK_UPGRADE operation itself, and in - * that case LK_UPGRADE must be cleared as the unionfs vnode's - * lock has been reset to point to the standard v_lock field, - * which has not previously been held. - */ - if (flags & LK_UPGRADE) { - ap->a_flags &= ~LK_TYPE_MASK; - ap->a_flags |= LK_EXCLUSIVE; - } - return (vop_stdlock(ap)); } static int unionfs_unlock(struct vop_unlock_args *ap) { struct vnode *vp; - struct vnode *lvp; - struct vnode *uvp; + struct vnode *tvp; struct unionfs_node *unp; int error; - int uhold; KASSERT_UNIONFS_VNODE(ap->a_vp); - error = 0; - uhold = 0; vp = ap->a_vp; unp = VTOUNIONFS(vp); if (unp == NULL) - goto unionfs_unlock_null_vnode; - lvp = unp->un_lowervp; - uvp = unp->un_uppervp; + return (vop_stdunlock(ap)); - if (lvp != NULLVP) { - vholdnz(lvp); - error = VOP_UNLOCK(lvp); - } + tvp = (unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp); - if (error == 0 && uvp != NULLVP) { - vholdnz(uvp); - uhold = 1; - error = VOP_UNLOCK(uvp); - } + vholdnz(tvp); + error = VOP_UNLOCK(tvp); + vdrop(tvp); - if (lvp != NULLVP) - vdrop(lvp); - if (uhold != 0) - vdrop(uvp); - - return error; - -unionfs_unlock_null_vnode: - return (vop_stdunlock(ap)); + return (error); } static int @@ -2192,7 +2315,7 @@ unionfs_advlock(struct vop_advlock_args *ap) uvp = unp->un_uppervp; if (uvp == NULLVP) { - error = unionfs_copyfile(unp, 1, td->td_ucred, td); + error = unionfs_copyfile(ap->a_vp, 1, td->td_ucred, td); if (error != 0) goto unionfs_advlock_abort; uvp = unp->un_uppervp; @@ -2294,7 +2417,7 @@ unionfs_setacl(struct vop_setacl_args *ap) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { - if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) + if ((error = unionfs_copyfile(ap->a_vp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } @@ -2467,9 +2590,10 @@ unionfs_setextattr(struct vop_setextattr_args *ap) if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && - (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { + (error = unionfs_copyfile(ap->a_vp, 1, cred, td)) != 0) { unionfs_setextattr_reopen: - if ((unp->un_flag & UNIONFS_OPENEXTL) && + unp = VTOUNIONFS(ap->a_vp); + if (unp != NULL && (unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); @@ -2561,9 +2685,10 @@ unionfs_deleteextattr(struct vop_deleteextattr_args *ap) if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && - (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { + (error = unionfs_copyfile(ap->a_vp, 1, cred, td)) != 0) { unionfs_deleteextattr_reopen: - if ((unp->un_flag & UNIONFS_OPENEXTL) && + unp = VTOUNIONFS(ap->a_vp); + if (unp != NULL && (unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); @@ -2613,7 +2738,7 @@ unionfs_setlabel(struct vop_setlabel_args *ap) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { - if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) + if ((error = unionfs_copyfile(ap->a_vp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } @@ -2665,7 +2790,7 @@ static int unionfs_vput_pair(struct vop_vput_pair_args *ap) { struct mount *mp; - struct vnode *dvp, *vp, **vpp, *lvp, *ldvp, *uvp, *udvp, *tempvp; + struct vnode *dvp, *vp, **vpp, *lvp, *uvp, *tvp, *tdvp, *tempvp; struct unionfs_node *dunp, *unp; int error, res; @@ -2674,11 +2799,14 @@ unionfs_vput_pair(struct vop_vput_pair_args *ap) vp = NULLVP; lvp = NULLVP; uvp = NULLVP; + tvp = NULLVP; unp = NULL; dunp = VTOUNIONFS(dvp); - udvp = dunp->un_uppervp; - ldvp = dunp->un_lowervp; + if (dunp->un_uppervp != NULL) + tdvp = dunp->un_uppervp; + else + tdvp = dunp->un_lowervp; /* * Underlying vnodes should be locked because the encompassing unionfs @@ -2686,10 +2814,7 @@ unionfs_vput_pair(struct vop_vput_pair_args *ap) * only be on the unionfs node. Reference them now so that the vput()s * performed by VOP_VPUT_PAIR() will have a reference to drop. */ - if (udvp != NULLVP) - vref(udvp); - if (ldvp != NULLVP) - vref(ldvp); + vref(tdvp); if (vpp != NULL) vp = *vpp; @@ -2699,9 +2824,10 @@ unionfs_vput_pair(struct vop_vput_pair_args *ap) uvp = unp->un_uppervp; lvp = unp->un_lowervp; if (uvp != NULLVP) - vref(uvp); - if (lvp != NULLVP) - vref(lvp); + tvp = uvp; + else + tvp = lvp; + vref(tvp); /* * If we're being asked to return a locked child vnode, then @@ -2721,31 +2847,19 @@ unionfs_vput_pair(struct vop_vput_pair_args *ap) } } - /* - * TODO: Because unionfs_lock() locks both the lower and upper vnodes - * (if available), we must also call VOP_VPUT_PAIR() on both the lower - * and upper parent/child pairs. If unionfs_lock() is reworked to lock - * only a single vnode, this code will need to change to also only - * operate on one vnode pair. - */ - ASSERT_VOP_LOCKED(ldvp, __func__); - ASSERT_VOP_LOCKED(udvp, __func__); - ASSERT_VOP_LOCKED(lvp, __func__); - ASSERT_VOP_LOCKED(uvp, __func__); - - KASSERT(lvp == NULLVP || ldvp != NULLVP, - ("%s: NULL ldvp with non-NULL lvp", __func__)); - if (ldvp != NULLVP) - res = VOP_VPUT_PAIR(ldvp, lvp != NULLVP ? &lvp : NULL, true); - KASSERT(uvp == NULLVP || udvp != NULLVP, - ("%s: NULL udvp with non-NULL uvp", __func__)); - if (udvp != NULLVP) - res = VOP_VPUT_PAIR(udvp, uvp != NULLVP ? &uvp : NULL, true); - - ASSERT_VOP_UNLOCKED(ldvp, __func__); - ASSERT_VOP_UNLOCKED(udvp, __func__); - ASSERT_VOP_UNLOCKED(lvp, __func__); - ASSERT_VOP_UNLOCKED(uvp, __func__); + ASSERT_VOP_LOCKED(tdvp, __func__); + ASSERT_VOP_LOCKED(tvp, __func__); + + if (tdvp == dunp->un_uppervp && tvp != NULLVP && tvp == lvp) { + vput(tvp); + vput(tdvp); + res = 0; + } else { + res = VOP_VPUT_PAIR(tdvp, tvp != NULLVP ? &tvp : NULL, true); + } + + ASSERT_VOP_UNLOCKED(tdvp, __func__); + ASSERT_VOP_UNLOCKED(tvp, __func__); /* * VOP_VPUT_PAIR() dropped the references we added to the underlying |