aboutsummaryrefslogtreecommitdiff
path: root/sys/fs/unionfs/union_subr.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/fs/unionfs/union_subr.c')
-rw-r--r--sys/fs/unionfs/union_subr.c817
1 files changed, 489 insertions, 328 deletions
diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c
index bb57f3d56ade..edcc6716b674 100644
--- a/sys/fs/unionfs/union_subr.c
+++ b/sys/fs/unionfs/union_subr.c
@@ -203,19 +203,19 @@ unionfs_ins_cached_vnode(struct unionfs_node *uncp,
struct unionfs_node_hashhead *hd;
struct vnode *vp;
- ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
- ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
- KASSERT(uncp->un_uppervp == NULLVP || uncp->un_uppervp->v_type == VDIR,
- ("%s: v_type != VDIR", __func__));
- KASSERT(uncp->un_lowervp == NULLVP || uncp->un_lowervp->v_type == VDIR,
- ("%s: v_type != VDIR", __func__));
-
vp = NULLVP;
VI_LOCK(dvp);
- if (uncp->un_uppervp != NULL)
+ if (uncp->un_uppervp != NULLVP) {
+ ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
+ KASSERT(uncp->un_uppervp->v_type == VDIR,
+ ("%s: v_type != VDIR", __func__));
vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp);
- else if (uncp->un_lowervp != NULL)
+ } else if (uncp->un_lowervp != NULLVP) {
+ ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
+ KASSERT(uncp->un_lowervp->v_type == VDIR,
+ ("%s: v_type != VDIR", __func__));
vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp);
+ }
if (vp == NULLVP) {
hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ?
uncp->un_uppervp : uncp->un_lowervp));
@@ -276,9 +276,11 @@ unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp)
if (unp->un_dvp != NULLVP)
vrele(unp->un_dvp);
- if (unp->un_uppervp != NULLVP)
+ if (unp->un_uppervp != NULLVP) {
vput(unp->un_uppervp);
- if (unp->un_lowervp != NULLVP)
+ if (unp->un_lowervp != NULLVP)
+ vrele(unp->un_lowervp);
+ } else if (unp->un_lowervp != NULLVP)
vput(unp->un_lowervp);
if (unp->un_hashtbl != NULL)
hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
@@ -314,7 +316,7 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
*vpp = NULLVP;
if (uppervp == NULLVP && lowervp == NULLVP)
- panic("%s: upper and lower is null", __func__);
+ panic("%s: upper and lower are both null", __func__);
vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type);
@@ -327,7 +329,9 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp);
if (vp != NULLVP) {
*vpp = vp;
- goto unionfs_nodeget_out;
+ if (lkflags != 0)
+ vn_lock(*vpp, lkflags | LK_RETRY);
+ return (0);
}
}
@@ -385,27 +389,47 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0,
("%s: NULL dvp for non-root vp %p", __func__, vp));
- vn_lock_pair(lowervp, false, LK_EXCLUSIVE, uppervp, false,
- LK_EXCLUSIVE);
+
+ /*
+ * NOTE: There is still a possibility for cross-filesystem locking here.
+ * If dvp has an upper FS component and is locked, while the new vnode
+ * created here only has a lower-layer FS component, then we will end
+ * up taking a lower-FS lock while holding an upper-FS lock.
+ * That situation could be dealt with here using vn_lock_pair().
+ * However, that would only address one instance out of many in which
+ * a child vnode lock is taken while holding a lock on its parent
+ * directory. This is done in many places in common VFS code, as well as
+ * a few places within unionfs (which could lead to the same cross-FS
+ * locking issue if, for example, the upper FS is another nested unionfs
+ * instance). Additionally, it is unclear under what circumstances this
+ * specific lock sequence (a directory on one FS followed by a child of
+ * its 'peer' directory on another FS) would present the practical
+ * possibility of deadlock due to some other agent on the system
+ * attempting to lock those two specific vnodes in the opposite order.
+ */
+ if (uppervp != NULLVP)
+ vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY);
+ else
+ vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
error = insmntque1(vp, mp);
if (error != 0) {
unionfs_nodeget_cleanup(vp, unp);
return (error);
}
- if (lowervp != NULL && VN_IS_DOOMED(lowervp)) {
- vput(lowervp);
- unp->un_lowervp = lowervp = NULL;
- }
- if (uppervp != NULL && VN_IS_DOOMED(uppervp)) {
- vput(uppervp);
- unp->un_uppervp = uppervp = NULL;
- if (lowervp != NULLVP)
- vp->v_vnlock = lowervp->v_vnlock;
- }
- if (lowervp == NULL && uppervp == NULL) {
- unionfs_nodeget_cleanup(vp, unp);
- return (ENOENT);
- }
+ /*
+ * lowervp and uppervp should only be doomed by a forced unmount of
+ * their respective filesystems, but that can only happen if the
+ * unionfs instance is first unmounted. We also effectively hold the
+ * lock on the new unionfs vnode at this point. Therefore, if a
+ * unionfs umount has not yet reached the point at which the above
+ * insmntque1() would fail, then its vflush() call will end up
+ * blocked on our vnode lock, effectively also preventing unmount
+ * of the underlying filesystems.
+ */
+ VNASSERT(lowervp == NULLVP || !VN_IS_DOOMED(lowervp), vp,
+ ("%s: doomed lowervp %p", __func__, lowervp));
+ VNASSERT(uppervp == NULLVP || !VN_IS_DOOMED(uppervp), vp,
+ ("%s: doomed lowervp %p", __func__, uppervp));
vn_set_state(vp, VSTATE_CONSTRUCTED);
@@ -413,18 +437,16 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
*vpp = unionfs_ins_cached_vnode(unp, dvp);
if (*vpp != NULLVP) {
unionfs_nodeget_cleanup(vp, unp);
- vp = *vpp;
- } else {
- if (uppervp != NULL)
- VOP_UNLOCK(uppervp);
- if (lowervp != NULL)
- VOP_UNLOCK(lowervp);
+ if (lkflags != 0)
+ vn_lock(*vpp, lkflags | LK_RETRY);
+ return (0);
+ } else
*vpp = vp;
- }
-unionfs_nodeget_out:
- if (lkflags & LK_TYPE_MASK)
- vn_lock(vp, lkflags | LK_RETRY);
+ if ((lkflags & LK_SHARED) != 0)
+ vn_lock(vp, LK_DOWNGRADE);
+ else if ((lkflags & LK_EXCLUSIVE) == 0)
+ VOP_UNLOCK(vp);
return (0);
}
@@ -443,6 +465,7 @@ unionfs_noderem(struct vnode *vp)
struct vnode *dvp;
int count;
int writerefs;
+ bool unlock_lvp;
/*
* The root vnode lock may be recursed during unmount, because
@@ -455,18 +478,36 @@ unionfs_noderem(struct vnode *vp)
*/
KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0,
("%s: vnode %p locked recursively", __func__, vp));
+
+ unp = VTOUNIONFS(vp);
+ VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__));
+ lvp = unp->un_lowervp;
+ uvp = unp->un_uppervp;
+ dvp = unp->un_dvp;
+ unlock_lvp = (uvp == NULLVP);
+
+ /*
+ * Lock the lower vnode in addition to the upper vnode lock in order
+ * to synchronize against any unionfs_lock() operation which may still
+ * hold the lower vnode lock. We do not need to do this for the root
+ * vnode, as the root vnode should always have both upper and lower
+ * base vnodes for its entire lifecycled, so unionfs_lock() should
+ * never attempt to lock its lower vnode in the first place.
+ * Moreover, during unmount of a non-"below" unionfs mount, the lower
+ * root vnode will already be locked as it is the covered vnode.
+ */
+ if (uvp != NULLVP && lvp != NULLVP && (vp->v_vflag & VV_ROOT) == 0) {
+ vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE);
+ unlock_lvp = true;
+ }
+
if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
panic("%s: failed to acquire lock for vnode lock", __func__);
-
/*
* Use the interlock to protect the clearing of v_data to
* prevent faults in unionfs_lock().
*/
VI_LOCK(vp);
- unp = VTOUNIONFS(vp);
- lvp = unp->un_lowervp;
- uvp = unp->un_uppervp;
- dvp = unp->un_dvp;
unp->un_lowervp = unp->un_uppervp = NULLVP;
vp->v_vnlock = &(vp->v_lock);
vp->v_data = NULL;
@@ -502,18 +543,16 @@ unionfs_noderem(struct vnode *vp)
("%s: write reference without upper vnode", __func__));
VOP_ADD_WRITECOUNT(uvp, -writerefs);
}
- if (lvp != NULLVP)
- VOP_UNLOCK(lvp);
if (uvp != NULLVP)
- VOP_UNLOCK(uvp);
+ vput(uvp);
+ if (unlock_lvp)
+ vput(lvp);
+ else if (lvp != NULLVP)
+ vrele(lvp);
if (dvp != NULLVP)
unionfs_rem_cached_vnode(unp, dvp);
- if (lvp != NULLVP)
- vrele(lvp);
- if (uvp != NULLVP)
- vrele(uvp);
if (unp->un_path != NULL) {
free(unp->un_path, M_UNIONFSPATH);
unp->un_path = NULL;
@@ -539,35 +578,52 @@ unionfs_noderem(struct vnode *vp)
}
/*
- * Get the unionfs node status object for the vnode corresponding to unp,
- * for the process that owns td. Allocate a new status object if one
- * does not already exist.
+ * Find the unionfs node status object for the vnode corresponding to unp,
+ * for the process that owns td. Return NULL if no such object exists.
*/
-void
-unionfs_get_node_status(struct unionfs_node *unp, struct thread *td,
- struct unionfs_node_status **unspp)
+struct unionfs_node_status *
+unionfs_find_node_status(struct unionfs_node *unp, struct thread *td)
{
struct unionfs_node_status *unsp;
pid_t pid;
pid = td->td_proc->p_pid;
- KASSERT(NULL != unspp, ("%s: NULL status", __func__));
ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) {
if (unsp->uns_pid == pid) {
- *unspp = unsp;
- return;
+ return (unsp);
}
}
- /* create a new unionfs node status */
- unsp = malloc(sizeof(struct unionfs_node_status),
- M_TEMP, M_WAITOK | M_ZERO);
+ return (NULL);
+}
+
+/*
+ * Get the unionfs node status object for the vnode corresponding to unp,
+ * for the process that owns td. Allocate a new status object if one
+ * does not already exist.
+ */
+void
+unionfs_get_node_status(struct unionfs_node *unp, struct thread *td,
+ struct unionfs_node_status **unspp)
+{
+ struct unionfs_node_status *unsp;
+ pid_t pid;
+
+ pid = td->td_proc->p_pid;
- unsp->uns_pid = pid;
- LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list);
+ KASSERT(NULL != unspp, ("%s: NULL status", __func__));
+ unsp = unionfs_find_node_status(unp, td);
+ if (unsp == NULL) {
+ /* create a new unionfs node status */
+ unsp = malloc(sizeof(struct unionfs_node_status),
+ M_TEMP, M_WAITOK | M_ZERO);
+
+ unsp->uns_pid = pid;
+ LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list);
+ }
*unspp = unsp;
}
@@ -697,110 +753,6 @@ unionfs_relookup(struct vnode *dvp, struct vnode **vpp,
}
/*
- * relookup for CREATE namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- *
- * If it called 'unionfs_copyfile' function by unionfs_link etc,
- * VOP_LOOKUP information is broken.
- * So it need relookup in order to create link etc.
- */
-int
-unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp,
- struct thread *td)
-{
- struct vnode *udvp;
- struct vnode *vp;
- struct componentname cn;
- int error;
-
- udvp = UNIONFSVPTOUPPERVP(dvp);
- vp = NULLVP;
-
- error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
- cnp->cn_namelen, CREATE);
- if (error)
- return (error);
-
- if (vp != NULLVP) {
- if (udvp == vp)
- vrele(vp);
- else
- vput(vp);
-
- error = EEXIST;
- }
-
- return (error);
-}
-
-/*
- * relookup for DELETE namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- */
-int
-unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp,
- struct thread *td)
-{
- struct vnode *udvp;
- struct vnode *vp;
- struct componentname cn;
- int error;
-
- udvp = UNIONFSVPTOUPPERVP(dvp);
- vp = NULLVP;
-
- error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
- cnp->cn_namelen, DELETE);
- if (error)
- return (error);
-
- if (vp == NULLVP)
- error = ENOENT;
- else {
- if (udvp == vp)
- vrele(vp);
- else
- vput(vp);
- }
-
- return (error);
-}
-
-/*
- * relookup for RENAME namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- */
-int
-unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp,
- struct thread *td)
-{
- struct vnode *udvp;
- struct vnode *vp;
- struct componentname cn;
- int error;
-
- udvp = UNIONFSVPTOUPPERVP(dvp);
- vp = NULLVP;
-
- error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
- cnp->cn_namelen, RENAME);
- if (error)
- return (error);
-
- if (vp != NULLVP) {
- if (udvp == vp)
- vrele(vp);
- else
- vput(vp);
- }
-
- return (error);
-}
-
-/*
* Update the unionfs_node.
*
* uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the
@@ -836,6 +788,8 @@ unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp,
vp->v_vnlock = uvp->v_vnlock;
VI_UNLOCK(vp);
+ for (count = 0; count < lockrec + 1; count++)
+ VOP_UNLOCK(lvp);
/*
* Re-cache the unionfs vnode against the upper vnode
*/
@@ -851,18 +805,87 @@ unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp,
}
/*
+ * Mark a unionfs operation as being in progress, sleeping if the
+ * same operation is already in progress.
+ * This is useful, for example, during copy-up operations in which
+ * we may drop the target vnode lock, but we want to avoid the
+ * possibility of a concurrent copy-up on the same vnode triggering
+ * a spurious failure.
+ */
+int
+unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag)
+{
+ struct unionfs_node *unp;
+ int error;
+
+ error = 0;
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ VI_LOCK(vp);
+ unp = VTOUNIONFS(vp);
+ while (error == 0 && (unp->un_flag & flag) != 0) {
+ VOP_UNLOCK(vp);
+ error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VI_LOCK(vp);
+ if (error == 0) {
+ /*
+ * If we waited on a concurrent copy-up and that
+ * copy-up was successful, return a non-fatal
+ * indication that the desired operation is already
+ * complete. If we waited on a concurrent lookup,
+ * return ERELOOKUP to indicate the VFS cache should
+ * be re-queried to avoid creating a duplicate unionfs
+ * vnode.
+ */
+ unp = VTOUNIONFS(vp);
+ if (unp == NULL)
+ error = ENOENT;
+ else if (flag == UNIONFS_COPY_IN_PROGRESS &&
+ unp->un_uppervp != NULLVP)
+ error = EJUSTRETURN;
+ else if (flag == UNIONFS_LOOKUP_IN_PROGRESS)
+ error = ERELOOKUP;
+ }
+ }
+ if (error == 0)
+ unp->un_flag |= flag;
+ VI_UNLOCK(vp);
+
+ return (error);
+}
+
+void
+unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag)
+{
+ struct unionfs_node *unp;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ unp = VTOUNIONFS(vp);
+ VI_LOCK(vp);
+ if (unp != NULL) {
+ VNASSERT((unp->un_flag & flag) != 0, vp,
+ ("%s: copy not in progress", __func__));
+ unp->un_flag &= ~flag;
+ }
+ wakeup(vp);
+ VI_UNLOCK(vp);
+}
+
+/*
* Create a new shadow dir.
*
- * udvp should be locked on entry and will be locked on return.
+ * dvp and vp are unionfs vnodes representing a parent directory and
+ * child file, should be locked on entry, and will be locked on return.
*
* If no error returned, unp will be updated.
*/
int
-unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
- struct unionfs_node *unp, struct componentname *cnp, struct thread *td)
+unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp,
+ struct componentname *cnp, struct thread *td)
{
struct vnode *lvp;
struct vnode *uvp;
+ struct vnode *udvp;
struct vattr va;
struct vattr lva;
struct nameidata nd;
@@ -870,10 +893,25 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
struct ucred *cred;
struct ucred *credbk;
struct uidinfo *rootinfo;
+ struct unionfs_mount *ump;
+ struct unionfs_node *dunp;
+ struct unionfs_node *unp;
int error;
+ ASSERT_VOP_ELOCKED(dvp, __func__);
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ ump = MOUNTTOUNIONFSMOUNT(vp->v_mount);
+ unp = VTOUNIONFS(vp);
if (unp->un_uppervp != NULLVP)
return (EEXIST);
+ dunp = VTOUNIONFS(dvp);
+ udvp = dunp->un_uppervp;
+
+ error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
+ if (error == EJUSTRETURN)
+ return (0);
+ else if (error != 0)
+ return (error);
lvp = unp->un_lowervp;
uvp = NULLVP;
@@ -882,11 +920,6 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
/* Authority change to root */
rootinfo = uifind((uid_t)0);
cred = crdup(cnp->cn_cred);
- /*
- * The calls to chgproccnt() are needed to compensate for change_ruid()
- * calling chgproccnt().
- */
- chgproccnt(cred->cr_ruidinfo, 1, 0);
change_euid(cred, rootinfo);
change_ruid(cred, rootinfo);
change_svuid(cred, (uid_t)0);
@@ -897,11 +930,29 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
NDPREINIT(&nd);
if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred)))
- goto unionfs_mkshadowdir_abort;
+ goto unionfs_mkshadowdir_finish;
+ vref(udvp);
+ VOP_UNLOCK(vp);
if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td,
- cnp->cn_nameptr, cnp->cn_namelen, CREATE)))
- goto unionfs_mkshadowdir_abort;
+ cnp->cn_nameptr, cnp->cn_namelen, CREATE))) {
+ /*
+ * When handling error cases here, we drop udvp's lock and
+ * then jump to exit code that relocks dvp, which in most
+ * cases will effectively relock udvp. However, this is
+ * not guaranteed to be the case, as various calls made
+ * here (such as unionfs_relookup() above and VOP_MKDIR()
+ * below) may unlock and then relock udvp, allowing dvp to
+ * be reclaimed in the meantime. In such a situation dvp
+ * will no longer share its lock with udvp. Since
+ * performance isn't a concern for these error cases, it
+ * makes more sense to reuse the common code that locks
+ * dvp on exit than to explicitly check for reclamation
+ * of dvp.
+ */
+ vput(udvp);
+ goto unionfs_mkshadowdir_relock;
+ }
if (uvp != NULLVP) {
if (udvp == uvp)
vrele(uvp);
@@ -909,11 +960,14 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
vput(uvp);
error = EEXIST;
- goto unionfs_mkshadowdir_abort;
+ vput(udvp);
+ goto unionfs_mkshadowdir_relock;
}
- if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
- goto unionfs_mkshadowdir_abort;
+ if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) {
+ vput(udvp);
+ goto unionfs_mkshadowdir_relock;
+ }
unionfs_create_uppervattr_core(ump, &lva, &va, td);
/*
@@ -924,7 +978,7 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
* component. This *should* be fine, as cn_namelen will still
* correctly indicate the length of only the current component,
* but ZFS in particular does not respect cn_namelen in its VOP_MKDIR
- * implementation
+ * implementation.
* Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by
* something like a local namei() operation and the temporary
* NUL-termination will not have an effect on other threads.
@@ -934,29 +988,59 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
*pathend = '\0';
error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va);
*pathend = pathterm;
-
- if (!error) {
- /*
- * XXX The bug which cannot set uid/gid was corrected.
- * Ignore errors.
- */
- va.va_type = VNON;
- VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
-
+ if (error != 0) {
/*
- * VOP_SETATTR() may transiently drop uvp's lock, so it's
- * important to call it before unionfs_node_update() transfers
- * the unionfs vnode's lock from lvp to uvp; otherwise the
- * unionfs vnode itself would be transiently unlocked and
- * potentially doomed.
+ * See the comment after unionfs_relookup() above for an
+ * explanation of why we unlock udvp here only to relock
+ * dvp on exit.
*/
- unionfs_node_update(unp, uvp, td);
+ vput(udvp);
+ vn_finished_write(mp);
+ goto unionfs_mkshadowdir_relock;
}
+
+ /*
+ * XXX The bug which cannot set uid/gid was corrected.
+ * Ignore errors.
+ */
+ va.va_type = VNON;
+ /*
+ * VOP_SETATTR() may transiently drop uvp's lock, so it's
+ * important to call it before unionfs_node_update() transfers
+ * the unionfs vnode's lock from lvp to uvp; otherwise the
+ * unionfs vnode itself would be transiently unlocked and
+ * potentially doomed.
+ */
+ VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
+
+ /*
+ * uvp may become doomed during VOP_VPUT_PAIR() if the implementation
+ * must temporarily drop uvp's lock. However, since we hold a
+ * reference to uvp from the VOP_MKDIR() call above, this would require
+ * a forcible unmount of uvp's filesystem, which in turn can only
+ * happen if our unionfs instance is first forcibly unmounted. We'll
+ * therefore catch this case in the NULL check of unp below.
+ */
+ VOP_VPUT_PAIR(udvp, &uvp, false);
vn_finished_write(mp);
+ vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
+ unp = VTOUNIONFS(vp);
+ if (unp == NULL) {
+ vput(uvp);
+ error = ENOENT;
+ } else
+ unionfs_node_update(unp, uvp, td);
+ VOP_UNLOCK(vp);
+
+unionfs_mkshadowdir_relock:
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)))
+ error = ENOENT;
-unionfs_mkshadowdir_abort:
+unionfs_mkshadowdir_finish:
+ unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
cnp->cn_cred = credbk;
- chgproccnt(cred->cr_ruidinfo, -1, 0);
crfree(cred);
return (error);
@@ -1116,23 +1200,31 @@ unionfs_forward_vop_finish_pair(
/*
* Create a new whiteout.
*
- * udvp and dvp should be locked on entry and will be locked on return.
+ * dvp and vp are unionfs vnodes representing a parent directory and
+ * child file, should be locked on entry, and will be locked on return.
*/
int
-unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp,
+unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp,
struct componentname *cnp, struct thread *td, char *path, int pathlen)
{
+ struct vnode *udvp;
struct vnode *wvp;
struct nameidata nd;
struct mount *mp;
int error;
- int lkflags;
+ bool dvp_locked;
+
+ ASSERT_VOP_ELOCKED(dvp, __func__);
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ udvp = VTOUNIONFS(dvp)->un_uppervp;
wvp = NULLVP;
NDPREINIT(&nd);
+ vref(udvp);
+ VOP_UNLOCK(vp);
if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path,
pathlen, CREATE))) {
- return (error);
+ goto unionfs_mkwhiteout_cleanup;
}
if (wvp != NULLVP) {
if (udvp == wvp)
@@ -1140,18 +1232,27 @@ unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp,
else
vput(wvp);
- return (EEXIST);
+ if (nd.ni_cnd.cn_flags & ISWHITEOUT)
+ error = 0;
+ else
+ error = EEXIST;
+ goto unionfs_mkwhiteout_cleanup;
}
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
- goto unionfs_mkwhiteout_free_out;
- unionfs_forward_vop_start(udvp, &lkflags);
+ goto unionfs_mkwhiteout_cleanup;
error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE);
- unionfs_forward_vop_finish(dvp, udvp, lkflags);
-
vn_finished_write(mp);
-unionfs_mkwhiteout_free_out:
+unionfs_mkwhiteout_cleanup:
+ if (VTOUNIONFS(dvp) == NULL) {
+ vput(udvp);
+ dvp_locked = false;
+ } else {
+ vrele(udvp);
+ dvp_locked = true;
+ }
+ vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE);
return (error);
}
@@ -1165,10 +1266,11 @@ unionfs_mkwhiteout_free_out:
*/
static int
unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
- struct unionfs_node *unp, struct vattr *uvap, struct thread *td)
+ struct vnode *vp, struct vattr *uvap, struct thread *td)
{
struct unionfs_mount *ump;
- struct vnode *vp;
+ struct unionfs_node *unp;
+ struct vnode *uvp;
struct vnode *lvp;
struct ucred *cred;
struct vattr lva;
@@ -1176,8 +1278,10 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
int fmode;
int error;
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ unp = VTOUNIONFS(vp);
ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
- vp = NULLVP;
+ uvp = NULLVP;
lvp = unp->un_lowervp;
cred = td->td_ucred;
fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL);
@@ -1200,42 +1304,39 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
NDPREINIT(&nd);
vref(udvp);
- if ((error = vfs_relookup(udvp, &vp, &nd.ni_cnd, false)) != 0)
- goto unionfs_vn_create_on_upper_free_out2;
- vrele(udvp);
+ VOP_UNLOCK(vp);
+ if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) {
+ vrele(udvp);
+ return (error);
+ }
- if (vp != NULLVP) {
- if (vp == udvp)
- vrele(vp);
+ if (uvp != NULLVP) {
+ if (uvp == udvp)
+ vrele(uvp);
else
- vput(vp);
+ vput(uvp);
error = EEXIST;
- goto unionfs_vn_create_on_upper_free_out1;
+ goto unionfs_vn_create_on_upper_cleanup;
}
- if ((error = VOP_CREATE(udvp, &vp, &nd.ni_cnd, uvap)) != 0)
- goto unionfs_vn_create_on_upper_free_out1;
+ if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0)
+ goto unionfs_vn_create_on_upper_cleanup;
- if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) {
- vput(vp);
- goto unionfs_vn_create_on_upper_free_out1;
+ if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) {
+ vput(uvp);
+ goto unionfs_vn_create_on_upper_cleanup;
}
- error = VOP_ADD_WRITECOUNT(vp, 1);
+ error = VOP_ADD_WRITECOUNT(uvp, 1);
CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
- __func__, vp, vp->v_writecount);
+ __func__, uvp, uvp->v_writecount);
if (error == 0) {
- *vpp = vp;
+ *vpp = uvp;
} else {
- VOP_CLOSE(vp, fmode, cred, td);
+ VOP_CLOSE(uvp, fmode, cred, td);
}
-unionfs_vn_create_on_upper_free_out1:
- VOP_UNLOCK(udvp);
-
-unionfs_vn_create_on_upper_free_out2:
- KASSERT(nd.ni_cnd.cn_pnbuf == unp->un_path,
- ("%s: cn_pnbuf changed", __func__));
-
+unionfs_vn_create_on_upper_cleanup:
+ vput(udvp);
return (error);
}
@@ -1310,13 +1411,18 @@ unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp,
*
* If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to
* docopy.
+ *
+ * vp is a unionfs vnode that should be locked on entry and will be
+ * locked on return.
*
* If no error returned, unp will be updated.
*/
int
-unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
+unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred,
struct thread *td)
{
+ struct unionfs_node *unp;
+ struct unionfs_node *dunp;
struct mount *mp;
struct vnode *udvp;
struct vnode *lvp;
@@ -1324,6 +1430,8 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
struct vattr uva;
int error;
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ unp = VTOUNIONFS(vp);
lvp = unp->un_lowervp;
uvp = NULLVP;
@@ -1333,22 +1441,51 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
return (EINVAL);
if (unp->un_uppervp != NULLVP)
return (EEXIST);
- udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp;
+
+ udvp = NULLVP;
+ VI_LOCK(unp->un_dvp);
+ dunp = VTOUNIONFS(unp->un_dvp);
+ if (dunp != NULL)
+ udvp = dunp->un_uppervp;
+ VI_UNLOCK(unp->un_dvp);
+
if (udvp == NULLVP)
return (EROFS);
if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
return (EROFS);
+ ASSERT_VOP_UNLOCKED(udvp, __func__);
+
+ error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
+ if (error == EJUSTRETURN)
+ return (0);
+ else if (error != 0)
+ return (error);
error = VOP_ACCESS(lvp, VREAD, cred, td);
if (error != 0)
- return (error);
+ goto unionfs_copyfile_cleanup;
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0)
- return (error);
- error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td);
+ goto unionfs_copyfile_cleanup;
+ error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td);
if (error != 0) {
vn_finished_write(mp);
- return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ goto unionfs_copyfile_cleanup;
+ }
+
+ /*
+ * Note that it's still possible for e.g. VOP_WRITE to relock
+ * uvp below while holding vp[=lvp] locked. Replacing
+ * unionfs_copyfile_core with vn_generic_copy_file_range() will
+ * allow us to avoid the problem by moving this vn_lock_pair()
+ * call much later.
+ */
+ vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
+ unp = VTOUNIONFS(vp);
+ if (unp == NULL) {
+ error = ENOENT;
+ goto unionfs_copyfile_cleanup;
}
if (docopy != 0) {
@@ -1369,18 +1506,30 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
/* Reset the attributes. Ignore errors. */
uva.va_type = VNON;
VOP_SETATTR(uvp, &uva, cred);
+ unionfs_node_update(unp, uvp, td);
}
- unionfs_node_update(unp, uvp, td);
-
+unionfs_copyfile_cleanup:
+ unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
return (error);
}
/*
- * It checks whether vp can rmdir. (check empty)
+ * Determine if the unionfs view of a directory is empty such that
+ * an rmdir operation can be permitted.
+ *
+ * We assume the VOP_RMDIR() against the upper layer vnode will take
+ * care of this check for us where the upper FS is concerned, so here
+ * we concentrate on the lower FS. We need to check for the presence
+ * of files other than "." and ".." in the lower FS directory and
+ * then cross-check any files we find against the upper FS to see if
+ * a whiteout is present (in which case we treat the lower file as
+ * non-present).
+ *
+ * The logic here is based heavily on vn_dir_check_empty().
*
- * vp is unionfs vnode.
- * vp should be locked.
+ * vp should be a locked unionfs node, and vp's lowervp should also be
+ * locked.
*/
int
unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td)
@@ -1388,115 +1537,127 @@ unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td)
struct vnode *uvp;
struct vnode *lvp;
struct vnode *tvp;
+ char *dirbuf;
+ size_t dirbuflen, len;
+ off_t off;
struct dirent *dp;
- struct dirent *edp;
struct componentname cn;
- struct iovec iov;
- struct uio uio;
struct vattr va;
int error;
int eofflag;
- int lookuperr;
-
- /*
- * The size of buf needs to be larger than DIRBLKSIZ.
- */
- char buf[256 * 6];
-
- ASSERT_VOP_ELOCKED(vp, __func__);
eofflag = 0;
- uvp = UNIONFSVPTOUPPERVP(vp);
lvp = UNIONFSVPTOLOWERVP(vp);
+ uvp = UNIONFSVPTOUPPERVP(vp);
+
+ /*
+ * Note that the locking here still isn't ideal: We expect the caller
+ * to hold both the upper and lower layer locks as well as the upper
+ * parent directory lock, which it can do in a manner that avoids
+ * deadlock. However, if the cross-check logic below needs to call
+ * VOP_LOOKUP(), that may relock the upper vnode and lock any found
+ * child vnode in a way that doesn't protect against deadlock given
+ * the other held locks. Beyond that, the various other VOPs we issue
+ * below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the
+ * lower vnode.
+ * We might instead just handoff between the upper vnode lock
+ * (and its parent directory lock) and the lower vnode lock as needed,
+ * so that the lower lock is never held at the same time as the upper
+ * locks, but that opens up a wider window in which the upper
+ * directory (and also the lower directory if it isn't truly
+ * read-only) may change while the relevant lock is dropped. But
+ * since re-locking may happen here and open up such a window anyway,
+ * perhaps that is a worthwile tradeoff? Or perhaps we can ultimately
+ * do sufficient tracking of empty state within the unionfs vnode
+ * (in conjunction with upcalls from the lower FSes to notify us
+ * of out-of-band state changes) that we can avoid these costly checks
+ * altogether.
+ */
+ ASSERT_VOP_LOCKED(lvp, __func__);
+ ASSERT_VOP_ELOCKED(uvp, __func__);
- /* check opaque */
if ((error = VOP_GETATTR(uvp, &va, cred)) != 0)
return (error);
if (va.va_flags & OPAQUE)
return (0);
- /* open vnode */
#ifdef MAC
- if ((error = mac_vnode_check_open(cred, vp, VEXEC|VREAD)) != 0)
+ if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0)
return (error);
#endif
- if ((error = VOP_ACCESS(vp, VEXEC|VREAD, cred, td)) != 0)
+ if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0)
+ return (error);
+ if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0)
return (error);
- if ((error = VOP_OPEN(vp, FREAD, cred, td, NULL)) != 0)
+ if ((error = VOP_GETATTR(lvp, &va, cred)) != 0)
return (error);
- uio.uio_rw = UIO_READ;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_td = td;
- uio.uio_offset = 0;
+ dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ);
+ if (dirbuflen < va.va_blocksize)
+ dirbuflen = va.va_blocksize;
+ dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
-#ifdef MAC
- error = mac_vnode_check_readdir(td->td_ucred, lvp);
-#endif
- while (!error && !eofflag) {
- iov.iov_base = buf;
- iov.iov_len = sizeof(buf);
- uio.uio_iov = &iov;
- uio.uio_iovcnt = 1;
- uio.uio_resid = iov.iov_len;
+ len = 0;
+ off = 0;
+ eofflag = 0;
- error = VOP_READDIR(lvp, &uio, cred, &eofflag, NULL, NULL);
+ for (;;) {
+ error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen,
+ &dp, &len, &off, &eofflag);
if (error != 0)
break;
- KASSERT(eofflag != 0 || uio.uio_resid < sizeof(buf),
- ("%s: empty read from lower FS", __func__));
-
- edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid];
- for (dp = (struct dirent*)buf; !error && dp < edp;
- dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) {
- if (dp->d_type == DT_WHT || dp->d_fileno == 0 ||
- (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
- (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2)))
- continue;
-
- cn.cn_namelen = dp->d_namlen;
- cn.cn_pnbuf = NULL;
- cn.cn_nameptr = dp->d_name;
- cn.cn_nameiop = LOOKUP;
- cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
- cn.cn_lkflags = LK_EXCLUSIVE;
- cn.cn_cred = cred;
-
- /*
- * check entry in lower.
- * Sometimes, readdir function returns
- * wrong entry.
- */
- lookuperr = VOP_LOOKUP(lvp, &tvp, &cn);
- if (!lookuperr)
- vput(tvp);
- else
- continue; /* skip entry */
-
- /*
- * check entry
- * If it has no exist/whiteout entry in upper,
- * directory is not empty.
- */
- cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
- lookuperr = VOP_LOOKUP(uvp, &tvp, &cn);
+ if (len == 0) {
+ /* EOF */
+ error = 0;
+ break;
+ }
- if (!lookuperr)
- vput(tvp);
+ if (dp->d_type == DT_WHT)
+ continue;
- /* ignore exist or whiteout entry */
- if (!lookuperr ||
- (lookuperr == ENOENT && (cn.cn_flags & ISWHITEOUT)))
- continue;
+ /*
+ * Any file in the directory which is not '.' or '..' indicates
+ * the directory is not empty.
+ */
+ switch (dp->d_namlen) {
+ case 2:
+ if (dp->d_name[1] != '.') {
+ /* Can't be '..' (nor '.') */
+ break;
+ }
+ /* FALLTHROUGH */
+ case 1:
+ if (dp->d_name[0] != '.') {
+ /* Can't be '..' nor '.' */
+ break;
+ }
+ continue;
+ default:
+ break;
+ }
+ cn.cn_namelen = dp->d_namlen;
+ cn.cn_pnbuf = NULL;
+ cn.cn_nameptr = dp->d_name;
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
+ cn.cn_lkflags = LK_EXCLUSIVE;
+ cn.cn_cred = cred;
+
+ error = VOP_LOOKUP(uvp, &tvp, &cn);
+ if (tvp != NULLVP)
+ vput(tvp);
+ if (error != 0 && error != ENOENT && error != EJUSTRETURN)
+ break;
+ else if ((cn.cn_flags & ISWHITEOUT) == 0) {
error = ENOTEMPTY;
- }
+ break;
+ } else
+ error = 0;
}
- /* close vnode */
- VOP_CLOSE(vp, FREAD, cred, td);
-
+ VOP_CLOSE(lvp, FREAD, cred, td);
+ free(dirbuf, M_TEMP);
return (error);
}
-