From b363d3d52a4977fa6e3856bb88fca1224e043234 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Thu, 7 Jan 2021 16:03:48 -0800 Subject: OpenZFS: Fix issues caused by reversed commits + rebase --- .../openzfs/module/os/linux/zfs/zfs_vnops.c | 4000 -------------------- .../openzfs/module/os/linux/zfs/zfs_vnops_os.c | 4000 ++++++++++++++++++++ 2 files changed, 4000 insertions(+), 4000 deletions(-) delete mode 100644 sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c create mode 100644 sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c (limited to 'sys/contrib/openzfs/module/os/linux') diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c deleted file mode 100644 index 3be387a30e5c..000000000000 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c +++ /dev/null @@ -1,4000 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2015 by Chunwei Chen. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -/* Portions Copyright 2007 Jeremy Teo */ -/* Portions Copyright 2010 Robert Milkowski */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Programming rules. - * - * Each vnode op performs some logical unit of work. To do this, the ZPL must - * properly lock its in-core state, create a DMU transaction, do the work, - * record this work in the intent log (ZIL), commit the DMU transaction, - * and wait for the intent log to commit if it is a synchronous operation. - * Moreover, the vnode ops must work in both normal and log replay context. - * The ordering of events is important to avoid deadlocks and references - * to freed memory. The example below illustrates the following Big Rules: - * - * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros - * can return EIO from the calling function. - * - * (2) zrele() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: - * First, if it's the last reference, the vnode/znode - * can be freed, so the zp may point to freed memory. Second, the last - * reference will call zfs_zinactive(), which may induce a lot of work -- - * pushing cached pages (which acquires range locks) and syncing out - * cached atime changes. Third, zfs_zinactive() may require a new tx, - * which could deadlock the system if you were already holding one. - * If you must call zrele() within a tx then use zfs_zrele_async(). - * - * (3) All range locks must be grabbed before calling dmu_tx_assign(), - * as they can span dmu_tx_assign() calls. - * - * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to - * dmu_tx_assign(). This is critical because we don't want to block - * while holding locks. - * - * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This - * reduces lock contention and CPU usage when we must wait (note that if - * throughput is constrained by the storage, nearly every transaction - * must wait). - * - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing - * to use a non-blocking assign can deadlock the system. The scenario: - * - * Thread A has grabbed a lock before calling dmu_tx_assign(). - * Thread B is in an already-assigned tx, and blocks for this lock. - * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() - * forever, because the previous txg can't quiesce until B's tx commits. - * - * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, - * then drop all locks, call dmu_tx_wait(), and try again. On subsequent - * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, - * to indicate that this operation has already called dmu_tx_wait(). - * This will ensure that we don't retry forever, waiting a short bit - * each time. - * - * (5) If the operation succeeded, generate the intent log entry for it - * before dropping locks. This ensures that the ordering of events - * in the intent log matches the order in which they actually occurred. - * During ZIL replay the zfs_log_* functions will update the sequence - * number to indicate the zil transaction has replayed. - * - * (6) At the end of each vnode op, the DMU tx must always commit, - * regardless of whether there were any errors. - * - * (7) After dropping all locks, invoke zil_commit(zilog, foid) - * to ensure that synchronous semantics are provided when necessary. - * - * In general, this is how things should be ordered in each vnode op: - * - * ZFS_ENTER(zfsvfs); // exit if unmounted - * top: - * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) - * rw_enter(...); // grab any other locks you need - * tx = dmu_tx_create(...); // get DMU tx - * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - * if (error) { - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * zrele(...); // release held znodes - * if (error == ERESTART) { - * waited = B_TRUE; - * dmu_tx_wait(tx); - * dmu_tx_abort(tx); - * goto top; - * } - * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // really out of space - * } - * error = do_real_work(); // do whatever this VOP does - * if (error == 0) - * zfs_log_*(...); // on success, make ZIL entry - * dmu_tx_commit(tx); // commit DMU tx -- error or not - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * zrele(...); // release held znodes - * zil_commit(zilog, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // done, report error - */ - -/* - * Virus scanning is unsupported. It would be possible to add a hook - * here to performance the required virus scan. This could be done - * entirely in the kernel or potentially as an update to invoke a - * scanning utility. - */ -static int -zfs_vscan(struct inode *ip, cred_t *cr, int async) -{ - return (0); -} - -/* ARGSUSED */ -int -zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* Honor ZFS_APPENDONLY file attribute */ - if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && - ((flag & O_APPEND) == 0)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - /* Virus scan eligible files on open */ - if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { - if (zfs_vscan(ip, cr, 0) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - } - - /* Keep a count of the synchronous opens in the znode */ - if (flag & O_SYNC) - atomic_inc_32(&zp->z_sync_cnt); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* ARGSUSED */ -int -zfs_close(struct inode *ip, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* Decrement the synchronous opens in the znode */ - if (flag & O_SYNC) - atomic_dec_32(&zp->z_sync_cnt); - - if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) - VERIFY(zfs_vscan(ip, cr, 1) == 0); - - ZFS_EXIT(zfsvfs); - return (0); -} - -#if defined(_KERNEL) -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. - */ -void -update_pages(znode_t *zp, int64_t start, int len, objset_t *os) -{ - struct inode *ip = ZTOI(zp); - struct address_space *mp = ip->i_mapping; - struct page *pp; - uint64_t nbytes; - int64_t off; - void *pb; - - off = start & (PAGE_SIZE-1); - for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - nbytes = MIN(PAGE_SIZE - off, len); - - pp = find_lock_page(mp, start >> PAGE_SHIFT); - if (pp) { - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - pb = kmap(pp); - (void) dmu_read(os, zp->z_id, start + off, nbytes, - pb + off, DMU_READ_PREFETCH); - kunmap(pp); - - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); - unlock_page(pp); - put_page(pp); - } - - len -= nbytes; - off = 0; - } -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -int -mappedread(znode_t *zp, int nbytes, uio_t *uio) -{ - struct inode *ip = ZTOI(zp); - struct address_space *mp = ip->i_mapping; - struct page *pp; - int64_t start, off; - uint64_t bytes; - int len = nbytes; - int error = 0; - void *pb; - - start = uio->uio_loffset; - off = start & (PAGE_SIZE-1); - for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - bytes = MIN(PAGE_SIZE - off, len); - - pp = find_lock_page(mp, start >> PAGE_SHIFT); - if (pp) { - ASSERT(PageUptodate(pp)); - unlock_page(pp); - - pb = kmap(pp); - error = uiomove(pb + off, bytes, UIO_READ, uio); - kunmap(pp); - - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - mark_page_accessed(pp); - put_page(pp); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); - } - - len -= bytes; - off = 0; - if (error) - break; - } - return (error); -} -#endif /* _KERNEL */ - -unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; - -/* - * Write the bytes to a file. - * - * IN: zp - znode of file to be written to - * data - bytes to write - * len - number of bytes to write - * pos - offset to start writing at - * - * OUT: resid - remaining bytes to write - * - * RETURN: 0 if success - * positive error code if failure. EIO is returned - * for a short write when residp isn't provided. - * - * Timestamps: - * zp - ctime|mtime updated if byte count > 0 - */ -int -zfs_write_simple(znode_t *zp, const void *data, size_t len, - loff_t pos, size_t *residp) -{ - fstrans_cookie_t cookie; - int error; - - struct iovec iov; - iov.iov_base = (void *)data; - iov.iov_len = len; - - uio_t uio; - uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); - - cookie = spl_fstrans_mark(); - error = zfs_write(zp, &uio, 0, kcred); - spl_fstrans_unmark(cookie); - - if (error == 0) { - if (residp != NULL) - *residp = uio_resid(&uio); - else if (uio_resid(&uio) != 0) - error = SET_ERROR(EIO); - } - - return (error); -} - -void -zfs_zrele_async(znode_t *zp) -{ - struct inode *ip = ZTOI(zp); - objset_t *os = ITOZSB(ip)->z_os; - - ASSERT(atomic_read(&ip->i_count) > 0); - ASSERT(os != NULL); - - if (atomic_read(&ip->i_count) == 1) - VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), - (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); - else - zrele(zp); -} - - -/* - * Lookup an entry in a directory, or an extended attribute directory. - * If it exists, return a held inode reference for it. - * - * IN: zdp - znode of directory to search. - * nm - name of entry to lookup. - * flags - LOOKUP_XATTR set if looking for an attribute. - * cr - credentials of caller. - * direntflags - directory lookup flags - * realpnp - returned pathname. - * - * OUT: zpp - znode of located entry, NULL if not found. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * NA - */ -/* ARGSUSED */ -int -zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, - int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zdp); - int error = 0; - - /* - * Fast path lookup, however we must skip DNLC lookup - * for case folding or normalizing lookups because the - * DNLC code only stores the passed in name. This means - * creating 'a' and removing 'A' on a case insensitive - * file system would work, but DNLC still thinks 'a' - * exists and won't let you create it again on the next - * pass through fast path. - */ - if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { - - if (!S_ISDIR(ZTOI(zdp)->i_mode)) { - return (SET_ERROR(ENOTDIR)); - } else if (zdp->z_sa_hdl == NULL) { - return (SET_ERROR(EIO)); - } - - if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { - error = zfs_fastaccesschk_execute(zdp, cr); - if (!error) { - *zpp = zdp; - zhold(*zpp); - return (0); - } - return (error); - } - } - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); - - *zpp = NULL; - - if (flags & LOOKUP_XATTR) { - /* - * We don't allow recursive attributes.. - * Maybe someday we will. - */ - if (zdp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Do we have permission to get into attribute directory? - */ - - if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, - B_FALSE, cr))) { - zrele(*zpp); - *zpp = NULL; - } - - ZFS_EXIT(zfsvfs); - return (error); - } - - if (!S_ISDIR(ZTOI(zdp)->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTDIR)); - } - - /* - * Check accessibility of directory. - */ - - if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); - if ((error == 0) && (*zpp)) - zfs_inode_update(*zpp); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Attempt to create a new entry in a directory. If the entry - * already exists, truncate the file if permissible, else return - * an error. Return the ip of the created or trunc'd file. - * - * IN: dzp - znode of directory to put new file entry in. - * name - name of new file entry. - * vap - attributes of new file. - * excl - flag indicating exclusive or non-exclusive mode. - * mode - mode to open file with. - * cr - credentials of caller. - * flag - file flag. - * vsecp - ACL to be set - * - * OUT: zpp - znode of created or trunc'd entry. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dzp - ctime|mtime updated if new entry created - * zp - ctime|mtime always, atime if new - */ - -/* ARGSUSED */ -int -zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, - int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) -{ - znode_t *zp; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zilog_t *zilog; - objset_t *os; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - uid_t uid; - gid_t gid; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - boolean_t have_acl = B_FALSE; - boolean_t waited = B_FALSE; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - gid = crgetgid(cr); - uid = crgetuid(cr); - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - -top: - *zpp = NULL; - if (*name == '\0') { - /* - * Null component name refers to the directory itself. - */ - zhold(dzp); - zp = dzp; - dl = NULL; - error = 0; - } else { - /* possible igrab(zp) */ - int zflg = 0; - - if (flag & FIGNORECASE) - zflg |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL); - if (error) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - if (strcmp(name, "..") == 0) - error = SET_ERROR(EISDIR); - ZFS_EXIT(zfsvfs); - return (error); - } - } - - if (zp == NULL) { - uint64_t txtype; - uint64_t projid = ZFS_DEFAULT_PROJID; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - goto out; - } - - /* - * We only support the creation of regular files in - * extended attribute directories. - */ - - if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EINVAL); - goto out; - } - - if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - have_acl = B_TRUE; - - if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) - projid = zfs_inherit_projid(dzp); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } - - tx = dmu_tx_create(os); - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - - error = dmu_tx_assign(tx, - (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - /* - * Since, we failed to add the directory entry for it, - * delete the newly created dnode. - */ - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - goto out; - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); - if (flag & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, acl_ids.z_fuidp, vap); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - } else { - int aflags = (flag & O_APPEND) ? V_APPEND : 0; - - if (have_acl) - zfs_acl_ids_free(&acl_ids); - have_acl = B_FALSE; - - /* - * A directory entry already exists for this name. - */ - /* - * Can't truncate an existing file if in exclusive mode. - */ - if (excl) { - error = SET_ERROR(EEXIST); - goto out; - } - /* - * Can't open a directory for writing. - */ - if (S_ISDIR(ZTOI(zp)->i_mode)) { - error = SET_ERROR(EISDIR); - goto out; - } - /* - * Verify requested access to file. - */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { - goto out; - } - - mutex_enter(&dzp->z_lock); - dzp->z_seq++; - mutex_exit(&dzp->z_lock); - - /* - * Truncate regular files if requested. - */ - if (S_ISREG(ZTOI(zp)->i_mode) && - (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { - /* we can't hold any locks when calling zfs_freesp() */ - if (dl) { - zfs_dirent_unlock(dl); - dl = NULL; - } - error = zfs_freesp(zp, 0, 0, mode, TRUE); - } - } -out: - - if (dl) - zfs_dirent_unlock(dl); - - if (error) { - if (zp) - zrele(zp); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - *zpp = zp; - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* ARGSUSED */ -int -zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) -{ - znode_t *zp = NULL, *dzp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - objset_t *os; - dmu_tx_t *tx; - int error; - uid_t uid; - gid_t gid; - zfs_acl_ids_t acl_ids; - uint64_t projid = ZFS_DEFAULT_PROJID; - boolean_t fuid_dirtied; - boolean_t have_acl = B_FALSE; - boolean_t waited = B_FALSE; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - gid = crgetgid(cr); - uid = crgetuid(cr); - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - -top: - *ipp = NULL; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - goto out; - } - - if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - have_acl = B_TRUE; - - if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) - projid = zfs_inherit_projid(dzp); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } - - tx = dmu_tx_create(os); - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - /* Add to unlinked set */ - zp->z_unlinked = B_TRUE; - zfs_unlinked_add(zp, tx); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); -out: - - if (error) { - if (zp) - zrele(zp); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - *ipp = ZTOI(zp); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove an entry from a directory. - * - * IN: dzp - znode of directory to remove entry from. - * name - name of entry to remove. - * cr - credentials of caller. - * flags - case flags. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dzp - ctime|mtime - * ip - ctime (if nlink > 0) - */ - -uint64_t null_xattr = 0; - -/*ARGSUSED*/ -int -zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) -{ - znode_t *zp; - znode_t *xzp; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zilog_t *zilog; - uint64_t acl_obj, xattr_obj; - uint64_t xattr_obj_unlinked = 0; - uint64_t obj = 0; - uint64_t links; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - boolean_t may_delete_now, delete_now = FALSE; - boolean_t unlinked, toobig = FALSE; - uint64_t txtype; - pathname_t *realnmp = NULL; - pathname_t realnm; - int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) { - zflg |= ZCILOOK; - pn_alloc(&realnm); - realnmp = &realnm; - } - -top: - xattr_obj = 0; - xzp = NULL; - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, realnmp))) { - if (realnmp) - pn_free(realnmp); - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { - goto out; - } - - /* - * Need to use rmdir for removing directories. - */ - if (S_ISDIR(ZTOI(zp)->i_mode)) { - error = SET_ERROR(EPERM); - goto out; - } - - mutex_enter(&zp->z_lock); - may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && - !(zp->z_is_mapped); - mutex_exit(&zp->z_lock); - - /* - * We may delete the znode now, or we may put it in the unlinked set; - * it depends on whether we're the last link, and on whether there are - * other holds on the inode. So we dmu_tx_hold() the right things to - * allow for either case. - */ - obj = zp->z_id; - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - if (may_delete_now) { - toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; - /* if the file is too big, only hold_free a token amount */ - dmu_tx_hold_free(tx, zp->z_id, 0, - (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); - } - - /* are there any extended attributes? */ - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - if (error == 0 && xattr_obj) { - error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT0(error); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); - } - - mutex_enter(&zp->z_lock); - if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - mutex_exit(&zp->z_lock); - - /* charge as an update -- would be nice not to charge at all */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - /* - * Mark this transaction as typically resulting in a net free of space - */ - dmu_tx_mark_netfree(tx); - - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - zrele(zp); - if (xzp) - zrele(xzp); - goto top; - } - if (realnmp) - pn_free(realnmp); - dmu_tx_abort(tx); - zrele(zp); - if (xzp) - zrele(xzp); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Remove the directory entry. - */ - error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); - - if (error) { - dmu_tx_commit(tx); - goto out; - } - - if (unlinked) { - /* - * Hold z_lock so that we can make sure that the ACL obj - * hasn't changed. Could have been deleted due to - * zfs_sa_upgrade(). - */ - mutex_enter(&zp->z_lock); - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); - delete_now = may_delete_now && !toobig && - atomic_read(&ZTOI(zp)->i_count) == 1 && - !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && - zfs_external_acl(zp) == acl_obj; - } - - if (delete_now) { - if (xattr_obj_unlinked) { - ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = B_TRUE; - clear_nlink(ZTOI(xzp)); - links = 0; - error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), - &links, sizeof (links), tx); - ASSERT3U(error, ==, 0); - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - - if (zp->z_is_sa) - error = sa_remove(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), tx); - else - error = sa_update(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), &null_xattr, - sizeof (uint64_t), tx); - ASSERT0(error); - } - /* - * Add to the unlinked set because a new reference could be - * taken concurrently resulting in a deferred destruction. - */ - zfs_unlinked_add(zp, tx); - mutex_exit(&zp->z_lock); - } else if (unlinked) { - mutex_exit(&zp->z_lock); - zfs_unlinked_add(zp, tx); - } - - txtype = TX_REMOVE; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); - - dmu_tx_commit(tx); -out: - if (realnmp) - pn_free(realnmp); - - zfs_dirent_unlock(dl); - zfs_inode_update(dzp); - zfs_inode_update(zp); - - if (delete_now) - zrele(zp); - else - zfs_zrele_async(zp); - - if (xzp) { - zfs_inode_update(xzp); - zfs_zrele_async(xzp); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Create a new directory and insert it into dzp using the name - * provided. Return a pointer to the inserted directory. - * - * IN: dzp - znode of directory to add subdir to. - * dirname - name of new directory. - * vap - attributes of new directory. - * cr - credentials of caller. - * flags - case flags. - * vsecp - ACL to be set - * - * OUT: zpp - znode of created directory. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dzp - ctime|mtime updated - * zpp - ctime|mtime|atime updated - */ -/*ARGSUSED*/ -int -zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, - cred_t *cr, int flags, vsecattr_t *vsecp) -{ - znode_t *zp; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zilog_t *zilog; - zfs_dirlock_t *dl; - uint64_t txtype; - dmu_tx_t *tx; - int error; - int zf = ZNEW; - uid_t uid; - gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - boolean_t waited = B_FALSE; - - ASSERT(S_ISDIR(vap->va_mode)); - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - uid = crgetuid(cr); - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - if (dirname == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (dzp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (zfsvfs->z_utf8 && u8_validate(dirname, - strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - vsecp, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - /* - * First make sure the new directory doesn't exist. - * - * Existence is checked first to make sure we don't return - * EACCES instead of EEXIST which can cause some applications - * to fail. - */ -top: - *zpp = NULL; - - if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, - NULL, NULL))) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - - /* - * Add a new entry to the directory. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create new node. - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - /* - * Now put new name in parent dir. - */ - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - goto out; - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - *zpp = zp; - - txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, - acl_ids.z_fuidp, vap); - -out: - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (error != 0) { - zrele(zp); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - } - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove a directory subdir entry. If the current working - * directory is the same as the subdir to be removed, the - * remove will fail. - * - * IN: dzp - znode of directory to remove from. - * name - name of directory to be removed. - * cwd - inode of current working directory. - * cr - credentials of caller. - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dzp - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, - int flags) -{ - znode_t *zp; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - zp = NULL; - - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { - goto out; - } - - if (!S_ISDIR(ZTOI(zp)->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - - if (zp == cwd) { - error = SET_ERROR(EINVAL); - goto out; - } - - /* - * Grab a lock on the directory to make sure that no one is - * trying to add (or lookup) entries while we are removing it. - */ - rw_enter(&zp->z_name_lock, RW_WRITER); - - /* - * Grab a lock on the parent pointer to make sure we play well - * with the treewalk and directory rename code. - */ - rw_enter(&zp->z_parent_lock, RW_WRITER); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - zrele(zp); - goto top; - } - dmu_tx_abort(tx); - zrele(zp); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_destroy(dl, zp, tx, zflg, NULL); - - if (error == 0) { - uint64_t txtype = TX_RMDIR; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, - B_FALSE); - } - - dmu_tx_commit(tx); - - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); -out: - zfs_dirent_unlock(dl); - - zfs_inode_update(dzp); - zfs_inode_update(zp); - zrele(zp); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Read directory entries from the given directory cursor position and emit - * name and position for each entry. - * - * IN: ip - inode of directory to read. - * ctx - directory entry context. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - atime updated - * - * Note that the low 4 bits of the cookie returned by zap is always zero. - * This allows us to use the low range for "special" directory entries: - * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, - * we use the offset 2 for the '.zfs' directory. - */ -/* ARGSUSED */ -int -zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - zap_cursor_t zc; - zap_attribute_t zap; - int error; - uint8_t prefetch; - uint8_t type; - int done = 0; - uint64_t parent; - uint64_t offset; /* must be unsigned; checks for < 1 */ - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (parent))) != 0) - goto out; - - /* - * Quit if directory has been removed (posix) - */ - if (zp->z_unlinked) - goto out; - - error = 0; - os = zfsvfs->z_os; - offset = ctx->pos; - prefetch = zp->z_zn_prefetch; - - /* - * Initialize the iterator cursor. - */ - if (offset <= 3) { - /* - * Start iteration from the beginning of the directory. - */ - zap_cursor_init(&zc, os, zp->z_id); - } else { - /* - * The offset is a serialized cursor. - */ - zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } - - /* - * Transform to file-system independent format - */ - while (!done) { - uint64_t objnum; - /* - * Special case `.', `..', and `.zfs'. - */ - if (offset == 0) { - (void) strcpy(zap.za_name, "."); - zap.za_normalization_conflict = 0; - objnum = zp->z_id; - type = DT_DIR; - } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); - zap.za_normalization_conflict = 0; - objnum = parent; - type = DT_DIR; - } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); - zap.za_normalization_conflict = 0; - objnum = ZFSCTL_INO_ROOT; - type = DT_DIR; - } else { - /* - * Grab next entry. - */ - if ((error = zap_cursor_retrieve(&zc, &zap))) { - if (error == ENOENT) - break; - else - goto update; - } - - /* - * Allow multiple entries provided the first entry is - * the object id. Non-zpl consumers may safely make - * use of the additional space. - * - * XXX: This should be a feature flag for compatibility - */ - if (zap.za_integer_length != 8 || - zap.za_num_integers == 0) { - cmn_err(CE_WARN, "zap_readdir: bad directory " - "entry, obj = %lld, offset = %lld, " - "length = %d, num = %lld\n", - (u_longlong_t)zp->z_id, - (u_longlong_t)offset, - zap.za_integer_length, - (u_longlong_t)zap.za_num_integers); - error = SET_ERROR(ENXIO); - goto update; - } - - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); - type = ZFS_DIRENT_TYPE(zap.za_first_integer); - } - - done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), - objnum, type); - if (done) - break; - - /* Prefetch znode */ - if (prefetch) { - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - } - - /* - * Move to the next entry, fill in the previous offset. - */ - if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); - offset = zap_cursor_serialize(&zc); - } else { - offset += 1; - } - ctx->pos = offset; - } - zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ - -update: - zap_cursor_fini(&zc); - if (error == ENOENT) - error = 0; -out: - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Get the basic file attributes and place them in the provided kstat - * structure. The inode is assumed to be the authoritative source - * for most of the attributes. However, the znode currently has the - * authoritative atime, blksize, and block count. - * - * IN: ip - inode of file. - * - * OUT: sp - kstat values. - * - * RETURN: 0 (always succeeds) - */ -/* ARGSUSED */ -int -zfs_getattr_fast(struct inode *ip, struct kstat *sp) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint32_t blksize; - u_longlong_t nblocks; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - mutex_enter(&zp->z_lock); - - generic_fillattr(ip, sp); - /* - * +1 link count for root inode with visible '.zfs' directory. - */ - if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) - if (sp->nlink < ZFS_LINK_MAX) - sp->nlink++; - - sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); - sp->blksize = blksize; - sp->blocks = nblocks; - - if (unlikely(zp->z_blksz == 0)) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - sp->blksize = zfsvfs->z_max_blksz; - } - - mutex_exit(&zp->z_lock); - - /* - * Required to prevent NFS client from detecting different inode - * numbers of snapshot root dentry before and after snapshot mount. - */ - if (zfsvfs->z_issnap) { - if (ip->i_sb->s_root->d_inode == ip) - sp->ino = ZFSCTL_INO_SNAPDIRS - - dmu_objset_id(zfsvfs->z_os); - } - - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* - * For the operation of changing file's user/group/project, we need to - * handle not only the main object that is assigned to the file directly, - * but also the ones that are used by the file via hidden xattr directory. - * - * Because the xattr directory may contains many EA entries, as to it may - * be impossible to change all of them via the transaction of changing the - * main object's user/group/project attributes. Then we have to change them - * via other multiple independent transactions one by one. It may be not good - * solution, but we have no better idea yet. - */ -static int -zfs_setattr_dir(znode_t *dzp) -{ - struct inode *dxip = ZTOI(dzp); - struct inode *xip = NULL; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - objset_t *os = zfsvfs->z_os; - zap_cursor_t zc; - zap_attribute_t zap; - zfs_dirlock_t *dl; - znode_t *zp = NULL; - dmu_tx_t *tx = NULL; - uint64_t uid, gid; - sa_bulk_attr_t bulk[4]; - int count; - int err; - - zap_cursor_init(&zc, os, dzp->z_id); - while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { - count = 0; - if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { - err = ENXIO; - break; - } - - err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, - ZEXISTS, NULL, NULL); - if (err == ENOENT) - goto next; - if (err) - break; - - xip = ZTOI(zp); - if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && - KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && - zp->z_projid == dzp->z_projid) - goto next; - - tx = dmu_tx_create(os); - if (!(zp->z_pflags & ZFS_PROJID)) - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - else - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - break; - - mutex_enter(&dzp->z_lock); - - if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { - xip->i_uid = dxip->i_uid; - uid = zfs_uid_read(dxip); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &uid, sizeof (uid)); - } - - if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { - xip->i_gid = dxip->i_gid; - gid = zfs_gid_read(dxip); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &gid, sizeof (gid)); - } - - if (zp->z_projid != dzp->z_projid) { - if (!(zp->z_pflags & ZFS_PROJID)) { - zp->z_pflags |= ZFS_PROJID; - SA_ADD_BULK_ATTR(bulk, count, - SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, - sizeof (zp->z_pflags)); - } - - zp->z_projid = dzp->z_projid; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), - NULL, &zp->z_projid, sizeof (zp->z_projid)); - } - - mutex_exit(&dzp->z_lock); - - if (likely(count > 0)) { - err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - tx = NULL; - if (err != 0 && err != ENOENT) - break; - -next: - if (zp) { - zrele(zp); - zp = NULL; - zfs_dirent_unlock(dl); - } - zap_cursor_advance(&zc); - } - - if (tx) - dmu_tx_abort(tx); - if (zp) { - zrele(zp); - zfs_dirent_unlock(dl); - } - zap_cursor_fini(&zc); - - return (err == ENOENT ? 0 : err); -} - -/* - * Set the file attributes to the values contained in the - * vattr structure. - * - * IN: zp - znode of file to be modified. - * vap - new attribute values. - * If ATTR_XVATTR set, then optional attrs are being set - * flags - ATTR_UTIME set if non-default time values provided. - * - ATTR_NOACLCHECK (CIFS context only). - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime updated, mtime updated if size changed. - */ -/* ARGSUSED */ -int -zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) -{ - struct inode *ip; - zfsvfs_t *zfsvfs = ZTOZSB(zp); - objset_t *os = zfsvfs->z_os; - zilog_t *zilog; - dmu_tx_t *tx; - vattr_t oldva; - xvattr_t *tmpxvattr; - uint_t mask = vap->va_mask; - uint_t saved_mask = 0; - int trim_mask = 0; - uint64_t new_mode; - uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; - uint64_t xattr_obj; - uint64_t mtime[2], ctime[2], atime[2]; - uint64_t projid = ZFS_INVALID_PROJID; - znode_t *attrzp; - int need_policy = FALSE; - int err, err2 = 0; - zfs_fuid_info_t *fuidp = NULL; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap; - zfs_acl_t *aclp; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - boolean_t fuid_dirtied = B_FALSE; - boolean_t handle_eadir = B_FALSE; - sa_bulk_attr_t *bulk, *xattr_bulk; - int count = 0, xattr_count = 0, bulks = 8; - - if (mask == 0) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - ip = ZTOI(zp); - - /* - * If this is a xvattr_t, then get a pointer to the structure of - * optional attributes. If this is NULL, then we have a vattr_t. - */ - xoap = xva_getxoptattr(xvap); - if (xoap != NULL && (mask & ATTR_XVATTR)) { - if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { - if (!dmu_objset_projectquota_enabled(os) || - (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTSUP)); - } - - projid = xoap->xoa_projid; - if (unlikely(projid == ZFS_INVALID_PROJID)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) - projid = ZFS_INVALID_PROJID; - else - need_policy = TRUE; - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && - (xoap->xoa_projinherit != - ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && - (!dmu_objset_projectquota_enabled(os) || - (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTSUP)); - } - } - - zilog = zfsvfs->z_log; - - /* - * Make sure that if we have ephemeral uid/gid or xvattr specified - * that file system is at proper version level - */ - - if (zfsvfs->z_use_fuids == B_FALSE && - (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || - (mask & ATTR_XVATTR))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EISDIR)); - } - - if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); - xva_init(tmpxvattr); - - bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); - xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); - - /* - * Immutable files can only alter immutable bit and atime - */ - if ((zp->z_pflags & ZFS_IMMUTABLE) && - ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || - ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { - err = SET_ERROR(EPERM); - goto out3; - } - - if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = SET_ERROR(EPERM); - goto out3; - } - - /* - * Verify timestamps doesn't overflow 32 bits. - * ZFS can handle large timestamps, but 32bit syscalls can't - * handle times greater than 2039. This check should be removed - * once large timestamps are fully supported. - */ - if (mask & (ATTR_ATIME | ATTR_MTIME)) { - if (((mask & ATTR_ATIME) && - TIMESPEC_OVERFLOW(&vap->va_atime)) || - ((mask & ATTR_MTIME) && - TIMESPEC_OVERFLOW(&vap->va_mtime))) { - err = SET_ERROR(EOVERFLOW); - goto out3; - } - } - -top: - attrzp = NULL; - aclp = NULL; - - /* Can this be moved to before the top label? */ - if (zfs_is_readonly(zfsvfs)) { - err = SET_ERROR(EROFS); - goto out3; - } - - /* - * First validate permissions - */ - - if (mask & ATTR_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); - if (err) - goto out3; - - /* - * XXX - Note, we are not providing any open - * mode flags here (like FNDELAY), so we may - * block if there are locks present... this - * should be addressed in openat(). - */ - /* XXX - would it be OK to generate a log record here? */ - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - if (err) - goto out3; - } - - if (mask & (ATTR_ATIME|ATTR_MTIME) || - ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || - XVA_ISSET_REQ(xvap, XAT_READONLY) || - XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || - XVA_ISSET_REQ(xvap, XAT_OFFLINE) || - XVA_ISSET_REQ(xvap, XAT_SPARSE) || - XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { - need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); - } - - if (mask & (ATTR_UID|ATTR_GID)) { - int idmask = (mask & (ATTR_UID|ATTR_GID)); - int take_owner; - int take_group; - - /* - * NOTE: even if a new mode is being set, - * we may clear S_ISUID/S_ISGID bits. - */ - - if (!(mask & ATTR_MODE)) - vap->va_mode = zp->z_mode; - - /* - * Take ownership or chgrp to group we are a member of - */ - - take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); - take_group = (mask & ATTR_GID) && - zfs_groupmember(zfsvfs, vap->va_gid, cr); - - /* - * If both ATTR_UID and ATTR_GID are set then take_owner and - * take_group must both be set in order to allow taking - * ownership. - * - * Otherwise, send the check through secpolicy_vnode_setattr() - * - */ - - if (((idmask == (ATTR_UID|ATTR_GID)) && - take_owner && take_group) || - ((idmask == ATTR_UID) && take_owner) || - ((idmask == ATTR_GID) && take_group)) { - if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { - /* - * Remove setuid/setgid for non-privileged users - */ - (void) secpolicy_setid_clear(vap, cr); - trim_mask = (mask & (ATTR_UID|ATTR_GID)); - } else { - need_policy = TRUE; - } - } else { - need_policy = TRUE; - } - } - - mutex_enter(&zp->z_lock); - oldva.va_mode = zp->z_mode; - zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); - if (mask & ATTR_XVATTR) { - /* - * Update xvattr mask to include only those attributes - * that are actually changing. - * - * the bits will be restored prior to actually setting - * the attributes so the caller thinks they were set. - */ - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - if (xoap->xoa_appendonly != - ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_APPENDONLY); - XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { - if (xoap->xoa_projinherit != - ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_PROJINHERIT); - XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - if (xoap->xoa_nounlink != - ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NOUNLINK); - XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - if (xoap->xoa_immutable != - ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_IMMUTABLE); - XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - if (xoap->xoa_nodump != - ((zp->z_pflags & ZFS_NODUMP) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NODUMP); - XVA_SET_REQ(tmpxvattr, XAT_NODUMP); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - if (xoap->xoa_av_modified != - ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); - XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - if ((!S_ISREG(ip->i_mode) && - xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); - XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - mutex_exit(&zp->z_lock); - err = SET_ERROR(EPERM); - goto out3; - } - - if (need_policy == FALSE && - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || - XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { - need_policy = TRUE; - } - } - - mutex_exit(&zp->z_lock); - - if (mask & ATTR_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { - err = secpolicy_setid_setsticky_clear(ip, vap, - &oldva, cr); - if (err) - goto out3; - - trim_mask |= ATTR_MODE; - } else { - need_policy = TRUE; - } - } - - if (need_policy) { - /* - * If trim_mask is set then take ownership - * has been granted or write_acl is present and user - * has the ability to modify mode. In that case remove - * UID|GID and or MODE from mask so that - * secpolicy_vnode_setattr() doesn't revoke it. - */ - - if (trim_mask) { - saved_mask = vap->va_mask; - vap->va_mask &= ~trim_mask; - } - err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); - if (err) - goto out3; - - if (trim_mask) - vap->va_mask |= saved_mask; - } - - /* - * secpolicy_vnode_setattr, or take ownership may have - * changed va_mask - */ - mask = vap->va_mask; - - if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { - handle_eadir = B_TRUE; - err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - - if (err == 0 && xattr_obj) { - err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); - if (err) - goto out2; - } - if (mask & ATTR_UID) { - new_kuid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && - zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, - new_kuid)) { - if (attrzp) - zrele(attrzp); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - - if (mask & ATTR_GID) { - new_kgid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); - if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && - zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, - new_kgid)) { - if (attrzp) - zrele(attrzp); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - - if (projid != ZFS_INVALID_PROJID && - zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { - if (attrzp) - zrele(attrzp); - err = EDQUOT; - goto out2; - } - } - tx = dmu_tx_create(os); - - if (mask & ATTR_MODE) { - uint64_t pmode = zp->z_mode; - uint64_t acl_obj; - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && - !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { - err = EPERM; - goto out; - } - - if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) - goto out; - - mutex_enter(&zp->z_lock); - if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { - /* - * Are we upgrading ACL from old V0 format - * to V1 format? - */ - if (zfsvfs->z_version >= ZPL_VERSION_FUID && - zfs_znode_acl_version(zp) == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, acl_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, acl_obj, 0, - aclp->z_acl_bytes); - } - } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } - mutex_exit(&zp->z_lock); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - } else { - if (((mask & ATTR_XVATTR) && - XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || - (projid != ZFS_INVALID_PROJID && - !(zp->z_pflags & ZFS_PROJID))) - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - else - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - } - - if (attrzp) { - dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); - } - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - - zfs_sa_upgrade_txholds(tx, zp); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - goto out; - - count = 0; - /* - * Set each attribute requested. - * We group settings according to the locks they need to acquire. - * - * Note: you cannot set ctime directly, although it will be - * updated as a side-effect of calling this function. - */ - - if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { - /* - * For the existed object that is upgraded from old system, - * its on-disk layout has no slot for the project ID attribute. - * But quota accounting logic needs to access related slots by - * offset directly. So we need to adjust old objects' layout - * to make the project ID to some unified and fixed offset. - */ - if (attrzp) - err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); - if (err == 0) - err = sa_add_projid(zp->z_sa_hdl, tx, projid); - - if (unlikely(err == EEXIST)) - err = 0; - else if (err != 0) - goto out; - else - projid = ZFS_INVALID_PROJID; - } - - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - - if (attrzp) { - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_enter(&attrzp->z_acl_lock); - mutex_enter(&attrzp->z_lock); - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, - sizeof (attrzp->z_pflags)); - if (projid != ZFS_INVALID_PROJID) { - attrzp->z_projid = projid; - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, - sizeof (attrzp->z_projid)); - } - } - - if (mask & (ATTR_UID|ATTR_GID)) { - - if (mask & ATTR_UID) { - ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); - new_uid = zfs_uid_read(ZTOI(zp)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &new_uid, sizeof (new_uid)); - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_UID(zfsvfs), NULL, &new_uid, - sizeof (new_uid)); - ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); - } - } - - if (mask & ATTR_GID) { - ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); - new_gid = zfs_gid_read(ZTOI(zp)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), - NULL, &new_gid, sizeof (new_gid)); - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_GID(zfsvfs), NULL, &new_gid, - sizeof (new_gid)); - ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); - } - } - if (!(mask & ATTR_MODE)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), - NULL, &new_mode, sizeof (new_mode)); - new_mode = zp->z_mode; - } - err = zfs_acl_chown_setattr(zp); - ASSERT(err == 0); - if (attrzp) { - err = zfs_acl_chown_setattr(attrzp); - ASSERT(err == 0); - } - } - - if (mask & ATTR_MODE) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &new_mode, sizeof (new_mode)); - zp->z_mode = ZTOI(zp)->i_mode = new_mode; - ASSERT3P(aclp, !=, NULL); - err = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT0(err); - if (zp->z_acl_cached) - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = aclp; - aclp = NULL; - } - - if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { - zp->z_atime_dirty = B_FALSE; - ZFS_TIME_ENCODE(&ip->i_atime, atime); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, - &atime, sizeof (atime)); - } - - if (mask & (ATTR_MTIME | ATTR_SIZE)) { - ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( - vap->va_mtime, ZTOI(zp)); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - mtime, sizeof (mtime)); - } - - if (mask & (ATTR_CTIME | ATTR_SIZE)) { - ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, - ZTOI(zp)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - ctime, sizeof (ctime)); - } - - if (projid != ZFS_INVALID_PROJID) { - zp->z_projid = projid; - SA_ADD_BULK_ATTR(bulk, count, - SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, - sizeof (zp->z_projid)); - } - - if (attrzp && mask) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_CTIME(zfsvfs), NULL, &ctime, - sizeof (ctime)); - } - - /* - * Do this after setting timestamps to prevent timestamp - * update from toggling bit - */ - - if (xoap && (mask & ATTR_XVATTR)) { - - /* - * restore trimmed off masks - * so that return masks can be set for caller. - */ - - if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { - XVA_SET_REQ(xvap, XAT_APPENDONLY); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { - XVA_SET_REQ(xvap, XAT_NOUNLINK); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { - XVA_SET_REQ(xvap, XAT_IMMUTABLE); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { - XVA_SET_REQ(xvap, XAT_NODUMP); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { - XVA_SET_REQ(xvap, XAT_AV_MODIFIED); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { - XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { - XVA_SET_REQ(xvap, XAT_PROJINHERIT); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - ASSERT(S_ISREG(ip->i_mode)); - - zfs_xvattr_set(zp, xvap, tx); - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - if (mask != 0) - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - - mutex_exit(&zp->z_lock); - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_exit(&zp->z_acl_lock); - - if (attrzp) { - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_exit(&attrzp->z_acl_lock); - mutex_exit(&attrzp->z_lock); - } -out: - if (err == 0 && xattr_count > 0) { - err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, - xattr_count, tx); - ASSERT(err2 == 0); - } - - if (aclp) - zfs_acl_free(aclp); - - if (fuidp) { - zfs_fuid_info_free(fuidp); - fuidp = NULL; - } - - if (err) { - dmu_tx_abort(tx); - if (attrzp) - zrele(attrzp); - if (err == ERESTART) - goto top; - } else { - if (count > 0) - err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - dmu_tx_commit(tx); - if (attrzp) { - if (err2 == 0 && handle_eadir) - err2 = zfs_setattr_dir(attrzp); - zrele(attrzp); - } - zfs_inode_update(zp); - } - -out2: - if (os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - -out3: - kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); - kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); - kmem_free(tmpxvattr, sizeof (xvattr_t)); - ZFS_EXIT(zfsvfs); - return (err); -} - -typedef struct zfs_zlock { - krwlock_t *zl_rwlock; /* lock we acquired */ - znode_t *zl_znode; /* znode we held */ - struct zfs_zlock *zl_next; /* next in list */ -} zfs_zlock_t; - -/* - * Drop locks and release vnodes that were held by zfs_rename_lock(). - */ -static void -zfs_rename_unlock(zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - - while ((zl = *zlpp) != NULL) { - if (zl->zl_znode != NULL) - zfs_zrele_async(zl->zl_znode); - rw_exit(zl->zl_rwlock); - *zlpp = zl->zl_next; - kmem_free(zl, sizeof (*zl)); - } -} - -/* - * Search back through the directory tree, using the ".." entries. - * Lock each directory in the chain to prevent concurrent renames. - * Fail any attempt to move a directory into one of its own descendants. - * XXX - z_parent_lock can overlap with map or grow locks - */ -static int -zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - znode_t *zp = tdzp; - uint64_t rootid = ZTOZSB(zp)->z_root; - uint64_t oidp = zp->z_id; - krwlock_t *rwlp = &szp->z_parent_lock; - krw_t rw = RW_WRITER; - - /* - * First pass write-locks szp and compares to zp->z_id. - * Later passes read-lock zp and compare to zp->z_parent. - */ - do { - if (!rw_tryenter(rwlp, rw)) { - /* - * Another thread is renaming in this path. - * Note that if we are a WRITER, we don't have any - * parent_locks held yet. - */ - if (rw == RW_READER && zp->z_id > szp->z_id) { - /* - * Drop our locks and restart - */ - zfs_rename_unlock(&zl); - *zlpp = NULL; - zp = tdzp; - oidp = zp->z_id; - rwlp = &szp->z_parent_lock; - rw = RW_WRITER; - continue; - } else { - /* - * Wait for other thread to drop its locks - */ - rw_enter(rwlp, rw); - } - } - - zl = kmem_alloc(sizeof (*zl), KM_SLEEP); - zl->zl_rwlock = rwlp; - zl->zl_znode = NULL; - zl->zl_next = *zlpp; - *zlpp = zl; - - if (oidp == szp->z_id) /* We're a descendant of szp */ - return (SET_ERROR(EINVAL)); - - if (oidp == rootid) /* We've hit the top */ - return (0); - - if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(ZTOZSB(zp), oidp, &zp); - if (error) - return (error); - zl->zl_znode = zp; - } - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), - &oidp, sizeof (oidp)); - rwlp = &zp->z_parent_lock; - rw = RW_READER; - - } while (zp->z_id != sdzp->z_id); - - return (0); -} - -/* - * Move an entry from the provided source directory to the target - * directory. Change the entry name as indicated. - * - * IN: sdzp - Source directory containing the "old entry". - * snm - Old entry name. - * tdzp - Target directory to contain the "new entry". - * tnm - New entry name. - * cr - credentials of caller. - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * sdzp,tdzp - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, - cred_t *cr, int flags) -{ - znode_t *szp, *tzp; - zfsvfs_t *zfsvfs = ZTOZSB(sdzp); - zilog_t *zilog; - zfs_dirlock_t *sdl, *tdl; - dmu_tx_t *tx; - zfs_zlock_t *zl; - int cmp, serr, terr; - int error = 0; - int zflg = 0; - boolean_t waited = B_FALSE; - - if (snm == NULL || tnm == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); - zilog = zfsvfs->z_log; - - ZFS_VERIFY_ZP(tdzp); - - /* - * We check i_sb because snapshots and the ctldir must have different - * super blocks. - */ - if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || - zfsctl_is_node(ZTOI(tdzp))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - if (zfsvfs->z_utf8 && u8_validate(tnm, - strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - -top: - szp = NULL; - tzp = NULL; - zl = NULL; - - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. - */ - if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Lock source and target directory entries. To prevent deadlock, - * a lock ordering must be defined. We lock the directory with - * the smallest object id first, or if it's a tie, the one with - * the lexically first name. - */ - if (sdzp->z_id < tdzp->z_id) { - cmp = -1; - } else if (sdzp->z_id > tdzp->z_id) { - cmp = 1; - } else { - /* - * First compare the two name arguments without - * considering any case folding. - */ - int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); - - cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); - ASSERT(error == 0 || !zfsvfs->z_utf8); - if (cmp == 0) { - /* - * POSIX: "If the old argument and the new argument - * both refer to links to the same existing file, - * the rename() function shall return successfully - * and perform no other action." - */ - ZFS_EXIT(zfsvfs); - return (0); - } - /* - * If the file system is case-folding, then we may - * have some more checking to do. A case-folding file - * system is either supporting mixed case sensitivity - * access or is completely case-insensitive. Note - * that the file system is always case preserving. - * - * In mixed sensitivity mode case sensitive behavior - * is the default. FIGNORECASE must be used to - * explicitly request case insensitive behavior. - * - * If the source and target names provided differ only - * by case (e.g., a request to rename 'tim' to 'Tim'), - * we will treat this as a special case in the - * case-insensitive mode: as long as the source name - * is an exact match, we will allow this to proceed as - * a name-change request. - */ - if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - (zfsvfs->z_case == ZFS_CASE_MIXED && - flags & FIGNORECASE)) && - u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, - &error) == 0) { - /* - * case preserving rename request, require exact - * name matches - */ - zflg |= ZCIEXACT; - zflg &= ~ZCILOOK; - } - } - - /* - * If the source and destination directories are the same, we should - * grab the z_name_lock of that directory only once. - */ - if (sdzp == tdzp) { - zflg |= ZHAVELOCK; - rw_enter(&sdzp->z_name_lock, RW_READER); - } - - if (cmp < 0) { - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, - ZEXISTS | zflg, NULL, NULL); - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); - } else { - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, zflg, NULL, NULL); - serr = zfs_dirent_lock(&sdl, - sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, - NULL, NULL); - } - - if (serr) { - /* - * Source entry invalid or not there. - */ - if (!terr) { - zfs_dirent_unlock(tdl); - if (tzp) - zrele(tzp); - } - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(snm, "..") == 0) - serr = EINVAL; - ZFS_EXIT(zfsvfs); - return (serr); - } - if (terr) { - zfs_dirent_unlock(sdl); - zrele(szp); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(tnm, "..") == 0) - terr = EINVAL; - ZFS_EXIT(zfsvfs); - return (terr); - } - - /* - * If we are using project inheritance, means if the directory has - * ZFS_PROJINHERIT set, then its descendant directories will inherit - * not only the project ID, but also the ZFS_PROJINHERIT flag. Under - * such case, we only allow renames into our tree when the project - * IDs are the same. - */ - if (tdzp->z_pflags & ZFS_PROJINHERIT && - tdzp->z_projid != szp->z_projid) { - error = SET_ERROR(EXDEV); - goto out; - } - - /* - * Must have write access at the source to remove the old entry - * and write access at the target to create the new entry. - * Note that if target and source are the same, this can be - * done in a single check. - */ - - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) - goto out; - - if (S_ISDIR(ZTOI(szp)->i_mode)) { - /* - * Check to make sure rename is valid. - * Can't do a move like this: /usr/a/b to /usr/a/b/c/d - */ - if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) - goto out; - } - - /* - * Does target exist? - */ - if (tzp) { - /* - * Source and target must be the same type. - */ - if (S_ISDIR(ZTOI(szp)->i_mode)) { - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - } else { - if (S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(EISDIR); - goto out; - } - } - /* - * POSIX dictates that when the source and target - * entries refer to the same file object, rename - * must do nothing and exit without error. - */ - if (szp->z_id == tzp->z_id) { - error = 0; - goto out; - } - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); - dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) { - dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tdzp); - } - if (tzp) { - dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tzp); - } - - zfs_sa_upgrade_txholds(tx, szp); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - if (zl != NULL) - zfs_rename_unlock(&zl); - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - zrele(szp); - if (tzp) - zrele(tzp); - goto top; - } - dmu_tx_abort(tx); - zrele(szp); - if (tzp) - zrele(tzp); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); - - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_pflags |= ZFS_AV_MODIFIED; - if (tdzp->z_pflags & ZFS_PROJINHERIT) - szp->z_pflags |= ZFS_PROJINHERIT; - - error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - (void *)&szp->z_pflags, sizeof (uint64_t), tx); - ASSERT0(error); - - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); - } else { - /* - * At this point, we have successfully created - * the target name, but have failed to remove - * the source name. Since the create was done - * with the ZRENAMING flag, there are - * complications; for one, the link count is - * wrong. The easiest way to deal with this - * is to remove the newly created target, and - * return the original error. This must - * succeed; fortunately, it is very unlikely to - * fail, since we just created it. - */ - VERIFY3U(zfs_link_destroy(tdl, szp, tx, - ZRENAMING, NULL), ==, 0); - } - } else { - /* - * If we had removed the existing target, subsequent - * call to zfs_link_create() to add back the same entry - * but, the new dnode (szp) should not fail. - */ - ASSERT(tzp == NULL); - } - } - - dmu_tx_commit(tx); -out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - zfs_inode_update(sdzp); - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (sdzp != tdzp) - zfs_inode_update(tdzp); - - zfs_inode_update(szp); - zrele(szp); - if (tzp) { - zfs_inode_update(tzp); - zrele(tzp); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert the indicated symbolic reference entry into the directory. - * - * IN: dzp - Directory to contain new symbolic link. - * name - Name of directory entry in dip. - * vap - Attributes of new entry. - * link - Name for new symlink entry. - * cr - credentials of caller. - * flags - case flags - * - * OUT: zpp - Znode for new symbolic link. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dip - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, - znode_t **zpp, cred_t *cr, int flags) -{ - znode_t *zp; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zilog_t *zilog; - uint64_t len = strlen(link); - int error; - int zflg = ZNEW; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - uint64_t txtype = TX_SYMLINK; - boolean_t waited = B_FALSE; - - ASSERT(S_ISLNK(vap->va_mode)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - - if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENAMETOOLONG)); - } - - if ((error = zfs_acl_ids_create(dzp, 0, - vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } -top: - *zpp = NULL; - - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); - if (error) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - tx = dmu_tx_create(zfsvfs->z_os); - fuid_dirtied = zfsvfs->z_fuid_dirty; - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE + len); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create a new object for the symlink. - * for version 4 ZPL datsets the symlink will be an SA attribute - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - mutex_enter(&zp->z_lock); - if (zp->z_is_sa) - error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), - link, len, tx); - else - zfs_sa_symlink(zp, link, len, tx); - mutex_exit(&zp->z_lock); - - zp->z_size = len; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - &zp->z_size, sizeof (zp->z_size), tx); - /* - * Insert the new object into the directory. - */ - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - } else { - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - - zfs_inode_update(dzp); - zfs_inode_update(zp); - } - - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (error == 0) { - *zpp = zp; - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - } else { - zrele(zp); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Return, in the buffer contained in the provided uio structure, - * the symbolic path referred to by ip. - * - * IN: ip - inode of symbolic link - * uio - structure to contain the link path. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - atime updated - */ -/* ARGSUSED */ -int -zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - mutex_enter(&zp->z_lock); - if (zp->z_is_sa) - error = sa_lookup_uio(zp->z_sa_hdl, - SA_ZPL_SYMLINK(zfsvfs), uio); - else - error = zfs_sa_readlink(zp, uio); - mutex_exit(&zp->z_lock); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert a new entry into directory tdzp referencing szp. - * - * IN: tdzp - Directory to contain new entry. - * szp - znode of new entry. - * name - name of new entry. - * cr - credentials of caller. - * flags - case flags. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * tdzp - ctime|mtime updated - * szp - ctime updated - */ -/* ARGSUSED */ -int -zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, - int flags) -{ - struct inode *sip = ZTOI(szp); - znode_t *tzp; - zfsvfs_t *zfsvfs = ZTOZSB(tdzp); - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - int zf = ZNEW; - uint64_t parent; - uid_t owner; - boolean_t waited = B_FALSE; - boolean_t is_tmpfile = 0; - uint64_t txg; -#ifdef HAVE_TMPFILE - is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); -#endif - ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(tdzp); - zilog = zfsvfs->z_log; - - /* - * POSIX dictates that we return EPERM here. - * Better choices include ENOTSUP or EISDIR. - */ - if (S_ISDIR(sip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - ZFS_VERIFY_ZP(szp); - - /* - * If we are using project inheritance, means if the directory has - * ZFS_PROJINHERIT set, then its descendant directories will inherit - * not only the project ID, but also the ZFS_PROJINHERIT flag. Under - * such case, we only allow hard link creation in our tree when the - * project IDs are the same. - */ - if (tdzp->z_pflags & ZFS_PROJINHERIT && - tdzp->z_projid != szp->z_projid) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - /* - * We check i_sb because snapshots and the ctldir must have different - * super blocks. - */ - if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - /* Prevent links to .zfs/shares files */ - - if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (zfsvfs->z_utf8 && u8_validate(name, - strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - /* - * We do not support links between attributes and non-attributes - * because of the potential security risk of creating links - * into "normal" file space in order to circumvent restrictions - * imposed in attribute space. - */ - if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), - cr, ZFS_OWNER); - if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - -top: - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); - if (is_tmpfile) - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - zfs_sa_upgrade_txholds(tx, szp); - zfs_sa_upgrade_txholds(tx, tdzp); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - /* unmark z_unlinked so zfs_link_create will not reject */ - if (is_tmpfile) - szp->z_unlinked = B_FALSE; - error = zfs_link_create(dl, szp, tx, 0); - - if (error == 0) { - uint64_t txtype = TX_LINK; - /* - * tmpfile is created to be in z_unlinkedobj, so remove it. - * Also, we don't log in ZIL, because all previous file - * operation on the tmpfile are ignored by ZIL. Instead we - * always wait for txg to sync to make sure all previous - * operation are sync safe. - */ - if (is_tmpfile) { - VERIFY(zap_remove_int(zfsvfs->z_os, - zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); - } else { - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_link(zilog, tx, txtype, tdzp, szp, name); - } - } else if (is_tmpfile) { - /* restore z_unlinked since when linking failed */ - szp->z_unlinked = B_TRUE; - } - txg = dmu_tx_get_txg(tx); - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); - - zfs_inode_update(tdzp); - zfs_inode_update(szp); - ZFS_EXIT(zfsvfs); - return (error); -} - -static void -zfs_putpage_commit_cb(void *arg) -{ - struct page *pp = arg; - - ClearPageError(pp); - end_page_writeback(pp); -} - -/* - * Push a page out to disk, once the page is on stable storage the - * registered commit callback will be run as notification of completion. - * - * IN: ip - page mapped for inode. - * pp - page to push (page is locked) - * wbc - writeback control data - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime|mtime updated - */ -/* ARGSUSED */ -int -zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - loff_t offset; - loff_t pgoff; - unsigned int pglen; - dmu_tx_t *tx; - caddr_t va; - int err = 0; - uint64_t mtime[2], ctime[2]; - sa_bulk_attr_t bulk[3]; - int cnt = 0; - struct address_space *mapping; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - ASSERT(PageLocked(pp)); - - pgoff = page_offset(pp); /* Page byte-offset in file */ - offset = i_size_read(ip); /* File length in bytes */ - pglen = MIN(PAGE_SIZE, /* Page length in bytes */ - P2ROUNDUP(offset, PAGE_SIZE)-pgoff); - - /* Page is beyond end of file */ - if (pgoff >= offset) { - unlock_page(pp); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Truncate page length to end of file */ - if (pgoff + pglen > offset) - pglen = offset - pgoff; - -#if 0 - /* - * FIXME: Allow mmap writes past its quota. The correct fix - * is to register a page_mkwrite() handler to count the page - * against its quota when it is about to be dirtied. - */ - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, - KUID_TO_SUID(ip->i_uid)) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, - KGID_TO_SGID(ip->i_gid)) || - (zp->z_projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - zp->z_projid))) { - err = EDQUOT; - } -#endif - - /* - * The ordering here is critical and must adhere to the following - * rules in order to avoid deadlocking in either zfs_read() or - * zfs_free_range() due to a lock inversion. - * - * 1) The page must be unlocked prior to acquiring the range lock. - * This is critical because zfs_read() calls find_lock_page() - * which may block on the page lock while holding the range lock. - * - * 2) Before setting or clearing write back on a page the range lock - * must be held in order to prevent a lock inversion with the - * zfs_free_range() function. - * - * This presents a problem because upon entering this function the - * page lock is already held. To safely acquire the range lock the - * page lock must be dropped. This creates a window where another - * process could truncate, invalidate, dirty, or write out the page. - * - * Therefore, after successfully reacquiring the range and page locks - * the current page state is checked. In the common case everything - * will be as is expected and it can be written out. However, if - * the page state has changed it must be handled accordingly. - */ - mapping = pp->mapping; - redirty_page_for_writepage(wbc, pp); - unlock_page(pp); - - zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, - pgoff, pglen, RL_WRITER); - lock_page(pp); - - /* Page mapping changed or it was no longer dirty, we're done */ - if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { - unlock_page(pp); - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Another process started write block if required */ - if (PageWriteback(pp)) { - unlock_page(pp); - zfs_rangelock_exit(lr); - - if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(pp)) - wait_on_page_bit(pp, PG_writeback); - } - - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Clear the dirty flag the required locks are held */ - if (!clear_page_dirty_for_io(pp)) { - unlock_page(pp); - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * Counterpart for redirty_page_for_writepage() above. This page - * was in fact not skipped and should not be counted as if it were. - */ - wbc->pages_skipped--; - set_page_writeback(pp); - unlock_page(pp); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - - err = dmu_tx_assign(tx, TXG_NOWAIT); - if (err != 0) { - if (err == ERESTART) - dmu_tx_wait(tx); - - dmu_tx_abort(tx); - __set_page_dirty_nobuffers(pp); - ClearPageError(pp); - end_page_writeback(pp); - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (err); - } - - va = kmap(pp); - ASSERT3U(pglen, <=, PAGE_SIZE); - dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); - kunmap(pp); - - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* Preserve the mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); - zp->z_atime_dirty = B_FALSE; - zp->z_seq++; - - err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, - zfs_putpage_commit_cb, pp); - dmu_tx_commit(tx); - - zfs_rangelock_exit(lr); - - if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Note that this is rarely called under writepages(), because - * writepages() normally handles the entire commit for - * performance reasons. - */ - zil_commit(zfsvfs->z_log, zp->z_id); - } - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Update the system attributes when the inode has been dirtied. For the - * moment we only update the mode, atime, mtime, and ctime. - */ -int -zfs_dirty_inode(struct inode *ip, int flags) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - dmu_tx_t *tx; - uint64_t mode, atime[2], mtime[2], ctime[2]; - sa_bulk_attr_t bulk[4]; - int error = 0; - int cnt = 0; - - if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - -#ifdef I_DIRTY_TIME - /* - * This is the lazytime semantic introduced in Linux 4.0 - * This flag will only be called from update_time when lazytime is set. - * (Note, I_DIRTY_SYNC will also set if not lazytime) - * Fortunately mtime and ctime are managed within ZFS itself, so we - * only need to dirty atime. - */ - if (flags == I_DIRTY_TIME) { - zp->z_atime_dirty = B_TRUE; - goto out; - } -#endif - - tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto out; - } - - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = B_FALSE; - - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - - /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); - mode = ip->i_mode; - - zp->z_mode = mode; - - error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - mutex_exit(&zp->z_lock); - - dmu_tx_commit(tx); -out: - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -void -zfs_inactive(struct inode *ip) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint64_t atime[2]; - int error; - int need_unlock = 0; - - /* Only read lock if we haven't already write locked, e.g. rollback */ - if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { - need_unlock = 1; - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - } - if (zp->z_sa_hdl == NULL) { - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); - return; - } - - if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - ZFS_TIME_ENCODE(&ip->i_atime, atime); - mutex_enter(&zp->z_lock); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), - (void *)&atime, sizeof (atime), tx); - zp->z_atime_dirty = B_FALSE; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } - - zfs_zinactive(zp); - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); -} - -/* - * Fill pages with data from the disk. - */ -static int -zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - struct page *cur_pp; - u_offset_t io_off, total; - size_t io_len; - loff_t i_size; - unsigned page_idx; - int err; - - os = zfsvfs->z_os; - io_len = nr_pages << PAGE_SHIFT; - i_size = i_size_read(ip); - io_off = page_offset(pl[0]); - - if (io_off + io_len > i_size) - io_len = i_size - io_off; - - /* - * Iterate over list of pages and read each page individually. - */ - page_idx = 0; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; - - cur_pp = pl[page_idx++]; - va = kmap(cur_pp); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - kunmap(cur_pp); - if (err) { - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } - } - - return (0); -} - -/* - * Uses zfs_fillpage to read data from the file and fill the pages. - * - * IN: ip - inode of file to get data from. - * pl - list of pages to read - * nr_pages - number of pages to read - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -int -zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int err; - - if (pl == NULL) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - err = zfs_fillpage(ip, pl, nr_pages); - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Check ZFS specific permissions to memory map a section of a file. - * - * IN: ip - inode of the file to mmap - * off - file offset - * addrp - start address in memory region - * len - length of memory region - * vm_flags- address flags - * - * RETURN: 0 if success - * error code if failure - */ -/*ARGSUSED*/ -int -zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - unsigned long vm_flags) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((vm_flags & VM_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if ((vm_flags & (VM_READ | VM_EXEC)) && - (zp->z_pflags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENXIO)); - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Free or allocate space in a file. Currently, this function only - * supports the `F_FREESP' command. However, this command is somewhat - * misnamed, as its functionality includes the ability to allocate as - * well as free space. - * - * IN: zp - znode of file to free data in. - * cmd - action to take (only F_FREESP supported). - * bfp - section of file to free/alloc. - * flag - current file open mode flags. - * offset - current file offset. - * cr - credentials of caller. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * zp - ctime|mtime updated - */ -/* ARGSUSED */ -int -zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, - offset_t offset, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - uint64_t off, len; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Permissions aren't checked on Solaris because on this OS - * zfs_space() can only be called with an opened file handle. - * On Linux we can get here through truncate_range() which - * operates directly on inodes, so we need to check access rights. - */ - if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - off = bfp->l_start; - len = bfp->l_len; /* 0 means from off to end of file */ - - error = zfs_freesp(zp, off, len, flag, TRUE); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -int -zfs_fid(struct inode *ip, fid_t *fidp) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint32_t gen; - uint64_t gen64; - uint64_t object = zp->z_id; - zfid_short_t *zfid; - int size, i, error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), - &gen64, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - gen = (uint32_t)gen64; - - size = SHORT_FID_LEN; - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = size; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* Must have a non-zero generation number to distinguish from .zfs */ - if (gen == 0) - gen = 1; - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - - ZFS_EXIT(zfsvfs); - return (0); -} - -#if defined(_KERNEL) -EXPORT_SYMBOL(zfs_open); -EXPORT_SYMBOL(zfs_close); -EXPORT_SYMBOL(zfs_lookup); -EXPORT_SYMBOL(zfs_create); -EXPORT_SYMBOL(zfs_tmpfile); -EXPORT_SYMBOL(zfs_remove); -EXPORT_SYMBOL(zfs_mkdir); -EXPORT_SYMBOL(zfs_rmdir); -EXPORT_SYMBOL(zfs_readdir); -EXPORT_SYMBOL(zfs_getattr_fast); -EXPORT_SYMBOL(zfs_setattr); -EXPORT_SYMBOL(zfs_rename); -EXPORT_SYMBOL(zfs_symlink); -EXPORT_SYMBOL(zfs_readlink); -EXPORT_SYMBOL(zfs_link); -EXPORT_SYMBOL(zfs_inactive); -EXPORT_SYMBOL(zfs_space); -EXPORT_SYMBOL(zfs_fid); -EXPORT_SYMBOL(zfs_getpage); -EXPORT_SYMBOL(zfs_putpage); -EXPORT_SYMBOL(zfs_dirty_inode); -EXPORT_SYMBOL(zfs_map); - -/* BEGIN CSTYLED */ -module_param(zfs_delete_blocks, ulong, 0644); -MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); -/* END CSTYLED */ - -#endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c new file mode 100644 index 000000000000..3be387a30e5c --- /dev/null +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -0,0 +1,4000 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) zrele() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call zrele() within a tx then use zfs_zrele_async(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* + * Virus scanning is unsupported. It would be possible to add a hook + * here to performance the required virus scan. This could be done + * entirely in the kernel or potentially as an update to invoke a + * scanning utility. + */ +static int +zfs_vscan(struct inode *ip, cred_t *cr, int async) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Honor ZFS_APPENDONLY file attribute */ + if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & O_APPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Virus scan eligible files on open */ + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (zfs_vscan(ip, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +int +zfs_close(struct inode *ip, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(zfs_vscan(ip, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(_KERNEL) +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +void +update_pages(znode_t *zp, int64_t start, int len, objset_t *os) +{ + struct inode *ip = ZTOI(zp); + struct address_space *mp = ip->i_mapping; + struct page *pp; + uint64_t nbytes; + int64_t off; + void *pb; + + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + nbytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + pb = kmap(pp); + (void) dmu_read(os, zp->z_id, start + off, nbytes, + pb + off, DMU_READ_PREFETCH); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + SetPageUptodate(pp); + ClearPageError(pp); + unlock_page(pp); + put_page(pp); + } + + len -= nbytes; + off = 0; + } +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +int +mappedread(znode_t *zp, int nbytes, uio_t *uio) +{ + struct inode *ip = ZTOI(zp); + struct address_space *mp = ip->i_mapping; + struct page *pp; + int64_t start, off; + uint64_t bytes; + int len = nbytes; + int error = 0; + void *pb; + + start = uio->uio_loffset; + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + bytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + ASSERT(PageUptodate(pp)); + unlock_page(pp); + + pb = kmap(pp); + error = uiomove(pb + off, bytes, UIO_READ, uio); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + put_page(pp); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + } + + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} +#endif /* _KERNEL */ + +unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; + +/* + * Write the bytes to a file. + * + * IN: zp - znode of file to be written to + * data - bytes to write + * len - number of bytes to write + * pos - offset to start writing at + * + * OUT: resid - remaining bytes to write + * + * RETURN: 0 if success + * positive error code if failure. EIO is returned + * for a short write when residp isn't provided. + * + * Timestamps: + * zp - ctime|mtime updated if byte count > 0 + */ +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *residp) +{ + fstrans_cookie_t cookie; + int error; + + struct iovec iov; + iov.iov_base = (void *)data; + iov.iov_len = len; + + uio_t uio; + uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); + + cookie = spl_fstrans_mark(); + error = zfs_write(zp, &uio, 0, kcred); + spl_fstrans_unmark(cookie); + + if (error == 0) { + if (residp != NULL) + *residp = uio_resid(&uio); + else if (uio_resid(&uio) != 0) + error = SET_ERROR(EIO); + } + + return (error); +} + +void +zfs_zrele_async(znode_t *zp) +{ + struct inode *ip = ZTOI(zp); + objset_t *os = ITOZSB(ip)->z_os; + + ASSERT(atomic_read(&ip->i_count) > 0); + ASSERT(os != NULL); + + if (atomic_read(&ip->i_count) == 1) + VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), + (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); + else + zrele(zp); +} + + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held inode reference for it. + * + * IN: zdp - znode of directory to search. + * nm - name of entry to lookup. + * flags - LOOKUP_XATTR set if looking for an attribute. + * cr - credentials of caller. + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: zpp - znode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (!S_ISDIR(ZTOI(zdp)->i_mode)) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *zpp = zdp; + zhold(*zpp); + return (0); + } + return (error); + } + } + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *zpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, + B_FALSE, cr))) { + zrele(*zpp); + *zpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + if (!S_ISDIR(ZTOI(zdp)->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTDIR)); + } + + /* + * Check accessibility of directory. + */ + + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); + if ((error == 0) && (*zpp)) + zfs_inode_update(*zpp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the ip of the created or trunc'd file. + * + * IN: dzp - znode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - file flag. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated if new entry created + * zp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *zpp = NULL; + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + zhold(dzp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible igrab(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (strcmp(name, "..") == 0) + error = SET_ERROR(EISDIR); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + uint64_t projid = ZFS_DEFAULT_PROJID; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EINVAL); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + } else { + int aflags = (flag & O_APPEND) ? V_APPEND : 0; + + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl) { + error = SET_ERROR(EEXIST); + goto out; + } + /* + * Can't open a directory for writing. + */ + if (S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if (S_ISREG(ZTOI(zp)->i_mode) && + (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + if (dl) { + zfs_dirent_unlock(dl); + dl = NULL; + } + error = zfs_freesp(zp, 0, 0, mode, TRUE); + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + zrele(zp); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +int +zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp = NULL, *dzp = ITOZ(dip); + zfsvfs_t *zfsvfs = ITOZSB(dip); + objset_t *os; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + uint64_t projid = ZFS_DEFAULT_PROJID; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *ipp = NULL; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* Add to unlinked set */ + zp->z_unlinked = B_TRUE; + zfs_unlinked_add(zp, tx); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); +out: + + if (error) { + if (zp) + zrele(zp); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + *ipp = ZTOI(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dzp - znode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime + * ip - ctime (if nlink > 0) + */ + +uint64_t null_xattr = 0; + +/*ARGSUSED*/ +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; + uint64_t links; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } + +top: + xattr_obj = 0; + xzp = NULL; + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp))) { + if (realnmp) + pn_free(realnmp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(EPERM); + goto out; + } + + mutex_enter(&zp->z_lock); + may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && + !(zp->z_is_mapped); + mutex_exit(&zp->z_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the inode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + obj = zp->z_id; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + goto top; + } + if (realnmp) + pn_free(realnmp); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + atomic_read(&ZTOI(zp)->i_count) == 1 && + !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && + zfs_external_acl(zp) == acl_obj; + } + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(xzp)); + links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT0(error); + } + /* + * Add to the unlinked set because a new reference could be + * taken concurrently resulting in a deferred destruction. + */ + zfs_unlinked_add(zp, tx); + mutex_exit(&zp->z_lock); + } else if (unlinked) { + mutex_exit(&zp->z_lock); + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + if (realnmp) + pn_free(realnmp); + + zfs_dirent_unlock(dl); + zfs_inode_update(dzp); + zfs_inode_update(zp); + + if (delete_now) + zrele(zp); + else + zfs_zrele_async(zp); + + if (xzp) { + zfs_inode_update(xzp); + zfs_zrele_async(xzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dzp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dzp - znode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * flags - case flags. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime updated + * zpp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, + cred_t *cr, int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t waited = B_FALSE; + + ASSERT(S_ISDIR(vap->va_mode)); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (dirname == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ +top: + *zpp = NULL; + + if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + /* + * Now put new name in parent dir. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); + +out: + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (error != 0) { + zrele(zp); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + } + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dzp - znode of directory to remove from. + * name - name of directory to be removed. + * cwd - inode of current working directory. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, + int flags) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (!S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + if (zp == cwd) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * Grab a lock on the directory to make sure that no one is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + goto top; + } + dmu_tx_abort(tx); + zrele(zp); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, + B_FALSE); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + zfs_inode_update(dzp); + zfs_inode_update(zp); + zrele(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read directory entries from the given directory cursor position and emit + * name and position for each entry. + * + * IN: ip - inode of directory to read. + * ctx - directory entry context. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +int +zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + zap_cursor_t zc; + zap_attribute_t zap; + int error; + uint8_t prefetch; + uint8_t type; + int done = 0; + uint64_t parent; + uint64_t offset; /* must be unsigned; checks for < 1 */ + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) + goto out; + + /* + * Quit if directory has been removed (posix) + */ + if (zp->z_unlinked) + goto out; + + error = 0; + os = zfsvfs->z_os; + offset = ctx->pos; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Transform to file-system independent format + */ + while (!done) { + uint64_t objnum; + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if (error == ENOENT) + break; + else + goto update; + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (zap.za_integer_length != 8 || + zap.za_num_integers == 0) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld, " + "length = %d, num = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset, + zap.za_integer_length, + (u_longlong_t)zap.za_num_integers); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + } + + done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), + objnum, type); + if (done) + break; + + /* Prefetch znode */ + if (prefetch) { + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + ctx->pos = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + +update: + zap_cursor_fini(&zc); + if (error == ENOENT) + error = 0; +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Get the basic file attributes and place them in the provided kstat + * structure. The inode is assumed to be the authoritative source + * for most of the attributes. However, the znode currently has the + * authoritative atime, blksize, and block count. + * + * IN: ip - inode of file. + * + * OUT: sp - kstat values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +int +zfs_getattr_fast(struct inode *ip, struct kstat *sp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t blksize; + u_longlong_t nblocks; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + + generic_fillattr(ip, sp); + /* + * +1 link count for root inode with visible '.zfs' directory. + */ + if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) + if (sp->nlink < ZFS_LINK_MAX) + sp->nlink++; + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + sp->blksize = blksize; + sp->blocks = nblocks; + + if (unlikely(zp->z_blksz == 0)) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + sp->blksize = zfsvfs->z_max_blksz; + } + + mutex_exit(&zp->z_lock); + + /* + * Required to prevent NFS client from detecting different inode + * numbers of snapshot root dentry before and after snapshot mount. + */ + if (zfsvfs->z_issnap) { + if (ip->i_sb->s_root->d_inode == ip) + sp->ino = ZFSCTL_INO_SNAPDIRS - + dmu_objset_id(zfsvfs->z_os); + } + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + struct inode *dxip = ZTOI(dzp); + struct inode *xip = NULL; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t zap; + zfs_dirlock_t *dl; + znode_t *zp = NULL; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { + count = 0; + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, + ZEXISTS, NULL, NULL); + if (err == ENOENT) + goto next; + if (err) + break; + + xip = ZTOI(zp); + if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && + KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + break; + + mutex_enter(&dzp->z_lock); + + if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { + xip->i_uid = dxip->i_uid; + uid = zfs_uid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + } + + if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { + xip->i_gid = dxip->i_gid; + gid = zfs_gid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + } + + if (zp->z_projid != dzp->z_projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, + sizeof (zp->z_pflags)); + } + + zp->z_projid = dzp->z_projid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), + NULL, &zp->z_projid, sizeof (zp->z_projid)); + } + + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + if (err != 0 && err != ENOENT) + break; + +next: + if (zp) { + zrele(zp); + zp = NULL; + zfs_dirent_unlock(dl); + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (zp) { + zrele(zp); + zfs_dirent_unlock(dl); + } + zap_cursor_fini(&zc); + + return (err == ENOENT ? 0 : err); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If ATTR_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + struct inode *ip; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t *tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2], atime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2 = 0; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; + sa_bulk_attr_t *bulk, *xattr_bulk; + int count = 0, xattr_count = 0, bulks = 8; + + if (mask == 0) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + ip = ZTOI(zp); + + /* + * If this is a xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + if (xoap != NULL && (mask & ATTR_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & ATTR_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); + xva_init(tmpxvattr); + + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || + ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } + + if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { + err = SET_ERROR(EPERM); + goto out3; + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (ATTR_ATIME | ATTR_MTIME)) { + if (((mask & ATTR_ATIME) && + TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & ATTR_MTIME) && + TIMESPEC_OVERFLOW(&vap->va_mtime))) { + err = SET_ERROR(EOVERFLOW); + goto out3; + } + } + +top: + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (zfs_is_readonly(zfsvfs)) { + err = SET_ERROR(EROFS); + goto out3; + } + + /* + * First validate permissions + */ + + if (mask & ATTR_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + if (err) + goto out3; + + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) + goto out3; + } + + if (mask & (ATTR_ATIME|ATTR_MTIME) || + ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (ATTR_UID|ATTR_GID)) { + int idmask = (mask & (ATTR_UID|ATTR_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & ATTR_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & ATTR_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both ATTR_UID and ATTR_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (ATTR_UID|ATTR_GID)) && + take_owner && take_group) || + ((idmask == ATTR_UID) && take_owner) || + ((idmask == ATTR_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + (void) secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (ATTR_UID|ATTR_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & ATTR_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((!S_ISREG(ip->i_mode) && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + err = SET_ERROR(EPERM); + goto out3; + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & ATTR_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(ip, vap, + &oldva, cr); + if (err) + goto out3; + + trim_mask |= ATTR_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) + goto out3; + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); + if (err) + goto out2; + } + if (mask & ATTR_UID) { + new_kuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_kuid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & ATTR_GID) { + new_kgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); + if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_kgid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + zrele(attrzp); + err = EDQUOT; + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & ATTR_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = EPERM; + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & ATTR_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (ATTR_UID|ATTR_GID)) { + + if (mask & ATTR_UID) { + ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); + new_uid = zfs_uid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); + } + } + + if (mask & ATTR_GID) { + ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); + new_gid = zfs_gid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); + } + } + if (!(mask & ATTR_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & ATTR_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = ZTOI(zp)->i_mode = new_mode; + ASSERT3P(aclp, !=, NULL); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { + zp->z_atime_dirty = B_FALSE; + ZFS_TIME_ENCODE(&ip->i_atime, atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, sizeof (atime)); + } + + if (mask & (ATTR_MTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( + vap->va_mtime, ZTOI(zp)); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (mask & (ATTR_CTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_ctime, ctime); + ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, + ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + if (attrzp && mask) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, &ctime, + sizeof (ctime)); + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & ATTR_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(S_ISREG(ip->i_mode)); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + mutex_exit(&zp->z_lock); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } +out: + if (err == 0 && xattr_count > 0) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + if (attrzp) + zrele(attrzp); + if (err == ERESTART) + goto top; + } else { + if (count > 0) + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + if (attrzp) { + if (err2 == 0 && handle_eadir) + err2 = zfs_setattr_dir(attrzp); + zrele(attrzp); + } + zfs_inode_update(zp); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + +out3: + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(tmpxvattr, sizeof (xvattr_t)); + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + zfs_zrele_async(zl->zl_znode); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = ZTOZSB(zp)->z_root; + uint64_t oidp = zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (oidp == szp->z_id) /* We're a descendant of szp */ + return (SET_ERROR(EINVAL)); + + if (oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(ZTOZSB(zp), oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), + &oidp, sizeof (oidp)); + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdzp - Source directory containing the "old entry". + * snm - Old entry name. + * tdzp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdzp,tdzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, + cred_t *cr, int flags) +{ + znode_t *szp, *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(sdzp); + zilog_t *zilog; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + boolean_t waited = B_FALSE; + + if (snm == NULL || tnm == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + ZFS_VERIFY_ZP(tdzp); + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || + zfsctl_is_node(ZTOI(tdzp))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + zrele(tzp); + } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + zrele(szp); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto out; + + if (S_ISDIR(ZTOI(szp)->i_mode)) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (S_ISDIR(ZTOI(szp)->i_mode)) { + if (!S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + } else { + if (S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + goto top; + } + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + zfs_inode_update(sdzp); + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (sdzp != tdzp) + zfs_inode_update(tdzp); + + zfs_inode_update(szp); + zrele(szp); + if (tzp) { + zfs_inode_update(tzp); + zrele(tzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dzp - Directory to contain new symbolic link. + * name - Name of directory entry in dip. + * vap - Attributes of new entry. + * link - Name for new symlink entry. + * cr - credentials of caller. + * flags - case flags + * + * OUT: zpp - Znode for new symbolic link. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, + znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + int zflg = ZNEW; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + boolean_t waited = B_FALSE; + + ASSERT(S_ISLNK(vap->va_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: + *zpp = NULL; + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + + zfs_inode_update(dzp); + zfs_inode_update(zp); + } + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + zrele(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by ip. + * + * IN: ip - inode of symbolic link + * uio - structure to contain the link path. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + */ +/* ARGSUSED */ +int +zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdzp referencing szp. + * + * IN: tdzp - Directory to contain new entry. + * szp - znode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdzp - ctime|mtime updated + * szp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + struct inode *sip = ZTOI(szp); + znode_t *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(tdzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uint64_t parent; + uid_t owner; + boolean_t waited = B_FALSE; + boolean_t is_tmpfile = 0; + uint64_t txg; +#ifdef HAVE_TMPFILE + is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); +#endif + ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (S_ISDIR(sip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), + cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + if (is_tmpfile) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + /* unmark z_unlinked so zfs_link_create will not reject */ + if (is_tmpfile) + szp->z_unlinked = B_FALSE; + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + /* + * tmpfile is created to be in z_unlinkedobj, so remove it. + * Also, we don't log in ZIL, because all previous file + * operation on the tmpfile are ignored by ZIL. Instead we + * always wait for txg to sync to make sure all previous + * operation are sync safe. + */ + if (is_tmpfile) { + VERIFY(zap_remove_int(zfsvfs->z_os, + zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } + } else if (is_tmpfile) { + /* restore z_unlinked since when linking failed */ + szp->z_unlinked = B_TRUE; + } + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); + + zfs_inode_update(tdzp); + zfs_inode_update(szp); + ZFS_EXIT(zfsvfs); + return (error); +} + +static void +zfs_putpage_commit_cb(void *arg) +{ + struct page *pp = arg; + + ClearPageError(pp); + end_page_writeback(pp); +} + +/* + * Push a page out to disk, once the page is on stable storage the + * registered commit callback will be run as notification of completion. + * + * IN: ip - page mapped for inode. + * pp - page to push (page is locked) + * wbc - writeback control data + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + loff_t offset; + loff_t pgoff; + unsigned int pglen; + dmu_tx_t *tx; + caddr_t va; + int err = 0; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int cnt = 0; + struct address_space *mapping; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + ASSERT(PageLocked(pp)); + + pgoff = page_offset(pp); /* Page byte-offset in file */ + offset = i_size_read(ip); /* File length in bytes */ + pglen = MIN(PAGE_SIZE, /* Page length in bytes */ + P2ROUNDUP(offset, PAGE_SIZE)-pgoff); + + /* Page is beyond end of file */ + if (pgoff >= offset) { + unlock_page(pp); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Truncate page length to end of file */ + if (pgoff + pglen > offset) + pglen = offset - pgoff; + +#if 0 + /* + * FIXME: Allow mmap writes past its quota. The correct fix + * is to register a page_mkwrite() handler to count the page + * against its quota when it is about to be dirtied. + */ + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + err = EDQUOT; + } +#endif + + /* + * The ordering here is critical and must adhere to the following + * rules in order to avoid deadlocking in either zfs_read() or + * zfs_free_range() due to a lock inversion. + * + * 1) The page must be unlocked prior to acquiring the range lock. + * This is critical because zfs_read() calls find_lock_page() + * which may block on the page lock while holding the range lock. + * + * 2) Before setting or clearing write back on a page the range lock + * must be held in order to prevent a lock inversion with the + * zfs_free_range() function. + * + * This presents a problem because upon entering this function the + * page lock is already held. To safely acquire the range lock the + * page lock must be dropped. This creates a window where another + * process could truncate, invalidate, dirty, or write out the page. + * + * Therefore, after successfully reacquiring the range and page locks + * the current page state is checked. In the common case everything + * will be as is expected and it can be written out. However, if + * the page state has changed it must be handled accordingly. + */ + mapping = pp->mapping; + redirty_page_for_writepage(wbc, pp); + unlock_page(pp); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + pgoff, pglen, RL_WRITER); + lock_page(pp); + + /* Page mapping changed or it was no longer dirty, we're done */ + if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { + unlock_page(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Another process started write block if required */ + if (PageWriteback(pp)) { + unlock_page(pp); + zfs_rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(pp)) + wait_on_page_bit(pp, PG_writeback); + } + + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Clear the dirty flag the required locks are held */ + if (!clear_page_dirty_for_io(pp)) { + unlock_page(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Counterpart for redirty_page_for_writepage() above. This page + * was in fact not skipped and should not be counted as if it were. + */ + wbc->pages_skipped--; + set_page_writeback(pp); + unlock_page(pp); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err != 0) { + if (err == ERESTART) + dmu_tx_wait(tx); + + dmu_tx_abort(tx); + __set_page_dirty_nobuffers(pp); + ClearPageError(pp); + end_page_writeback(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (err); + } + + va = kmap(pp); + ASSERT3U(pglen, <=, PAGE_SIZE); + dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); + kunmap(pp); + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* Preserve the mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + zp->z_atime_dirty = B_FALSE; + zp->z_seq++; + + err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, + zfs_putpage_commit_cb, pp); + dmu_tx_commit(tx); + + zfs_rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Note that this is rarely called under writepages(), because + * writepages() normally handles the entire commit for + * performance reasons. + */ + zil_commit(zfsvfs->z_log, zp->z_id); + } + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Update the system attributes when the inode has been dirtied. For the + * moment we only update the mode, atime, mtime, and ctime. + */ +int +zfs_dirty_inode(struct inode *ip, int flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + dmu_tx_t *tx; + uint64_t mode, atime[2], mtime[2], ctime[2]; + sa_bulk_attr_t bulk[4]; + int error = 0; + int cnt = 0; + + if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + +#ifdef I_DIRTY_TIME + /* + * This is the lazytime semantic introduced in Linux 4.0 + * This flag will only be called from update_time when lazytime is set. + * (Note, I_DIRTY_SYNC will also set if not lazytime) + * Fortunately mtime and ctime are managed within ZFS itself, so we + * only need to dirty atime. + */ + if (flags == I_DIRTY_TIME) { + zp->z_atime_dirty = B_TRUE; + goto out; + } +#endif + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = B_FALSE; + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + /* Preserve the mode, mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_atime, atime); + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + mode = ip->i_mode; + + zp->z_mode = mode; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + mutex_exit(&zp->z_lock); + + dmu_tx_commit(tx); +out: + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(struct inode *ip) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t atime[2]; + int error; + int need_unlock = 0; + + /* Only read lock if we haven't already write locked, e.g. rollback */ + if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { + need_unlock = 1; + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + } + if (zp->z_sa_hdl == NULL) { + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + ZFS_TIME_ENCODE(&ip->i_atime, atime); + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&atime, sizeof (atime), tx); + zp->z_atime_dirty = B_FALSE; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +/* + * Fill pages with data from the disk. + */ +static int +zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + struct page *cur_pp; + u_offset_t io_off, total; + size_t io_len; + loff_t i_size; + unsigned page_idx; + int err; + + os = zfsvfs->z_os; + io_len = nr_pages << PAGE_SHIFT; + i_size = i_size_read(ip); + io_off = page_offset(pl[0]); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; + + /* + * Iterate over list of pages and read each page individually. + */ + page_idx = 0; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + + cur_pp = pl[page_idx++]; + va = kmap(cur_pp); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); + kunmap(cur_pp); + if (err) { + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = SET_ERROR(EIO); + return (err); + } + } + + return (0); +} + +/* + * Uses zfs_fillpage to read data from the file and fill the pages. + * + * IN: ip - inode of file to get data from. + * pl - list of pages to read + * nr_pages - number of pages to read + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +int +zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int err; + + if (pl == NULL) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + err = zfs_fillpage(ip, pl, nr_pages); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Check ZFS specific permissions to memory map a section of a file. + * + * IN: ip - inode of the file to mmap + * off - file offset + * addrp - start address in memory region + * len - length of memory region + * vm_flags- address flags + * + * RETURN: 0 if success + * error code if failure + */ +/*ARGSUSED*/ +int +zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, + unsigned long vm_flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((vm_flags & VM_WRITE) && (zp->z_pflags & + (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((vm_flags & (VM_READ | VM_EXEC)) && + (zp->z_pflags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENXIO)); + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: zp - znode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * zp - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +int +zfs_fid(struct inode *ip, fid_t *fidp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = SHORT_FID_LEN; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_open); +EXPORT_SYMBOL(zfs_close); +EXPORT_SYMBOL(zfs_lookup); +EXPORT_SYMBOL(zfs_create); +EXPORT_SYMBOL(zfs_tmpfile); +EXPORT_SYMBOL(zfs_remove); +EXPORT_SYMBOL(zfs_mkdir); +EXPORT_SYMBOL(zfs_rmdir); +EXPORT_SYMBOL(zfs_readdir); +EXPORT_SYMBOL(zfs_getattr_fast); +EXPORT_SYMBOL(zfs_setattr); +EXPORT_SYMBOL(zfs_rename); +EXPORT_SYMBOL(zfs_symlink); +EXPORT_SYMBOL(zfs_readlink); +EXPORT_SYMBOL(zfs_link); +EXPORT_SYMBOL(zfs_inactive); +EXPORT_SYMBOL(zfs_space); +EXPORT_SYMBOL(zfs_fid); +EXPORT_SYMBOL(zfs_getpage); +EXPORT_SYMBOL(zfs_putpage); +EXPORT_SYMBOL(zfs_dirty_inode); +EXPORT_SYMBOL(zfs_map); + +/* BEGIN CSTYLED */ +module_param(zfs_delete_blocks, ulong, 0644); +MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); +/* END CSTYLED */ + +#endif -- cgit v1.2.3