diff options
Diffstat (limited to 'module')
42 files changed, 852 insertions, 297 deletions
diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index dce73577eacd..4c696129857a 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -378,4 +378,3 @@ MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); #endif MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1); -MODULE_DEPEND(zfsctrl, cryptodev, 1, 1, 1); diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 200bbf43d757..b3cb7e7e4374 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -121,6 +121,7 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV mirror"); diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index 4d27751c8893..cf762c5fd61c 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -40,6 +40,9 @@ static taskq_t *vdev_file_taskq; +unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; +unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; + void vdev_file_init(void) { @@ -167,8 +170,8 @@ skip_open: } *max_psize = *psize = zfa.zfa_size; - *logical_ashift = SPA_MINBLOCKSHIFT; - *physical_ashift = SPA_MINBLOCKSHIFT; + *logical_ashift = vdev_file_logical_ashift; + *physical_ashift = vdev_file_physical_ashift; return (0); } @@ -326,3 +329,8 @@ vdev_ops_t vdev_disk_ops = { }; #endif + +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW, + "Logical ashift for file-based devices"); +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW, + "Physical ashift for file-based devices"); diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c index ec7c04717c84..d7786d5136a2 100644 --- a/module/os/freebsd/zfs/zfs_file_os.c +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -234,13 +234,10 @@ drop: int zfs_file_fsync(zfs_file_t *fp, int flags) { - struct vnode *v; - if (fp->f_type != DTYPE_VNODE) return (EINVAL); - v = fp->f_data; - return (zfs_vop_fsync(v)); + return (zfs_vop_fsync(fp->f_vnode)); } int diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index b6cf0c92b70b..77812ca8d400 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -975,7 +975,7 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) #else rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); #endif - rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + ZFS_INIT_TEARDOWN_INACTIVE(zfsvfs); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); @@ -1126,7 +1126,7 @@ zfsvfs_free(zfsvfs_t *zfsvfs) ASSERT(zfsvfs->z_nr_znodes == 0); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); - rw_destroy(&zfsvfs->z_teardown_inactive_lock); + ZFS_DESTROY_TEARDOWN_INACTIVE(zfsvfs); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); @@ -1545,7 +1545,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) zfsvfs->z_log = NULL; } - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs); /* * If we are not unmounting (ie: online recv) and someone already @@ -1553,7 +1553,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } @@ -1581,7 +1581,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } @@ -1901,7 +1901,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) znode_t *zp; ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); - ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs)); /* * We already own this, so just update the objset_t, as the one we @@ -1939,7 +1939,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) bail: /* release the VOPs */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { @@ -2056,7 +2056,7 @@ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); - ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs)); /* * We already own this, so just hold and rele it to update the @@ -2072,7 +2072,7 @@ zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) zfsvfs->z_os = os; /* release the VOPs */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); /* diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c index 2a4acf21582f..2dde5b1f9d5c 100644 --- a/module/os/freebsd/zfs/zfs_vnops.c +++ b/module/os/freebsd/zfs/zfs_vnops.c @@ -4638,13 +4638,13 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); vrecycle(vp); return; } @@ -4653,7 +4653,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) /* * Fast path to recycle a vnode of a removed file. */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); vrecycle(vp); return; } @@ -4673,7 +4673,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) dmu_tx_commit(tx); } } - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); } @@ -5823,10 +5823,10 @@ zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) if (vn_need_pageq_flush(vp)) return (1); - if (!rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER)) + if (!ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs)) return (1); need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); return (need); } @@ -5857,12 +5857,12 @@ zfs_freebsd_reclaim(struct vop_reclaim_args *ap) * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); - rw_exit(&zfsvfs->z_teardown_inactive_lock); + ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); vp->v_data = NULL; return (0); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 76e24b1bdf51..653f42239df9 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -384,7 +384,7 @@ zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || zp->z_unlinked || - RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + ZFS_TEARDOWN_INACTIVE_WLOCKED(zp->z_zfsvfs)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; diff --git a/module/os/linux/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c index 9d045e3e8a66..49f48664503a 100644 --- a/module/os/linux/spl/spl-condvar.c +++ b/module/os/linux/spl/spl-condvar.c @@ -198,6 +198,18 @@ __cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) } EXPORT_SYMBOL(__cv_wait_sig); +void +__cv_wait_idle(kcondvar_t *cvp, kmutex_t *mp) +{ + sigset_t blocked, saved; + + sigfillset(&blocked); + (void) sigprocmask(SIG_BLOCK, &blocked, &saved); + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); + (void) sigprocmask(SIG_SETMASK, &saved, NULL); +} +EXPORT_SYMBOL(__cv_wait_idle); + #if defined(HAVE_IO_SCHEDULE_TIMEOUT) #define spl_io_schedule_timeout(t) io_schedule_timeout(t) #else @@ -330,6 +342,21 @@ __cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) } EXPORT_SYMBOL(__cv_timedwait_sig); +int +__cv_timedwait_idle(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + sigset_t blocked, saved; + int rc; + + sigfillset(&blocked); + (void) sigprocmask(SIG_BLOCK, &blocked, &saved); + rc = __cv_timedwait_common(cvp, mp, exp_time, + TASK_INTERRUPTIBLE, 0); + (void) sigprocmask(SIG_SETMASK, &saved, NULL); + + return (rc); +} +EXPORT_SYMBOL(__cv_timedwait_idle); /* * 'expire_time' argument is an absolute clock time in nanoseconds. * Return value is time left (expire_time - now) or -1 if timeout occurred. @@ -427,6 +454,23 @@ cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, } EXPORT_SYMBOL(cv_timedwait_sig_hires); +int +cv_timedwait_idle_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + sigset_t blocked, saved; + int rc; + + sigfillset(&blocked); + (void) sigprocmask(SIG_BLOCK, &blocked, &saved); + rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE); + (void) sigprocmask(SIG_SETMASK, &saved, NULL); + + return (rc); +} +EXPORT_SYMBOL(cv_timedwait_idle_hires); + void __cv_signal(kcondvar_t *cvp) { diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 5a2245436c72..85daef43be40 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -34,6 +34,7 @@ #include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/zio.h> +#include <linux/blkpg.h> #include <linux/msdos_fs.h> #include <linux/vfs_compat.h> @@ -175,7 +176,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* * Reopen the device if it is currently open. When expanding a - * partition force re-scanning the partition table while closed + * partition force re-scanning the partition table if userland + * did not take care of this already. We need to do this while closed * in order to get an accurate updated block device size. Then * since udev may need to recreate the device links increase the * open retry timeout before reporting the device as unavailable. @@ -192,7 +194,23 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, if (bdev) { if (v->vdev_expanding && bdev != bdev->bd_contains) { bdevname(bdev->bd_contains, disk_name + 5); - reread_part = B_TRUE; + /* + * If userland has BLKPG_RESIZE_PARTITION, + * then it should have updated the partition + * table already. We can detect this by + * comparing our current physical size + * with that of the device. If they are + * the same, then we must not have + * BLKPG_RESIZE_PARTITION or it failed to + * update the partition table online. We + * fallback to rescanning the partition + * table from the kernel below. However, + * if the capacity already reflects the + * updated partition, then we skip + * rescanning the partition table here. + */ + if (v->vdev_psize == bdev_capacity(bdev)) + reread_part = B_TRUE; } blkdev_put(bdev, mode | FMODE_EXCL); diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index a4e71ca40788..423ce858144c 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -45,6 +45,17 @@ static taskq_t *vdev_file_taskq; +/* + * By default, the logical/physical ashift for file vdevs is set to + * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9) + * blocksizes. Users may opt to change one or both of these for testing + * or performance reasons. Care should be taken as these values will + * impact the vdev_ashift setting which can only be set at vdev creation + * time. + */ +unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; +unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; + static void vdev_file_hold(vdev_t *vd) { @@ -159,8 +170,8 @@ skip_open: } *max_psize = *psize = zfa.zfa_size; - *logical_ashift = SPA_MINBLOCKSHIFT; - *physical_ashift = SPA_MINBLOCKSHIFT; + *logical_ashift = vdev_file_logical_ashift; + *physical_ashift = vdev_file_physical_ashift; return (0); } @@ -346,3 +357,8 @@ vdev_ops_t vdev_disk_ops = { }; #endif + +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW, + "Logical ashift for file-based devices"); +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW, + "Physical ashift for file-based devices"); diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index 8d79878c0458..11b5559321ad 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -1153,7 +1153,7 @@ zfs_acl_chown_setattr(znode_t *zp) int error; zfs_acl_t *aclp; - if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL) + if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIX) return (0); ASSERT(MUTEX_HELD(&zp->z_lock)); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 389200b52127..15ec7b91b001 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -356,9 +356,9 @@ acltype_changed_cb(void *arg, uint64_t newval) zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; break; - case ZFS_ACLTYPE_POSIXACL: + case ZFS_ACLTYPE_POSIX: #ifdef CONFIG_FS_POSIX_ACL - zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL; + zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX; zfsvfs->z_sb->s_flags |= SB_POSIXACL; #else zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index 333c647466cc..9db8bda4cc66 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -187,10 +187,12 @@ __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { char *fsname; + ZFS_ENTER(zfsvfs); fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); seq_puts(seq, fsname); kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); + ZFS_EXIT(zfsvfs); return (0); } @@ -209,7 +211,7 @@ __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) #ifdef CONFIG_FS_POSIX_ACL switch (zfsvfs->z_acl_type) { - case ZFS_ACLTYPE_POSIXACL: + case ZFS_ACLTYPE_POSIX: seq_puts(seq, ",posixacl"); break; default: @@ -272,8 +274,12 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) * a txg sync. If the dsl_pool lock is held over sget() * this can prevent the pool sync and cause a deadlock. */ + dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); dsl_pool_rele(dmu_objset_pool(os), FTAG); + s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); + + dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(s)) diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index fa3c036405b0..9b5fd0fd397b 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1058,7 +1058,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir) struct posix_acl *acl = NULL; int error = 0; - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (!S_ISLNK(ip->i_mode)) { @@ -1103,7 +1103,7 @@ zpl_chmod_acl(struct inode *ip) struct posix_acl *acl; int error; - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (S_ISLNK(ip->i_mode)) @@ -1129,7 +1129,7 @@ __zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) @@ -1146,7 +1146,7 @@ __zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) @@ -1168,7 +1168,7 @@ __zpl_xattr_acl_get_access(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl(ip, type); @@ -1196,7 +1196,7 @@ __zpl_xattr_acl_get_default(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl(ip, type); @@ -1224,7 +1224,7 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); if (!inode_owner_or_capable(ip)) @@ -1264,7 +1264,7 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name, if (strcmp(name, "") != 0) return (-EINVAL); #endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); if (!inode_owner_or_capable(ip)) diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 837b8ae71b34..f3dbbc15d25e 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -253,9 +253,10 @@ zfs_prop_init(void) static zprop_index_t acltype_table[] = { { "off", ZFS_ACLTYPE_OFF }, - { "disabled", ZFS_ACLTYPE_OFF }, - { "noacl", ZFS_ACLTYPE_OFF }, - { "posixacl", ZFS_ACLTYPE_POSIXACL }, + { "posix", ZFS_ACLTYPE_POSIX }, + { "disabled", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */ + { "noacl", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */ + { "posixacl", ZFS_ACLTYPE_POSIX }, /* bkwrd compatibility */ { NULL } }; @@ -430,7 +431,7 @@ zfs_prop_init(void) #ifndef __FreeBSD__ zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", ZFS_ACLTYPE_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "noacl | posixacl", "ACLTYPE", acltype_table); + "off | posix", "ACLTYPE", acltype_table); #endif zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, @@ -705,7 +706,7 @@ zfs_prop_init(void) zprop_register_impl(ZFS_PROP_ACLTYPE, "acltype", PROP_TYPE_INDEX, ZFS_ACLTYPE_OFF, NULL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "noacl | posixacl", "ACLTYPE", B_FALSE, B_FALSE, acltype_table); + "off | posix", "ACLTYPE", B_FALSE, B_FALSE, acltype_table); #endif zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 904c325f37a1..7a499298f75c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. @@ -895,6 +895,12 @@ static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); /* + * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU + * metadata and data are cached from ARC into L2ARC. + */ +int l2arc_mfuonly = 0; + +/* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we @@ -2188,7 +2194,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, ret = SET_ERROR(EIO); spa_log_error(spa, zb); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } return (ret); @@ -5654,7 +5660,7 @@ arc_read_done(zio_t *zio) spa_log_error(zio->io_spa, &acb->acb_zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); + zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } @@ -5931,7 +5937,7 @@ top: spa_log_error(spa, zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } } if (rc != 0) { @@ -8909,6 +8915,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * Copy buffers for L2ARC writing. */ for (int try = 0; try < L2ARC_FEED_TYPES; try++) { + /* + * If try == 1 or 3, we cache MRU metadata and data + * respectively. + */ + if (l2arc_mfuonly) { + if (try == 1 || try == 3) + continue; + } + multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; @@ -9174,7 +9189,7 @@ l2arc_feed_thread(void *unused) cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig(&l2arc_feed_thr_cv, + (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; @@ -9291,8 +9306,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) ASSERT(!l2arc_vdev_present(vd)); - vdev_ashift_optimize(vd); - /* * Create a new l2arc device entry. */ @@ -10562,6 +10575,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, + "Cache only MFU data from ARC into L2ARC"); + ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 2de1f4e4c267..7d817320aae4 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -718,7 +718,7 @@ dbuf_evict_thread(void *unused) while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig_hires(&dbuf_evict_cv, + (void) cv_timedwait_idle_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index df10d8d6faae..c53fba75cc51 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -568,8 +568,7 @@ commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, uint64_t txg = dmu_tx_get_txg(tx); if (!md->md_synctask_txg[txg & TXG_MASK]) { dsl_sync_task_nowait(dmu_tx_pool(tx), - redaction_list_update_sync, md, 5, ZFS_SPACE_CHECK_NONE, - tx); + redaction_list_update_sync, md, tx); md->md_synctask_txg[txg & TXG_MASK] = B_TRUE; md->md_latest_synctask_txg = txg; } @@ -1007,10 +1006,14 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, objset_t *os; struct redact_thread_arg *args = NULL; redaction_list_t *new_rl = NULL; + char *newredactbook; if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0) return (err); + newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN, + KM_SLEEP); + if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) != 0) { goto out; @@ -1064,7 +1067,6 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, goto out; boolean_t resuming = B_FALSE; - char newredactbook[ZFS_MAX_DATASET_NAME_LEN]; zfs_bookmark_phys_t bookmark; (void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN); @@ -1074,6 +1076,10 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, "#%s", redactbook); if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) { dsl_pool_rele(dp, FTAG); + kmem_free(newredactbook, + sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); + if (args != NULL) + kmem_free(args, numsnaps * sizeof (*args)); return (SET_ERROR(ENAMETOOLONG)); } err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); @@ -1146,16 +1152,23 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, (void) thread_create(NULL, 0, redact_traverse_thread, rta, 0, curproc, TS_RUN, minclsyspri); } - struct redact_merge_thread_arg rmta = { { {0} } }; - (void) bqueue_init(&rmta.q, zfs_redact_queue_ff, + + struct redact_merge_thread_arg *rmta; + rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP); + + (void) bqueue_init(&rmta->q, zfs_redact_queue_ff, zfs_redact_queue_length, offsetof(struct redact_record, ln)); - rmta.numsnaps = numsnaps; - rmta.spa = os->os_spa; - rmta.thr_args = args; - (void) thread_create(NULL, 0, redact_merge_thread, &rmta, 0, curproc, + rmta->numsnaps = numsnaps; + rmta->spa = os->os_spa; + rmta->thr_args = args; + (void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc, TS_RUN, minclsyspri); - err = perform_redaction(os, new_rl, &rmta); + err = perform_redaction(os, new_rl, rmta); + kmem_free(rmta, sizeof (struct redact_merge_thread_arg)); + out: + kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); + if (new_rl != NULL) { dsl_redaction_list_long_rele(new_rl, FTAG); dsl_redaction_list_rele(new_rl, FTAG); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 00536f2774e7..30d20bfefa12 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1197,7 +1197,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_t *dn; zrl_init(&dnh->dnh_zrlock); - zrl_tryenter(&dnh->dnh_zrlock); + VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock)); dn = dnode_create(os, dnp, NULL, object, dnh); DNODE_VERIFY(dn); @@ -1949,18 +1949,20 @@ static void dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { - dmu_buf_impl_t db_search; + dmu_buf_impl_t *db_search; dmu_buf_impl_t *db; avl_index_t where; + db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + mutex_enter(&dn->dn_dbufs_mtx); - db_search.db_level = 1; - db_search.db_blkid = start_blkid + 1; - db_search.db_state = DB_SEARCH; + db_search->db_level = 1; + db_search->db_blkid = start_blkid + 1; + db_search->db_state = DB_SEARCH; for (;;) { - db = avl_find(&dn->dn_dbufs, &db_search, &where); + db = avl_find(&dn->dn_dbufs, db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); @@ -1972,7 +1974,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, /* * Setup the next blkid we want to search for. */ - db_search.db_blkid = db->db_blkid + 1; + db_search->db_blkid = db->db_blkid + 1; ASSERT3U(db->db_blkid, >=, start_blkid); /* @@ -1992,10 +1994,10 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, /* * Walk all the in-core level-1 dbufs and verify they have been dirtied. */ - db_search.db_level = 1; - db_search.db_blkid = start_blkid + 1; - db_search.db_state = DB_SEARCH; - db = avl_find(&dn->dn_dbufs, &db_search, &where); + db_search->db_level = 1; + db_search->db_blkid = start_blkid + 1; + db_search->db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { @@ -2005,6 +2007,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, ASSERT(db->db_dirtycnt > 0); } #endif + kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 712af664e90f..0ebda2f77074 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. */ diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index 2d6ca8549eb9..148e8fff2437 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -170,15 +170,13 @@ dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc, static void dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx, - boolean_t early) + dmu_tx_t *tx, boolean_t early) { dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); dst->dst_pool = dp; dst->dst_txg = dmu_tx_get_txg(tx); - dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; - dst->dst_space_check = space_check; + dst->dst_space_check = ZFS_SPACE_CHECK_NONE; dst->dst_checkfunc = dsl_null_checkfunc; dst->dst_syncfunc = syncfunc; dst->dst_arg = arg; @@ -192,18 +190,16 @@ dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, void dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) + dmu_tx_t *tx) { - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_FALSE); + dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_FALSE); } void dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) + dmu_tx_t *tx) { - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_TRUE); + dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_TRUE); } /* diff --git a/module/zfs/fm.c b/module/zfs/fm.c index c00e08b8d02a..a5003f85d621 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -104,13 +104,15 @@ struct erpt_kstat { kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t payload_set_failed; /* num payload set failures */ + kstat_named_t erpt_duplicates; /* num duplicate erpts */ }; static struct erpt_kstat erpt_kstat_data = { { "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 }, - { "payload-set-failed", KSTAT_DATA_UINT64 } + { "payload-set-failed", KSTAT_DATA_UINT64 }, + { "erpt-duplicates", KSTAT_DATA_UINT64 } }; kstat_t *fm_ksp; @@ -568,6 +570,12 @@ out: return (error); } +void +zfs_zevent_track_duplicate(void) +{ + atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); +} + static int zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) { @@ -1633,6 +1641,8 @@ fm_init(void) list_create(&zevent_list, sizeof (zevent_t), offsetof(zevent_t, ev_node)); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); + + zfs_ereport_init(); } void @@ -1640,6 +1650,8 @@ fm_fini(void) { int count; + zfs_ereport_fini(); + zfs_zevent_drain_all(&count); mutex_enter(&zevent_lock); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index ccc247d1557a..133005b227e5 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 4170d7e03ebd..99852521b6d1 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -198,14 +198,6 @@ mmp_init(spa_t *spa) cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); mmp->mmp_kstat_id = 1; - - /* - * mmp_write_done() calculates mmp_delay based on prior mmp_delay and - * the elapsed time since the last write. For the first mmp write, - * there is no "last write", so we start with fake non-zero values. - */ - mmp->mmp_last_write = gethrtime(); - mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); } void @@ -557,6 +549,18 @@ mmp_thread(void *arg) mmp_thread_enter(mmp, &cpr); + /* + * There have been no MMP writes yet. Setting mmp_last_write here gives + * us one mmp_fail_ns period, which is consistent with the activity + * check duration, to try to land an MMP write before MMP suspends the + * pool (if so configured). + */ + + mutex_enter(&mmp->mmp_io_lock); + mmp->mmp_last_write = gethrtime(); + mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); + mutex_exit(&mmp->mmp_io_lock); + while (!mmp->mmp_thread_exiting) { hrtime_t next_time = gethrtime() + MSEC2NSEC(MMP_DEFAULT_INTERVAL); @@ -671,7 +675,7 @@ mmp_thread(void *arg) } CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv, + (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), CALLOUT_FLAG_ABSOLUTE); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 2c0e4b860a04..2ce0139c9137 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -24,6 +24,7 @@ */ /* * Copyright (c) 2013, 2019 by Delphix. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. */ #include <sys/zfs_context.h> diff --git a/module/zfs/spa.c b/module/zfs/spa.c index aac469f44b59..532f04b91ca1 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -1000,13 +1000,25 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important - * priority than the other taskqs. Under Linux this - * means incrementing the priority value on platforms - * like illumos it should be decremented. + * priority than the other taskqs. + * + * Under Linux and FreeBSD this means incrementing + * the priority value as opposed to platforms like + * illumos where it should be decremented. + * + * On FreeBSD, if priorities divided by four (RQ_PPQ) + * are equal then a difference between them is + * insignificant. */ - if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) + if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { +#if defined(__linux__) pri++; - +#elif defined(__FreeBSD__) + pri += 4; +#else +#error "unknown OS" +#endif + } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); } @@ -2485,11 +2497,12 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); VERIFY0(zap_count(mos, ll_obj, &count)); if (count > 0) { - dsl_deadlist_t ll = { 0 }; + dsl_deadlist_t *ll; dsl_deadlist_entry_t *dle; bplist_t to_free; - dsl_deadlist_open(&ll, mos, ll_obj); - dle = dsl_deadlist_first(&ll); + ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); + dsl_deadlist_open(ll, mos, ll_obj); + dle = dsl_deadlist_first(ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, @@ -2497,7 +2510,7 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) if (err == 0) { sublist_delete_arg_t sync_arg = { .spa = spa, - .ll = &ll, + .ll = ll, .key = dle->dle_mintxg, .to_free = &to_free }; @@ -2512,7 +2525,8 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) } bplist_clear(&to_free); bplist_destroy(&to_free); - dsl_deadlist_close(&ll); + dsl_deadlist_close(ll); + kmem_free(ll, sizeof (dsl_deadlist_t)); } else { livelist_delete_arg_t sync_arg = { .spa = spa, @@ -2688,8 +2702,7 @@ spa_livelist_condense_cb(void *arg, zthr_t *t) lca->first_size = first_size; lca->next_size = next_size; dsl_sync_task_nowait(spa_get_dsl(spa), - spa_livelist_condense_sync, lca, 0, - ZFS_SPACE_CHECK_NONE, tx); + spa_livelist_condense_sync, lca, tx); dmu_tx_commit(tx); return; } @@ -2869,7 +2882,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, - NULL, NULL, NULL, 0, 0); + NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; @@ -5749,7 +5762,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; - vdev_ashift_optimize(vd); vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index b98b7badbae1..dacba127dcfa 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ @@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) if (target->spa_ccw_fail_time == 0) { (void) zfs_ereport_post( FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, - target, NULL, NULL, NULL, 0, 0); + target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); @@ -577,10 +577,8 @@ spa_config_update(spa_t *spa, int what) (tvd->vdev_islog && tvd->vdev_removing)) continue; - if (tvd->vdev_ms_array == 0) { - vdev_ashift_optimize(tvd); + if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); - } vdev_expand(tvd, txg); } } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index f47adb94d55b..2ab58815400a 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -397,8 +397,7 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); /* Kick this off asynchronously; errors are ignored. */ - dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, - nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free nvl */ @@ -532,7 +531,7 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, spa_history_log_sync(nvl, tx); } else { dsl_sync_task_nowait(spa_get_dsl(spa), - spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); + spa_history_log_sync, nvl, tx); } /* spa_history_log_sync() will free nvl */ } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index a5f2b041737b..65375b579da6 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -242,16 +242,11 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) { CALLB_CPR_SAFE_BEGIN(cpr); - /* - * cv_wait_sig() is used instead of cv_wait() in order to prevent - * this process from incorrectly contributing to the system load - * average when idle. - */ if (time) { - (void) cv_timedwait_sig(cv, &tx->tx_sync_lock, + (void) cv_timedwait_idle(cv, &tx->tx_sync_lock, ddi_get_lbolt() + time); } else { - cv_wait_sig(cv, &tx->tx_sync_lock); + cv_wait_idle(cv, &tx->tx_sync_lock); } CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); @@ -760,7 +755,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) if (should_quiesce == B_TRUE) { cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); } else { - cv_wait_sig(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + cv_wait_idle(&tx->tx_quiesce_done_cv, + &tx->tx_sync_lock); } } mutex_exit(&tx->tx_sync_lock); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 95a2f5947db1..a94101485c94 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio) ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } @@ -1673,6 +1673,38 @@ vdev_set_deflate_ratio(vdev_t *vd) } /* + * Maximize performance by inflating the configured ashift for top level + * vdevs to be as close to the physical ashift as possible while maintaining + * administrator defined limits and ensuring it doesn't go below the + * logical ashift. + */ +static void +vdev_ashift_optimize(vdev_t *vd) +{ + ASSERT(vd == vd->vdev_top); + + if (vd->vdev_ashift < vd->vdev_physical_ashift) { + vd->vdev_ashift = MIN( + MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), + MAX(zfs_vdev_min_auto_ashift, + vd->vdev_physical_ashift)); + } else { + /* + * If the logical and physical ashifts are the same, then + * we ensure that the top-level vdev's ashift is not smaller + * than our minimum ashift value. For the unusual case + * where logical ashift > physical ashift, we can't cap + * the calculated ashift based on max ashift as that + * would cause failures. + * We still check if we need to increase it to match + * the min ashift. + */ + vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, + vd->vdev_ashift); + } +} + +/* * Prepare a virtual device for access. */ int @@ -1830,16 +1862,17 @@ vdev_open(vdev_t *vd) return (SET_ERROR(EINVAL)); } + /* + * We can always set the logical/physical ashift members since + * their values are only used to calculate the vdev_ashift when + * the device is first added to the config. These values should + * not be used for anything else since they may change whenever + * the device is reopened and we don't store them in the label. + */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); - vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); - vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); - - if (vd->vdev_logical_ashift > ASHIFT_MAX) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_ASHIFT_TOO_BIG); - return (SET_ERROR(EDOM)); - } + vd->vdev_logical_ashift = MAX(logical_ashift, + vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* @@ -1848,6 +1881,24 @@ vdev_open(vdev_t *vd) */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; + + /* + * If the vdev_ashift was not overriden at creation time, + * then set it the logical ashift and optimize the ashift. + */ + if (vd->vdev_ashift == 0) { + vd->vdev_ashift = vd->vdev_logical_ashift; + + if (vd->vdev_logical_ashift > ASHIFT_MAX) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_ASHIFT_TOO_BIG); + return (SET_ERROR(EDOM)); + } + + if (vd->vdev_top == vd) { + vdev_ashift_optimize(vd); + } + } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -1862,11 +1913,10 @@ vdev_open(vdev_t *vd) vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); - } vd->vdev_max_asize = max_asize; } @@ -2445,35 +2495,6 @@ vdev_metaslab_set_size(vdev_t *vd) ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } -/* - * Maximize performance by inflating the configured ashift for top level - * vdevs to be as close to the physical ashift as possible while maintaining - * administrator defined limits and ensuring it doesn't go below the - * logical ashift. - */ -void -vdev_ashift_optimize(vdev_t *vd) -{ - if (vd == vd->vdev_top) { - if (vd->vdev_ashift < vd->vdev_physical_ashift) { - vd->vdev_ashift = MIN( - MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), - MAX(zfs_vdev_min_auto_ashift, - vd->vdev_physical_ashift)); - } else { - /* - * Unusual case where logical ashift > physical ashift - * so we can't cap the calculated ashift based on max - * ashift as that would cause failures. - * We still check if we need to increase it to match - * the min ashift. - */ - vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, - vd->vdev_ashift); - } - } -} - void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { @@ -4759,7 +4780,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, - save_state, 0); + save_state); } /* Erase any notion of persistent removed state */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 6bc2d917d59c..12ee393bd5db 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -16,7 +16,7 @@ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. - * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -576,8 +576,7 @@ spa_condense_indirect_commit_entry(spa_t *spa, */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), - spa_condense_indirect_commit_sync, sci, - 0, ZFS_SPACE_CHECK_NONE, tx); + spa_condense_indirect_commit_sync, sci, tx); } vdev_indirect_mapping_entry_t *vime = @@ -1474,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vdev_t *vd = ic->ic_vdev; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } } diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index ab711441d9ca..7ff7fffcc80e 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -126,7 +126,7 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, - guid, 2, ZFS_SPACE_CHECK_NONE, tx); + guid, tx); switch (new_state) { case VDEV_INITIALIZE_ACTIVE: @@ -216,8 +216,7 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), - vdev_initialize_zap_update_sync, guid, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + vdev_initialize_zap_update_sync, guid, tx); } /* diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 8c7468255565..7fab7d0d7950 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -149,6 +149,8 @@ #include <sys/dsl_scan.h> #include <sys/abd.h> #include <sys/fs/zfs.h> +#include <sys/byteorder.h> +#include <sys/zfs_bootenv.h> /* * Basic routines to read and write from a vdev label. @@ -1233,13 +1235,9 @@ vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) * bootloader should have rewritten them all to be the same on boot, * and any changes we made since boot have been the same across all * labels. - * - * While grub supports writing to all four labels, other bootloaders - * don't, so we only use the first two labels to store boot - * information. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { - for (int l = 0; l < VDEV_LABELS / 2; l++) { + for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, @@ -1249,14 +1247,15 @@ vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) } int -vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command) +vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) { + nvlist_t *config; spa_t *spa = rvd->vdev_spa; abd_t *abd = NULL; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; - ASSERT(command); + ASSERT(bootenv); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); zio_t *zio = zio_root(spa, NULL, &abd, flags); @@ -1264,39 +1263,81 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command) int err = zio_wait(zio); if (abd != NULL) { + char *buf; vdev_boot_envblock_t *vbe = abd_to_buf(abd); - if (vbe->vbe_version != VB_RAW) { - abd_free(abd); - return (SET_ERROR(ENOTSUP)); + + vbe->vbe_version = ntohll(vbe->vbe_version); + switch (vbe->vbe_version) { + case VB_RAW: + /* + * if we have textual data in vbe_bootenv, create nvlist + * with key "envmap". + */ + fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW); + vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; + fnvlist_add_string(bootenv, GRUB_ENVMAP, + vbe->vbe_bootenv); + break; + + case VB_NVLIST: + err = nvlist_unpack(vbe->vbe_bootenv, + sizeof (vbe->vbe_bootenv), &config, 0); + if (err == 0) { + fnvlist_merge(bootenv, config); + nvlist_free(config); + break; + } + /* FALLTHROUGH */ + default: + /* Check for FreeBSD zfs bootonce command string */ + buf = abd_to_buf(abd); + if (*buf == '\0') { + fnvlist_add_uint64(bootenv, BOOTENV_VERSION, + VB_NVLIST); + break; + } + fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf); } - vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; - fnvlist_add_string(command, "envmap", vbe->vbe_bootenv); - /* abd was allocated in vdev_label_read_bootenv_impl() */ + + /* + * abd was allocated in vdev_label_read_bootenv_impl() + */ abd_free(abd); - /* If we managed to read any successfully, return success. */ + /* + * If we managed to read any successfully, + * return success. + */ return (0); } return (err); } int -vdev_label_write_bootenv(vdev_t *vd, char *envmap) +vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) { zio_t *zio; spa_t *spa = vd->vdev_spa; vdev_boot_envblock_t *bootenv; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - int error = ENXIO; + int error; + size_t nvsize; + char *nvbuf; + + error = nvlist_size(env, &nvsize, NV_ENCODE_XDR); + if (error != 0) + return (SET_ERROR(error)); - if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) { + if (nvsize >= sizeof (bootenv->vbe_bootenv)) { return (SET_ERROR(E2BIG)); } ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + error = ENXIO; for (int c = 0; c < vd->vdev_children; c++) { - int child_err = vdev_label_write_bootenv(vd->vdev_child[c], - envmap); + int child_err; + + child_err = vdev_label_write_bootenv(vd->vdev_child[c], env); /* * As long as any of the disks managed to write all of their * labels successfully, return success. @@ -1312,16 +1353,41 @@ vdev_label_write_bootenv(vdev_t *vd, char *envmap) ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(abd, VDEV_PAD_SIZE); + bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); + nvbuf = bootenv->vbe_bootenv; + nvsize = sizeof (bootenv->vbe_bootenv); + + bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); + switch (bootenv->vbe_version) { + case VB_RAW: + if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) { + (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize); + } + error = 0; + break; - char *buf = bootenv->vbe_bootenv; - (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv)); - bootenv->vbe_version = VB_RAW; - abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + case VB_NVLIST: + error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR, + KM_SLEEP); + break; + + default: + error = EINVAL; + break; + } + + if (error == 0) { + bootenv->vbe_version = htonll(bootenv->vbe_version); + abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + } else { + abd_free(abd); + return (SET_ERROR(error)); + } retry: zio = zio_root(spa, NULL, NULL, flags); - for (int l = 0; l < VDEV_LABELS / 2; l++) { + for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, abd, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 5e1060f127c9..71b5adbbd06a 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -391,7 +391,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); *physical_ashift = MAX(*physical_ashift, - vd->vdev_physical_ashift); + cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 4320078b6f7c..47312e02f70a 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ @@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } @@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio) vdev_t *cvd; rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) { - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = - rm->rm_ecksuminjected; + if (rc->rc_error != 0) + continue; + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + int ret = zfs_ereport_start_checksum( + zio->io_spa, cvd, &zio->io_bookmark, zio, + rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + if (ret != EALREADY) { mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); - - zfs_ereport_start_checksum( - zio->io_spa, cvd, - &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); } } } diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 85ed8afe1cf4..3362d608c037 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -267,7 +267,7 @@ vdev_rebuild_initiate(vdev_t *vd) vd->vdev_rebuilding = B_TRUE; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, - (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); @@ -553,8 +553,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) vr->vr_scan_offset[txg & TXG_MASK] = start; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_update_sync, - (void *)(uintptr_t)vd->vdev_id, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } /* When exiting write out our progress. */ @@ -875,16 +874,14 @@ vdev_rebuild_thread(void *arg) * by a pool checkpoint. See the dsl_scan_done() comments. */ dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, - (void *)(uintptr_t)vd->vdev_id, 0, - ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_cancel_wanted) { /* * The rebuild operation was canceled. This will occur when * a device participating in the rebuild is detached. */ dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, - (void *)(uintptr_t)vd->vdev_id, 0, - ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_reset_wanted) { /* * Reset the running rebuild without canceling and restarting @@ -892,8 +889,7 @@ vdev_rebuild_thread(void *arg) * participate in the rebuild. */ dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, - (void *)(uintptr_t)vd->vdev_id, 0, - ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } else { /* * The rebuild operation should be suspended. This may occur diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 56e420871f61..fdeca7ab3418 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1167,8 +1167,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) /* After this, we can not use svr. */ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, - 0, ZFS_SPACE_CHECK_NONE, tx); + dsl_sync_task_nowait(spa->spa_dsl_pool, + vdev_remove_complete_sync, svr, tx); dmu_tx_commit(tx); } @@ -1317,7 +1317,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, - svr, 0, ZFS_SPACE_CHECK_NONE, tx); + svr, tx); } svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); @@ -2143,8 +2143,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) vdev_config_dirty(vd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, - vdev_remove_initiate_sync, - (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); return (0); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 3f8c34806020..02b42ddd5a6c 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -317,7 +317,7 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, - guid, 2, ZFS_SPACE_CHECK_NONE, tx); + guid, tx); switch (new_state) { case VDEV_TRIM_ACTIVE: @@ -481,7 +481,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) if (ta->trim_type == TRIM_TYPE_MANUAL) { while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { - cv_timedwait_sig(&vd->vdev_trim_io_cv, + cv_timedwait_idle(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock, ddi_get_lbolt() + MSEC_TO_TICK(10)); } @@ -510,8 +510,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), - vdev_trim_zap_update_sync, guid, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + vdev_trim_zap_update_sync, guid, tx); } /* diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index ad13ccedfc06..a8341f50ba09 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012,2020 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -101,7 +101,251 @@ * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ + #ifdef _KERNEL +/* + * Duplicate ereport Detection + * + * Some ereports are retained momentarily for detecting duplicates. These + * are kept in a recent_events_node_t in both a time-ordered list and an AVL + * tree of recent unique ereports. + * + * The lifespan of these recent ereports is bounded (15 mins) and a cleaner + * task is used to purge stale entries. + */ +static list_t recent_events_list; +static avl_tree_t recent_events_tree; +static kmutex_t recent_events_lock; +static taskqid_t recent_events_cleaner_tqid; + +/* + * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. + * + * This setting can be changed dynamically and setting it to zero + * disables duplicate detection. + */ +unsigned int zfs_zevent_retain_max = 2000; + +/* + * The lifespan for a recent ereport entry. The default of 15 minutes is + * intended to outlive the zfs diagnosis engine's threshold of 10 errors + * over a period of 10 minutes. + */ +unsigned int zfs_zevent_retain_expire_secs = 900; + +typedef enum zfs_subclass { + ZSC_IO, + ZSC_DATA, + ZSC_CHECKSUM +} zfs_subclass_t; + +typedef struct { + /* common criteria */ + uint64_t re_pool_guid; + uint64_t re_vdev_guid; + int re_io_error; + uint64_t re_io_size; + uint64_t re_io_offset; + zfs_subclass_t re_subclass; + zio_priority_t re_io_priority; + + /* logical zio criteria (optional) */ + zbookmark_phys_t re_io_bookmark; + + /* internal state */ + avl_node_t re_tree_link; + list_node_t re_list_link; + uint64_t re_timestamp; +} recent_events_node_t; + +static int +recent_events_compare(const void *a, const void *b) +{ + const recent_events_node_t *node1 = a; + const recent_events_node_t *node2 = b; + int cmp; + + /* + * The comparison order here is somewhat arbitrary. + * What's important is that if every criteria matches, then it + * is a duplicate (i.e. compare returns 0) + */ + if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) + return (cmp); + + const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; + const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; + + if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) + return (cmp); + + return (0); +} + +static void zfs_ereport_schedule_cleaner(void); + +/* + * background task to clean stale recent event nodes. + */ +/*ARGSUSED*/ +static void +zfs_ereport_cleaner(void *arg) +{ + recent_events_node_t *entry; + uint64_t now = gethrtime(); + + /* + * purge expired entries + */ + mutex_enter(&recent_events_lock); + while ((entry = list_tail(&recent_events_list)) != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + if (age <= zfs_zevent_retain_expire_secs) + break; + + /* remove expired node */ + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + + /* Restart the cleaner if more entries remain */ + recent_events_cleaner_tqid = 0; + if (!list_is_empty(&recent_events_list)) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); +} + +static void +zfs_ereport_schedule_cleaner(void) +{ + ASSERT(MUTEX_HELD(&recent_events_lock)); + + uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); + + recent_events_cleaner_tqid = taskq_dispatch_delay( + system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, + ddi_get_lbolt() + NSEC_TO_TICK(timeout)); +} + +/* + * Check if an ereport would be a duplicate of one recently posted. + * + * An ereport is considered a duplicate if the set of criteria in + * recent_events_node_t all match. + * + * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM + * are candidates for duplicate checking. + */ +static boolean_t +zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) +{ + recent_events_node_t search = {0}, *entry; + + if (vd == NULL || zio == NULL) + return (B_FALSE); + + if (zfs_zevent_retain_max == 0) + return (B_FALSE); + + if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) + search.re_subclass = ZSC_IO; + else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) + search.re_subclass = ZSC_DATA; + else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) + search.re_subclass = ZSC_CHECKSUM; + else + return (B_FALSE); + + search.re_pool_guid = spa_guid(spa); + search.re_vdev_guid = vd->vdev_guid; + search.re_io_error = zio->io_error; + search.re_io_priority = zio->io_priority; + /* if size is supplied use it over what's in zio */ + if (size) { + search.re_io_size = size; + search.re_io_offset = offset; + } else { + search.re_io_size = zio->io_size; + search.re_io_offset = zio->io_offset; + } + + /* grab optional logical zio criteria */ + if (zb != NULL) { + search.re_io_bookmark.zb_objset = zb->zb_objset; + search.re_io_bookmark.zb_object = zb->zb_object; + search.re_io_bookmark.zb_level = zb->zb_level; + search.re_io_bookmark.zb_blkid = zb->zb_blkid; + } + + uint64_t now = gethrtime(); + + mutex_enter(&recent_events_lock); + + /* check if we have seen this one recently */ + entry = avl_find(&recent_events_tree, &search, NULL); + if (entry != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + + /* + * There is still an active cleaner (since we're here). + * Reset the last seen time for this duplicate entry + * so that its lifespand gets extended. + */ + list_remove(&recent_events_list, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + zfs_zevent_track_duplicate(); + mutex_exit(&recent_events_lock); + + return (age <= zfs_zevent_retain_expire_secs); + } + + if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { + /* recycle oldest node */ + entry = list_tail(&recent_events_list); + ASSERT(entry != NULL); + list_remove(&recent_events_list, entry); + avl_remove(&recent_events_tree, entry); + } else { + entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); + } + + /* record this as a recent ereport */ + *entry = search; + avl_add(&recent_events_tree, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + /* Start a cleaner if not already scheduled */ + if (recent_events_cleaner_tqid == 0) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); + return (B_FALSE); +} + void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { @@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) - return (B_FALSE); - if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); @@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, + DATA_TYPE_UINT32, zio->io_priority, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a @@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) } /* - * Return 0 if event was posted, EINVAL if there was a problem posting it or - * EBUSY if the event was rate limited. + * Post an ereport for the given subclass + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t size) + const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (EINVAL); + + if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size)) + zb, zio, state, 0)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) @@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, return (rc); } -void +/* + * Prepare a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ +int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) @@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_cksum_report_t *report; #ifdef _KERNEL + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return; + return (SET_ERROR(EBUSY)); #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); @@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); - return; + return (0); } #endif @@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); + return (0); } void @@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) kmem_free(rpt, sizeof (*rpt)); } - +/* + * Post a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, @@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, nvlist_t *detector = NULL; zfs_ecksum_info_t *info; + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return (EBUSY); + return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { @@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #endif } -#if defined(_KERNEL) +#ifdef _KERNEL +void +zfs_ereport_init(void) +{ + mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&recent_events_list, sizeof (recent_events_node_t), + offsetof(recent_events_node_t, re_list_link)); + avl_create(&recent_events_tree, recent_events_compare, + sizeof (recent_events_node_t), offsetof(recent_events_node_t, + re_tree_link)); +} + +/* + * This 'early' fini needs to run before zfs_fini() which on Linux waits + * for the system_delay_taskq to drain. + */ +void +zfs_ereport_taskq_fini(void) +{ + mutex_enter(&recent_events_lock); + if (recent_events_cleaner_tqid != 0) { + taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); + recent_events_cleaner_tqid = 0; + } + mutex_exit(&recent_events_lock); +} + +void +zfs_ereport_fini(void) +{ + recent_events_node_t *entry; + + while ((entry = list_head(&recent_events_list)) != NULL) { + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + avl_destroy(&recent_events_tree); + list_destroy(&recent_events_list); + mutex_destroy(&recent_events_lock); +} + EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, + "Maximum recent zevents records to retain for duplicate checking"); +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, + "Expiration time for recent zevents records"); #endif /* _KERNEL */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 495ff4707d77..eff66b32fcb1 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3511,30 +3511,29 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, - * and it is used by the GRUB bootloader used on Linux to store the contents - * of the grubenv file. The file is stored as raw ASCII, and is protected by - * an embedded checksum. By default, GRUB will check if the boot filesystem - * supports storing the environment data in a special location, and if so, - * will invoke filesystem specific logic to retrieve it. This can be overridden - * by a variable, should the user so desire. + * and it is used by the bootloader(s) to store the bootloader and/or system + * specific data. + * The data is stored as nvlist data stream, and is protected by + * an embedded checksum. + * The version can have two possible values: + * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. + * VB_NVLIST: nvlist with arbitrary <key, value> pairs. */ -/* ARGSUSED */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { - {"envmap", DATA_TYPE_STRING, 0}, + {"version", DATA_TYPE_UINT64, 0}, + {"<keys>", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { - char *envmap; int error; spa_t *spa; - envmap = fnvlist_lookup_string(innvl, "envmap"); if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); - error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap); + error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); @@ -3544,7 +3543,6 @@ static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; -/* ARGSUSED */ static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { @@ -7615,6 +7613,7 @@ zfs_kmod_fini(void) kmem_free(zs, sizeof (zfsdev_state_t)); } + zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f956a9ef7621..8a8fbccd7d63 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. @@ -547,7 +547,7 @@ error: if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, &zio->io_bookmark, zio, 0, 0); + spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; @@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth) zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pio->io_offset, pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, - pio->io_spa, vd, zb, pio, 0, 0); + pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { @@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, - NULL, NULL, 0, 0); + NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); @@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio) zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - - zfs_ereport_start_checksum(zio->io_spa, + int ret = zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, NULL, &info); + + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + } } } @@ -4543,7 +4545,7 @@ zio_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0, 0); + zio, 0); } } } @@ -4557,16 +4559,16 @@ zio_done(zio_t *zio) */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - if (zio->io_type == ZIO_TYPE_READ) { - zio->io_vd->vdev_stat.vs_read_errors++; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - zio->io_vd->vdev_stat.vs_write_errors++; + int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, + zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vd->vdev_stat.vs_read_errors++; + else if (zio->io_type == ZIO_TYPE_WRITE) + zio->io_vd->vdev_stat.vs_write_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); } - mutex_exit(&zio->io_vd->vdev_stat_lock); - - (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); } if ((zio->io_error == EIO || !(zio->io_flags & @@ -4578,7 +4580,7 @@ zio_done(zio_t *zio) */ spa_log_error(zio->io_spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, - zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); + zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c index fdc4b863382c..5ac2e30467e3 100644 --- a/module/zfs/zthr.c +++ b/module/zfs/zthr.c @@ -56,7 +56,7 @@ * * == ZTHR creation * - * Every zthr needs three inputs to start running: + * Every zthr needs four inputs to start running: * * 1] A user-defined checker function (checkfunc) that decides whether * the zthr should start working or go to sleep. The function should @@ -72,6 +72,9 @@ * 3] A void args pointer that will be passed to checkfunc and func * implicitly by the infrastructure. * + * 4] A name for the thread. This string must be valid for the lifetime + * of the zthr. + * * The reason why the above API needs two different functions, * instead of one that both checks and does the work, has to do with * the zthr's internal state lock (zthr_state_lock) and the allowed @@ -221,6 +224,7 @@ struct zthr { zthr_checkfunc_t *zthr_checkfunc; zthr_func_t *zthr_func; void *zthr_arg; + const char *zthr_name; }; static void @@ -237,15 +241,10 @@ zthr_procedure(void *arg) t->zthr_func(t->zthr_arg, t); mutex_enter(&t->zthr_state_lock); } else { - /* - * cv_wait_sig() is used instead of cv_wait() in - * order to prevent this process from incorrectly - * contributing to the system load average when idle. - */ if (t->zthr_sleep_timeout == 0) { - cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock); + cv_wait_idle(&t->zthr_cv, &t->zthr_state_lock); } else { - (void) cv_timedwait_sig_hires(&t->zthr_cv, + (void) cv_timedwait_idle_hires(&t->zthr_cv, &t->zthr_state_lock, t->zthr_sleep_timeout, MSEC2NSEC(1), 0); } @@ -296,6 +295,7 @@ zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc, t->zthr_func = func; t->zthr_arg = arg; t->zthr_sleep_timeout = max_sleep; + t->zthr_name = zthr_name; t->zthr_thread = thread_create_named(zthr_name, NULL, 0, zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri); @@ -422,8 +422,8 @@ zthr_resume(zthr_t *t) * no-op. */ if (t->zthr_thread == NULL) { - t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, - 0, &p0, TS_RUN, minclsyspri); + t->zthr_thread = thread_create_named(t->zthr_name, NULL, 0, + zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&t->zthr_state_lock); |