diff options
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r-- | sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c | 23 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zcommon/zfs_prop.c | 2 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/dnode.c | 65 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/mmp.c | 2 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/range_tree.c | 2 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/spa_config.c | 2 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/spa_misc.c | 23 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev.c | 15 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_label.c | 2 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/zio.c | 35 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/zio_inject.c | 38 |
11 files changed, 181 insertions, 28 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index bac166fcd89e..967a018640e1 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -484,7 +484,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (rq != NULL) { + /* + * Flush & trim requests go down the zvol_write codepath. Or + * more specifically: + * + * If request is a write, or if it's op_is_sync() and not a + * read, or if it's a flush, or if it's a discard, then send the + * request down the write path. + */ + if (op_is_write(rq->cmd_flags) || + (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || + req_op(rq) == REQ_OP_FLUSH || + op_is_discard(rq->cmd_flags)) { + rw = WRITE; + } else { + rw = READ; + } + } else { + rw = bio_data_dir(bio); + } if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 864e3898b365..9190ae0362ea 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -364,8 +364,8 @@ zfs_prop_init(void) static const zprop_index_t xattr_table[] = { { "off", ZFS_XATTR_OFF }, - { "on", ZFS_XATTR_SA }, { "sa", ZFS_XATTR_SA }, + { "on", ZFS_XATTR_SA }, { "dir", ZFS_XATTR_DIR }, { NULL } }; diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 6c150d31c669..e88d394b5229 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -2656,6 +2656,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, } /* + * Adjust *offset to the next (or previous) block byte offset at lvl. + * Returns FALSE if *offset would overflow or underflow. + */ +static boolean_t +dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) +{ + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int span = lvl * epbs + dn->dn_datablkshift; + uint64_t blkid, maxblkid; + + if (span >= 8 * sizeof (uint64_t)) + return (B_FALSE); + + blkid = *offset >> span; + maxblkid = 1ULL << (8 * sizeof (*offset) - span); + if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) + *offset = (blkid + 1) << span; + else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) + *offset = (blkid << span) - 1; + else + return (B_FALSE); + + return (B_TRUE); +} + +/* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, @@ -2682,7 +2708,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { - uint64_t initial_offset = *offset; + uint64_t matched = *offset; int lvl, maxlvl; int error = 0; @@ -2706,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, maxlvl = dn->dn_phys->dn_nlevels; - for (lvl = minlvl; lvl <= maxlvl; lvl++) { + for (lvl = minlvl; lvl <= maxlvl; ) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); - if (error != ESRCH) + if (error == 0 && lvl > minlvl) { + --lvl; + matched = *offset; + } else if (error == ESRCH && lvl < maxlvl && + dnode_next_block(dn, flags, &matched, lvl)) { + /* + * Continue search at next/prev offset in lvl+1 block. + * + * Usually we only search upwards at the start of the + * search as higher level blocks point at a matching + * minlvl block in most cases, but we backtrack if not. + * + * This can happen for txg > 0 searches if the block + * contains only BPs/dnodes freed at that txg. It also + * happens if we are still syncing out the tree, and + * some BP's at higher levels are not updated yet. + * + * We must adjust offset to avoid coming back to the + * same offset and getting stuck looping forever. This + * also deals with the case where offset is already at + * the beginning or end of the object. + */ + ++lvl; + *offset = matched; + } else { break; - } - - while (error == 0 && --lvl >= minlvl) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); + } } /* @@ -2727,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, error = 0; } - if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? - initial_offset < *offset : initial_offset > *offset)) - error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index 7db72b9b04b0..fd46127b6068 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa) uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); + spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index ea2d2c7227c8..d73195f1a21f 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - + zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - zfs_rs_get_start_raw(rs, rt)); zfs_range_tree_stat_incr(rt, &rs_tmp); diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index cf28955b0c50..f615591e826b 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 6f7c060f97f8..0bead6d49666 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -510,7 +510,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, - int mmp_flag) + int priority_flag) { (void) tag; int wlocks_held = 0; @@ -526,7 +526,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || - (!mmp_flag && scl->scl_write_wanted)) { + (!priority_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { @@ -551,7 +551,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) } /* - * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * The spa_config_enter_priority() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is @@ -560,7 +560,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) */ void -spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } @@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ @@ -1865,6 +1866,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) } /* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + +/* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to @@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index fc6d445f9785..654e034de9e1 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -1497,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void @@ -1560,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd) if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index c44f654b0261..0d4fdaa77ba0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 4cf8912d4269..aeea58bedfe4 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -4574,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio) ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]); if (vd == NULL) { - if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) - spa_config_enter(spa, SCL_ZIO, zio, RW_READER); + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) { + /* + * A deadlock workaround. The ddt_prune_unique_entries() + * -> prune_candidates_sync() code path takes the + * SCL_ZIO reader lock and may request it again here. + * If there is another thread who wants the SCL_ZIO + * writer lock, then scl_write_wanted will be set. + * Thus, the spa_config_enter_priority() is used to + * ignore pending writer requests. + * + * The locking should be revised to remove the need + * for this workaround. If that's not workable then + * it should only be applied to the zios involved in + * the pruning process. This impacts the read/write + * I/O balance while pruning. + */ + if (spa->spa_active_ddt_prune) + spa_config_enter_priority(spa, SCL_ZIO, zio, + RW_READER); + else + spa_config_enter(spa, SCL_ZIO, zio, + RW_READER); + } /* * The mirror_ops handle multiple DVAs in a single BP. @@ -5305,6 +5326,16 @@ zio_ready(zio_t *zio) return (NULL); } + if (zio_injection_enabled) { + hrtime_t target = zio_handle_ready_delay(zio); + if (target != 0 && zio->io_target_timestamp == 0) { + zio->io_stage >>= 1; + zio->io_target_timestamp = target; + zio_delay_interrupt(zio); + return (NULL); + } + } + if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c index 981a1be4847c..287577018ed1 100644 --- a/sys/contrib/openzfs/module/zfs/zio_inject.c +++ b/sys/contrib/openzfs/module/zfs/zio_inject.c @@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); } +/* + * For testing, inject a delay before ready state. + */ +hrtime_t +zio_handle_ready_delay(zio_t *zio) +{ + inject_handler_t *handler; + hrtime_t now = gethrtime(); + hrtime_t target = 0; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DELAY_READY) + continue; + + /* If this handler matches, inject the delay */ + if (zio_match_iotype(zio, handler->zi_record.zi_iotype) && + zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + zio_match_dva(zio), &handler->zi_record, zio->io_error)) { + target = now + (hrtime_t)handler->zi_record.zi_timer; + break; + } + } + + rw_exit(&inject_lock); + return (target); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { |