aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c23
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_prop.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode.c65
-rw-r--r--sys/contrib/openzfs/module/zfs/mmp.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/range_tree.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_config.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c23
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c15
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c35
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_inject.c38
11 files changed, 181 insertions, 28 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index bac166fcd89e..967a018640e1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -484,7 +484,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = io_offset(bio, rq);
uint64_t size = io_size(bio, rq);
- int rw = io_data_dir(bio, rq);
+ int rw;
+
+ if (rq != NULL) {
+ /*
+ * Flush & trim requests go down the zvol_write codepath. Or
+ * more specifically:
+ *
+ * If request is a write, or if it's op_is_sync() and not a
+ * read, or if it's a flush, or if it's a discard, then send the
+ * request down the write path.
+ */
+ if (op_is_write(rq->cmd_flags) ||
+ (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) ||
+ req_op(rq) == REQ_OP_FLUSH ||
+ op_is_discard(rq->cmd_flags)) {
+ rw = WRITE;
+ } else {
+ rw = READ;
+ }
+ } else {
+ rw = bio_data_dir(bio);
+ }
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
zvol_end_io(bio, rq, SET_ERROR(ENXIO));
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 864e3898b365..9190ae0362ea 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -364,8 +364,8 @@ zfs_prop_init(void)
static const zprop_index_t xattr_table[] = {
{ "off", ZFS_XATTR_OFF },
- { "on", ZFS_XATTR_SA },
{ "sa", ZFS_XATTR_SA },
+ { "on", ZFS_XATTR_SA },
{ "dir", ZFS_XATTR_DIR },
{ NULL }
};
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 6c150d31c669..e88d394b5229 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -2656,6 +2656,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
}
/*
+ * Adjust *offset to the next (or previous) block byte offset at lvl.
+ * Returns FALSE if *offset would overflow or underflow.
+ */
+static boolean_t
+dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
+{
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int span = lvl * epbs + dn->dn_datablkshift;
+ uint64_t blkid, maxblkid;
+
+ if (span >= 8 * sizeof (uint64_t))
+ return (B_FALSE);
+
+ blkid = *offset >> span;
+ maxblkid = 1ULL << (8 * sizeof (*offset) - span);
+ if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
+ *offset = (blkid + 1) << span;
+ else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
+ *offset = (blkid << span) - 1;
+ else
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
* Find the next hole, data, or sparse region at or after *offset.
* The value 'blkfill' tells us how many items we expect to find
* in an L0 data block; this value is 1 for normal objects,
@@ -2682,7 +2708,7 @@ int
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg)
{
- uint64_t initial_offset = *offset;
+ uint64_t matched = *offset;
int lvl, maxlvl;
int error = 0;
@@ -2706,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
maxlvl = dn->dn_phys->dn_nlevels;
- for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ for (lvl = minlvl; lvl <= maxlvl; ) {
error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg);
- if (error != ESRCH)
+ if (error == 0 && lvl > minlvl) {
+ --lvl;
+ matched = *offset;
+ } else if (error == ESRCH && lvl < maxlvl &&
+ dnode_next_block(dn, flags, &matched, lvl)) {
+ /*
+ * Continue search at next/prev offset in lvl+1 block.
+ *
+ * Usually we only search upwards at the start of the
+ * search as higher level blocks point at a matching
+ * minlvl block in most cases, but we backtrack if not.
+ *
+ * This can happen for txg > 0 searches if the block
+ * contains only BPs/dnodes freed at that txg. It also
+ * happens if we are still syncing out the tree, and
+ * some BP's at higher levels are not updated yet.
+ *
+ * We must adjust offset to avoid coming back to the
+ * same offset and getting stuck looping forever. This
+ * also deals with the case where offset is already at
+ * the beginning or end of the object.
+ */
+ ++lvl;
+ *offset = matched;
+ } else {
break;
- }
-
- while (error == 0 && --lvl >= minlvl) {
- error = dnode_next_offset_level(dn,
- flags, offset, lvl, blkfill, txg);
+ }
}
/*
@@ -2727,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
error = 0;
}
- if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
- initial_offset < *offset : initial_offset > *offset))
- error = SET_ERROR(ESRCH);
out:
if (!(flags & DNODE_FIND_HAVELOCK))
rw_exit(&dn->dn_struct_rwlock);
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 7db72b9b04b0..fd46127b6068 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -446,7 +446,7 @@ mmp_write_uberblock(spa_t *spa)
uint64_t offset;
hrtime_t lock_acquire_time = gethrtime();
- spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
+ spa_config_enter_priority(spa, SCL_STATE, mmp_tag, RW_READER);
lock_acquire_time = gethrtime() - lock_acquire_time;
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
index ea2d2c7227c8..d73195f1a21f 100644
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
* the size, since we do not support removing partial segments
* of range trees with gaps.
*/
- zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
+ zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
zfs_rs_get_start_raw(rs, rt));
zfs_range_tree_stat_incr(rt, &rs_tmp);
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index cf28955b0c50..f615591e826b 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc);
if (spa->spa_comment != NULL)
fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
spa->spa_comment);
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 6f7c060f97f8..0bead6d49666 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -510,7 +510,7 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
static void
spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
- int mmp_flag)
+ int priority_flag)
{
(void) tag;
int wlocks_held = 0;
@@ -526,7 +526,7 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
mutex_enter(&scl->scl_lock);
if (rw == RW_READER) {
while (scl->scl_writer ||
- (!mmp_flag && scl->scl_write_wanted)) {
+ (!priority_flag && scl->scl_write_wanted)) {
cv_wait(&scl->scl_cv, &scl->scl_lock);
}
} else {
@@ -551,7 +551,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
}
/*
- * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * The spa_config_enter_priority() allows the mmp thread to cut in front of
* outstanding write lock requests. This is needed since the mmp updates are
* time sensitive and failure to service them promptly will result in a
* suspended pool. This pool suspension has been seen in practice when there is
@@ -560,7 +560,7 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
*/
void
-spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw)
{
spa_config_enter_impl(spa, locks, tag, rw, 1);
}
@@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
spa->spa_min_alloc = INT_MAX;
+ spa->spa_max_alloc = 0;
spa->spa_gcd_alloc = INT_MAX;
/* Reset cached value */
@@ -1865,6 +1866,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
}
/*
+ * Return the range of minimum allocation sizes for the normal allocation
+ * class. This can be used by external consumers of the DMU to estimate
+ * potential wasted capacity when setting the recordsize for an object.
+ * This is mainly for dRAID pools which always pad to a full stripe width.
+ */
+void
+spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc)
+{
+ *min_alloc = spa->spa_min_alloc;
+ *max_alloc = spa->spa_max_alloc;
+}
+
+/*
* Return the amount of slop space in bytes. It is typically 1/32 of the pool
* (3.2%), minus the embedded log space. On very small pools, it may be
* slightly larger than this. On very large pools, it will be capped to
@@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version);
EXPORT_SYMBOL(spa_state);
EXPORT_SYMBOL(spa_load_state);
EXPORT_SYMBOL(spa_freeze_txg);
+EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */
EXPORT_SYMBOL(spa_get_dspace);
EXPORT_SYMBOL(spa_update_dspace);
EXPORT_SYMBOL(spa_deflate);
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index fc6d445f9785..654e034de9e1 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -1497,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
{
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
- if (spa->spa_gcd_alloc == INT_MAX) {
+
+ if (min_alloc > spa->spa_max_alloc)
+ spa->spa_max_alloc = min_alloc;
+
+ if (spa->spa_gcd_alloc == INT_MAX)
spa->spa_gcd_alloc = min_alloc;
- } else {
- spa->spa_gcd_alloc = vdev_gcd(min_alloc,
- spa->spa_gcd_alloc);
- }
+ else
+ spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc);
}
void
@@ -1560,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd)
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
- uint64_t min_alloc = vdev_get_min_alloc(vd);
- vdev_spa_set_alloc(spa, min_alloc);
+ vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd));
}
}
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index c44f654b0261..0d4fdaa77ba0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
vd->vdev_asize);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC,
+ vdev_get_min_alloc(vd));
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
if (vd->vdev_noalloc) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 4cf8912d4269..aeea58bedfe4 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -4574,8 +4574,29 @@ zio_vdev_io_start(zio_t *zio)
ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]);
if (vd == NULL) {
- if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
- spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+ if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) {
+ /*
+ * A deadlock workaround. The ddt_prune_unique_entries()
+ * -> prune_candidates_sync() code path takes the
+ * SCL_ZIO reader lock and may request it again here.
+ * If there is another thread who wants the SCL_ZIO
+ * writer lock, then scl_write_wanted will be set.
+ * Thus, the spa_config_enter_priority() is used to
+ * ignore pending writer requests.
+ *
+ * The locking should be revised to remove the need
+ * for this workaround. If that's not workable then
+ * it should only be applied to the zios involved in
+ * the pruning process. This impacts the read/write
+ * I/O balance while pruning.
+ */
+ if (spa->spa_active_ddt_prune)
+ spa_config_enter_priority(spa, SCL_ZIO, zio,
+ RW_READER);
+ else
+ spa_config_enter(spa, SCL_ZIO, zio,
+ RW_READER);
+ }
/*
* The mirror_ops handle multiple DVAs in a single BP.
@@ -5305,6 +5326,16 @@ zio_ready(zio_t *zio)
return (NULL);
}
+ if (zio_injection_enabled) {
+ hrtime_t target = zio_handle_ready_delay(zio);
+ if (target != 0 && zio->io_target_timestamp == 0) {
+ zio->io_stage >>= 1;
+ zio->io_target_timestamp = target;
+ zio_delay_interrupt(zio);
+ return (NULL);
+ }
+ }
+
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(BP_GET_BIRTH(bp) == zio->io_txg ||
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index 981a1be4847c..287577018ed1 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -827,6 +827,44 @@ zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
}
+/*
+ * For testing, inject a delay before ready state.
+ */
+hrtime_t
+zio_handle_ready_delay(zio_t *zio)
+{
+ inject_handler_t *handler;
+ hrtime_t now = gethrtime();
+ hrtime_t target = 0;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_DELAY_READY)
+ continue;
+
+ /* If this handler matches, inject the delay */
+ if (zio_match_iotype(zio, handler->zi_record.zi_iotype) &&
+ zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ zio_match_dva(zio), &handler->zi_record, zio->io_error)) {
+ target = now + (hrtime_t)handler->zi_record.zi_timer;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+ return (target);
+}
+
static int
zio_calculate_range(const char *pool, zinject_record_t *record)
{