aboutsummaryrefslogtreecommitdiff
path: root/module/zfs/spa.c
diff options
context:
space:
mode:
authorJohn Gallagher <john.gallagher@delphix.com>2019-09-14 01:09:06 +0000
committerBrian Behlendorf <behlendorf1@llnl.gov>2019-09-14 01:09:06 +0000
commite60e158eff920825311c1e18b3631876eaaacb54 (patch)
tree03b5f6ff4855ae0fdc233d377d3c1939d1223912 /module/zfs/spa.c
parent7238cbd4d3ee7eadb3131c890d0692a49ea844af (diff)
downloadsrc-e60e158eff920825311c1e18b3631876eaaacb54.tar.gz
src-e60e158eff920825311c1e18b3631876eaaacb54.zip
Diffstat (limited to 'module/zfs/spa.c')
-rw-r--r--module/zfs/spa.c277
1 files changed, 277 insertions, 0 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 8330ab1ce8d3..0f1a2a9eb38c 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1541,6 +1541,8 @@ spa_unload(spa_t *spa)
spa_import_progress_remove(spa_guid(spa));
spa_load_note(spa, "UNLOADING");
+ spa_wake_waiters(spa);
+
/*
* If the log space map feature is enabled and the pool is getting
* exported (but not destroyed), we want to spend some time flushing
@@ -2470,6 +2472,7 @@ livelist_delete_sync(void *arg, dmu_tx_t *tx)
DMU_POOL_DELETED_CLONES, tx));
VERIFY0(zap_destroy(mos, zap_obj, tx));
spa->spa_livelists_to_delete = 0;
+ spa_notify_waiters(spa);
}
}
@@ -6947,6 +6950,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_dirty(tvd, VDD_DTL, vd, txg);
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
+ spa_notify_waiters(spa);
/* hang on to the spa before we release the lock */
spa_open_ref(spa, FTAG);
@@ -9228,6 +9232,279 @@ spa_total_metaslabs(spa_t *spa)
return (m);
}
+/*
+ * Notify any waiting threads that some activity has switched from being in-
+ * progress to not-in-progress so that the thread can wake up and determine
+ * whether it is finished waiting.
+ */
+void
+spa_notify_waiters(spa_t *spa)
+{
+ /*
+ * Acquiring spa_activities_lock here prevents the cv_broadcast from
+ * happening between the waiting thread's check and cv_wait.
+ */
+ mutex_enter(&spa->spa_activities_lock);
+ cv_broadcast(&spa->spa_activities_cv);
+ mutex_exit(&spa->spa_activities_lock);
+}
+
+/*
+ * Notify any waiting threads that the pool is exporting, and then block until
+ * they are finished using the spa_t.
+ */
+void
+spa_wake_waiters(spa_t *spa)
+{
+ mutex_enter(&spa->spa_activities_lock);
+ spa->spa_waiters_cancel = B_TRUE;
+ cv_broadcast(&spa->spa_activities_cv);
+ while (spa->spa_waiters != 0)
+ cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
+ spa->spa_waiters_cancel = B_FALSE;
+ mutex_exit(&spa->spa_activities_lock);
+}
+
+/* Whether the vdev or any of its descendants is initializing. */
+static boolean_t
+spa_vdev_initializing_impl(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ boolean_t initializing;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
+ ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+
+ mutex_exit(&spa->spa_activities_lock);
+ mutex_enter(&vd->vdev_initialize_lock);
+ mutex_enter(&spa->spa_activities_lock);
+
+ initializing = (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE);
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ if (initializing)
+ return (B_TRUE);
+
+ for (int i = 0; i < vd->vdev_children; i++) {
+ if (spa_vdev_initializing_impl(vd->vdev_child[i]))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * If use_guid is true, this checks whether the vdev specified by guid is
+ * being initialized. Otherwise, it checks whether any vdev in the pool is being
+ * initialized. The caller must hold the config lock and spa_activities_lock.
+ */
+static int
+spa_vdev_initializing(spa_t *spa, boolean_t use_guid, uint64_t guid,
+ boolean_t *in_progress)
+{
+ mutex_exit(&spa->spa_activities_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ mutex_enter(&spa->spa_activities_lock);
+
+ vdev_t *vd;
+ if (use_guid) {
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (EINVAL);
+ }
+ } else {
+ vd = spa->spa_root_vdev;
+ }
+
+ *in_progress = spa_vdev_initializing_impl(vd);
+
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (0);
+}
+
+/*
+ * Locking for waiting threads
+ * ---------------------------
+ *
+ * Waiting threads need a way to check whether a given activity is in progress,
+ * and then, if it is, wait for it to complete. Each activity will have some
+ * in-memory representation of the relevant on-disk state which can be used to
+ * determine whether or not the activity is in progress. The in-memory state and
+ * the locking used to protect it will be different for each activity, and may
+ * not be suitable for use with a cvar (e.g., some state is protected by the
+ * config lock). To allow waiting threads to wait without any races, another
+ * lock, spa_activities_lock, is used.
+ *
+ * When the state is checked, both the activity-specific lock (if there is one)
+ * and spa_activities_lock are held. In some cases, the activity-specific lock
+ * is acquired explicitly (e.g. the config lock). In others, the locking is
+ * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
+ * thread releases the activity-specific lock and, if the activity is in
+ * progress, then cv_waits using spa_activities_lock.
+ *
+ * The waiting thread is woken when another thread, one completing some
+ * activity, updates the state of the activity and then calls
+ * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
+ * needs to hold its activity-specific lock when updating the state, and this
+ * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
+ *
+ * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
+ * and because it is held when the waiting thread checks the state of the
+ * activity, it can never be the case that the completing thread both updates
+ * the activity state and cv_broadcasts in between the waiting thread's check
+ * and cv_wait. Thus, a waiting thread can never miss a wakeup.
+ *
+ * In order to prevent deadlock, when the waiting thread does its check, in some
+ * cases it will temporarily drop spa_activities_lock in order to acquire the
+ * activity-specific lock. The order in which spa_activities_lock and the
+ * activity specific lock are acquired in the waiting thread is determined by
+ * the order in which they are acquired in the completing thread; if the
+ * completing thread calls spa_notify_waiters with the activity-specific lock
+ * held, then the waiting thread must also acquire the activity-specific lock
+ * first.
+ */
+
+static int
+spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
+ boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+
+ switch (activity) {
+ case ZPOOL_WAIT_CKPT_DISCARD:
+ *in_progress =
+ (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
+ zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
+ ENOENT);
+ break;
+ case ZPOOL_WAIT_FREE:
+ *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
+ !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
+ spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
+ spa_livelist_delete_check(spa));
+ break;
+ case ZPOOL_WAIT_INITIALIZE:
+ error = spa_vdev_initializing(spa, use_tag, tag, in_progress);
+ break;
+ case ZPOOL_WAIT_REPLACE:
+ mutex_exit(&spa->spa_activities_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ mutex_enter(&spa->spa_activities_lock);
+
+ *in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ break;
+ case ZPOOL_WAIT_REMOVE:
+ *in_progress = (spa->spa_removing_phys.sr_state ==
+ DSS_SCANNING);
+ break;
+ case ZPOOL_WAIT_RESILVER:
+ case ZPOOL_WAIT_SCRUB:
+ {
+ boolean_t scanning, paused, is_scrub;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+ is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
+ scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
+ paused = dsl_scan_is_paused_scrub(scn);
+ *in_progress = (scanning && !paused &&
+ is_scrub == (activity == ZPOOL_WAIT_SCRUB));
+ break;
+ }
+ default:
+ panic("unrecognized value for activity %d", activity);
+ }
+
+ return (error);
+}
+
+static int
+spa_wait_common(const char *pool, zpool_wait_activity_t activity,
+ boolean_t use_tag, uint64_t tag, boolean_t *waited)
+{
+ /*
+ * The tag is used to distinguish between instances of an activity.
+ * 'initialize' is the only activity that we use this for. The other
+ * activities can only have a single instance in progress in a pool at
+ * one time, making the tag unnecessary.
+ *
+ * There can be multiple devices being replaced at once, but since they
+ * all finish once resilvering finishes, we don't bother keeping track
+ * of them individually, we just wait for them all to finish.
+ */
+ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE)
+ return (EINVAL);
+
+ if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
+ return (EINVAL);
+
+ spa_t *spa;
+ int error = spa_open(pool, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Increment the spa's waiter count so that we can call spa_close and
+ * still ensure that the spa_t doesn't get freed before this thread is
+ * finished with it when the pool is exported. We want to call spa_close
+ * before we start waiting because otherwise the additional ref would
+ * prevent the pool from being exported or destroyed throughout the
+ * potentially long wait.
+ */
+ mutex_enter(&spa->spa_activities_lock);
+ spa->spa_waiters++;
+ spa_close(spa, FTAG);
+
+ *waited = B_FALSE;
+ for (;;) {
+ boolean_t in_progress;
+ error = spa_activity_in_progress(spa, activity, use_tag, tag,
+ &in_progress);
+
+ if (!in_progress || spa->spa_waiters_cancel || error)
+ break;
+
+ *waited = B_TRUE;
+
+ if (cv_wait_sig(&spa->spa_activities_cv,
+ &spa->spa_activities_lock) == 0) {
+ error = EINTR;
+ break;
+ }
+ }
+
+ spa->spa_waiters--;
+ cv_signal(&spa->spa_waiters_cv);
+ mutex_exit(&spa->spa_activities_lock);
+
+ return (error);
+}
+
+/*
+ * Wait for a particular instance of the specified activity to complete, where
+ * the instance is identified by 'tag'
+ */
+int
+spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
+ boolean_t *waited)
+{
+ return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
+}
+
+/*
+ * Wait for all instances of the specified activity complete
+ */
+int
+spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
+{
+
+ return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
+}
+
sysevent_t *
spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
{