diff options
author | Brian Behlendorf <behlendorf1@llnl.gov> | 2020-07-03 18:05:50 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-03 18:05:50 +0000 |
commit | 9a49d3f3d3bfa26df4e5e54d574cb490f0ee284b (patch) | |
tree | 715c2fa00e55762764cadef8460da09f919910ad | |
parent | 7ddb753d17f2c12f152647c0e34eb9c42ee5e4af (diff) | |
download | src-9a49d3f3d3bfa26df4e5e54d574cb490f0ee284b.tar.gz src-9a49d3f3d3bfa26df4e5e54d574cb490f0ee284b.zip |
65 files changed, 3281 insertions, 362 deletions
diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 1094d25dd34e..8d0a3b420086 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -437,7 +437,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) return; } - ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index f3dbb24b84eb..665fb216d507 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -237,7 +237,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) dev_name, basename(spare_name)); if (zpool_vdev_attach(zhp, dev_name, spare_name, - replacement, B_TRUE) == 0) { + replacement, B_TRUE, B_FALSE) == 0) { free(dev_name); nvlist_free(replacement); return (B_TRUE); @@ -319,12 +319,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); + /* * If this is a resource notifying us of device removal then simply * check for an available spare and continue unless the device is a * l2arc vdev, in which case we just offline it. */ - if (strcmp(class, "resource.fs.zfs.removed") == 0) { + if (strcmp(class, "resource.fs.zfs.removed") == 0 || + (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_REMOVED)) { char *devtype; char *devname; @@ -365,8 +369,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * healthy ones so we need to confirm the actual state value. */ if (strcmp(class, "resource.fs.zfs.statechange") == 0 && - nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, - &state) == 0 && state == VDEV_STATE_HEALTHY) { + state == VDEV_STATE_HEALTHY) { zfs_vdev_repair(hdl, nvl); return; } diff --git a/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/cmd/zed/zed.d/resilver_finish-start-scrub.sh index 6f9c0b309467..c7cfd1ddba80 100755 --- a/cmd/zed/zed.d/resilver_finish-start-scrub.sh +++ b/cmd/zed/zed.d/resilver_finish-start-scrub.sh @@ -5,10 +5,12 @@ # Exit codes: # 1: Internal error # 2: Script wasn't enabled in zed.rc +# 3: Scrubs are automatically started for sequential resilvers [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" [ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2 +[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3 [ -n "${ZEVENT_POOL}" ] || exit 1 [ -n "${ZEVENT_SUBCLASS}" ] || exit 1 zed_check_cmd "${ZPOOL}" || exit 1 diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index ee6c479eb3bd..cdf5511fe75b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -337,7 +337,7 @@ get_usage(zpool_help_t idx) return (gettext("\tadd [-fgLnP] [-o property=value] " "<pool> <vdev> ...\n")); case HELP_ATTACH: - return (gettext("\tattach [-fw] [-o property=value] " + return (gettext("\tattach [-fsw] [-o property=value] " "<pool> <device> <new-device>\n")); case HELP_CLEAR: return (gettext("\tclear [-nF] <pool> [device]\n")); @@ -380,7 +380,7 @@ get_usage(zpool_help_t idx) case HELP_ONLINE: return (gettext("\tonline [-e] <pool> <device> ...\n")); case HELP_REPLACE: - return (gettext("\treplace [-fw] [-o property=value] " + return (gettext("\treplace [-fsw] [-o property=value] " "<pool> <device> [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] <pool> <device> ...\n")); @@ -2077,10 +2077,10 @@ health_str_to_color(const char *health) */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, - nvlist_t *nv, int depth, boolean_t isspare) + nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; - uint_t c, children; + uint_t c, i, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; @@ -2266,6 +2266,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } + /* The top-level vdevs have the rebuild stats */ + if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && + children == 0) { + if (vs->vs_rebuild_processed != 0) { + (void) printf(gettext(" (resilvering)")); + } + } + if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); @@ -2295,11 +2303,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; + /* Provide vdev_rebuild_stats to children if available */ + if (vrs == NULL) { + (void) nvlist_lookup_uint64_array(nv, + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i); + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); - print_status_config(zhp, cb, vname, child[c], depth + 2, - isspare); + isspare, vrs); free(vname); } } @@ -2468,7 +2482,7 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, - B_FALSE); + B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); @@ -2622,6 +2636,7 @@ show_import(nvlist_t *config) break; case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); @@ -6118,6 +6133,7 @@ static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; + boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; @@ -6128,7 +6144,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) int ret; /* check options */ - while ((c = getopt(argc, argv, "fo:w")) != -1) { + while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -6146,6 +6162,9 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; + case 's': + rebuild = B_TRUE; + break; case 'w': wait = B_TRUE; break; @@ -6230,7 +6249,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) return (1); } - ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); + ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, + rebuild); if (ret == 0 && wait) ret = zpool_wait(zhp, @@ -6244,9 +6264,10 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) } /* - * zpool replace [-fw] [-o property=value] <pool> <device> <new_device> + * zpool replace [-fsw] [-o property=value] <pool> <device> <new_device> * * -f Force attach, even if <new_device> appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * @@ -6260,9 +6281,10 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-fw] [-o property=value] <pool> <device> <new_device> + * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device> * * -f Force attach, even if <new_device> appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering to complete before returning * @@ -7132,19 +7154,40 @@ zpool_do_trim(int argc, char **argv) } /* + * Converts a total number of seconds to a human readable string broken + * down in to days/hours/minutes/seconds. + */ +static void +secs_to_dhms(uint64_t total, char *buf) +{ + uint64_t days = total / 60 / 60 / 24; + uint64_t hours = (total / 60 / 60) % 24; + uint64_t mins = (total / 60) % 60; + uint64_t secs = (total % 60); + + if (days > 0) { + (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", + (u_longlong_t)days, (u_longlong_t)hours, + (u_longlong_t)mins, (u_longlong_t)secs); + } else { + (void) sprintf(buf, "%02llu:%02llu:%02llu", + (u_longlong_t)hours, (u_longlong_t)mins, + (u_longlong_t)secs); + } +} + +/* * Print out detailed scrub status. */ static void -print_scan_status(pool_scan_stat_t *ps) +print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t total_secs_left; - uint64_t elapsed, secs_left, mins_left, hours_left, days_left; uint64_t pass_scanned, scanned, pass_issued, issued, total; - uint64_t scan_rate, issue_rate; + uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; - char srate_buf[7], irate_buf[7]; + char srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); @@ -7168,26 +7211,18 @@ print_scan_status(pool_scan_stat_t *ps) /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { - total_secs_left = end - start; - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); + secs_to_dhms(end - start, time_buf); if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub repaired %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilvered %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { @@ -7235,13 +7270,9 @@ print_scan_status(pool_scan_stat_t *ps) scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; - total_secs_left = (issue_rate != 0 && total >= issued) ? + uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? ((total - issued) / issue_rate) : UINT64_MAX; - - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); + secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); @@ -7271,10 +7302,84 @@ print_scan_status(pool_scan_stat_t *ps) if (pause == 0) { if (total_secs_left != UINT64_MAX && issue_rate >= 10 * 1024 * 1024) { - (void) printf(gettext(", %llu days " - "%02llu:%02llu:%02llu to go\n"), - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left); + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } +} + +static void +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) +{ + if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) + return; + + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); + + uint64_t bytes_scanned = vrs->vrs_bytes_scanned; + uint64_t bytes_issued = vrs->vrs_bytes_issued; + uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; + uint64_t bytes_est = vrs->vrs_bytes_est; + uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / + (vrs->vrs_pass_time_ms + 1)) * 1000; + uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / + (vrs->vrs_pass_time_ms + 1)) * 1000; + double scan_pct = MIN((double)bytes_scanned * 100 / + (bytes_est + 1), 100); + + /* Format all of the numbers we will be reporting */ + char bytes_scanned_buf[7], bytes_issued_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_buf[7]; + char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; + zfs_nicebytes(bytes_scanned, bytes_scanned_buf, + sizeof (bytes_scanned_buf)); + zfs_nicebytes(bytes_issued, bytes_issued_buf, + sizeof (bytes_issued_buf)); + zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, + sizeof (bytes_rebuilt_buf)); + zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); + + time_t start = vrs->vrs_start_time; + time_t end = vrs->vrs_end_time; + + /* Rebuild is finished or canceled. */ + if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { + secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); + (void) printf(gettext("resilvered (%s) %s in %s " + "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, + time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { + (void) printf(gettext("resilver (%s) canceled on %s"), + vdev_name, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + (void) printf(gettext("resilver (%s) in progress since %s"), + vdev_name, ctime(&start)); + } + + assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); + + secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / + MAX(scan_rate, 1), time_buf); + + (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " + "%s total\n"), bytes_scanned_buf, scan_rate_buf, + bytes_issued_buf, issue_rate_buf, bytes_est_buf); + (void) printf(gettext("\t%s resilvered, %.2f%% done"), + bytes_rebuilt_buf, scan_pct); + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + if (scan_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); @@ -7285,9 +7390,38 @@ print_scan_status(pool_scan_stat_t *ps) } /* - * As we don't scrub checkpointed blocks, we want to warn the - * user that we skipped scanning some blocks if a checkpoint exists - * or existed at any time during the scan. + * Print rebuild status for top-level vdevs. + */ +static void +print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + char *name = zpool_vdev_name(g_zfs, zhp, + child[c], VDEV_NAME_TYPE_ID); + print_rebuild_status_impl(vrs, name); + free(name); + } + } +} + +/* + * As we don't scrub checkpointed blocks, we want to warn the user that we + * skipped scanning some blocks if a checkpoint exists or existed at any + * time during the scan. If a sequential instead of healing reconstruction + * was performed then the blocks were reconstructed. However, their checksums + * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) @@ -7319,6 +7453,95 @@ print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) } /* + * Returns B_TRUE if there is an active rebuild in progress. Otherwise, + * B_FALSE is returned and 'rebuild_end_time' is set to the end time for + * the last completed (or cancelled) rebuild. + */ +static boolean_t +check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) +{ + nvlist_t **child; + uint_t children; + boolean_t rebuilding = B_FALSE; + uint64_t end_time = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + + if (vrs->vrs_end_time > end_time) + end_time = vrs->vrs_end_time; + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + rebuilding = B_TRUE; + end_time = 0; + break; + } + } + } + + if (rebuild_end_time != NULL) + *rebuild_end_time = end_time; + + return (rebuilding); +} + +/* + * Print the scan status. + */ +static void +print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + uint64_t rebuild_end_time = 0, resilver_end_time = 0; + boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; + boolean_t active_resilver = B_FALSE; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *ps = NULL; + uint_t c; + + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c) == 0) { + if (ps->pss_func == POOL_SCAN_RESILVER) { + resilver_end_time = ps->pss_end_time; + active_resilver = (ps->pss_state == DSS_SCANNING); + } + + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); + have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); + } + + boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); + boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); + + /* Always print the scrub status when available. */ + if (have_scrub) + print_scan_scrub_resilver_status(ps); + + /* + * When there is an active resilver or rebuild print its status. + * Otherwise print the status of the last resilver or rebuild. + */ + if (active_resilver || (!active_rebuild && have_resilver && + resilver_end_time && resilver_end_time > rebuild_end_time)) { + print_scan_scrub_resilver_status(ps); + } else if (active_rebuild || (!active_resilver && have_rebuild && + rebuild_end_time && rebuild_end_time > resilver_end_time)) { + print_rebuild_status(zhp, nvroot); + } + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + print_checkpoint_scan_warning(ps, pcs); +} + +/* * Print out detailed removal status. */ static void @@ -7504,7 +7727,7 @@ print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); - print_status_config(zhp, cb, name, spares[i], 2, B_TRUE); + print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } @@ -7524,7 +7747,8 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); - print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE); + print_status_config(zhp, cb, name, l2cache[i], 2, + B_FALSE, NULL); free(name); } } @@ -7718,6 +7942,7 @@ status_callback(zpool_handle_t *zhp, void *data) break; case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " @@ -7727,6 +7952,16 @@ status_callback(zpool_handle_t *zhp, void *data) "complete.\n")); break; + case ZPOOL_STATUS_REBUILD_SCRUB: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices have " + "been sequentially resilvered, scrubbing\n\tthe pool " + "is recommended.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " + "verify all data checksums.\n")); + break; + case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " @@ -7951,18 +8186,16 @@ status_callback(zpool_handle_t *zhp, void *data) nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; - pool_scan_stat_t *ps = NULL; pool_removal_stat_t *prs = NULL; - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); + print_scan_status(zhp, nvroot); + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); - print_scan_status(ps); - print_checkpoint_scan_warning(ps, pcs); print_removal_status(zhp, prs); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, @@ -7987,7 +8220,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, - B_FALSE); + B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); @@ -9543,6 +9776,36 @@ vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) return (bytes_remaining); } +/* Add up the total number of bytes left to rebuild across top-level vdevs */ +static uint64_t +vdev_activity_top_remaining(nvlist_t *nv) +{ + uint64_t bytes_remaining = 0; + nvlist_t **child; + uint_t children; + int error; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + error = nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); + if (error == 0) { + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + bytes_remaining += (vrs->vrs_bytes_est - + vrs->vrs_bytes_rebuilt); + } + } + } + + return (bytes_remaining); +} + /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) @@ -9652,6 +9915,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; + } else if (check_rebuilding(nvroot, NULL)) { + bytes_rem[ZPOOL_WAIT_RESILVER] = + vdev_activity_top_remaining(nvroot); } bytes_rem[ZPOOL_WAIT_INITIALIZE] = diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index ce748da189a3..ca38271cc4ac 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -3507,7 +3507,16 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, NULL, 0, 0, 1); - error = spa_vdev_attach(spa, oldguid, root, replacing); + /* + * When supported select either a healing or sequential resilver. + */ + boolean_t rebuilding = B_FALSE; + if (pvd->vdev_ops == &vdev_mirror_ops || + pvd->vdev_ops == &vdev_root_ops) { + rebuilding = !!ztest_random(2); + } + + error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); nvlist_free(root); @@ -3527,10 +3536,11 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || - error == ZFS_ERR_DISCARDING_CHECKPOINT) + error == ZFS_ERR_DISCARDING_CHECKPOINT || + error == ZFS_ERR_RESILVER_IN_PROGRESS || + error == ZFS_ERR_REBUILD_IN_PROGRESS) expected_error = error; - /* XXX workaround 6690467 */ if (error != expected_error && expected_error != EBUSY) { fatal(0, "attach (%s %llu, %s %llu, %d) " "returned %d, expected %d", diff --git a/configure.ac b/configure.ac index e405ddb57bb6..c7f813d1927c 100644 --- a/configure.ac +++ b/configure.ac @@ -368,7 +368,6 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/rename_dirs/Makefile tests/zfs-tests/tests/functional/replacement/Makefile tests/zfs-tests/tests/functional/reservation/Makefile - tests/zfs-tests/tests/functional/resilver/Makefile tests/zfs-tests/tests/functional/rootpool/Makefile tests/zfs-tests/tests/functional/rsend/Makefile tests/zfs-tests/tests/functional/scrub_mirror/Makefile diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py index 5c285164b976..50dca67f3a6f 100644 --- a/contrib/pyzfs/libzfs_core/_constants.py +++ b/contrib/pyzfs/libzfs_core/_constants.py @@ -95,6 +95,8 @@ zfs_errno = enum_with_offset(1024, [ 'ZFS_ERR_EXPORT_IN_PROGRESS', 'ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR', 'ZFS_ERR_STREAM_TRUNCATED', + 'ZFS_ERR_RESILVER_IN_PROGRESS', + 'ZFS_ERR_REBUILD_IN_PROGRESS', ], {} ) diff --git a/include/libzfs.h b/include/libzfs.h index 64a0a203501a..873e8f3046fb 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -79,7 +79,7 @@ typedef enum zfs_error { EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ - EZFS_RESILVERING, /* currently resilvering */ + EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ @@ -148,6 +148,7 @@ typedef enum zfs_error { EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ + EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_UNKNOWN } zfs_error_t; @@ -297,7 +298,7 @@ extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); extern int zpool_vdev_attach(zpool_handle_t *, const char *, - const char *, nvlist_t *, int); + const char *, nvlist_t *, int, boolean_t); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); extern int zpool_vdev_remove_cancel(zpool_handle_t *); @@ -387,6 +388,8 @@ typedef enum { ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ + ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ /* * Finally, the following indicates a healthy pool. diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index ce781aa4cffe..0659c6419dfc 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -89,6 +89,7 @@ COMMON_H = \ vdev_initialize.h \ vdev_raidz.h \ vdev_raidz_impl.h \ + vdev_rebuild.h \ vdev_removal.h \ vdev_trim.h \ xvattr.h \ diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index bcb896da373b..8f929207d2d7 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -42,6 +42,8 @@ struct dsl_dataset; struct dsl_pool; struct dmu_tx; +extern int zfs_scan_suspend_progress; + /* * All members of this structure must be uint64_t, for byteswap * purposes. diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 575a4af51439..1bfd7a485abe 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -704,6 +704,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" +#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ @@ -730,6 +731,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ +#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" /* * The persistent vdev state is stored as separate values rather than a single @@ -778,6 +780,9 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ "com.delphix:ms_unflushed_phys_txgs" +#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ + "org.openzfs:vdev_rebuild" + #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" @@ -991,6 +996,21 @@ typedef enum dsl_scan_state { DSS_NUM_STATES } dsl_scan_state_t; +typedef struct vdev_rebuild_stat { + uint64_t vrs_state; /* vdev_rebuild_state_t */ + uint64_t vrs_start_time; /* time_t */ + uint64_t vrs_end_time; /* time_t */ + uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ + uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ + uint64_t vrs_bytes_issued; /* read bytes issued */ + uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrs_bytes_est; /* total bytes to scan */ + uint64_t vrs_errors; /* scanning errors */ + uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ + uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ + uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ +} vdev_rebuild_stat_t; + /* * Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER. The ordering * of this enum must be maintained to ensure the errata identifiers map to @@ -1047,6 +1067,7 @@ typedef struct vdev_stat { uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ + uint64_t vs_rebuild_processed; /* bytes rebuilt */ } vdev_stat_t; /* @@ -1178,6 +1199,13 @@ typedef enum { VDEV_TRIM_COMPLETE, } vdev_trim_state_t; +typedef enum { + VDEV_REBUILD_NONE, + VDEV_REBUILD_ACTIVE, + VDEV_REBUILD_CANCELED, + VDEV_REBUILD_COMPLETE, +} vdev_rebuild_state_t; + /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl @@ -1337,6 +1365,8 @@ typedef enum { ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, + ZFS_ERR_RESILVER_IN_PROGRESS, + ZFS_ERR_REBUILD_IN_PROGRESS, } zfs_errno_t; /* @@ -1478,7 +1508,12 @@ typedef enum { * given payloads: * * ESC_ZFS_RESILVER_START - * ESC_ZFS_RESILVER_END + * ESC_ZFS_RESILVER_FINISH + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING + * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * @@ -1532,6 +1567,7 @@ typedef enum { #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" +#define ZFS_EV_RESILVER_TYPE "resilver_type" #ifdef __cplusplus } diff --git a/include/sys/spa.h b/include/sys/spa.h index 5806dda418c5..9b96eb1f871f 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -790,17 +790,12 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 - -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 +#define SPA_ASYNC_REBUILD_DONE 0x2000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, - int replacing); + int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); @@ -988,6 +983,7 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 6481d5397792..2c52cb666f73 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -36,6 +36,7 @@ #include <sys/spa_checkpoint.h> #include <sys/spa_log_spacemap.h> #include <sys/vdev.h> +#include <sys/vdev_rebuild.h> #include <sys/vdev_removal.h> #include <sys/metaslab.h> #include <sys/dmu.h> diff --git a/include/sys/vdev.h b/include/sys/vdev.h index d93ef78f164d..a7e88063657e 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -73,7 +73,7 @@ extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, - int scrub_done); + boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 56407a1914bc..b9298c62d629 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -38,6 +38,7 @@ #include <sys/uberblock_impl.h> #include <sys/vdev_indirect_mapping.h> #include <sys/vdev_indirect_births.h> +#include <sys/vdev_rebuild.h> #include <sys/vdev_removal.h> #include <sys/zfs_ratelimit.h> @@ -295,13 +296,26 @@ struct vdev { uint64_t vdev_trim_secure; /* requested secure TRIM */ uint64_t vdev_trim_action_time; /* start and end time */ - /* for limiting outstanding I/Os (initialize and TRIM) */ + /* Rebuild related */ + boolean_t vdev_rebuilding; + boolean_t vdev_rebuild_exit_wanted; + boolean_t vdev_rebuild_cancel_wanted; + boolean_t vdev_rebuild_reset_wanted; + kmutex_t vdev_rebuild_lock; + kcondvar_t vdev_rebuild_cv; + kthread_t *vdev_rebuild_thread; + vdev_rebuild_t vdev_rebuild_config; + + /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; + kmutex_t vdev_rebuild_io_lock; + kcondvar_t vdev_rebuild_io_cv; + uint64_t vdev_rebuild_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -358,6 +372,7 @@ struct vdev { uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ + uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h new file mode 100644 index 000000000000..3d4b8cc46836 --- /dev/null +++ b/include/sys/vdev_rebuild.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_REBUILD_H +#define _SYS_VDEV_REBUILD_H + +#include <sys/spa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Number of entries in the physical vdev_rebuild_phys structure. This + * state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS. + */ +#define REBUILD_PHYS_ENTRIES 12 + +/* + * On-disk rebuild configuration and state. When adding new fields they + * must be added to the end of the structure. + */ +typedef struct vdev_rebuild_phys { + uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */ + uint64_t vrp_last_offset; /* last rebuilt offset */ + uint64_t vrp_min_txg; /* minimum missing txg */ + uint64_t vrp_max_txg; /* maximum missing txg */ + uint64_t vrp_start_time; /* start time */ + uint64_t vrp_end_time; /* end time */ + uint64_t vrp_scan_time_ms; /* total run time in ms */ + uint64_t vrp_bytes_scanned; /* alloc bytes scanned */ + uint64_t vrp_bytes_issued; /* read bytes rebuilt */ + uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrp_bytes_est; /* total bytes to scan */ + uint64_t vrp_errors; /* errors during rebuild */ +} vdev_rebuild_phys_t; + +/* + * The vdev_rebuild_t describes the current state and how a top-level vdev + * should be rebuilt. The core elements are the top-vdev, the metaslab being + * rebuilt, range tree containing the allocted extents and the on-disk state. + */ +typedef struct vdev_rebuild { + vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ + metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ + range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + + /* In-core state and progress */ + uint64_t vr_scan_offset[TXG_SIZE]; + uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + + /* Per-rebuild pass statistics for calculating bandwidth */ + uint64_t vr_pass_start_time; + uint64_t vr_pass_bytes_scanned; + uint64_t vr_pass_bytes_issued; + + /* On-disk state updated by vdev_rebuild_zap_update_sync() */ + vdev_rebuild_phys_t vr_rebuild_phys; +} vdev_rebuild_t; + +boolean_t vdev_rebuild_active(vdev_t *); + +int vdev_rebuild_load(vdev_t *); +void vdev_rebuild(vdev_t *); +void vdev_rebuild_stop_wait(vdev_t *); +void vdev_rebuild_stop_all(spa_t *); +void vdev_rebuild_restart(spa_t *); +void vdev_rebuild_clear_sync(void *, dmu_tx_t *); +int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REBUILD_H */ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index 0b422904ec5a..2d8e7fc36bae 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -31,6 +31,7 @@ typedef enum zio_priority { ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ + ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2d8767d5b9ae..7e19a62e24ff 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -74,6 +74,7 @@ typedef enum spa_feature { SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURE_LOG_SPACEMAP, SPA_FEATURE_LIVELIST, + SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 11b3d4cd9d0d..f848cb3cfc24 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2446,7 +2446,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); - if (ps && ps->pss_func == POOL_SCAN_SCRUB) { + if (ps && ps->pss_func == POOL_SCAN_SCRUB && + ps->pss_state == DSS_SCANNING) { if (cmd == POOL_SCRUB_PAUSE) return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg)); else @@ -3128,8 +3129,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) * If 'replacing' is specified, the new disk will replace the old one. */ int -zpool_vdev_attach(zpool_handle_t *zhp, - const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing) +zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, + const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild) { zfs_cmd_t zc = {"\0"}; char msg[1024]; @@ -3164,6 +3165,14 @@ zpool_vdev_attach(zpool_handle_t *zhp, verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); zc.zc_cookie = replacing; + zc.zc_simple = rebuild; + + if (rebuild && + zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "the loaded zfs module doesn't support device rebuilds")); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); + } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { @@ -3224,16 +3233,21 @@ zpool_vdev_attach(zpool_handle_t *zhp, uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); - if (islog) + if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); - else if (version >= SPA_VERSION_MULTI_REPLACE) + } else if (rebuild) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "only mirror vdevs support sequential " + "reconstruction")); + } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); - else + } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); + } } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only attach to mirrors and top-level " diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index ebf497db6423..67b8ea33e9d4 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -84,6 +84,8 @@ static char *zfs_msgid_table[] = { * ZPOOL_STATUS_RESILVERING * ZPOOL_STATUS_OFFLINE_DEV * ZPOOL_STATUS_REMOVED_DEV + * ZPOOL_STATUS_REBUILDING + * ZPOOL_STATUS_REBUILD_SCRUB * ZPOOL_STATUS_OK */ }; @@ -195,7 +197,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) * - Check for any data errors * - Check for any faulted or missing devices in a replicated config * - Look for any devices showing errors - * - Check for any resilvering devices + * - Check for any resilvering or rebuilding devices * * There can obviously be multiple errors within a single pool, so this routine * only picks the most damaging of all the current errors to report. @@ -234,6 +236,49 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) return (ZPOOL_STATUS_RESILVERING); /* + * Currently rebuilding a vdev, check top-level vdevs. + */ + vdev_rebuild_stat_t *vrs = NULL; + nvlist_t **child; + uint_t c, i, children; + uint64_t rebuild_end_time = 0; + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) { + uint64_t state = vrs->vrs_state; + + if (state == VDEV_REBUILD_ACTIVE) { + return (ZPOOL_STATUS_REBUILDING); + } else if (state == VDEV_REBUILD_COMPLETE && + vrs->vrs_end_time > rebuild_end_time) { + rebuild_end_time = vrs->vrs_end_time; + } + } + } + + /* + * If we can determine when the last scrub was run, and it + * was before the last rebuild completed, then recommend + * that the pool be scrubbed to verify all checksums. When + * ps is NULL we can infer the pool has never been scrubbed. + */ + if (rebuild_end_time > 0) { + if (ps != NULL) { + if ((ps->pss_state == DSS_FINISHED && + ps->pss_func == POOL_SCAN_SCRUB && + rebuild_end_time > ps->pss_end_time) || + ps->pss_state == DSS_NONE) + return (ZPOOL_STATUS_REBUILD_SCRUB); + } else { + return (ZPOOL_STATUS_REBUILD_SCRUB); + } + } + } + + /* * The multihost property is set and the pool may be active. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 21bd8289c025..2f4aaed32986 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -286,6 +286,9 @@ libzfs_error_description(libzfs_handle_t *hdl) "resilver_defer feature")); case EZFS_EXPORT_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "pool export in progress")); + case EZFS_REBUILDING: + return (dgettext(TEXT_DOMAIN, "currently sequentially " + "resilvering")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -693,6 +696,12 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_EXPORT_IN_PROGRESS: zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); break; + case ZFS_ERR_RESILVER_IN_PROGRESS: + zfs_verror(hdl, EZFS_RESILVERING, fmt, ap); + break; + case ZFS_ERR_REBUILD_IN_PROGRESS: + zfs_verror(hdl, EZFS_REBUILDING, fmt, ap); + break; case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 46befa7d43fe..06b89fe0a64f 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -132,6 +132,7 @@ KERNEL_C = \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_raidz_math_powerpc_altivec.c \ + vdev_rebuild.c \ vdev_removal.c \ vdev_root.c \ vdev_trim.c \ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 687b85d0bd22..3fbd3c67f825 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1865,6 +1865,30 @@ Default value: \fB1,000\fR. .sp .ne 2 .na +\fBzfs_vdev_rebuild_max_active\fR (int) +.ad +.RS 12n +Maximum sequential resilver I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB3\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_rebuild_min_active\fR (int) +.ad +.RS 12n +Minimum sequential resilver I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na \fBzfs_vdev_removal_max_active\fR (int) .ad .RS 12n @@ -2710,6 +2734,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default). .sp .ne 2 .na +\fBzfs_rebuild_max_segment\fR (ulong) +.ad +.RS 12n +Maximum read segment size to issue when sequentially resilvering a +top-level vdev. +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na \fBzfs_reconstruct_indirect_combinations_max\fR (int) .ad .RS 12na diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index e7a61957ff4f..3f690c3340c4 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -258,6 +258,35 @@ returned to the \fBenabled\fR state when all bookmarks with these fields are des .sp .ne 2 .na +\fBdevice_rebuild\fR +.ad +.RS 4n +.TS +l l . +GUID org.openzfs:device_rebuild +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables the ability for the \fBzpool attach\fR and \fBzpool +replace\fR subcommands to perform sequential reconstruction (instead of +healing reconstruction) when resilvering. + +Sequential reconstruction resilvers a device in LBA order without immediately +verifying the checksums. Once complete a scrub is started which then verifies +the checksums. This approach allows full redundancy to be restored to the pool +in the minimum amount of time. This two phase approach will take longer than a +healing resilver when the time to verify the checksums is included. However, +unless there is additional pool damage no checksum errors should be reported +by the scrub. This feature is incompatible with raidz configurations. + +This feature becomes \fBactive\fR while a sequential resilver is in progress, +and returns to \fBenabled\fR when the resilver completes. +.RE + +.sp +.ne 2 +.na \fBdevice_removal\fR .ad .RS 4n diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 index be0be4e076fc..585357b96d68 100644 --- a/man/man8/zpool-attach.8 +++ b/man/man8/zpool-attach.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd May 15, 2020 .Dt ZPOOL-ATTACH 8 .Os Linux .Sh NAME @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm .Cm attach -.Op Fl fw +.Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool device new_device .Sh DESCRIPTION @@ -44,7 +44,7 @@ .It Xo .Nm .Cm attach -.Op Fl fw +.Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool device new_device .Xc @@ -68,22 +68,29 @@ is part of a two-way mirror, attaching creates a three-way mirror, and so on. In either case, .Ar new_device -begins to resilver immediately. +begins to resilver immediately and any running scrub is cancelled. .Bl -tag -width Ds .It Fl f Forces use of .Ar new_device , even if it appears to be in use. Not all devices can be overridden in this manner. -.It Fl w -Waits until -.Ar new_device -has finished resilvering before returning. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the .Xr zpoolprops 8 manual page for a list of valid properties that can be set. The only property supported at the moment is ashift. +.It Fl s +The +.Ar new_device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. +.It Fl w +Waits until +.Ar new_device +has finished resilvering before returning. .El .El .Sh SEE ALSO diff --git a/man/man8/zpool-replace.8 b/man/man8/zpool-replace.8 index 933fb4ae92ab..5e639feaf767 100644 --- a/man/man8/zpool-replace.8 +++ b/man/man8/zpool-replace.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd May 15, 2020 .Dt ZPOOL-REPLACE 8 .Os Linux .Sh NAME @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm .Cm replace -.Op Fl fw +.Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool Ar device Op Ar new_device .Sh DESCRIPTION @@ -44,7 +44,7 @@ .It Xo .Nm .Cm replace -.Op Fl fw +.Op Fl fsw .Op Fl o Ar property Ns = Ns Ar value .Ar pool Ar device Op Ar new_device .Xc @@ -56,6 +56,7 @@ This is equivalent to attaching .Ar new_device , waiting for it to resilver, and then detaching .Ar old_device . +Any in progress scrub will be cancelled. .Pp The size of .Ar new_device @@ -86,6 +87,13 @@ Sets the given pool properties. See the manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . +.It Fl s +The +.Ar new_device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. .It Fl w Waits until the replacement has completed before returning. .El diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 7364bf635706..66e335995783 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd May 15, 2020 .Dt ZPOOL-STATUS 8 .Os Linux .Sh NAME @@ -59,7 +59,7 @@ is specified, then the status of each pool in the system is displayed. For more information on pool and device health, see the .Em Device Failure and Recovery section of -.Xr zpoolconcepts 8. +.Xr zpoolconcepts 8 . .Pp If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 7c83113ac86a..1ac9d00e7bf5 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -251,6 +251,7 @@ SRCS+= abd.c \ vdev_raidz.c \ vdev_raidz_math.c \ vdev_raidz_math_scalar.c \ + vdev_rebuild.c \ vdev_raidz_math_avx2.c \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_avx512f.c \ diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index cf3006721df0..302d485703f4 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -570,6 +570,11 @@ zpool_feature_init(void) "com.datto:resilver_defer", "resilver_defer", "Support for deferring new resilvers when one is already running.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_DEVICE_REBUILD, + "org.openzfs:device_rebuild", "device_rebuild", + "Support for sequential device rebuilds", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 7ea976d129dd..9ddcd6c339d4 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -94,6 +94,7 @@ $(MODULE)-objs += vdev_queue.o $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o +$(MODULE)-objs += vdev_rebuild.o $(MODULE)-objs += vdev_removal.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += vdev_trim.o diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 895ffbf0a94e..712af664e90f 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -704,8 +704,9 @@ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) return (SET_ERROR(EBUSY)); return (0); @@ -746,8 +747,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { - spa_event_notify(spa, NULL, NULL, + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_START); + nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } @@ -761,6 +766,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + /* + * When starting a resilver clear any existing rebuild state. + * This is required to prevent stale rebuild status from + * being reported when a rebuild is run, then a resilver and + * finally a scrub. In which case only the scrub status + * should be reported by 'zpool status'. + */ + if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + vdev_rebuild_clear_sync( + (void *)(uintptr_t)vd->vdev_id, tx); + } + } } /* back to the generic stuff */ @@ -918,14 +938,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - scn->scn_phys.scn_max_txg, B_TRUE); - - spa_event_notify(spa, NULL, NULL, - scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); + + if (scn->scn_phys.scn_min_txg) { + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, + ESC_ZFS_RESILVER_FINISH); + nvlist_free(aux); + } else { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_SCRUB_FINISH); + } } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - 0, B_TRUE); + 0, B_TRUE, B_FALSE); } spa_errlog_rotate(spa); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 943330886eec..6b60227d244f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -57,6 +57,7 @@ #include <sys/vdev_indirect_mapping.h> #include <sys/vdev_indirect_births.h> #include <sys/vdev_initialize.h> +#include <sys/vdev_rebuild.h> #include <sys/vdev_trim.h> #include <sys/vdev_disk.h> #include <sys/metaslab.h> @@ -1562,6 +1563,7 @@ spa_unload(spa_t *spa) vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -4240,7 +4242,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); @@ -4829,11 +4831,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) update_config_cache); /* - * Check all DTLs to see if anything needs resilvering. + * Check if a rebuild was in progress and if so resume it. + * Then check all DTLs to see if anything needs resilvering. + * The resilver will be deferred if a rebuild was started. */ - if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + if (vdev_rebuild_active(spa->spa_root_vdev)) { + vdev_rebuild_restart(spa); + } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); + } /* * Log the fact that we booted up (so that we can detect if @@ -6313,6 +6320,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -6536,12 +6544,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. + * + * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) + * should be performed instead of traditional healing reconstruction. From + * an administrators perspective these are both resilver operations. */ int -spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, + int rebuild) { uint64_t txg, dtl_max_txg; - vdev_t *rvd __maybe_unused = spa->spa_root_vdev; + vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; @@ -6561,6 +6574,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) return (spa_vdev_exit(spa, NULL, txg, error)); } + if (rebuild) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + if (dsl_scan_resilvering(spa_get_dsl(spa))) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RESILVER_IN_PROGRESS)); + } else { + if (vdev_rebuild_active(rvd)) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_REBUILD_IN_PROGRESS)); + } + if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); @@ -6593,6 +6619,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + if (rebuild) { + /* + * For rebuilds, the parent vdev must support reconstruction + * using only space maps. This means the only allowable + * parents are the root vdev or a mirror vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + } + if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root @@ -6646,7 +6684,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) - return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid @@ -6664,9 +6702,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } } - /* mark the device being resilvered */ - newvd->vdev_resilver_txg = txg; - /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -6704,8 +6739,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -6722,16 +6757,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_dirty(tvd, VDD_DTL, newvd, txg); /* - * Schedule the resilver to restart in the future. We do this to - * ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. We do not do this if resilvers have been - * deferred. + * Schedule the resilver or rebuild to restart in the future. We do + * this to ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. */ - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_defer_resilver(newvd); - else - dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); + } else { + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } + } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -6774,7 +6818,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) ASSERT(spa_writeable(spa)); - txg = spa_vdev_enter(spa); + txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -7728,6 +7772,12 @@ spa_vdev_resilver_done(spa_t *spa) } spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * If a detach was not performed above replace waiters will not have + * been notified. In which case we must do so now. + */ + spa_notify_waiters(spa); } /* @@ -7971,9 +8021,21 @@ spa_async_thread(void *arg) spa_vdev_resilver_done(spa); /* + * If any devices are done replacing, detach them. Then if no + * top-level vdevs are rebuilding attempt to kick off a scrub. + */ + if (tasks & SPA_ASYNC_REBUILD_DONE) { + spa_vdev_resilver_done(spa); + + if (!vdev_rebuild_active(spa->spa_root_vdev)) + (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); + } + + /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && + !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); @@ -9470,6 +9532,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: + if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + break; + /* fall through */ case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 61cefa3dda43..4c884409afbd 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1166,6 +1166,30 @@ spa_vdev_enter(spa_t *spa) } /* + * The same as spa_vdev_enter() above but additionally takes the guid of + * the vdev being detached. When there is a rebuild in process it will be + * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). + * The rebuild is canceled if only a single child remains after the detach. + */ +uint64_t +spa_vdev_detach_enter(spa_t *spa, uint64_t guid) +{ + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + + vdev_autotrim_stop_all(spa); + + if (guid != 0) { + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd) { + vdev_rebuild_stop_wait(vd->vdev_top); + } + } + + return (spa_vdev_config_enter(spa)); +} + +/* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. @@ -1198,7 +1222,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) /* * Reassess the DTLs. */ - vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; @@ -1271,6 +1295,7 @@ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); + vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); @@ -1322,7 +1347,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) } if (vd != NULL || error == 0) - vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE); + vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 03360120ad4c..27ac17fea5eb 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -39,6 +39,7 @@ #include <sys/dmu_tx.h> #include <sys/dsl_dir.h> #include <sys/vdev_impl.h> +#include <sys/vdev_rebuild.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -551,10 +552,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); @@ -562,10 +565,16 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } + txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, @@ -835,6 +844,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + &vd->vdev_rebuild_txg); + if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); @@ -890,6 +902,7 @@ vdev_free(vdev_t *vd) ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -998,10 +1011,12 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); + mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); @@ -1009,6 +1024,11 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); + mutex_destroy(&vd->vdev_rebuild_lock); + mutex_destroy(&vd->vdev_rebuild_io_lock); + cv_destroy(&vd->vdev_rebuild_cv); + cv_destroy(&vd->vdev_rebuild_io_cv); + zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1078,7 +1098,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); + ASSERT0(tvd->vdev_rebuilding); tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_rebuilding = svd->vdev_rebuilding; + tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; @@ -1092,6 +1115,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; + svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) @@ -2576,11 +2600,8 @@ vdev_dtl_max(vdev_t *vd) * excise the DTLs. */ static boolean_t -vdev_dtl_should_excise(vdev_t *vd) +vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { - spa_t *spa = vd->vdev_spa; - dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2589,23 +2610,52 @@ vdev_dtl_should_excise(vdev_t *vd) if (vd->vdev_resilver_deferred) return (B_FALSE); - if (vd->vdev_resilver_txg == 0 || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) + if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); - /* - * When a resilver is initiated the scan will assign the scn_max_txg - * value to the highest txg value that exists in all DTLs. If this - * device's max DTL is not part of this scan (i.e. it is not in - * the range (scn_min_txg, scn_max_txg] then it is not eligible - * for excision. - */ - if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { - ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); - ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); - ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); - return (B_TRUE); + if (rebuild_done) { + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + /* Rebuild not initiated by attach */ + if (vd->vdev_rebuild_txg == 0) + return (B_TRUE); + + /* + * When a rebuild completes without error then all missing data + * up to the rebuild max txg has been reconstructed and the DTL + * is eligible for excision. + */ + if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && + vdev_dtl_max(vd) <= vrp->vrp_max_txg) { + ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); + ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); + return (B_TRUE); + } + } else { + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; + + /* Resilver not initiated by attach */ + if (vd->vdev_resilver_txg == 0) + return (B_TRUE); + + /* + * When a resilver is initiated the scan will assign the + * scn_max_txg value to the highest txg value that exists + * in all DTLs. If this device's max DTL is not part of this + * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] + * then it is not eligible for excision. + */ + if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { + ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); + ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); + return (B_TRUE); + } } + return (B_FALSE); } @@ -2614,7 +2664,8 @@ vdev_dtl_should_excise(vdev_t *vd) * write operations will be issued to the pool. */ void -vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; @@ -2624,22 +2675,28 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, - scrub_txg, scrub_done); + scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* - * If requested, pretend the scan completed cleanly. + * If requested, pretend the scan or rebuild completed cleanly. */ - if (zfs_scan_ignore_errors && scn) - scn->scn_phys.scn_errors = 0; + if (zfs_scan_ignore_errors) { + if (scn != NULL) + scn->scn_phys.scn_errors = 0; + if (vr != NULL) + vr->vr_rebuild_phys.vrp_errors = 0; + } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { @@ -2654,21 +2711,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) } /* - * If we've completed a scan cleanly then determine - * if this vdev should remove any DTLs. We only want to - * excise regions on vdevs that were available during - * the entire duration of this scan. + * If we've completed a scrub/resilver or a rebuild cleanly + * then determine if this vdev should remove any DTLs. We + * only want to excise regions on vdevs that were available + * during the entire duration of this scan. */ - if (scrub_txg != 0 && - (spa->spa_scrub_started || - (scn != NULL && scn->scn_phys.scn_errors == 0)) && - vdev_dtl_should_excise(vd)) { + if (rebuild_done && + vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { + check_excise = B_TRUE; + } else { + if (spa->spa_scrub_started || + (scn != NULL && scn->scn_phys.scn_errors == 0)) { + check_excise = B_TRUE; + } + } + + if (scrub_txg && check_excise && + vdev_dtl_should_excise(vd, rebuild_done)) { /* - * We completed a scrub up to scrub_txg. If we - * did it without rebooting, then the scrub dtl - * will be valid, so excise the old region and - * fold in the scrub dtl. Otherwise, leave the - * dtl as-is if there was an error. + * We completed a scrub, resilver or rebuild up to + * scrub_txg. If we did it without rebooting, then + * the scrub dtl will be valid, so excise the old + * region and fold in the scrub dtl. Otherwise, + * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference @@ -2711,15 +2776,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* - * If the vdev was resilvering and no longer has any - * DTLs then reset its resilvering flag and dirty + * If the vdev was resilvering or rebuilding and no longer + * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ - if (txg != 0 && vd->vdev_resilver_txg != 0 && + if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { - vd->vdev_resilver_txg = 0; - vdev_config_dirty(vd->vdev_top); + if (vd->vdev_rebuild_txg != 0) { + vd->vdev_rebuild_txg = 0; + vdev_config_dirty(vd->vdev_top); + } else if (vd->vdev_resilver_txg != 0) { + vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } } mutex_exit(&vd->vdev_dtl_lock); @@ -2955,10 +3025,10 @@ vdev_dtl_required(vdev_t *vd) * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, @@ -3066,6 +3136,20 @@ vdev_load(vdev_t *vd) } /* + * Load any rebuild state from the top-level vdev zap. + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + error = vdev_rebuild_load(vd); + if (error && error != ENOTSUP) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " + "failed [error=%d]", error); + return (error); + } + } + + /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { @@ -3947,6 +4031,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; @@ -3973,7 +4058,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; + + /* Set when there is a deferred resilver. */ + vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } + /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab @@ -3985,13 +4074,16 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } + + /* + * Report fragmentation and rebuild progress for top-level, + * non-auxiliary, concrete devices. + */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } - if (vd->vdev_ops->vdev_op_leaf) - vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } vdev_get_stats_ex_impl(vd, vs, vsx); @@ -4072,17 +4164,35 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { + /* + * Repair is the result of a resilver issued by the + * scan thread (spa_sync). + */ if (flags & ZIO_FLAG_SCAN_THREAD) { - dsl_scan_phys_t *scn_phys = - &spa->spa_dsl_pool->dp_scan->scn_phys; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; - /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } + /* + * Repair is the result of a rebuild issued by the + * rebuild thread (vdev_rebuild_thread). + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + vdev_t *tvd = vd->vdev_top; + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; + + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(rebuilt, psize); + vs->vs_rebuild_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -4094,6 +4204,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; + zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as @@ -4103,19 +4214,44 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; + /* + * Solely for the purposes of 'zpool iostat -lqrw' + * reporting use the priority to catagorize the IO. + * Only the following are reported to user space: + * + * ZIO_PRIORITY_SYNC_READ, + * ZIO_PRIORITY_SYNC_WRITE, + * ZIO_PRIORITY_ASYNC_READ, + * ZIO_PRIORITY_ASYNC_WRITE, + * ZIO_PRIORITY_SCRUB, + * ZIO_PRIORITY_TRIM. + */ + if (priority == ZIO_PRIORITY_REBUILD) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_SCRUB); + } else if (priority == ZIO_PRIORITY_INITIALIZING) { + ASSERT3U(type, ==, ZIO_TYPE_WRITE); + priority = ZIO_PRIORITY_ASYNC_WRITE; + } else if (priority == ZIO_PRIORITY_REMOVAL) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_ASYNC_READ); + } + vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { - vsx->vsx_agg_histo[zio->io_priority] + vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { - vsx->vsx_ind_histo[zio->io_priority] + vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { - vsx->vsx_queue_histo[zio->io_priority] + vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 81cfd5ccef06..8c7468255565 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -404,6 +404,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) } } +static void +top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + if (vd == vd->vdev_top) { + vdev_rebuild_stat_t vrs; + if (vdev_rebuild_get_stats(vd, &vrs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, + sizeof (vrs) / sizeof (uint64_t)); + } + } +} + /* * Generate the nvlist representing this vdev's config. */ @@ -559,6 +572,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); + top_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context @@ -663,6 +677,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); + if (vd->vdev_rebuild_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + vd->vdev_rebuild_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 3edd65c01965..094530e9badd 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -767,8 +767,9 @@ vdev_mirror_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_abd, zio->io_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e31271dcb30d..a8ef3d7474c9 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -158,6 +158,8 @@ uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_max_active = 1; uint32_t zfs_vdev_trim_min_active = 1; uint32_t zfs_vdev_trim_max_active = 2; +uint32_t zfs_vdev_rebuild_min_active = 1; +uint32_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -278,6 +280,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_initializing_min_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_min_active); default: panic("invalid priority %u", p); return (0); @@ -352,6 +356,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); return (0); @@ -845,7 +851,8 @@ vdev_queue_io(zio_t *zio) zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } } else if (zio->io_type == ZIO_TYPE_WRITE) { @@ -854,7 +861,8 @@ vdev_queue_io(zio_t *zio) if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } } else { @@ -1051,6 +1059,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, + "Max active rebuild I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, + "Min active rebuild I/Os per vdev"); + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); /* END CSTYLED */ diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c new file mode 100644 index 000000000000..bf1079fd7af0 --- /dev/null +++ b/module/zfs/vdev_rebuild.c @@ -0,0 +1,1106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include <sys/vdev_impl.h> +#include <sys/dsl_scan.h> +#include <sys/spa_impl.h> +#include <sys/metaslab_impl.h> +#include <sys/vdev_rebuild.h> +#include <sys/zio.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/zap.h> + +/* + * This file contains the sequential reconstruction implementation for + * resilvering. This form of resilvering is internally referred to as device + * rebuild to avoid conflating it with the traditional healing reconstruction + * performed by the dsl scan code. + * + * When replacing a device, or scrubbing the pool, ZFS has historically used + * a process called resilvering which is a form of healing reconstruction. + * This approach has the advantage that as blocks are read from disk their + * checksums can be immediately verified and the data repaired. Unfortunately, + * it also results in a random IO pattern to the disk even when extra care + * is taken to sequentialize the IO as much as possible. This substantially + * increases the time required to resilver the pool and restore redundancy. + * + * For mirrored devices it's possible to implement an alternate sequential + * reconstruction strategy when resilvering. Sequential reconstruction + * behaves like a traditional RAID rebuild and reconstructs a device in LBA + * order without verifying the checksum. After this phase completes a second + * scrub phase is started to verify all of the checksums. This two phase + * process will take longer than the healing reconstruction described above. + * However, it has that advantage that after the reconstruction first phase + * completes redundancy has been restored. At this point the pool can incur + * another device failure without risking data loss. + * + * There are a few noteworthy limitations and other advantages of resilvering + * using sequential reconstruction vs healing reconstruction. + * + * Limitations: + * + * - Only supported for mirror vdev types. Due to the variable stripe + * width used by raidz sequential reconstruction is not possible. + * + * - Block checksums are not verified during sequential reconstuction. + * Similar to traditional RAID the parity/mirror data is reconstructed + * but cannot be immediately double checked. For this reason when the + * last active resilver completes the pool is automatically scrubbed. + * + * - Deferred resilvers using sequential reconstruction are not currently + * supported. When adding another vdev to an active top-level resilver + * it must be restarted. + * + * Advantages: + * + * - Sequential reconstuction is performed in LBA order which may be faster + * than healing reconstuction particularly when using using HDDs (or + * especially with SMR devices). Only allocated capacity is resilvered. + * + * - Sequential reconstruction is not constrained by ZFS block boundaries. + * This allows it to issue larger IOs to disk which span multiple blocks + * allowing all of these logical blocks to be repaired with a single IO. + * + * - Unlike a healing resilver or scrub which are pool wide operations, + * sequential reconstruction is handled by the top-level mirror vdevs. + * This allows for it to be started or canceled on a top-level vdev + * without impacting any other top-level vdevs in the pool. + * + * - Data only referenced by a pool checkpoint will be repaired because + * that space is reflected in the space maps. This differs for a + * healing resilver or scrub which will not repair that data. + */ + + +/* + * Maximum number of queued rebuild I/Os top-level vdev. The number of + * concurrent rebuild I/Os issued to the device is controlled by the + * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module + * options. + */ +unsigned int zfs_rebuild_queue_limit = 20; + +/* + * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. + */ +unsigned long zfs_rebuild_max_segment = 1024 * 1024; + +/* + * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). + */ +static void vdev_rebuild_thread(void *arg); + +/* + * Clear the per-vdev rebuild bytes value for a vdev tree. + */ +static void +clear_rebuild_bytes(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (uint64_t i = 0; i < vd->vdev_children; i++) + clear_rebuild_bytes(vd->vdev_child[i]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_rebuild_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Determines whether a vdev_rebuild_thread() should be stopped. + */ +static boolean_t +vdev_rebuild_should_stop(vdev_t *vd) +{ + return (!vdev_writeable(vd) || vd->vdev_removing || + vd->vdev_rebuild_exit_wanted || + vd->vdev_rebuild_cancel_wanted || + vd->vdev_rebuild_reset_wanted); +} + +/* + * Determine if the rebuild should be canceled. This may happen when all + * vdevs with MISSING DTLs are detached. + */ +static boolean_t +vdev_rebuild_should_cancel(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The sync task for updating the on-disk state of a rebuild. This is + * scheduled by vdev_rebuild_range(). + */ +static void +vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t txg = dmu_tx_get_txg(tx); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { + vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; + vr->vr_scan_offset[txg & TXG_MASK] = 0; + } + + vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Initialize the on-disk state for a new rebuild, start the rebuild thread. + */ +static void +vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_rebuilding); + + spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + mutex_enter(&vd->vdev_rebuild_lock); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_start_time = gethrestime_sec(); + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* + * Rebuilds are currently only used when replacing a device, in which + * case there must be DTL_MISSING entries. In the future, we could + * allow rebuilds to be used in a way similar to a scrub. This would + * be useful because it would allow us to rebuild the space used by + * pool checkpoints. + */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu started", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) +{ + nvlist_t *aux = fnvlist_alloc(); + + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); + spa_event_notify(spa, vd, aux, name); + nvlist_free(aux); +} + +/* + * Called to request that a new rebuild be started. The feature will remain + * active for the duration of the rebuild, then revert to the enabled state. + */ +static void +vdev_rebuild_initiate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_top == vd); + ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); + ASSERT(!vd->vdev_rebuilding); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + vd->vdev_rebuilding = B_TRUE; + + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); +} + +/* + * Update the on-disk state to completed when a rebuild finishes. + */ +static void +vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu complete", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + /* Handles detaching of spares */ + spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Update the on-disk state to canceled when a rebuild finishes. + */ +static void +vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu canceled", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + vd->vdev_rebuild_cancel_wanted = B_FALSE; + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Resets the progress of a running rebuild. This will occur when a new + * vdev is added to rebuild. + */ +static void +vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + + vrp->vrp_last_offset = 0; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_bytes_scanned = 0; + vrp->vrp_bytes_issued = 0; + vrp->vrp_bytes_rebuilt = 0; + vrp->vrp_bytes_est = 0; + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* See vdev_rebuild_initiate_sync comment */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu reset", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + vd->vdev_rebuild_reset_wanted = B_FALSE; + ASSERT(vd->vdev_rebuilding); + + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Clear the last rebuild status. + */ +void +vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + objset_t *mos = spa_meta_objset(spa); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || + vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { + mutex_exit(&vd->vdev_rebuild_lock); + return; + } + + clear_rebuild_bytes(vd); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + + if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { + VERIFY0(zap_update(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + } + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * The zio_done_func_t callback for each rebuild I/O issued. It's responsible + * for updating the rebuild stats and limiting the number of in flight I/Os. + */ +static void +vdev_rebuild_cb(zio_t *zio) +{ + vdev_rebuild_t *vr = zio->io_private; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vdev_t *vd = vr->vr_top_vdev; + + mutex_enter(&vd->vdev_rebuild_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the top-level vdev was unavailable. + * Attempt to roll back to the last completed offset, in order + * resume from the correct location if the pool is resumed. + * (This works because spa_sync waits on spa_txg_zio before + * it runs sync tasks.) + */ + uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else if (zio->io_error) { + vrp->vrp_errors++; + } + + abd_free(zio->io_abd); + + ASSERT3U(vd->vdev_rebuild_inflight, >, 0); + vd->vdev_rebuild_inflight--; + cv_broadcast(&vd->vdev_rebuild_io_cv); + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* + * Rebuild the data in this range by constructing a special dummy block + * pointer for the given range. It has no relation to any existing blocks + * in the pool. But by disabling checksum verification and issuing a scrub + * I/O mirrored vdevs will replicate the block using any available mirror + * leaf vdevs. + */ +static void +vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, + uint64_t txg) +{ + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + uint64_t psize = asize; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + blkptr_t blk, *bp = &blk; + BP_ZERO(bp); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], start); + DVA_SET_GANG(&bp->blk_dva[0], 0); + DVA_SET_ASIZE(&bp->blk_dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + /* + * We increment the issued bytes by the asize rather than the psize + * so the scanned and issued bytes may be directly compared. This + * is consistent with the scrub/resilver issued reporting. + */ + vr->vr_pass_bytes_issued += asize; + vr->vr_rebuild_phys.vrp_bytes_issued += asize; + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); +} + +/* + * Issues a rebuild I/O and takes care of rate limiting the number of queued + * rebuild I/Os. The provided start and size must be properly aligned for the + * top-level vdev type being rebuilt. + */ +static int +vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) +{ + uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + + ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); + ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); + + vr->vr_pass_bytes_scanned += size; + vr->vr_rebuild_phys.vrp_bytes_scanned += size; + + mutex_enter(&vd->vdev_rebuild_io_lock); + + /* Limit in flight rebuild I/Os */ + while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + vd->vdev_rebuild_inflight++; + mutex_exit(&vd->vdev_rebuild_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + /* This is the first I/O for this txg. */ + if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { + vr->vr_scan_offset[txg & TXG_MASK] = start; + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_rebuild_update_sync, + (void *)(uintptr_t)vd->vdev_id, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* When exiting write out our progress. */ + if (vdev_rebuild_should_stop(vd)) { + mutex_enter(&vd->vdev_rebuild_io_lock); + vd->vdev_rebuild_inflight--; + mutex_exit(&vd->vdev_rebuild_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_rebuild_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_rebuild_lock); + + vr->vr_scan_offset[txg & TXG_MASK] = start + size; + vdev_rebuild_rebuild_block(vr, start, size, txg); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Split range into legally-sized logical chunks given the constraints of the + * top-level mirror vdev type. + */ +static uint64_t +vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) +{ + uint64_t chunk_size, max_asize, max_segment; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, + 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); + max_asize = vdev_psize_to_asize(vd, max_segment); + chunk_size = MIN(size, max_asize); + + return (chunk_size); +} + +/* + * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. + */ +static int +vdev_rebuild_ranges(vdev_rebuild_t *vr) +{ + vdev_t *vd = vr->vr_top_vdev; + zfs_btree_t *t = &vr->vr_scan_tree->rt_root; + zfs_btree_index_t idx; + int error; + + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t start = rs_get_start(rs, vr->vr_scan_tree); + uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; + + /* + * zfs_scan_suspend_progress can be set to disable rebuild + * progress for testing. See comment in dsl_scan_sync(). + */ + while (zfs_scan_suspend_progress && + !vdev_rebuild_should_stop(vd)) { + delay(hz); + } + + while (size > 0) { + uint64_t chunk_size; + + chunk_size = vdev_rebuild_chunk_size(vd, start, size); + + error = vdev_rebuild_range(vr, start, chunk_size); + if (error != 0) + return (error); + + size -= chunk_size; + start += chunk_size; + } + } + + return (0); +} + +/* + * Calculates the estimated capacity which remains to be scanned. Since + * we traverse the pool in metaslab order only allocated capacity beyond + * the vrp_last_offset need be considered. All lower offsets must have + * already been rebuilt and are thus already included in vrp_bytes_scanned. + */ +static void +vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t bytes_est = vrp->vrp_bytes_scanned; + + if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) + return; + + for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + + mutex_enter(&msp->ms_lock); + bytes_est += metaslab_allocated_space(msp); + mutex_exit(&msp->ms_lock); + } + + vrp->vrp_bytes_est = bytes_est; +} + +/* + * Load from disk the top-level vdev's rebuild information. + */ +int +vdev_rebuild_load(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + spa_t *spa = vd->vdev_spa; + int err = 0; + + mutex_enter(&vd->vdev_rebuild_lock); + vd->vdev_rebuilding = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + mutex_exit(&vd->vdev_rebuild_lock); + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(vd->vdev_top == vd); + + err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp); + + /* + * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should + * not prevent a pool from being imported. Clear the rebuild + * status allowing a new resilver/rebuild to be started. + */ + if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + } else if (err) { + mutex_exit(&vd->vdev_rebuild_lock); + return (err); + } + + vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; + vr->vr_top_vdev = vd; + + mutex_exit(&vd->vdev_rebuild_lock); + + return (0); +} + +/* + * Each scan thread is responsible for rebuilding a top-level vdev. The + * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. + */ +static void +vdev_rebuild_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + /* + * If there's a scrub in process request that it be stopped. This + * is not required for a correct rebuild, but we do want rebuilds to + * emulate the resilver behavior as much as possible. + */ + dsl_pool_t *dsl = spa_get_dsl(spa); + if (dsl_scan_scrubbing(dsl)) + dsl_scan_cancel(dsl); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); + ASSERT(vd->vdev_rebuilding); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); + ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); + ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); + + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vr->vr_top_vdev = vd; + vr->vr_scan_msp = NULL; + vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + vr->vr_pass_start_time = gethrtime(); + vr->vr_pass_bytes_scanned = 0; + vr->vr_pass_bytes_issued = 0; + + uint64_t update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, 0); + + clear_rebuild_bytes(vr->vr_top_vdev); + + mutex_exit(&vd->vdev_rebuild_lock); + + /* + * Systematically walk the metaslabs and issue rebuild I/Os for + * all ranges in the allocated space map. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + vr->vr_scan_msp = msp; + + /* + * Removal of vdevs from the vdev tree may eliminate the need + * for the rebuild, in which case it should be canceled. The + * vdev_rebuild_cancel_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (vdev_rebuild_should_cancel(vd)) { + vd->vdev_rebuild_cancel_wanted = B_TRUE; + error = EINTR; + break; + } + + ASSERT0(range_tree_space(vr->vr_scan_tree)); + + /* + * Disable any new allocations to this metaslab and wait + * for any writes inflight to complete. This is needed to + * ensure all allocated ranges are rebuilt. + */ + metaslab_disable(msp); + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(dsl, 0); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * When a metaslab has been allocated from read its allocated + * ranges from the space map object in to the vr_scan_tree. + * Then add inflight / unflushed ranges and remove inflight / + * unflushed frees. This is the minimum range to be rebuilt. + */ + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + vr->vr_scan_tree, SM_ALLOC)); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space( + msp->ms_allocating[i])); + } + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, vr->vr_scan_tree); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, vr->vr_scan_tree); + + /* + * Remove ranges which have already been rebuilt based + * on the last offset. This can happen when restarting + * a scan after exporting and re-importing the pool. + */ + range_tree_clear(vr->vr_scan_tree, 0, + vrp->vrp_last_offset); + } + + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + /* + * To provide an accurate estimate re-calculate the estimated + * size every 5 minutes to account for recent allocations and + * frees made space maps which have not yet been rebuilt. + */ + if (gethrtime() > update_est_time + SEC2NSEC(300)) { + update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, i); + } + + /* + * Walk the allocated space map and issue the rebuild I/O. + */ + error = vdev_rebuild_ranges(vr); + range_tree_vacate(vr->vr_scan_tree, NULL, NULL); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + metaslab_enable(msp, B_FALSE, B_FALSE); + + if (error != 0) + break; + } + + range_tree_destroy(vr->vr_scan_tree); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* Wait for any remaining rebuild I/O to complete */ + mutex_enter(&vd->vdev_rebuild_io_lock); + while (vd->vdev_rebuild_inflight > 0) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + dsl_pool_t *dp = spa_get_dsl(spa); + dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (error == 0) { + /* + * After a successful rebuild clear the DTLs of all ranges + * which were missing when the rebuild was started. These + * ranges must have been rebuilt as a consequence of rebuilding + * all allocated space. Note that unlike a scrub or resilver + * the rebuild operation will reconstruct data only referenced + * by a pool checkpoint. See the dsl_scan_done() comments. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_cancel_wanted) { + /* + * The rebuild operation was canceled. This will occur when + * a device participating in the rebuild is detached. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_reset_wanted) { + /* + * Reset the running rebuild without canceling and restarting + * it. This will occur when a new device is attached and must + * participate in the rebuild. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else { + /* + * The rebuild operation should be suspended. This may occur + * when detaching a child vdev or when exporting the pool. The + * rebuild is left in the active state so it will be resumed. + */ + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + vd->vdev_rebuilding = B_FALSE; + } + + dmu_tx_commit(tx); + + vd->vdev_rebuild_thread = NULL; + mutex_exit(&vd->vdev_rebuild_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Returns B_TRUE if any top-level vdev are rebuilding. + */ +boolean_t +vdev_rebuild_active(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + boolean_t ret = B_FALSE; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) { + ret = vdev_rebuild_active(vd->vdev_child[i]); + if (ret) + return (ret); + } + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + mutex_exit(&vd->vdev_rebuild_lock); + } + + return (ret); +} + +/* + * Start a rebuild operation. The rebuild may be restarted when the + * top-level vdev is currently actively rebuilding. + */ +void +vdev_rebuild(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_top == vd); + ASSERT(vdev_is_concrete(vd)); + ASSERT(!vd->vdev_removing); + ASSERT(spa_feature_is_enabled(vd->vdev_spa, + SPA_FEATURE_DEVICE_REBUILD)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuilding) { + ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); + + /* + * Signal a running rebuild operation that it should restart + * from the beginning because a new device was attached. The + * vdev_rebuild_reset_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (!vd->vdev_rebuild_reset_wanted) + vd->vdev_rebuild_reset_wanted = B_TRUE; + } else { + vdev_rebuild_initiate(vd); + } + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_restart_impl(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_restart_impl(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && + vdev_writeable(vd) && !vd->vdev_rebuilding) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DEVICE_REBUILD)); + vd->vdev_rebuilding = B_TRUE; + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, + maxclsyspri); + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Conditionally restart all of the vdev_rebuild_thread's for a pool. The + * feature flag must be active and the rebuild in the active state. This + * cannot be used to start a new rebuild. + */ +void +vdev_rebuild_restart(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vdev_rebuild_restart_impl(spa->spa_root_vdev); +} + +/* + * Stop and wait for all of the vdev_rebuild_thread's associated with the + * vdev tree provide to be terminated (canceled or stopped). + */ +void +vdev_rebuild_stop_wait(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_stop_wait(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuild_thread != NULL) { + vd->vdev_rebuild_exit_wanted = B_TRUE; + while (vd->vdev_rebuilding) { + cv_wait(&vd->vdev_rebuild_cv, + &vd->vdev_rebuild_lock); + } + vd->vdev_rebuild_exit_wanted = B_FALSE; + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Stop all rebuild operations but leave them in the active state so they + * will be resumed when importing the pool. + */ +void +vdev_rebuild_stop_all(spa_t *spa) +{ + vdev_rebuild_stop_wait(spa->spa_root_vdev); +} + +/* + * Rebuild statistics reported per top-level vdev. + */ +int +vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) +{ + spa_t *spa = tvd->vdev_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (SET_ERROR(ENOTSUP)); + + if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) + return (SET_ERROR(EINVAL)); + + int error = zap_contains(spa_meta_objset(spa), + tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); + + if (error == ENOENT) { + bzero(vrs, sizeof (vdev_rebuild_stat_t)); + vrs->vrs_state = VDEV_REBUILD_NONE; + error = 0; + } else if (error == 0) { + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&tvd->vdev_rebuild_lock); + vrs->vrs_state = vrp->vrp_rebuild_state; + vrs->vrs_start_time = vrp->vrp_start_time; + vrs->vrs_end_time = vrp->vrp_end_time; + vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; + vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; + vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; + vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; + vrs->vrs_bytes_est = vrp->vrp_bytes_est; + vrs->vrs_errors = vrp->vrp_errors; + vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - + vr->vr_pass_start_time); + vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; + vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + mutex_exit(&tvd->vdev_rebuild_lock); + } + + return (error); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, + "Max segment size in bytes of rebuild reads"); +/* END CSTYLED */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 4122114b5619..1d2ae6270546 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1938,8 +1938,9 @@ static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; - int replacing = zc->zc_cookie; nvlist_t *config; + int replacing = zc->zc_cookie; + int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) @@ -1947,7 +1948,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { - error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, + rebuild); nvlist_free(config); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 765ffea8a302..f6478dd0d292 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -487,7 +487,8 @@ tests = ['zpool_wait_discard', 'zpool_wait_freeing', tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] -tests = ['zpool_wait_replace_cancel', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', +tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', + 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] @@ -748,7 +749,11 @@ tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] -tests = ['replacement_001_pos', 'replacement_002_pos', 'replacement_003_pos'] +tests = ['attach_import', 'attach_multiple', 'attach_rebuild', + 'attach_resilver', 'detach', 'rebuild_disabled_feature', + 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', + 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', + 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] @@ -762,10 +767,6 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_022_pos'] tags = ['functional', 'reservation'] -[tests/functional/resilver] -tests = ['resilver_restart_001', 'resilver_restart_002'] -tags = ['functional', 'resilver'] - [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 9fbcc37c610c..5e07cda4d791 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2222,26 +2222,27 @@ function check_pool_status # pool token keyword <verbose> if [[ $verbose == true ]]; then log_note $scan fi - echo $scan | grep -i "$keyword" > /dev/null 2>&1 + echo $scan | egrep -i "$keyword" > /dev/null 2>&1 return $? } # # The following functions are instance of check_pool_status() -# is_pool_resilvering - to check if the pool is resilver in progress -# is_pool_resilvered - to check if the pool is resilver completed -# is_pool_scrubbing - to check if the pool is scrub in progress -# is_pool_scrubbed - to check if the pool is scrub completed -# is_pool_scrub_stopped - to check if the pool is scrub stopped -# is_pool_scrub_paused - to check if the pool has scrub paused -# is_pool_removing - to check if the pool is removing a vdev -# is_pool_removed - to check if the pool is remove completed -# is_pool_discarding - to check if the pool has checkpoint being discarded +# is_pool_resilvering - to check if the pool resilver is in progress +# is_pool_resilvered - to check if the pool resilver is completed +# is_pool_scrubbing - to check if the pool scrub is in progress +# is_pool_scrubbed - to check if the pool scrub is completed +# is_pool_scrub_stopped - to check if the pool scrub is stopped +# is_pool_scrub_paused - to check if the pool scrub has paused +# is_pool_removing - to check if the pool removing is a vdev +# is_pool_removed - to check if the pool remove is completed +# is_pool_discarding - to check if the pool checkpoint is being discarded # function is_pool_resilvering #pool <verbose> { - check_pool_status "$1" "scan" "resilver in progress since " $2 + check_pool_status "$1" "scan" \ + "resilver[ ()0-9A-Za-z_-]* in progress since" $2 return $? } @@ -3487,7 +3488,7 @@ function wait_scrubbed typeset pool=${1:-$TESTPOOL} while true ; do is_pool_scrubbed $pool && break - log_must sleep 1 + sleep 1 done } diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index 24f3e50bb5c6..c56518c55a03 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -65,7 +65,6 @@ SUBDIRS = \ rename_dirs \ replacement \ reservation \ - resilver \ rootpool \ rsend \ scrub_mirror \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index ee5b2b4e1740..4991b76bfa57 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -79,6 +79,7 @@ typeset -a properties=( "feature@redacted_datasets" "feature@bookmark_written" "feature@log_spacemap" + "feature@device_rebuild" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am index 6a21cac4fdae..451d83a79aa6 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am @@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_wait_replace.ksh \ zpool_wait_replace_cancel.ksh \ + zpool_wait_rebuild.ksh \ zpool_wait_resilver.ksh \ zpool_wait_scrub_basic.ksh \ zpool_wait_scrub_cancel.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh new file mode 100755 index 000000000000..8cd5864597af --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for sequential resilvering to complete. +# +# STRATEGY: +# 1. Attach a device to the pool so that sequential resilvering starts. +# 2. Start 'zpool wait'. +# 3. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 4. Repeat 1-3, except using the '-w' flag with 'zpool attach' instead of using +# 'zpool wait'. +# + +function cleanup +{ + remove_io_delay + kill_if_running $pid + get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK2 +} + +typeset -r IN_PROGRESS_CHECK="is_pool_resilvering $TESTPOOL" +typeset pid + +log_onexit cleanup + +add_io_delay $TESTPOOL + +# Test 'zpool wait -t resilver' +log_must zpool attach -s $TESTPOOL $DISK1 $DISK2 +log_bkgrnd zpool wait -t resilver $TESTPOOL +pid=$! +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_must zpool detach $TESTPOOL $DISK2 + +# Test 'zpool attach -w' +log_bkgrnd zpool attach -sw $TESTPOOL $DISK1 $DISK2 +pid=$! +while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do + log_must sleep .5 +done +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_pass "'zpool wait -t resilver' and 'zpool attach -w' work." diff --git a/tests/zfs-tests/tests/functional/replacement/Makefile.am b/tests/zfs-tests/tests/functional/replacement/Makefile.am index d47fcd5e1b24..fe6e4912198d 100644 --- a/tests/zfs-tests/tests/functional/replacement/Makefile.am +++ b/tests/zfs-tests/tests/functional/replacement/Makefile.am @@ -2,9 +2,20 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/replacement dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - replacement_001_pos.ksh \ - replacement_002_pos.ksh \ - replacement_003_pos.ksh + attach_import.ksh \ + attach_multiple.ksh \ + attach_rebuild.ksh \ + attach_resilver.ksh \ + detach.ksh \ + rebuild_disabled_feature.ksh \ + rebuild_multiple.ksh \ + rebuild_raidz.ksh \ + replace_import.ksh \ + replace_rebuild.ksh \ + replace_resilver.ksh \ + resilver_restart_001.ksh \ + resilver_restart_002.ksh \ + scrub_cancel.ksh dist_pkgdata_DATA = \ replacement.cfg diff --git a/tests/zfs-tests/tests/functional/replacement/attach_import.ksh b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh new file mode 100755 index 000000000000..e2749b164efe --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that on import an in progress attach operation is resumed. +# +# Strategy: +# 1. For both healing and sequential resilvering. +# a. Create a pool +# b. Add a vdev with 'zpool attach' and resilver (-s) it. +# c. Export the pool +# d. Import the pool +# e. Verify the 'zpool attach' resumed resilvering +# f. Destroy the pool +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} +} + +log_assert "Verify attach is resumed on import" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} + +# Verify healing and sequential resilver resume on import. +for arg in "" "-s"; do + log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]} + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + log_must zpool attach $arg $TESTPOOL1 ${VDEV_FILES[0]} ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1 + log_must is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 + destroy_pool $TESTPOOL1 +done + +log_pass "Verify attach is resumed on import" diff --git a/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh new file mode 100755 index 000000000000..b3192b2bfbda --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that attach/detach work while resilvering and attaching +# multiple vdevs. +# +# Strategy: +# 1. Create a single vdev pool +# 2. While healing or sequential resilvering: +# a. Attach a vdev to convert the pool to a mirror. +# b. Attach a vdev to convert the pool to a 3-way mirror. +# c. Verify the original vdev cannot be removed (no redundant copies) +# d. Detach a vdev. Healing and sequential resilver remain running. +# e. Detach a vdev. Healing resilver remains running, sequential +# resilver is canceled. +# f. Wait for resilver to complete. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} +} + +log_assert "Verify attach/detech with multiple vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} + +# Verify resilver resumes on import. +log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]} + +for replace_mode in "healing" "sequential"; do + # + # Resilvers abort the dsl_scan and reconfigure it for resilvering. + # Rebuilds cancel the dsl_scan and start the vdev_rebuild thread. + # + if [[ "$replace_mode" = "healing" ]]; then + flags="" + else + flags="-s" + fi + + log_mustnot is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + + # Attach first vdev (stripe -> mirror) + log_must zpool attach $flags $TESTPOOL1 \ + ${VDEV_FILES[0]} ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + + # Attach second vdev (2-way -> 3-way mirror) + log_must zpool attach $flags $TESTPOOL1 \ + ${VDEV_FILES[1]} ${VDEV_FILES[2]} + log_must is_pool_resilvering $TESTPOOL1 + + # Original vdev cannot be detached until there is sufficent redundancy. + log_mustnot zpool detach $TESTPOOL1 ${VDEV_FILES[0]} + + # Detach first vdev (resilver keeps running) + log_must zpool detach $TESTPOOL1 ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + + # + # Detach second vdev. There's a difference in behavior between + # healing and sequential resilvers. A healing resilver will not be + # cancelled even though there's nothing on the original vdev which + # needs to be rebuilt. A sequential resilver on the otherhand is + # canceled when returning to a non-redundant striped layout. At + # some point the healing resilver behavior should be updated to match + # the sequential resilver behavior. + # + log_must zpool detach $TESTPOOL1 ${VDEV_FILES[2]} + + if [[ "$replace_mode" = "healing" ]]; then + log_must is_pool_resilvering $TESTPOOL1 + else + log_mustnot is_pool_resilvering $TESTPOOL1 + fi + + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait $TESTPOOL1 +done + +log_pass "Verify attach/detech with multiple vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh new file mode 100755 index 000000000000..e9427c7adc9d --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh @@ -0,0 +1,173 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Attaching disks during I/O should pass for supported pools. +# +# STRATEGY: +# 1. Create multidisk pools (stripe/mirror/raidz) and +# start some random I/O +# 2. Attach a disk to the pool. +# 3. Verify the integrity of the file system and the resilvering. +# +# NOTE: Raidz does not support the sequential resilver (-s) option. +# + +verify_runnable "global" + +function cleanup +{ + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids; do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function attach_test +{ + typeset -i iters=2 + typeset -i index=0 + typeset opt=$1 + typeset disk1=$2 + typeset disk2=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + log_must zpool attach -sw $opt $TESTPOOL1 $disk1 $disk2 + + for wait_pid in $child_pids; do + kill $wait_pid + done + child_pids="" + + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +specials_list="" +i=0 +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +# +# Create a replacement disk special file. +# +truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE + +for op in "" "-f"; do + create_pool $TESTPOOL1 mirror $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -ne 0 ]]; then + log_fail "$REPLACEFILE is not present." + fi + + destroy_pool $TESTPOOL1 +done + +log_note "Verify 'zpool attach' fails with non-mirrors." + +for type in "" "raidz" "raidz1"; do + for op in "" "-f"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_mustnot zpool attach -s "$opt" $TESTDIR/$TESTFILE1.1 \ + $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -eq 0 ]]; then + log_fail "$REPLACEFILE should not be present." + fi + + destroy_pool $TESTPOOL1 + done +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh index 391aa5cf0dfc..4261d4d67cc0 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh @@ -104,9 +104,7 @@ function attach_test ((i = i + 1)) done - log_must zpool attach $opt $TESTPOOL1 $disk1 $disk2 - - sleep 10 + log_must zpool attach -w $opt $TESTPOOL1 $disk1 $disk2 for wait_pid in $child_pids do @@ -119,13 +117,13 @@ function attach_test log_must zfs umount $TESTPOOL1/$TESTFS1 log_must zdb -cdui $TESTPOOL1/$TESTFS1 log_must zfs mount $TESTPOOL1/$TESTFS1 - + verify_pool $TESTPOOL1 } specials_list="" i=0 -while [[ $i != 2 ]]; do - mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -134,7 +132,7 @@ done # # Create a replacement disk special file. # -mkfile $MINVDEVSIZE $TESTDIR/$REPLACEFILE +truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE for op in "" "-f"; do create_pool $TESTPOOL1 mirror $specials_list @@ -143,7 +141,7 @@ for op in "" "-f"; do attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -ne 0 ]]; then log_fail "$REPLACEFILE is not present." fi @@ -162,7 +160,7 @@ for type in "" "raidz" "raidz1"; do log_mustnot zpool attach "$opt" $TESTDIR/$TESTFILE1.1 \ $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -eq 0 ]]; then log_fail "$REPLACEFILE should not be present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh b/tests/zfs-tests/tests/functional/replacement/detach.ksh index 71b9602ee15a..aa3ec4f7a75d 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/detach.ksh @@ -121,8 +121,8 @@ function detach_test specials_list="" i=0 -while [[ $i != 2 ]]; do - mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -134,7 +134,7 @@ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 detach_test $TESTDIR/$TESTFILE1.1 -zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1" +zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1" if [[ $? -eq 0 ]]; then log_fail "$TESTFILE1.1 should no longer be present." fi @@ -143,14 +143,14 @@ destroy_pool $TESTPOOL1 log_note "Verify 'zpool detach' fails with non-mirrors." -for type in "" "raidz" "raidz1" ; do +for type in "" "raidz" "raidz1"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 log_mustnot zpool detach $TESTDIR/$TESTFILE1.1 - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1" + zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1" if [[ $? -ne 0 ]]; then log_fail "$TESTFILE1.1 is not present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh new file mode 100755 index 000000000000..d17d83b78333 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify device_rebuild feature flags. +# +# Strategy: +# 1. Create a pool with all features disabled. +# 2. Verify 'zpool replace -s' fails and the feature is disabled. +# 3. Enable the device_rebuild feature. +# 4. Verify 'zpool replace -s' works and the feature is active. +# 5. Wait for the feature to return to enabled. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +function check_feature_flag +{ + feature=$1 + pool=$2 + expected_value=$3 + + value="$(zpool get -H -o property,value all $pool | \ + egrep "$feature" | awk '{print $2}')" + if [ "$value" = "$expected_value" ]; then + log_note "$feature verified to be $value" + else + log_fail "$feature should be $expected_value but is $value" + fi +} + +log_assert "Verify device_rebuild feature flags." + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE +log_must zpool create -d $TESTPOOL1 ${VDEV_FILES[@]} + +log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "disabled" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool set feature@device_rebuild=enabled $TESTPOOL1 +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "active" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "enabled" + +log_pass "Verify device_rebuild feature flags." diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh new file mode 100755 index 000000000000..7775cbff4db8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh @@ -0,0 +1,126 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Sequential reconstruction (unlike healing reconstruction) operate on the +# top-level vdev. This means that a sequential resilver operation can be +# started/stopped on a different top-level vdev without impacting other +# sequential resilvers. +# +# STRATEGY: +# 1. Create a mirrored pool. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE $SPARE_VDEV_FILE2 +} + +function check_history +{ + pool=$1 + msg=$2 + exp=$3 + + count=$(zpool history -i $pool | grep "rebuild" | grep -c "$msg") + if [[ "$count" -ne "$exp" ]]; then + log_fail "Expected $exp rebuild '$msg' messages, found $count" + else + log_note "Found $count/$exp rebuild '$msg' messages" + fi +} + +log_assert "Rebuilds operate on the top-level vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} \ + $SPARE_VDEV_FILE $SPARE_VDEV_FILE2 + +# Verify two sequential resilvers can run concurrently. +log_must zpool create -f $TESTPOOL1 \ + mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \ + mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32 +log_must zpool sync $TESTPOOL1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "started" 2 +check_history $TESTPOOL1 "reset" 0 +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 0 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 + +check_history $TESTPOOL1 "complete" 2 +destroy_pool $TESTPOOL1 + +# Verify canceling one resilver (zpool detach) does not impact others. +log_must zpool create -f $TESTPOOL1 \ + mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \ + mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32 +log_must zpool sync $TESTPOOL1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "started" 2 +check_history $TESTPOOL1 "reset" 0 +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 0 + +log_must zpool detach $TESTPOOL1 $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 + +check_history $TESTPOOL1 "complete" 1 +check_history $TESTPOOL1 "canceled" 1 +destroy_pool $TESTPOOL1 + +log_pass "Rebuilds operate on the top-level vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh new file mode 100755 index 000000000000..c919b44b21cc --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Executing 'zpool replace -s' for raidz vdevs failed. Sequential +# resilvers are only allowed for stripe/mirror pools. +# +# STRATEGY: +# 1. Create a raidz pool, verify 'zpool replace -s' fails +# 2. Create a stripe/mirror pool, verify 'zpool replace -s' passes +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Sequential resilver is not allowed for raidz vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +# raidz[1-3] +for vdev_type in "raidz" "raidz2" "raidz3"; do + log_must zpool create -f $TESTPOOL1 $vdev_type ${VDEV_FILES[@]} + log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} \ + $SPARE_VDEV_FILE + destroy_pool $TESTPOOL1 +done + +# stripe +log_must zpool create $TESTPOOL1 ${VDEV_FILES[@]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + +# mirror +log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + +log_pass "Sequential resilver is not allowed for raidz vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_import.ksh b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh new file mode 100755 index 000000000000..35d51d93938b --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that on import an in progress replace operation is resumed. +# +# Strategy: +# 1. For both healing and sequential resilvering replace: +# a. Create a pool +# b. Repalce a vdev with 'zpool replace' to resilver (-s) it. +# c. Export the pool +# d. Import the pool +# e. Verify the 'zpool replace' resumed resilvering. +# f. Destroy the pool +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Verify replace is resumed on import" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +# Verify healing and sequential resilver resume on import. +for arg in "" "-s"; do + log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]} + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE + log_must is_pool_resilvering $TESTPOOL1 + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1 + log_must is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 + destroy_pool $TESTPOOL1 +done + +log_pass "Verify replace is resumed on import" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh new file mode 100755 index 000000000000..5997352284b4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh @@ -0,0 +1,158 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Replacing disks during I/O should pass for supported pools. +# +# STRATEGY: +# 1. Create multidisk pools (stripe/mirror) and +# start some random I/O +# 2. Replace a disk in the pool with another disk. +# 3. Verify the integrity of the file system and the rebuilding. +# +# NOTE: Raidz does not support the sequential resilver (-s) option. +# + +verify_runnable "global" + +function cleanup +{ + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids + do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk with -r during I/O completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function replace_test +{ + typeset -i iters=2 + typeset -i index=0 + typeset opt=$1 + typeset disk1=$2 + typeset disk2=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + log_must zpool replace -sw $opt $TESTPOOL1 $disk1 $disk2 + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +specials_list="" +i=0 +while [[ $i != 3 ]]; do + log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +# +# Create a replacement disk special file. +# +log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE + +for type in "" "mirror"; do + for op in "" "-f"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -ne 0 ]]; then + log_fail "$REPLACEFILE is not present." + fi + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 + done +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh index 8f40436ffb11..253cf65e452b 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh @@ -104,9 +104,7 @@ function replace_test ((i = i + 1)) done - log_must zpool replace $opt $TESTPOOL1 $disk1 $disk2 - - sleep 10 + log_must zpool replace -w $opt $TESTPOOL1 $disk1 $disk2 for wait_pid in $child_pids do @@ -119,11 +117,12 @@ function replace_test log_must zfs umount $TESTPOOL1/$TESTFS1 log_must zdb -cdui $TESTPOOL1/$TESTFS1 log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 } specials_list="" i=0 -while [[ $i != 2 ]]; do +while [[ $i != 3 ]]; do log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" @@ -143,7 +142,7 @@ for type in "" "raidz" "mirror"; do replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -ne 0 ]]; then log_fail "$REPLACEFILE is not present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/replacement.cfg b/tests/zfs-tests/tests/functional/replacement/replacement.cfg index b2ba1b885117..271317b1c970 100644 --- a/tests/zfs-tests/tests/functional/replacement/replacement.cfg +++ b/tests/zfs-tests/tests/functional/replacement/replacement.cfg @@ -36,3 +36,8 @@ export HOLES_SEED=${HOLES_SEED-""} export HOLES_FILEOFFSET=${HOLES_FILEOFFSET-""} export HOLES_COUNT=${HOLES_COUNT-"16384"} # FILESIZE/BLKSIZE/8 export REPLACEFILE="sparedisk" + +set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4} +export VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 )) +export SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1 +export SPARE_VDEV_FILE2=$TEST_BASE_DIR/spare-2 diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh index 9af1c972faa9..7896b2dbe1d0 100755 --- a/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh +++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh @@ -20,7 +20,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg +. $STF_SUITE/tests/functional/replacement/replacement.cfg # # DESCRIPTION: @@ -50,7 +50,7 @@ function cleanup $ORIG_SCAN_SUSPEND_PROGRESS log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX log_must zinject -c all - destroy_pool $TESTPOOL + destroy_pool $TESTPOOL1 rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE } @@ -70,7 +70,7 @@ function verify_restarts # <msg> <cnt> <defer> [[ -z "$defer" ]] && return # use zdb to find which vdevs have the resilver defer flag - VDEV_DEFERS=$(zdb -C $TESTPOOL | awk ' + VDEV_DEFERS=$(zdb -C $TESTPOOL1 | awk ' /children/ { gsub(/[^0-9]/, ""); child = $0 } /com\.datto:resilver_defer$/ { print child } ') @@ -106,17 +106,17 @@ log_must set_tunable32 ZEVENT_LEN_MAX 512 log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE -log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL \ +log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL1 \ raidz ${VDEV_FILES[@]} # create 4 filesystems for fs in fs{0..3} do - log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL/$fs + log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL1/$fs done # simultaneously write 16M to each of them -set -A DATAPATHS /$TESTPOOL/fs{0..3}/dat.0 +set -A DATAPATHS /$TESTPOOL1/fs{0..3}/dat.0 log_note "Writing data files" for path in ${DATAPATHS[@]} do @@ -131,7 +131,7 @@ do if [[ $test == "with" ]] then - log_must zpool set feature@resilver_defer=enabled $TESTPOOL + log_must zpool set feature@resilver_defer=enabled $TESTPOOL1 RESTARTS=( "${DEFER_RESTARTS[@]}" ) VDEVS=( "${DEFER_VDEVS[@]}" ) VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}" @@ -144,7 +144,7 @@ do log_must set_tunable32 RESILVER_MIN_TIME_MS 50 # initiate a resilver and suspend the scan as soon as possible - log_must zpool replace $TESTPOOL $VDEV_REPLACE + log_must zpool replace $TESTPOOL1 $VDEV_REPLACE log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 # there should only be 1 resilver start @@ -152,16 +152,16 @@ do # offline then online a vdev to introduce a new DTL range after current # scan, which should restart (or defer) the resilver - log_must zpool offline $TESTPOOL ${VDEV_FILES[2]} - log_must zpool sync $TESTPOOL - log_must zpool online $TESTPOOL ${VDEV_FILES[2]} - log_must zpool sync $TESTPOOL + log_must zpool offline $TESTPOOL1 ${VDEV_FILES[2]} + log_must zpool sync $TESTPOOL1 + log_must zpool online $TESTPOOL1 ${VDEV_FILES[2]} + log_must zpool sync $TESTPOOL1 # there should now be 2 resilver starts w/o defer, 1 with defer verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}" # inject read io errors on vdev and verify resilver does not restart - log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL + log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL1 log_must cat ${DATAPATHS[1]} > /dev/null log_must zinject -c all @@ -173,17 +173,12 @@ do log_must set_tunable32 RESILVER_MIN_TIME_MS 3000 # wait for resilver to finish - for iter in {0..59} - do - is_pool_resilvered $TESTPOOL && break - sleep 1 - done - is_pool_resilvered $TESTPOOL || - log_fail "resilver timed out" + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 # wait for a few txg's to see if a resilver happens - log_must zpool sync $TESTPOOL - log_must zpool sync $TESTPOOL + log_must zpool sync $TESTPOOL1 + log_must zpool sync $TESTPOOL1 # there should now be 2 resilver starts verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}" diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh index ebe5e693b2cf..48763f9b2dfa 100755 --- a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh +++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh @@ -20,7 +20,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg +. $STF_SUITE/tests/functional/replacement/replacement.cfg # # DESCRIPTION: @@ -40,7 +40,7 @@ function cleanup { log_must zinject -c all - destroy_pool $TESTPOOL + destroy_pool $TESTPOOL1 rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY } @@ -56,25 +56,25 @@ log_must set_tunable32 SCAN_LEGACY 1 # create the pool and a 32M file (32k blocks) log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE -log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]} -log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1 +log_must zpool create -f -O recordsize=1k $TESTPOOL1 ${VDEV_FILES[0]} +log_must dd if=/dev/urandom of=/$TESTPOOL1/file bs=1M count=32 > /dev/null 2>&1 # determine objset/object -objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') -object=$(ls -i /$TESTPOOL/file | awk '{print $1}') +objset=$(zdb -d $TESTPOOL1/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') +object=$(ls -i /$TESTPOOL1/file | awk '{print $1}') # inject event to cause error during resilver -log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL +log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL1 # clear events and start resilver log_must zpool events -c -log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE +log_must zpool attach $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE log_note "waiting for read errors to start showing up" for iter in {0..59} do - zpool sync $TESTPOOL - err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}') + zpool sync $TESTPOOL1 + err=$(zpool status $TESTPOOL1 | grep ${VDEV_FILES[0]} | awk '{print $3}') (( $err > 0 )) && break sleep 1 done @@ -92,8 +92,8 @@ done (( $finish == 0 )) && log_fail "resilver took too long to finish" # wait a few syncs to ensure that zfs does not restart the resilver -log_must zpool sync $TESTPOOL -log_must zpool sync $TESTPOOL +log_must zpool sync $TESTPOOL1 +log_must zpool sync $TESTPOOL1 # check if resilver was restarted start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l) diff --git a/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh new file mode 100755 index 000000000000..da8a0a26e333 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Verify scrub behaves as intended when contending with a healing or +# sequential resilver. +# +# STRATEGY: +# 1. Create a pool +# 2. Add a modest amount of data to the pool. +# 3. For healing and sequential resilver: +# a. Start scrubbing. +# b. Verify a resilver can be started and it cancels the scrub. +# c. Verify a scrub cannot be started when resilvering +# + +function cleanup +{ + log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Scrub was cancelled by resilver" + +ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS) +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=64 +log_must zpool sync $TESTPOOL1 + +# Request a healing or sequential resilver +for replace_mode in "healing" "sequential"; do + + # + # Healing resilvers abort the dsl_scan and reconfigure it for + # resilvering. Sequential resilvers cancel the dsl_scan and start + # the vdev_rebuild thread. + # + if [[ "$replace_mode" = "healing" ]]; then + history_msg="scan aborted, restarting" + flags="" + else + history_msg="scan cancelled" + flags="-s" + fi + + # Limit scanning time and suspend the scan as soon as possible. + log_must set_tunable32 RESILVER_MIN_TIME_MS 50 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + + # Initiate a scrub. + log_must zpool scrub $TESTPOOL1 + + # Initiate a resilver to cancel the scrub. + log_must zpool replace $flags $TESTPOOL1 ${VDEV_FILES[1]} \ + $SPARE_VDEV_FILE + + # Verify the scrub was canceled, it may take a few seconds to exit. + while is_pool_scrubbing $TESTPOOL1; do + sleep 1 + done + log_mustnot is_pool_scrubbing $TESTPOOL1 + + # Verify a scrub cannot be started while resilvering. + log_must is_pool_resilvering $TESTPOOL1 + log_mustnot zpool scrub $TESTPOOL1 + + # Unsuspend resilver. + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must set_tunable32 RESILVER_MIN_TIME_MS 3000 + + # Wait for resilver to finish then put the original back. + log_must zpool wait $TESTPOOL1 + log_must zpool replace $flags -w $TESTPOOL1 $SPARE_VDEV_FILE \ + ${VDEV_FILES[1]} +done +log_pass "Scrub was cancelled by resilver" + diff --git a/tests/zfs-tests/tests/functional/resilver/Makefile.am b/tests/zfs-tests/tests/functional/resilver/Makefile.am deleted file mode 100644 index 38136a843aac..000000000000 --- a/tests/zfs-tests/tests/functional/resilver/Makefile.am +++ /dev/null @@ -1,9 +0,0 @@ -pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver -dist_pkgdata_SCRIPTS = \ - setup.ksh \ - cleanup.ksh \ - resilver_restart_001.ksh \ - resilver_restart_002.ksh - -dist_pkgdata_DATA = \ - resilver.cfg diff --git a/tests/zfs-tests/tests/functional/resilver/cleanup.ksh b/tests/zfs-tests/tests/functional/resilver/cleanup.ksh deleted file mode 100755 index 4dfa81424513..000000000000 --- a/tests/zfs-tests/tests/functional/resilver/cleanup.ksh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END - -# -# Copyright (c) 2019, Datto Inc. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg - -verify_runnable "global" - -log_pass diff --git a/tests/zfs-tests/tests/functional/resilver/resilver.cfg b/tests/zfs-tests/tests/functional/resilver/resilver.cfg deleted file mode 100644 index 88dfd24aed20..000000000000 --- a/tests/zfs-tests/tests/functional/resilver/resilver.cfg +++ /dev/null @@ -1,32 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END - -# -# Copyright (c) 2019, Datto Inc. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib - -verify_runnable "global" - -set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4} -SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1 - -VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 )) diff --git a/tests/zfs-tests/tests/functional/resilver/setup.ksh b/tests/zfs-tests/tests/functional/resilver/setup.ksh deleted file mode 100755 index 4dfa81424513..000000000000 --- a/tests/zfs-tests/tests/functional/resilver/setup.ksh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END - -# -# Copyright (c) 2019, Datto Inc. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg - -verify_runnable "global" - -log_pass |