diff options
Diffstat (limited to 'sys/contrib/openzfs')
15 files changed, 440 insertions, 151 deletions
diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh index 1c608348ffcd..422b3e9df388 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh @@ -121,7 +121,14 @@ case "$OS" in KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-0c) - FreeBSD="15.0-ALPHA3" + FreeBSD="15.0-ALPHA4" + OSNAME="FreeBSD $FreeBSD" + OSv="freebsd14.0" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" + ;; + freebsd16-0c) + FreeBSD="16.0-CURRENT" OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" @@ -287,7 +294,7 @@ else while pidof /usr/bin/qemu-system-x86_64 >/dev/null; do ssh 2>/dev/null root@vm0 "uname -a" && break done - ssh root@vm0 "pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init" + ssh root@vm0 "env IGNORE_OSVERSION=yes pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init" ssh root@vm0 "chsh -s $BASH root" ssh root@vm0 'sysrc qemu_guest_agent_enable="YES"' ssh root@vm0 'sysrc cloudinit_enable="YES"' diff --git a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml index 69349678d84c..3b164548f9be 100644 --- a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml +++ b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml @@ -68,7 +68,7 @@ jobs: # FreeBSD variants of 2025-06: # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r # FreeBSD Stable: freebsd13-5s, freebsd14-3s - # FreeBSD Current: freebsd15-0c + # FreeBSD Current: freebsd15-0c, freebsd16-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} runs-on: ubuntu-24.04 steps: diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 70a4ed46f263..1ffb8ebc70f2 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -106,11 +106,15 @@ extern boolean_t spa_mode_readable_spacemaps; extern uint_t zfs_reconstruct_indirect_combinations_max; extern uint_t zfs_btree_verify_intensity; +enum { + ARG_ALLOCATED = 256, + ARG_BLOCK_BIN_MODE, + ARG_BLOCK_CLASSES, +}; + static const char cmdname[] = "zdb"; uint8_t dump_opt[512]; -#define ALLOCATED_OPT 256 - typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); static uint64_t *zopt_metaslab = NULL; @@ -131,6 +135,20 @@ static objset_t *os; static boolean_t kernel_init_done; static boolean_t corruption_found = B_FALSE; +static enum { + BIN_AUTO = 0, + BIN_PSIZE, + BIN_LSIZE, + BIN_ASIZE, +} block_bin_mode = BIN_AUTO; + +static enum { + CLASS_NORMAL = 1 << 1, + CLASS_SPECIAL = 1 << 2, + CLASS_DEDUP = 1 << 3, + CLASS_OTHER = 1 << 4, +} block_classes = 0; + static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); static void mos_obj_refd(uint64_t); @@ -749,6 +767,12 @@ usage(void) (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b --block-stats " "block statistics\n"); + (void) fprintf(stderr, " --bin=(lsize|psize|asize) " + "bin blocks based on this size in all three columns\n"); + (void) fprintf(stderr, + " --class=(normal|special|dedup|other)[,...]\n" + " only consider blocks from " + "these allocation classes\n"); (void) fprintf(stderr, " -B --backup " "backup stream\n"); (void) fprintf(stderr, " -c --checksum " @@ -1694,7 +1718,7 @@ dump_metaslab(metaslab_t *msp) (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); - if (dump_opt[ALLOCATED_OPT] || + if (dump_opt[ARG_ALLOCATED] || (dump_opt['m'] > 2 && !dump_opt['L'])) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); @@ -1705,7 +1729,7 @@ dump_metaslab(metaslab_t *msp) dump_metaslab_stats(msp); } - if (dump_opt[ALLOCATED_OPT]) { + if (dump_opt[ARG_ALLOCATED]) { uint64_t off = msp->ms_start; zfs_range_tree_walk(msp->ms_allocatable, dump_allocated, &off); @@ -1726,7 +1750,7 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - if (dump_opt[ALLOCATED_OPT] || + if (dump_opt[ARG_ALLOCATED] || (dump_opt['m'] > 2 && !dump_opt['L'])) { metaslab_unload(msp); mutex_exit(&msp->ms_lock); @@ -5814,6 +5838,34 @@ dump_size_histograms(zdb_cb_t *zcb) (void) printf("\nBlock Size Histogram\n"); + switch (block_bin_mode) { + case BIN_PSIZE: + printf("(note: all categories are binned by %s)\n", "psize"); + break; + case BIN_LSIZE: + printf("(note: all categories are binned by %s)\n", "lsize"); + break; + case BIN_ASIZE: + printf("(note: all categories are binned by %s)\n", "asize"); + break; + default: + printf("(note: all categories are binned separately)\n"); + break; + } + if (block_classes != 0) { + char buf[256] = ""; + if (block_classes & CLASS_NORMAL) + strlcat(buf, "\"normal\", ", sizeof (buf)); + if (block_classes & CLASS_SPECIAL) + strlcat(buf, "\"special\", ", sizeof (buf)); + if (block_classes & CLASS_DEDUP) + strlcat(buf, "\"dedup\", ", sizeof (buf)); + if (block_classes & CLASS_OTHER) + strlcat(buf, "\"other\", ", sizeof (buf)); + buf[strlen(buf)-2] = '\0'; + printf("(note: only blocks in these classes are counted: %s)\n", + buf); + } /* * Print the first line titles */ @@ -6162,29 +6214,85 @@ skipped: [BPE_GET_PSIZE(bp)]++; return; } + + if (block_classes != 0) { + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[0]); + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[0]); + vdev_t *vd = vdev_lookup_top(zcb->zcb_spa, vdev); + ASSERT(vd != NULL); + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(ms != NULL); + metaslab_group_t *mg = ms->ms_group; + ASSERT(mg != NULL); + metaslab_class_t *mc = mg->mg_class; + ASSERT(mc != NULL); + + spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); + + int class; + if (mc == spa_normal_class(zcb->zcb_spa)) { + class = CLASS_NORMAL; + } else if (mc == spa_special_class(zcb->zcb_spa)) { + class = CLASS_SPECIAL; + } else if (mc == spa_dedup_class(zcb->zcb_spa)) { + class = CLASS_DEDUP; + } else { + class = CLASS_OTHER; + } + + if (!(block_classes & class)) { + goto hist_skipped; + } + } + /* * The binning histogram bins by powers of two up to * SPA_MAXBLOCKSIZE rather than creating bins for * every possible blocksize found in the pool. */ - int bin = highbit64(BP_GET_PSIZE(bp)) - 1; + int bin; + + /* + * Binning strategy: each bin includes blocks up to and including + * the given size (excluding blocks that fit into the previous bin). + * This way, the "4K" bin includes blocks within the (2K; 4K] range. + */ +#define BIN(size) (highbit64((size) - 1)) + + switch (block_bin_mode) { + case BIN_PSIZE: bin = BIN(BP_GET_PSIZE(bp)); break; + case BIN_LSIZE: bin = BIN(BP_GET_LSIZE(bp)); break; + case BIN_ASIZE: bin = BIN(BP_GET_ASIZE(bp)); break; + case BIN_AUTO: break; + default: PANIC("bad block_bin_mode"); abort(); + } + + if (block_bin_mode == BIN_AUTO) + bin = BIN(BP_GET_PSIZE(bp)); zcb->zcb_psize_count[bin]++; zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); zcb->zcb_psize_total += BP_GET_PSIZE(bp); - bin = highbit64(BP_GET_LSIZE(bp)) - 1; + if (block_bin_mode == BIN_AUTO) + bin = BIN(BP_GET_LSIZE(bp)); zcb->zcb_lsize_count[bin]++; zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); zcb->zcb_lsize_total += BP_GET_LSIZE(bp); - bin = highbit64(BP_GET_ASIZE(bp)) - 1; + if (block_bin_mode == BIN_AUTO) + bin = BIN(BP_GET_ASIZE(bp)); zcb->zcb_asize_count[bin]++; zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); +#undef BIN + +hist_skipped: if (!do_claim) return; @@ -9426,7 +9534,11 @@ main(int argc, char **argv) {"livelist", no_argument, NULL, 'y'}, {"zstd-headers", no_argument, NULL, 'Z'}, {"allocated-map", no_argument, NULL, - ALLOCATED_OPT}, + ARG_ALLOCATED}, + {"bin", required_argument, NULL, + ARG_BLOCK_BIN_MODE}, + {"class", required_argument, NULL, + ARG_BLOCK_CLASSES}, {0, 0, 0, 0} }; @@ -9457,7 +9569,7 @@ main(int argc, char **argv) case 'u': case 'y': case 'Z': - case ALLOCATED_OPT: + case ARG_ALLOCATED: dump_opt[c]++; dump_all = 0; break; @@ -9540,6 +9652,59 @@ main(int argc, char **argv) case 'x': vn_dumpdir = optarg; break; + case ARG_BLOCK_BIN_MODE: + if (strcmp(optarg, "lsize") == 0) { + block_bin_mode = BIN_LSIZE; + } else if (strcmp(optarg, "psize") == 0) { + block_bin_mode = BIN_PSIZE; + } else if (strcmp(optarg, "asize") == 0) { + block_bin_mode = BIN_ASIZE; + } else { + (void) fprintf(stderr, + "--bin=\"%s\" must be one of \"lsize\", " + "\"psize\" or \"asize\"\n", optarg); + usage(); + } + break; + + case ARG_BLOCK_CLASSES: { + char *buf = strdup(optarg), *tok = buf, *next, + *save = NULL; + + while ((next = strtok_r(tok, ",", &save)) != NULL) { + tok = NULL; + + if (strcmp(next, "normal") == 0) { + block_classes |= CLASS_NORMAL; + } else if (strcmp(next, "special") == 0) { + block_classes |= CLASS_SPECIAL; + } else if (strcmp(next, "dedup") == 0) { + block_classes |= CLASS_DEDUP; + } else if (strcmp(next, "other") == 0) { + block_classes |= CLASS_OTHER; + } else { + (void) fprintf(stderr, + "--class=\"%s\" must be a " + "comma-separated list of either " + "\"normal\", \"special\", " + "\"asize\" or \"other\"; " + "got \"%s\"\n", + optarg, next); + usage(); + } + } + + if (block_classes == 0) { + (void) fprintf(stderr, + "--class= must be a comma-separated " + "list of either \"normal\", \"special\", " + "\"asize\" or \"other\"; got empty\n"); + usage(); + } + + free(buf); + break; + } default: usage(); break; diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c index 088c0108e911..222b5524669e 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -270,14 +270,13 @@ is_spare(nvlist_t *config, const char *path) * draid* Virtual dRAID spare */ static nvlist_t * -make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) +make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift) { char path[MAXPATHLEN]; struct stat64 statbuf; nvlist_t *vdev = NULL; const char *type = NULL; boolean_t wholedisk = B_FALSE; - uint64_t ashift = 0; int err; /* @@ -382,31 +381,6 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) (uint64_t)wholedisk) == 0); /* - * Override defaults if custom properties are provided. - */ - if (props != NULL) { - const char *value = NULL; - - if (nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { - if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { - (void) fprintf(stderr, - gettext("ashift must be a number.\n")); - return (NULL); - } - if (ashift != 0 && - (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { - (void) fprintf(stderr, - gettext("invalid 'ashift=%" PRIu64 "' " - "property: only values between %" PRId32 " " - "and %" PRId32 " are allowed.\n"), - ashift, ASHIFT_MIN, ASHIFT_MAX); - return (NULL); - } - } - } - - /* * If the device is known to incorrectly report its physical sector * size explicitly provide the known correct value. */ @@ -1513,6 +1487,29 @@ construct_spec(nvlist_t *props, int argc, char **argv) const char *type, *fulltype; boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; + uint64_t ashift = 0; + + if (props != NULL) { + const char *value = NULL; + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { + if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { + (void) fprintf(stderr, + gettext("ashift must be a number.\n")); + return (NULL); + } + if (ashift != 0 && + (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { + (void) fprintf(stderr, + gettext("invalid 'ashift=%" PRIu64 "' " + "property: only values between %" PRId32 " " + "and %" PRId32 " are allowed.\n"), + ashift, ASHIFT_MIN, ASHIFT_MAX); + return (NULL); + } + } + } top = NULL; toplevels = 0; @@ -1618,9 +1615,9 @@ construct_spec(nvlist_t *props, int argc, char **argv) children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); - if ((nv = make_leaf_vdev(props, argv[c], + if ((nv = make_leaf_vdev(argv[c], !(is_log || is_special || is_dedup || - is_spare))) == NULL) { + is_spare), ashift)) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); @@ -1684,6 +1681,10 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) == 0); } + if (ashift > 0) { + fnvlist_add_uint64(nv, + ZPOOL_CONFIG_ASHIFT, ashift); + } if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, @@ -1711,8 +1712,9 @@ construct_spec(nvlist_t *props, int argc, char **argv) * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ - if ((nv = make_leaf_vdev(props, argv[0], !(is_log || - is_special || is_dedup || is_spare))) == NULL) + if ((nv = make_leaf_vdev(argv[0], !(is_log || + is_special || is_dedup || is_spare), + ashift)) == NULL) goto spec_out; verify(nvlist_add_uint64(nv, diff --git a/sys/contrib/openzfs/lib/libspl/include/sys/uio.h b/sys/contrib/openzfs/lib/libspl/include/sys/uio.h index 93aa4984d734..9ada482be000 100644 --- a/sys/contrib/openzfs/lib/libspl/include/sys/uio.h +++ b/sys/contrib/openzfs/lib/libspl/include/sys/uio.h @@ -41,6 +41,7 @@ #ifndef _LIBSPL_SYS_UIO_H #define _LIBSPL_SYS_UIO_H +#include <sys/sysmacros.h> #include <sys/types.h> #include_next <sys/uio.h> diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_status.c b/sys/contrib/openzfs/lib/libzfs/libzfs_status.c index bdddefb92165..a589ca6896f0 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_status.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_status.c @@ -98,57 +98,57 @@ static const char *const zfs_msgid_table[] = { #define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0])) static int -vdev_missing(vdev_stat_t *vs, uint_t vsc) +vdev_missing(vdev_stat_t *vs, uint_t vsc, void *arg) { - (void) vsc; + (void) vsc, (void) arg; return (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_OPEN_FAILED); } static int -vdev_faulted(vdev_stat_t *vs, uint_t vsc) +vdev_faulted(vdev_stat_t *vs, uint_t vsc, void *arg) { - (void) vsc; + (void) vsc, (void) arg; return (vs->vs_state == VDEV_STATE_FAULTED); } static int -vdev_errors(vdev_stat_t *vs, uint_t vsc) +vdev_errors(vdev_stat_t *vs, uint_t vsc, void *arg) { - (void) vsc; + (void) vsc, (void) arg; return (vs->vs_state == VDEV_STATE_DEGRADED || vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || vs->vs_checksum_errors != 0); } static int -vdev_broken(vdev_stat_t *vs, uint_t vsc) +vdev_broken(vdev_stat_t *vs, uint_t vsc, void *arg) { - (void) vsc; + (void) vsc, (void) arg; return (vs->vs_state == VDEV_STATE_CANT_OPEN); } static int -vdev_offlined(vdev_stat_t *vs, uint_t vsc) +vdev_offlined(vdev_stat_t *vs, uint_t vsc, void *arg) { - (void) vsc; + (void) vsc, (void) arg; return (vs->vs_state == VDEV_STATE_OFFLINE); } static int -vdev_removed(vdev_stat_t *vs, uint_t vsc) +vdev_removed(vdev_stat_t *vs, uint_t vsc, void *arg) { - (void) vsc; + (void) vsc, (void) arg; return (vs->vs_state == VDEV_STATE_REMOVED); } static int -vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) +vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc, void *arg) { - if (getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") != NULL) - return (0); + uint64_t ashift = *(uint64_t *)arg; return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && + (ashift == 0 || vs->vs_configured_ashift < ashift) && vs->vs_configured_ashift < vs->vs_physical_ashift); } @@ -156,8 +156,8 @@ vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) * Detect if any leaf devices that have seen errors or could not be opened. */ static boolean_t -find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), - boolean_t ignore_replacing) +find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *), + void *arg, boolean_t ignore_replacing) { nvlist_t **child; uint_t c, children; @@ -177,14 +177,16 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { - for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func, ignore_replacing)) + for (c = 0; c < children; c++) { + if (find_vdev_problem(child[c], func, arg, + ignore_replacing)) return (B_TRUE); + } } else { uint_t vsc; vdev_stat_t *vs = (vdev_stat_t *)fnvlist_lookup_uint64_array( vdev, ZPOOL_CONFIG_VDEV_STATS, &vsc); - if (func(vs, vsc) != 0) + if (func(vs, vsc, arg) != 0) return (B_TRUE); } @@ -193,9 +195,11 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), */ if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { - for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func, ignore_replacing)) + for (c = 0; c < children; c++) { + if (find_vdev_problem(child[c], func, arg, + ignore_replacing)) return (B_TRUE); + } } return (B_FALSE); @@ -220,7 +224,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), */ static zpool_status_t check_status(nvlist_t *config, boolean_t isimport, - zpool_errata_t *erratap, const char *compat) + zpool_errata_t *erratap, const char *compat, uint64_t ashift) { pool_scan_stat_t *ps = NULL; uint_t vsc, psc; @@ -371,15 +375,15 @@ check_status(nvlist_t *config, boolean_t isimport, * Bad devices in non-replicated config. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) + find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_missing, B_TRUE)) + find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_broken, B_TRUE)) + find_vdev_problem(nvroot, vdev_broken, NULL, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_NR); /* @@ -402,35 +406,37 @@ check_status(nvlist_t *config, boolean_t isimport, /* * Missing devices in a replicated config. */ - if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) + if (find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); - if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) + if (find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_R); - if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) + if (find_vdev_problem(nvroot, vdev_broken, NULL, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_R); /* * Devices with errors */ - if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) + if (!isimport && find_vdev_problem(nvroot, vdev_errors, NULL, B_TRUE)) return (ZPOOL_STATUS_FAILING_DEV); /* * Offlined devices */ - if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) + if (find_vdev_problem(nvroot, vdev_offlined, NULL, B_TRUE)) return (ZPOOL_STATUS_OFFLINE_DEV); /* * Removed device */ - if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) + if (find_vdev_problem(nvroot, vdev_removed, NULL, B_TRUE)) return (ZPOOL_STATUS_REMOVED_DEV); /* * Suboptimal, but usable, ashift configuration. */ - if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) + if (!isimport && + getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && + find_vdev_problem(nvroot, vdev_non_native_ashift, &ashift, B_FALSE)) return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); /* @@ -510,8 +516,10 @@ zpool_get_status(zpool_handle_t *zhp, const char **msgid, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compatibility[0] = '\0'; + uint64_t ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, NULL); + zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata, - compatibility); + compatibility, ashift); if (msgid != NULL) { if (ret >= NMSGID) @@ -526,7 +534,7 @@ zpool_status_t zpool_import_status(nvlist_t *config, const char **msgid, zpool_errata_t *errata) { - zpool_status_t ret = check_status(config, B_TRUE, errata, NULL); + zpool_status_t ret = check_status(config, B_TRUE, errata, NULL, 0); if (ret >= NMSGID) *msgid = NULL; diff --git a/sys/contrib/openzfs/lib/libzpool/kernel.c b/sys/contrib/openzfs/lib/libzpool/kernel.c index 8ed374627264..70eba5099119 100644 --- a/sys/contrib/openzfs/lib/libzpool/kernel.c +++ b/sys/contrib/openzfs/lib/libzpool/kernel.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include <assert.h> @@ -645,39 +646,60 @@ __dprintf(boolean_t dprint, const char *file, const char *func, * cmn_err() and panic() * ========================================================================= */ -static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; -static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; -__attribute__((noreturn)) void -vpanic(const char *fmt, va_list adx) +static __attribute__((noreturn)) void +panic_stop_or_abort(void) { - (void) fprintf(stderr, "error: "); - (void) vfprintf(stderr, fmt, adx); - (void) fprintf(stderr, "\n"); + const char *stopenv = getenv("LIBZPOOL_PANIC_STOP"); + if (stopenv != NULL && atoi(stopenv)) { + fputs("libzpool: LIBZPOOL_PANIC_STOP is set, sending " + "SIGSTOP to process group\n", stderr); + fflush(stderr); + + kill(0, SIGSTOP); + + fputs("libzpool: continued after panic stop, " + "aborting\n", stderr); + } abort(); /* think of it as a "user-level crash dump" */ } -__attribute__((noreturn)) void -panic(const char *fmt, ...) +static void +vcmn_msg(int ce, const char *fmt, va_list adx) { - va_list adx; + switch (ce) { + case CE_IGNORE: + return; + case CE_CONT: + break; + case CE_NOTE: + fputs("libzpool: NOTICE: ", stderr); + break; + case CE_WARN: + fputs("libzpool: WARNING: ", stderr); + break; + case CE_PANIC: + fputs("libzpool: PANIC: ", stderr); + break; + default: + fputs("libzpool: [unknown severity %d]: ", stderr); + break; + } - va_start(adx, fmt); - vpanic(fmt, adx); - va_end(adx); + vfprintf(stderr, fmt, adx); + if (ce != CE_CONT) + fputc('\n', stderr); + fflush(stderr); } void vcmn_err(int ce, const char *fmt, va_list adx) { + vcmn_msg(ce, fmt, adx); + if (ce == CE_PANIC) - vpanic(fmt, adx); - if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ - (void) fprintf(stderr, "%s", ce_prefix[ce]); - (void) vfprintf(stderr, fmt, adx); - (void) fprintf(stderr, "%s", ce_suffix[ce]); - } + panic_stop_or_abort(); } void @@ -690,6 +712,25 @@ cmn_err(int ce, const char *fmt, ...) va_end(adx); } +__attribute__((noreturn)) void +panic(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_msg(CE_PANIC, fmt, adx); + va_end(adx); + + panic_stop_or_abort(); +} + +__attribute__((noreturn)) void +vpanic(const char *fmt, va_list adx) +{ + vcmn_msg(CE_PANIC, fmt, adx); + panic_stop_or_abort(); +} + /* * ========================================================================= * misc routines diff --git a/sys/contrib/openzfs/man/man8/zdb.8 b/sys/contrib/openzfs/man/man8/zdb.8 index c3290ea14769..f51e24fa849c 100644 --- a/sys/contrib/openzfs/man/man8/zdb.8 +++ b/sys/contrib/openzfs/man/man8/zdb.8 @@ -144,6 +144,20 @@ subcommand. Display statistics regarding the number, size .Pq logical, physical and allocated and deduplication of blocks. +.It Fl -bin Ns = Ns ( Li lsize Ns | Ns Li psize Ns | Ns Li asize ) +When used with +.Fl bb , +sort blocks into all three bins according to the given size (instead of binning +a block for each size separately). +.Pp +For instance, with +.Fl -bin Ns = Ns Li lsize , +a block with lsize of 16K and psize of 4K will be added to the 16K bin +in all three columns. +.It Fl -class Ns = Ns ( Li normal Ns | Ns Li special Ns | Ns Li dedup Ns | Ns Li other ) Ns Op , Ns … +When used with +.Fl bb , +only consider blocks from these allocation classes. .It Fl B , -backup Generate a backup stream, similar to .Nm zfs Cm send , diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 4de48e013ec4..d0a9c662e6f0 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -762,8 +762,7 @@ zfsctl_common_pathconf(struct vop_pathconf_args *ap) return (0); case _PC_MIN_HOLE_SIZE: - *ap->a_retval = (int)SPA_MINBLOCKSIZE; - return (0); + return (EINVAL); case _PC_ACL_EXTENDED: *ap->a_retval = 0; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 411225786089..e5344497f1be 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4116,6 +4116,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, { znode_t *zp; zfsvfs_t *zfsvfs; + uint_t blksize, iosize; int error; switch (cmd) { @@ -4127,8 +4128,20 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, *valp = 64; return (0); case _PC_MIN_HOLE_SIZE: - *valp = (int)SPA_MINBLOCKSIZE; - return (0); + iosize = vp->v_mount->mnt_stat.f_iosize; + if (vp->v_type == VREG) { + zp = VTOZ(vp); + blksize = zp->z_blksz; + if (zp->z_size <= blksize) + blksize = MAX(blksize, iosize); + *valp = (int)blksize; + return (0); + } + if (vp->v_type == VDIR) { + *valp = (int)iosize; + return (0); + } + return (EINVAL); case _PC_ACL_EXTENDED: #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ zp = VTOZ(vp); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 967a018640e1..4e66bee7744d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr) } /* - * Align the request to volume block boundaries when a secure erase is - * not required. This will prevent dnode_free_range() from zeroing out - * the unaligned parts which is slow (read-modify-write) and useless - * since we are not freeing any space by doing so. + * Align the request to volume block boundaries. This will prevent + * dnode_free_range() from zeroing out the unaligned parts which is + * slow (read-modify-write) and useless since we are not freeing any + * space by doing so. */ - if (!io_is_secure_erase(bio, rq)) { - start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); - size = end - start; - } + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); + size = end - start; if (start >= end) goto unlock; @@ -467,6 +465,24 @@ zvol_read_task(void *arg) zv_request_task_free(task); } +/* + * Note: + * + * The kernel uses different enum names for the IO opcode, depending on the + * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather + * than inline functions for these checks. + */ +/* Should this IO go down the zvol write path? */ +#define ZVOL_OP_IS_WRITE(op) \ + (op == REQ_OP_WRITE || \ + op == REQ_OP_FLUSH || \ + op == REQ_OP_DISCARD) + +/* Is this IO type supported by zvols? */ +#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) + +/* Get the IO opcode */ +#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq)) /* * Process a BIO or request @@ -486,27 +502,32 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, uint64_t size = io_size(bio, rq); int rw; - if (rq != NULL) { - /* - * Flush & trim requests go down the zvol_write codepath. Or - * more specifically: - * - * If request is a write, or if it's op_is_sync() and not a - * read, or if it's a flush, or if it's a discard, then send the - * request down the write path. - */ - if (op_is_write(rq->cmd_flags) || - (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || - req_op(rq) == REQ_OP_FLUSH || - op_is_discard(rq->cmd_flags)) { - rw = WRITE; - } else { - rw = READ; - } + if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { + zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", + rq != NULL ? "request" : "BIO", + ZVOL_OP(bio, rq), + rq != NULL ? rq->cmd_flags : bio->bi_opf); + ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); + zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); + goto out; + } + + if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { + rw = WRITE; } else { - rw = bio_data_dir(bio); + rw = READ; } + /* + * Sanity check + * + * If we're a BIO, check our rw matches the kernel's + * bio_data_dir(bio) rw. We need to check because we support fewer + * IO operations, and want to verify that what we think are reads and + * writes from those operations match what the kernel thinks. + */ + ASSERT(rq != NULL || rw == bio_data_dir(bio)); + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); goto out; @@ -610,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (io_is_discard(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index b677f90280d7..dbb5e942e2e6 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1157,7 +1157,7 @@ buf_fini(void) #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages - * should be using vmem_free() in the linux kernel\ + * should be using vmem_free() in the linux kernel. */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); @@ -4651,10 +4651,10 @@ arc_flush_task(void *arg) arc_flush_impl(spa_guid, B_FALSE); arc_async_flush_remove(spa_guid, af->af_cache_level); - uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); - if (elaspsed > 0) { + uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); + if (elapsed > 0) { zfs_dbgmsg("spa %llu arc flushed in %llu ms", - (u_longlong_t)spa_guid, (u_longlong_t)elaspsed); + (u_longlong_t)spa_guid, (u_longlong_t)elapsed); } } @@ -9152,7 +9152,7 @@ top: if (dev->l2ad_first) { /* * This is the first sweep through the device. There is - * nothing to evict. We have already trimmmed the + * nothing to evict. We have already trimmed the * whole device. */ goto out; @@ -10086,12 +10086,12 @@ l2arc_device_teardown(void *arg) kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); - uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); - if (elaspsed > 0) { + uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); + if (elapsed > 0) { zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms", (u_longlong_t)rva->rva_spa_gid, (u_longlong_t)rva->rva_vdev_gid, - (u_longlong_t)elaspsed); + (u_longlong_t)elapsed); } if (rva->rva_async) diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_seek.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_seek.c index 45ba17e36c35..f46980cad111 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_seek.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_seek.c @@ -47,25 +47,34 @@ #endif static void +seek_expect(int fd, off_t offset, int whence, off_t expect_offset) +{ + errno = 0; + off_t seek_offset = lseek(fd, offset, whence); + if (seek_offset == expect_offset) + return; + + int err = errno; + fprintf(stderr, "lseek(fd, %ld, SEEK_%s) = %ld (expected %ld)", + offset, (whence == SEEK_DATA ? "DATA" : "HOLE"), + seek_offset, expect_offset); + if (err != 0) + fprintf(stderr, " (errno %d [%s])\n", err, strerror(err)); + else + fputc('\n', stderr); + exit(2); +} + +static inline void seek_data(int fd, off_t offset, off_t expected) { - off_t data_offset = lseek(fd, offset, SEEK_DATA); - if (data_offset != expected) { - fprintf(stderr, "lseek(fd, %d, SEEK_DATA) = %d (expected %d)\n", - (int)offset, (int)data_offset, (int)expected); - exit(2); - } + seek_expect(fd, offset, SEEK_DATA, expected); } -static void +static inline void seek_hole(int fd, off_t offset, off_t expected) { - off_t hole_offset = lseek(fd, offset, SEEK_HOLE); - if (hole_offset != expected) { - fprintf(stderr, "lseek(fd, %d, SEEK_HOLE) = %d (expected %d)\n", - (int)offset, (int)hole_offset, (int)expected); - exit(2); - } + seek_expect(fd, offset, SEEK_HOLE, expected); } int diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh index 2bd9f616170b..1a9774331ba1 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh @@ -47,6 +47,8 @@ function cleanup # bring back removed disk online for further tests insert_disk $REMOVED_DISK $scsi_host poolexists $TESTPOOL && destroy_pool $TESTPOOL + # Since the disk was offline during destroy, remove the label + zpool labelclear $DISK2 -f } log_assert "Testing zpool reopen with pool name as argument" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index bb697b76a9f3..9de308c4a11a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -41,6 +41,7 @@ # 5. TRIM the first 1MB and last 2MB of the 5MB block of data. # 6. Observe 2MB of used space on the zvol # 7. Verify the trimmed regions are zero'd on the zvol +# 8. Verify Secure Erase does not work on zvols (Linux only) verify_runnable "global" @@ -56,6 +57,7 @@ if is_linux ; then else trimcmd='blkdiscard' fi + secure_trimcmd="$trimcmd --secure" else # By default, FreeBSD 'trim' always does a dry-run. '-f' makes # it perform the actual operation. @@ -114,6 +116,11 @@ function do_test { log_must diff $datafile1 $datafile2 log_must rm $datafile1 $datafile2 + + # Secure erase should not work (Linux check only). + if [ -n "$secure_trimcmd" ] ; then + log_mustnot $secure_trimcmd $zvolpath + fi } log_assert "Verify that a ZFS volume can be TRIMed" |