aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAllan Jude <allan@klarasystems.com>2024-04-24 21:51:21 +0000
committerGitHub <noreply@github.com>2024-04-24 21:51:21 +0000
commit5044c4e3ff0558b726b491a9267fc3db6f855a2d (patch)
tree1f2b8e6e66b4ec71d7e471ad2ee10b2e79bced24
parent67d13998b3e055232a07311c2dc609571eaf1df1 (diff)
downloadsrc-5044c4e3ff0558b726b491a9267fc3db6f855a2d.tar.gz
src-5044c4e3ff0558b726b491a9267fc3db6f855a2d.zip
Fast Dedup: ZAP Shrinking
This allows ZAPs to shrink. When there are two empty sibling leafs, one of them is collapsed and its storage space is reused. This improved performance on directories that at one time contained a large number of files, but many or all of those files have since been deleted. This also applies to all other types of ZAPs as well. Sponsored-by: iXsystems, Inc. Sponsored-by: Klara, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com> Closes #15888
-rw-r--r--man/man4/zfs.47
-rw-r--r--module/zfs/zap.c336
-rw-r--r--tests/runfiles/common.run4
-rw-r--r--tests/zfs-tests/tests/Makefile.am3
-rwxr-xr-xtests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh34
-rwxr-xr-xtests/zfs-tests/tests/functional/zap_shrink/setup.ksh35
-rwxr-xr-xtests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh81
7 files changed, 488 insertions, 12 deletions
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 22e1106bbf05..ef0385d42b8e 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -16,7 +16,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
-.Dd January 9, 2024
+.Dd February 14, 2024
.Dt ZFS 4
.Os
.
@@ -564,9 +564,8 @@ However, this is limited by
Maximum micro ZAP size.
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
.
-.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
-Log2 fraction of holes in speculative prefetch stream allowed for it to
-proceed.
+.It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
+If set, adjacent empty ZAP blocks will be collapsed, reducing disk space.
.
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
Min bytes to prefetch per stream.
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index da86defb445c..1b6b16fc6662 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -22,6 +22,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
+ * Copyright (c) 2023, Klara Inc.
*/
/*
@@ -41,6 +43,7 @@
#include <sys/spa.h>
#include <sys/dmu.h>
+#include <sys/dnode.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#include <sys/fs/zfs.h>
@@ -78,9 +81,16 @@
*/
static int zap_iterate_prefetch = B_TRUE;
+/*
+ * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
+ * collapsed into a single block.
+ */
+int zap_shrink_enabled = B_TRUE;
+
int fzap_default_block_shift = 14; /* 16k blocksize */
static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
void
fzap_byteswap(void *vbuf, size_t size)
@@ -587,6 +597,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
}
static int
+zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
+ dmu_tx_t *tx)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int epb = bs >> 3; /* entries per block */
+ int err = 0;
+
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ /*
+ * Check for i/o errors
+ */
+ for (int i = 0; i < nptrs; i += epb) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, idx + i, &blk);
+ if (err != 0) {
+ return (err);
+ }
+ }
+
+ for (int i = 0; i < nptrs; i++) {
+ err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
+ ASSERT0(err); /* we checked for i/o errors above */
+ if (err != 0)
+ break;
+ }
+
+ return (err);
+}
+
+#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len)))
+
+/*
+ * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
+ * If two leaves are siblings, their ranges are adjecent and contain the same
+ * number of entries. In order to find out if a leaf has a sibling, we need to
+ * check the range corresponding to the sibling leaf. There is no need to check
+ * all entries in the range, we only need to check the frist and the last one.
+ */
+static uint64_t
+check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
+ uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
+ uint64_t nptrs = (1 << pref_diff);
+ uint64_t first;
+ uint64_t last;
+
+ ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+
+ if (zap_idx_to_blk(zap, idx, &first) != 0)
+ return (0);
+
+ if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
+ return (0);
+
+ if (first != last)
+ return (0);
+ return (first);
+}
+
+static int
zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{
uint64_t blk;
@@ -958,6 +1034,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
if (err == 0) {
zap_entry_remove(&zeh);
zap_increment_num_entries(zn->zn_zap, -1, tx);
+
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
+ zap_shrink_enabled)
+ return (zap_shrink(zn, l, tx));
}
zap_put_leaf(l);
return (err);
@@ -1222,13 +1302,19 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
ZIO_PRIORITY_ASYNC_READ);
}
- if (zc->zc_leaf &&
- (ZAP_HASH_IDX(zc->zc_hash,
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ if (zc->zc_leaf) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
- zap_put_leaf(zc->zc_leaf);
- zc->zc_leaf = NULL;
+
+ /*
+ * The leaf was either shrunk or split.
+ */
+ if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
}
again:
@@ -1237,8 +1323,6 @@ again:
&zc->zc_leaf);
if (err != 0)
return (err);
- } else {
- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
}
l = zc->zc_leaf;
@@ -1367,6 +1451,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
}
}
+/*
+ * Find last allocated block and update freeblk.
+ */
+static void
+zap_trunc(zap_t *zap)
+{
+ uint64_t nentries;
+ uint64_t lastblk;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
+ /* External ptrtbl */
+ nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+ } else {
+ /* Embedded ptrtbl */
+ nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ lastblk = 0;
+ }
+
+ for (uint64_t idx = 0; idx < nentries; idx++) {
+ uint64_t blk;
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ if (blk > lastblk)
+ lastblk = blk;
+ }
+
+ ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
+
+ zap_f_phys(zap)->zap_freeblk = lastblk + 1;
+}
+
+/*
+ * ZAP shrinking algorithm.
+ *
+ * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
+ * only if it has a sibling. Sibling leaves have the same prefix length and
+ * their prefixes differ only by the least significant (sibling) bit. We require
+ * both siblings to be empty. This eliminates a need to rehash the non-empty
+ * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
+ * entries of the removed leaf to point out to the remaining leaf. Prefix length
+ * of the remaining leaf is decremented. As a result, it has a new prefix and it
+ * might have a new sibling. So, we repeat the process.
+ *
+ * Steps:
+ * 1. Check if a sibling leaf (sl) exists and it is empty.
+ * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
+ * 3. Release the sibling (sl) to derefer it again with WRITER lock.
+ * 4. Upgrade zapdir lock to WRITER (once).
+ * 5. Derefer released leaves again.
+ * 6. If it is needed, recheck whether both leaves are still siblings and empty.
+ * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
+ * the remaining leaf (slbit 0).
+ * 8. Free disk block of the removed leaf (dmu_free_range).
+ * 9. Decrement prefix_len of the remaining leaf.
+ * 10. Repeat the steps.
+ */
+static int
+zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ zap_t *zap = zn->zn_zap;
+ int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ uint64_t hash = zn->zn_hash;
+ uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ boolean_t trunc = B_FALSE;
+ int err = 0;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+ ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
+
+ boolean_t writer = B_FALSE;
+
+ /*
+ * To avoid deadlock always deref leaves in the same order -
+ * sibling 0 first, then sibling 1.
+ */
+ while (prefix_len) {
+ zap_leaf_t *sl;
+ int64_t prefix_diff = zt_shift - prefix_len;
+ uint64_t sl_prefix = prefix ^ 1;
+ uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
+ int slbit = prefix & 1;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+
+ /*
+ * Check if there is a sibling by reading ptrtbl ptrs.
+ */
+ if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
+ break;
+
+ /*
+ * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
+ */
+ if (slbit == 1) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Dereference sibling leaf and check if it is empty.
+ */
+ if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
+ &sl)) != 0)
+ break;
+
+ ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
+
+ /*
+ * Check if we have a sibling and it is empty.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
+ zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ zap_put_leaf(sl);
+
+ /*
+ * If there two empty sibling, we have work to do, so
+ * we need to lock ZAP ptrtbl as WRITER.
+ */
+ if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+ /* We failed to upgrade */
+ if (l != NULL) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Usually, the right way to upgrade from a READER lock
+ * to a WRITER lock is to call zap_unlockdir() and
+ * zap_lockdir(), but we do not have a tag. Instead,
+ * we do it in more sophisticated way.
+ */
+ rw_exit(&zap->zap_rwlock);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+
+ zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ writer = B_TRUE;
+ }
+
+ /*
+ * Here we have WRITER lock for ptrtbl.
+ * Now, we need a WRITER lock for both siblings leaves.
+ * Also, we have to recheck if the leaves are still siblings
+ * and still empty.
+ */
+ if (l == NULL) {
+ /* sibling 0 */
+ if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
+ tx, RW_WRITER, &l)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
+ break;
+ }
+
+ /* sibling 1 */
+ if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
+ RW_WRITER, &sl)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /* If we have gotten here, we have a leaf to collapse */
+ uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
+ uint64_t nptrs = (1ULL << prefix_diff);
+ uint64_t sl_blkid = sl->l_blkid;
+
+ /*
+ * Set ptrtbl entries to point out to the slibling 0 blkid
+ */
+ if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
+ tx)) != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /*
+ * Free sibling 1 disk block.
+ */
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
+ trunc = B_TRUE;
+
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ sl_blkid << bs, 1 << bs, tx);
+ zap_put_leaf(sl);
+
+ zap_f_phys(zap)->zap_num_leafs--;
+
+ /*
+ * Update prefix and prefix_len.
+ */
+ zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
+
+ prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ }
+
+ if (trunc)
+ zap_trunc(zap);
+
+ if (l != NULL)
+ zap_put_leaf(l);
+
+ return (err);
+}
+
/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
"When iterating ZAP object, prefetch it");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
+ "Enable ZAP shrinking");
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 0586d991b802..5e7fdf359a75 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -643,6 +643,10 @@ tags = ['functional', 'compression']
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
tags = ['functional', 'cp_files']
+[tests/functional/zap_shrink]
+tests = ['zap_shrink_001_pos']
+tags = ['functional', 'zap_shrink']
+
[tests/functional/crtime]
tests = ['crtime_001_pos' ]
tags = ['functional', 'crtime']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index dc447e042225..a6fe030d410c 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -2074,6 +2074,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/xattr/xattr_012_pos.ksh \
functional/xattr/xattr_013_pos.ksh \
functional/xattr/xattr_compat.ksh \
+ functional/zap_shrink/cleanup.ksh \
+ functional/zap_shrink/zap_shrink_001_pos.ksh \
+ functional/zap_shrink/setup.ksh \
functional/zpool_influxdb/cleanup.ksh \
functional/zpool_influxdb/setup.ksh \
functional/zpool_influxdb/zpool_influxdb.ksh \
diff --git a/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh
new file mode 100755
index 000000000000..42fe70042d6a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh
new file mode 100755
index 000000000000..b756d4e76c83
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh
@@ -0,0 +1,35 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_setup $DISK
diff --git a/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh
new file mode 100755
index 000000000000..4dbf579b8ac7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh
@@ -0,0 +1,81 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2024, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Create a large number of files in a directory. Then remove all files and
+# check that the directory zap was shrunk. Use zdb to check that the zap object
+# contains only one leaf block using zdb.
+#
+
+verify_runnable "global"
+
+DIR=largedir
+
+NR_FILES=100000
+BATCH=1000
+CWD=$PWD
+
+log_assert "Create a large number of files ($NR_FILES) in a directory. " \
+ "Make sure that the directory ZAP object was shrunk."
+
+log_must mkdir $TESTDIR/$DIR
+
+cd $TESTDIR/$DIR
+# In order to prevent arguments overflowing, create NR_FILES in BATCH at once.
+for i in $(seq $(($NR_FILES/$BATCH))); do
+ touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)));
+done
+cd $CWD
+
+log_must test $NR_FILES -eq $(ls -U $TESTDIR/$DIR | wc -l)
+
+# remove all files in $DIR directory
+cd $TESTDIR/$DIR
+for i in $(seq $(($NR_FILES/$BATCH))); do
+ rm $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)))
+done
+cd $CWD
+sync_pool $TESTPOOL
+
+log_must test 0 -eq $(ls -U $TESTDIR/$DIR | wc -l)
+
+# check whether zap_shrink works
+zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
+nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
+log_must test 1 -eq $nleafs
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+# check whether zap_shrink works
+zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
+nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
+log_must test 1 -eq $nleafs
+
+log_pass