From ca960ce56ce1bfe207e4d80ba6e5ab67ea41b32f Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 30 Jun 2023 13:32:18 -0700
Subject: Update META

Increase the version to 2.2.99 to indicate the master branch is
newer than the 2.2.x release.  This ensures packages built from
master branch are considered to be newer than the last release.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 META | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/META b/META
index 5f834d5cc7c4..e6488a6fa6f0 100644
--- a/META
+++ b/META
@@ -1,8 +1,8 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.0
-Release:       rc1
+Version:       2.2.99
+Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-- 
cgit v1.2.3


From 736d5962b42e23c2caaae3de8c5387772239a9a4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 13 Jul 2023 11:50:34 -0400
Subject: FreeBSD: Fix build on stable/13 after 1302506.

Starting approximately from version 1302506 vn_lock_pair() grown two
additional arguments following head.  There is a one week hole, but
that is closet reference point we have.

Reviewed-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Sponsored by:   iXsystems, Inc.
Closes #15047
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index d29f00a0cbe4..7692200ab250 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6263,7 +6263,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 			goto bad_write_fallback;
 		}
 	} else {
-#if __FreeBSD_version >= 1400086
+#if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
+	__FreeBSD_version >= 1400086
 		vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
 		    LK_EXCLUSIVE);
 #else
-- 
cgit v1.2.3


From 58f4a094b4efa361aea38e2febfe72c350744e7a Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Thu, 13 Jul 2023 20:55:12 +0500
Subject: Update changelog for 2.2

Add a new changelog entry for native packages to reflect version
2.2.99.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #15054
---
 contrib/debian/changelog | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/contrib/debian/changelog b/contrib/debian/changelog
index 6273d603834a..0d6a987dc2c4 100644
--- a/contrib/debian/changelog
+++ b/contrib/debian/changelog
@@ -1,3 +1,9 @@
+openzfs-linux (2.2.99-1) unstable; urgency=low
+
+  * OpenZFS 2.2 is tagged.
+
+ -- Umer Saleem <usaleem@ixsystems.com>  Wed, 12 Jul 2022 15:00:00 -0400
+
 openzfs-linux (2.1.99-1) unstable; urgency=low
 
   * Integrate minimally modified Debian packaging from ZFS on Linux
-- 
cgit v1.2.3


From 6c9aa1d2a6d39259e4d5bce8f470f0bf8494dc77 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 13 Jul 2023 18:06:57 +0200
Subject: FreeBSD: catch up to __FreeBSD_version 1400093

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Closes #15036
---
 include/os/freebsd/spl/sys/vnode.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h
index ab1727dca0c9..0779e58e4953 100644
--- a/include/os/freebsd/spl/sys/vnode.h
+++ b/include/os/freebsd/spl/sys/vnode.h
@@ -36,7 +36,11 @@ struct xucred;
 typedef struct flock	flock64_t;
 typedef	struct vnode	vnode_t;
 typedef	struct vattr	vattr_t;
+#if __FreeBSD_version < 1400093
 typedef enum vtype vtype_t;
+#else
+#define	vtype_t __enum_uint8(vtype)
+#endif
 
 #include <sys/types.h>
 #include <sys/queue.h>
-- 
cgit v1.2.3


From 6db781d52ca0993af42e9ecb6741263167b991e0 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 13 Jul 2023 12:12:55 -0400
Subject: Add missed DMU_PROJECTUSED_OBJECT prefetch.

It seems 9c5167d19f "Project Quota on ZFS" missed to add prefetch
for DMU_PROJECTUSED_OBJECT during scan (scrub/resilver).  It should
not cause visible problems, but may affect scub/resilver performance.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15024
---
 module/zfs/dsl_scan.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 50428bff3ef4..ecdeba80b745 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2015,6 +2015,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 		    zb->zb_objset, DMU_META_DNODE_OBJECT);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
+				dsl_scan_prefetch_dnode(scn,
+				    &osp->os_projectused_dnode, zb->zb_objset,
+				    DMU_PROJECTUSED_OBJECT);
+			}
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_groupused_dnode, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
-- 
cgit v1.2.3


From fdba8cbb796cb089c3d6eefa833f5176b0474c29 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 14 Jul 2023 19:11:46 -0400
Subject: Avoid extra snprintf() in dsl_deadlist_merge().

Since we are already iterating the ZAP, we have exact string key to
remove, we do not need to call zap_remove_int() with the int key we
just converted, we can call zap_remove() for the original string.

This should make no functional change, only a micro-optimization.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15056
---
 module/zfs/dsl_deadlist.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index 181efd8ed69c..47c234f76c40 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -892,9 +892,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
-		uint64_t mintxg = zfs_strtonum(za->za_name, NULL);
-		dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx);
-		VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+		dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
+		    zfs_strtonum(za->za_name, NULL), tx);
+		VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
 		if (perror == 0) {
 			dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
 			    zfs_strtonum(pza->za_name, NULL));
-- 
cgit v1.2.3


From 67c5e1ba4fbb3b1df6b9260498460079eb99edac Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@gmail.com>
Date: Fri, 14 Jul 2023 17:13:15 -0600
Subject: Fix the ZFS checksum error histograms with larger record sizes

My analysis in PR #14716 was incorrect.  Each histogram bucket contains
the number of incorrect bits, by position in a 64-bit word, over the
entire record.  8-bit buckets can overflow for record sizes above 2k.
To forestall that, saturate each bucket at 255.  That should still get
the point across: either all bits are equally wrong, or just a couple
are.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alan Somers <asomers@gmail.com>
Sponsored-by: Axcient
Closes #15049
---
 module/zfs/zfs_fm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index c42ef048dd74..2754ceec83ca 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -790,7 +790,7 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
 		if (value & (1ull << i)) {
-			hist[63 - i]++;
+			hist[63 - i] = MAX(hist[63 - i], hist[63 - i] + 1);
 			++bits;
 		}
 	}
-- 
cgit v1.2.3


From c4e8742149e31a77dc074f3baacfefed3ccb800e Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 14 Jul 2023 19:16:40 -0400
Subject: Fix raw receive with different indirect block size.

Unlike regular receive, raw receive require destination to have the
same block structure as the source.  In case of dnode reclaim this
triggers two special cases, requiring special handling:
 - If dn_nlevels == 1, we can change the ibs, but dnode_set_blksz()
should not dirty the data buffer if block size does not change, or
durign receive dbuf_dirty_lightweight() will trigger assertion.
 - If dn_nlevels > 1, we just can't change the ibs, dnode_set_blksz()
would fail and receive_object would trigger assertion, so we should
destroy and recreate the dnode from scratch.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15039
---
 module/zfs/dmu_recv.c | 22 ++++++++++++----------
 module/zfs/dnode.c    | 31 ++++++++++++++++---------------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 2fdd7c1ece73..05ca91717c2f 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1795,17 +1795,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,
 	}
 
 	/*
-	 * The dmu does not currently support decreasing nlevels
-	 * or changing the number of dnode slots on an object. For
-	 * non-raw sends, this does not matter and the new object
-	 * can just use the previous one's nlevels. For raw sends,
-	 * however, the structure of the received dnode (including
-	 * nlevels and dnode slots) must match that of the send
-	 * side. Therefore, instead of using dmu_object_reclaim(),
-	 * we must free the object completely and call
-	 * dmu_object_claim_dnsize() instead.
+	 * The dmu does not currently support decreasing nlevels or changing
+	 * indirect block size if there is already one, same as changing the
+	 * number of of dnode slots on an object.  For non-raw sends this
+	 * does not matter and the new object can just use the previous one's
+	 * parameters.  For raw sends, however, the structure of the received
+	 * dnode (including indirects and dnode slots) must match that of the
+	 * send side.  Therefore, instead of using dmu_object_reclaim(), we
+	 * must free the object completely and call dmu_object_claim_dnsize()
+	 * instead.
 	 */
-	if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+	if ((rwa->raw && ((doi->doi_indirection > 1 &&
+	    indblksz != doi->doi_metadata_block_size) ||
+	    drro->drr_nlevels < doi->doi_indirection)) ||
 	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
 		err = dmu_free_long_object(rwa->os, drro->drr_object);
 		if (err != 0)
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index d15268cd7bc7..7cf03264dce2 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1882,7 +1882,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;
 
-	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+	if (size == dn->dn_datablksz && ibs == 0)
 		return (0);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
@@ -1905,24 +1905,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
-	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
-	if (err == 0) {
-		dbuf_new_size(db, size, tx);
-	} else if (err != ENOENT) {
-		goto fail;
-	}
-
-	dnode_setdblksz(dn, size);
 	dnode_setdirty(dn, tx);
-	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+	if (size != dn->dn_datablksz) {
+		/* resize the old block */
+		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+		if (err == 0) {
+			dbuf_new_size(db, size, tx);
+		} else if (err != ENOENT) {
+			goto fail;
+		}
+
+		dnode_setdblksz(dn, size);
+		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
+		if (db)
+			dbuf_rele(db, FTAG);
+	}
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
-		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	}
-	/* release after we have fixed the blocksize in the dnode */
-	if (db)
-		dbuf_rele(db, FTAG);
 
 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);
-- 
cgit v1.2.3


From 3a3e0d6fbcd240dae5142d586433edfeaf952fe3 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Fri, 14 Jul 2023 19:32:49 -0400
Subject: intptr_t definition is canonically signed

Make the version here match that elsewhere in the kernel and system
headers.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15058
---
 include/os/linux/spl/sys/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h
index a7666187ec23..d89a91c36f92 100644
--- a/include/os/linux/spl/sys/types.h
+++ b/include/os/linux/spl/sys/types.h
@@ -38,7 +38,7 @@ typedef unsigned long		ulong_t;
 typedef unsigned long long	u_longlong_t;
 typedef long long		longlong_t;
 
-typedef unsigned long		intptr_t;
+typedef long			intptr_t;
 typedef unsigned long long	rlim64_t;
 
 typedef struct task_struct	kthread_t;
-- 
cgit v1.2.3


From d3d63cac4d318da0a7dc23dc5e89366ad940febe Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Fri, 14 Jul 2023 19:33:51 -0400
Subject: Linux 6.5 compat: BLK_STS_NEXUS renamed to BLK_STS_RESV_CONFLICT

This change was introduced in Linux commit
7ba150834b840f6f5cdd07ca69a4ccf39df59a66

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15059
---
 config/kernel-blkdev.m4                       | 25 +++++++++++++++++++++++++
 include/os/linux/kernel/linux/blkdev_compat.h |  8 ++++++++
 2 files changed, 33 insertions(+)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 28e5364581ea..63d719f9c2da 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -443,6 +443,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
 	])
 ])
 
+dnl #
+dnl # 6.5.x API change
+dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [
+	ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [
+		#include <linux/blkdev.h>
+	],[
+		blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [
+	AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined])
+		ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [
+			AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined])
+			AC_MSG_RESULT(yes)
+		], [
+			AC_MSG_RESULT(no)
+		])
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
@@ -458,6 +481,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
@@ -476,4 +500,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index c5c6385be6ff..f1448587b98c 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -181,7 +181,11 @@ bi_status_to_errno(blk_status_t status)
 		return (ENOLINK);
 	case BLK_STS_TARGET:
 		return (EREMOTEIO);
+#ifdef HAVE_BLK_STS_RESV_CONFLICT
+	case BLK_STS_RESV_CONFLICT:
+#else
 	case BLK_STS_NEXUS:
+#endif
 		return (EBADE);
 	case BLK_STS_MEDIUM:
 		return (ENODATA);
@@ -215,7 +219,11 @@ errno_to_bi_status(int error)
 	case EREMOTEIO:
 		return (BLK_STS_TARGET);
 	case EBADE:
+#ifdef HAVE_BLK_STS_RESV_CONFLICT
+		return (BLK_STS_RESV_CONFLICT);
+#else
 		return (BLK_STS_NEXUS);
+#endif
 	case ENODATA:
 		return (BLK_STS_MEDIUM);
 	case EILSEQ:
-- 
cgit v1.2.3


From 8beabfd3bfbc09d7c05faa9c2e61361adb71d425 Mon Sep 17 00:00:00 2001
From: Yuri Pankov <113725409+yuripv@users.noreply.github.com>
Date: Thu, 20 Jul 2023 18:06:55 +0200
Subject: set autotrim default to 'off' everywhere

As it turns out having autotrim default to 'on' on FreeBSD never really
worked due to mess with defines where userland and kernel module were
getting different default values (userland was defaulting to 'off',
module was thinking it's 'on').

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Yuri Pankov <yuripv@FreeBSD.org>
Closes #15079
---
 include/sys/spa.h           | 6 ------
 module/zcommon/zpool_prop.c | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 1fa2044008dc..b90855687411 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -723,16 +723,10 @@ typedef enum spa_mode {
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
- * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
-#ifdef IN_FREEBSD_BASE
-	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
-#else
-	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,
-#endif
 } spa_autotrim_t;
 
 /*
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index 459ff62fc996..c4aca04a96bd 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -160,7 +160,7 @@ zpool_prop_init(void)
 	    "wait | continue | panic", "FAILMODE", failuremode_table,
 	    sfeatures);
 	zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
-	    SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL,
+	    SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL,
 	    "on | off", "AUTOTRIM", boolean_table, sfeatures);
 
 	/* hidden properties */
-- 
cgit v1.2.3


From 74f8ce4ca5c35e4c3210dcade00dcee1dfa8d1b9 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Thu, 20 Jul 2023 12:09:25 -0400
Subject: Linux 6.5 compat: disk_check_media_change() was added

The disk_check_media_change() function was added which replaces
bdev_check_media_change.  This change was introduced in 6.5rc1
444aa2c58cb3b6cfe3b7cc7db6c294d73393a894 and the new function takes a
gendisk* as its argument, no longer a block_device*. Thus, bdev->bd_disk
is now used to pass the expected data.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15060
---
 config/kernel-blkdev.m4                       | 29 +++++++++++++++++++++++++++
 include/os/linux/kernel/linux/blkdev_compat.h |  2 ++
 2 files changed, 31 insertions(+)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 63d719f9c2da..887acee670ba 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -103,6 +103,33 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [
 	])
 ])
 
+dnl #
+dnl # 6.5.x API change
+dnl # disk_check_media_change() was added
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
+	ZFS_LINUX_TEST_SRC([disk_check_media_change], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct block_device *bdev = NULL;
+		bool error;
+
+		error = disk_check_media_change(bdev->bd_disk);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
+	AC_MSG_CHECKING([whether disk_check_media_change() exists])
+	ZFS_LINUX_TEST_RESULT([disk_check_media_change], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1,
+		    [disk_check_media_change() exists])
+	], [
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # bdev_kobj() is introduced from 5.12
 dnl #
@@ -481,6 +508,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
 
@@ -500,5 +528,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index f1448587b98c..e0f20ba32008 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -345,6 +345,8 @@ zfs_check_media_change(struct block_device *bdev)
 	return (0);
 }
 #define	vdev_bdev_reread_part(bdev)	zfs_check_media_change(bdev)
+#elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE)
+#define	vdev_bdev_reread_part(bdev)	disk_check_media_change(bdev->bd_disk)
 #else
 /*
  * This is encountered if check_disk_change() and bdev_check_media_change()
-- 
cgit v1.2.3


From 7d0df5422c146bf3082b2bb1c632548ff82b002c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 20 Jul 2023 12:10:04 -0400
Subject: Do not request data L1 buffers on scan prefetch.

Set ARC_FLAG_NO_BUF when prefetching data L1 buffers for scan.  We
do not prefetch data L0 buffers, so we do not need the L1 buffers,
only want them to be ready in ARC. This saves some CPU time on the
buffers decompression.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15029
---
 module/zfs/dsl_scan.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index ecdeba80b745..34012db82dee 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2080,10 +2080,16 @@ dsl_scan_prefetch_thread(void *arg)
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
+		/* We don't need data L1 buffer since we do not prefetch L0. */
+		blkptr_t *bp = &spic->spic_bp;
+		if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+			flags |= ARC_FLAG_NO_BUF;
+
 		/* issue the prefetch asynchronously */
-		(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
-		    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+		(void) arc_read(scn->scn_zio_root, spa, bp,
+		    dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
+		    zio_flags, &flags, &spic->spic_zb);
 
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
-- 
cgit v1.2.3


From e6ea31de9f153483b6840bdd5f4982a9a3ddc0aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wojciech=20Ma=C5=82ota-W=C3=B3jcik?=
 <59281144+outofforest@users.noreply.github.com>
Date: Thu, 20 Jul 2023 18:55:22 +0200
Subject: Rollback before zfs root is mounted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On my machines I observe random failures caused by rollback happening
after zfs root is mounted. I've observed two types of failures:

- zfs-rollback-bootfs.service fails saying that rollback must be
  done just before mounting the dataset
- boot process fails and rescue console is entered.

After making this modification and testing it for couple of days
none of those problems have been observed anymore.

I don't know if `dracut-mount.service` is still needed in the
`After` directive. Maybe someone else is able to address this?

Reviewed-by: Gregory Bartholomew <gregory.lee.bartholomew@gmail.com>
Signed-off-by: Wojciech Małota-Wójcik <59281144+outofforest@users.noreply.github.com>
Closes #15025
---
 contrib/dracut/90zfs/zfs-rollback-bootfs.service.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
index 68fdcb1f323e..12d8ac703e37 100644
--- a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
+++ b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
@@ -2,7 +2,7 @@
 Description=Rollback bootfs just before it is mounted
 Requisite=zfs-import.target
 After=zfs-import.target dracut-pre-mount.service zfs-snapshot-bootfs.service
-Before=dracut-mount.service
+Before=dracut-mount.service sysroot.mount
 DefaultDependencies=no
 ConditionKernelCommandLine=bootfs.rollback
 ConditionEnvironment=BOOTFS
-- 
cgit v1.2.3


From 4d2dad04aaa436256ef756701aff07af824690c4 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 20 Jul 2023 21:57:16 +0500
Subject: Ignore pool ashift property during vdev attachment

Ashift can be set for a vdev only during its creation, and the
top-level vdev does not change when a vdev is attached or replaced.
The ashift property should not be used during attachment, as it
does not allow attaching/replacing a vdev if the pool's ashift
property is increased after the existing vdev was created. Instead,
we should be able to attach the vdev if the attached vdev can
satisfy the ashift requirement with its parent.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #15061
---
 include/sys/vdev_impl.h                            |  1 +
 module/zfs/vdev.c                                  | 14 +++++++---
 .../cli_root/zpool_attach/attach-o_ashift.ksh      | 30 ++++++--------------
 .../cli_root/zpool_replace/replace-o_ashift.ksh    | 32 ++++++++--------------
 .../cli_root/zpool_replace/replace_prop_ashift.ksh | 24 ++++------------
 5 files changed, 36 insertions(+), 65 deletions(-)

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 2b22b973ba49..5f4e82ad8657 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -420,6 +420,7 @@ struct vdev {
 	boolean_t	vdev_copy_uberblocks;  /* post expand copy uberblocks */
 	boolean_t	vdev_resilver_deferred;  /* resilver deferred */
 	boolean_t	vdev_kobj_flag; /* kobj event record */
+	boolean_t	vdev_attaching; /* vdev attach ashift handling */
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 30551feb6322..1199bf5d32ca 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -889,9 +889,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    &vd->vdev_not_present);
 
 	/*
-	 * Get the alignment requirement.
+	 * Get the alignment requirement. Ignore pool ashift for vdev
+	 * attach case.
 	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+	if (alloctype != VDEV_ALLOC_ATTACH) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    &vd->vdev_ashift);
+	} else {
+		vd->vdev_attaching = B_TRUE;
+	}
 
 	/*
 	 * Retrieve the vdev creation time.
@@ -2144,9 +2150,9 @@ vdev_open(vdev_t *vd)
 				return (SET_ERROR(EDOM));
 			}
 
-			if (vd->vdev_top == vd) {
+			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
-			}
+			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
index 6ccec6abd66f..574cb7654d10 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
@@ -35,7 +35,7 @@
 #
 # STRATEGY:
 #	1. Create various pools with different ashift values.
-#	2. Verify 'attach -o ashift=<n>' works only with allowed values.
+#	2. Verify 'attach' works.
 #
 
 verify_runnable "global"
@@ -66,26 +66,14 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
-	for cmdval in ${ashifts[@]}
-	do
-		log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
-		log_must verify_ashift $disk1 $ashift
-
-		# ashift_of(attached_disk) <= ashift_of(existing_vdev)
-		if [[ $cmdval -le $ashift ]]
-		then
-			log_must zpool attach -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-			log_must verify_ashift $disk2 $ashift
-		else
-			log_mustnot zpool attach -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-		fi
-		# clean things for the next run
-		log_must zpool destroy $TESTPOOL1
-		log_must zpool labelclear $disk1
-		log_must zpool labelclear $disk2
-	done
+	log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
+	log_must verify_ashift $disk1 $ashift
+	log_must zpool attach $TESTPOOL1 $disk1 $disk2
+	log_must verify_ashift $disk2 $ashift
+	# clean things for the next run
+	log_must zpool destroy $TESTPOOL1
+	log_must zpool labelclear $disk1
+	log_must zpool labelclear $disk2
 done
 
 typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-")
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
index 37ed0062e61c..9595e51241b3 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
@@ -35,7 +35,7 @@
 #
 # STRATEGY:
 #	1. Create various pools with different ashift values.
-#	2. Verify 'replace -o ashift=<n>' works only with allowed values.
+#	2. Verify 'replace' works.
 #
 
 verify_runnable "global"
@@ -66,26 +66,16 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
-	for cmdval in ${ashifts[@]}
-	do
-		log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
-		log_must verify_ashift $disk1 $ashift
-		# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
-		if [[ $cmdval -le $ashift ]]
-		then
-			log_must zpool replace -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-			log_must verify_ashift $disk2 $ashift
-			wait_replacing $TESTPOOL1
-		else
-			log_mustnot zpool replace -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-		fi
-		# clean things for the next run
-		log_must zpool destroy $TESTPOOL1
-		log_must zpool labelclear $disk1
-		log_must zpool labelclear $disk2
-	done
+	log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
+	log_must verify_ashift $disk1 $ashift
+	# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
+	log_must zpool replace $TESTPOOL1 $disk1 $disk2
+	log_must verify_ashift $disk2 $ashift
+	wait_replacing $TESTPOOL1
+	# clean things for the next run
+	log_must zpool destroy $TESTPOOL1
+	log_must zpool labelclear $disk1
+	log_must zpool labelclear $disk2
 done
 
 typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-")
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
index ffdaf91a2841..b4ac18e5ea25 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
@@ -34,10 +34,8 @@
 #
 # STRATEGY:
 #	1. Create a pool with default values.
-#	2. Verify 'zpool replace' uses the ashift pool property value when
-#	   replacing an existing device.
-#	3. Verify the default ashift value can still be overridden by manually
-#	   specifying '-o ashift=<n>' from the command line.
+#	2. Override the pool ashift property.
+#	3. Verify 'zpool replace' works.
 #
 
 verify_runnable "global"
@@ -72,21 +70,9 @@ do
 	do
 		log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
 		log_must zpool set ashift=$pprop $TESTPOOL1
-		# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
-		if [[ $pprop -le $ashift ]]
-		then
-			log_must zpool replace $TESTPOOL1 $disk1 $disk2
-			wait_replacing $TESTPOOL1
-			log_must verify_ashift $disk2 $ashift
-		else
-			# cannot replace if pool prop ashift > vdev ashift
-			log_mustnot zpool replace $TESTPOOL1 $disk1 $disk2
-			# verify we can override the pool prop value manually
-			log_must zpool replace -o ashift=$ashift $TESTPOOL1 \
-			    $disk1 $disk2
-			wait_replacing $TESTPOOL1
-			log_must verify_ashift $disk2 $ashift
-		fi
+		log_must zpool replace $TESTPOOL1 $disk1 $disk2
+		wait_replacing $TESTPOOL1
+		log_must verify_ashift $disk2 $ashift
 		# clean things for the next run
 		log_must zpool destroy $TESTPOOL1
 		log_must zpool labelclear $disk1
-- 
cgit v1.2.3


From 929173ab42fa9482455c8d51ed64326a3d983e41 Mon Sep 17 00:00:00 2001
From: Yuri Pankov <113725409+yuripv@users.noreply.github.com>
Date: Thu, 20 Jul 2023 19:21:47 +0200
Subject: Don't panic if setting vdev properties is unsupported for this vdev
 type

Check that vdev has valid zap and bail out early.

While here, move objid selection out of the loop, it's not going to
change.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Yuri Pankov <yuripv@FreeBSD.org>
Closes #15063
---
 module/zfs/vdev.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 1199bf5d32ca..b6f8c0ab302e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -5694,6 +5694,7 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
+	uint64_t objid;
 	nvlist_t *nvprops;
 
 	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
@@ -5704,31 +5705,28 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	if (vd == NULL)
 		return;
 
+	/*
+	 * Set vdev property values in the vdev props mos object.
+	 */
+	if (vd->vdev_root_zap != 0) {
+		objid = vd->vdev_root_zap;
+	} else if (vd->vdev_top_zap != 0) {
+		objid = vd->vdev_top_zap;
+	} else if (vd->vdev_leaf_zap != 0) {
+		objid = vd->vdev_leaf_zap;
+	} else {
+		panic("unexpected vdev type");
+	}
+
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
-		uint64_t intval, objid = 0;
+		uint64_t intval;
 		const char *strval;
 		vdev_prop_t prop;
 		const char *propname = nvpair_name(elem);
 		zprop_type_t proptype;
 
-		/*
-		 * Set vdev property values in the vdev props mos object.
-		 */
-		if (vd->vdev_root_zap != 0) {
-			objid = vd->vdev_root_zap;
-		} else if (vd->vdev_top_zap != 0) {
-			objid = vd->vdev_top_zap;
-		} else if (vd->vdev_leaf_zap != 0) {
-			objid = vd->vdev_leaf_zap;
-		} else {
-			/*
-			 * XXX: implement vdev_props_set_check()
-			 */
-			panic("vdev not root/top/leaf");
-		}
-
 		switch (prop = vdev_name_to_prop(propname)) {
 		case VDEV_PROP_USERPROP:
 			if (vdev_prop_user(propname)) {
@@ -5797,6 +5795,12 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 
 	ASSERT(vd != NULL);
 
+	/* Check that vdev has a zap we can use */
+	if (vd->vdev_root_zap == 0 &&
+	    vd->vdev_top_zap == 0 &&
+	    vd->vdev_leaf_zap == 0)
+		return (SET_ERROR(EINVAL));
+
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
-- 
cgit v1.2.3


From d9bb583c25d833e57c0842a81dca1bd50da5d9b1 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 20 Jul 2023 22:23:52 +0500
Subject: spa_min_alloc should be GCD, not min

Since spa_min_alloc may not be a power of 2, unlike ashifts, in the
case of DRAID, we should not select the minimal value among several
vdevs. Rounding to a multiple of it is unlikely to work for other
vdevs. Instead, using the greatest common divisor produces smaller
yet more reasonable results.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #15067
---
 include/sys/spa_impl.h |  1 +
 module/zfs/spa_misc.c  |  1 +
 module/zfs/vdev.c      | 36 ++++++++++++++++++++++++++++++++----
 module/zfs/zio.c       | 22 +++++++++++++++++-----
 4 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 44afa763283a..588c72f6e4fa 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -250,6 +250,7 @@ struct spa {
 	uint64_t	spa_min_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_max_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_min_alloc;		/* of vdevs in normal class */
+	uint64_t	spa_gcd_alloc;		/* of vdevs in normal class */
 	uint64_t	spa_config_guid;	/* config pool guid */
 	uint64_t	spa_load_guid;		/* spa_load initialized guid */
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 06f640769043..3b355e0debcc 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -772,6 +772,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
+	spa->spa_gcd_alloc = INT_MAX;
 
 	/* Reset cached value */
 	spa->spa_dedup_dspace = ~0ULL;
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index b6f8c0ab302e..f3812b843e95 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1399,6 +1399,36 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_free(mvd);
 }
 
+/*
+ * Choose GCD for spa_gcd_alloc.
+ */
+static uint64_t
+vdev_gcd(uint64_t a, uint64_t b)
+{
+	while (b != 0) {
+		uint64_t t = b;
+		b = a % b;
+		a = t;
+	}
+	return (a);
+}
+
+/*
+ * Set spa_min_alloc and spa_gcd_alloc.
+ */
+static void
+vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
+{
+	if (min_alloc < spa->spa_min_alloc)
+		spa->spa_min_alloc = min_alloc;
+	if (spa->spa_gcd_alloc == INT_MAX) {
+		spa->spa_gcd_alloc = min_alloc;
+	} else {
+		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
+		    spa->spa_gcd_alloc);
+	}
+}
+
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
@@ -1451,8 +1481,7 @@ vdev_metaslab_group_create(vdev_t *vd)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
-			if (min_alloc < spa->spa_min_alloc)
-				spa->spa_min_alloc = min_alloc;
+			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
@@ -2213,8 +2242,7 @@ vdev_open(vdev_t *vd)
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
-		if (min_alloc < spa->spa_min_alloc)
-			spa->spa_min_alloc = min_alloc;
+		vdev_spa_set_alloc(spa, min_alloc);
 	}
 
 	/*
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 10279fde89df..3f5e6a08d89c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1596,6 +1596,19 @@ zio_shrink(zio_t *zio, uint64_t size)
 	}
 }
 
+/*
+ * Round provided allocation size up to a value that can be allocated
+ * by at least some vdev(s) in the pool with minimum or no additional
+ * padding and without extra space usage on others
+ */
+static uint64_t
+zio_roundup_alloc_size(spa_t *spa, uint64_t size)
+{
+	if (size > spa->spa_min_alloc)
+		return (roundup(size, spa->spa_gcd_alloc));
+	return (spa->spa_min_alloc);
+}
+
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
@@ -1802,9 +1815,8 @@ zio_write_compress(zio_t *zio)
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
-			ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
-			size_t rounded = (size_t)roundup(psize,
-			    spa->spa_min_alloc);
+			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
+			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
@@ -1847,8 +1859,8 @@ zio_write_compress(zio_t *zio)
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
-		size_t rounded = MIN((size_t)roundup(psize,
-		    spa->spa_min_alloc), lsize);
+		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
+		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
-- 
cgit v1.2.3


From 2d8a2b51dcc0066f73819e903609daa02a439f51 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Thu, 20 Jul 2023 10:30:21 -0700
Subject: Fix zpl_test_super race with zfs_umount

We cannot call zpl_enter in zpl_test_super, because zpl_test_super is
under spinlock so we can't sleep, and also because zpl_test_super is
called without sb->s_umount taken, so it's possible we would race with
zfs_umount and call zpl_enter on freed zfsvfs.

Here's an stack trace when this happens:
[ 2379.114837] VERIFY(cvp->cv_magic == CV_MAGIC) failed
[ 2379.114845] PANIC at spl-condvar.c:497:__cv_broadcast()
[ 2379.114854] Kernel panic - not syncing: VERIFY(cvp->cv_magic == CV_MAGIC) failed
[ 2379.115012] Call Trace:
[ 2379.115019]  dump_stack+0x74/0x96
[ 2379.115024]  panic+0x114/0x2f6
[ 2379.115035]  spl_panic+0xcf/0xfc [spl]
[ 2379.115477]  __cv_broadcast+0x68/0xa0 [spl]
[ 2379.115585]  rrw_exit+0xb8/0x310 [zfs]
[ 2379.115696]  rrm_exit+0x4a/0x80 [zfs]
[ 2379.115808]  zpl_test_super+0xa9/0xd0 [zfs]
[ 2379.115920]  sget+0xd1/0x230
[ 2379.116033]  zpl_mount+0xdc/0x230 [zfs]
[ 2379.116037]  legacy_get_tree+0x28/0x50
[ 2379.116039]  vfs_get_tree+0x27/0xc0
[ 2379.116045]  path_mount+0x2fe/0xa70
[ 2379.116048]  do_mount+0x80/0xa0
[ 2379.116050]  __x64_sys_mount+0x8b/0xe0
[ 2379.116052]  do_syscall_64+0x35/0x50
[ 2379.116054]  entry_SYSCALL_64_after_hwframe+0x61/0xc6
[ 2379.116057] RIP: 0033:0x7f9912e8b26a

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #15077
---
 module/os/linux/zfs/zfs_vfsops.c |  1 +
 module/os/linux/zfs/zpl_super.c  | 39 ++++++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index 6b6293b9e482..87c4e6dcaf7d 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -1662,6 +1662,7 @@ zfs_umount(struct super_block *sb)
 	}
 
 	zfsvfs_free(zfsvfs);
+	sb->s_fs_info = NULL;
 	return (0);
 }
 
diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index c5c230bee144..ad52a11aada0 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -277,8 +277,6 @@ zpl_test_super(struct super_block *s, void *data)
 {
 	zfsvfs_t *zfsvfs = s->s_fs_info;
 	objset_t *os = data;
-	int match;
-
 	/*
 	 * If the os doesn't match the z_os in the super_block, assume it is
 	 * not a match. Matching would imply a multimount of a dataset. It is
@@ -286,19 +284,7 @@ zpl_test_super(struct super_block *s, void *data)
 	 * that changes the z_os, e.g., rollback, where the match will be
 	 * missed, but in that case the user will get an EBUSY.
 	 */
-	if (zfsvfs == NULL || os != zfsvfs->z_os)
-		return (0);
-
-	/*
-	 * If they do match, recheck with the lock held to prevent mounting the
-	 * wrong dataset since z_os can be stale when the teardown lock is held.
-	 */
-	if (zpl_enter(zfsvfs, FTAG) != 0)
-		return (0);
-	match = (os == zfsvfs->z_os);
-	zpl_exit(zfsvfs, FTAG);
-
-	return (match);
+	return (zfsvfs != NULL && os == zfsvfs->z_os);
 }
 
 static struct super_block *
@@ -324,12 +310,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 
 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
 
+	/*
+	 * Recheck with the lock held to prevent mounting the wrong dataset
+	 * since z_os can be stale when the teardown lock is held.
+	 *
+	 * We can't do this in zpl_test_super in since it's under spinlock and
+	 * also s_umount lock is not held there so it would race with
+	 * zfs_umount and zfsvfs can be freed.
+	 */
+	if (!IS_ERR(s) && s->s_fs_info != NULL) {
+		zfsvfs_t *zfsvfs = s->s_fs_info;
+		if (zpl_enter(zfsvfs, FTAG) == 0) {
+			if (os != zfsvfs->z_os)
+				err = -SET_ERROR(EBUSY);
+			zpl_exit(zfsvfs, FTAG);
+		} else {
+			err = -SET_ERROR(EBUSY);
+		}
+	}
 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
 
 	if (IS_ERR(s))
 		return (ERR_CAST(s));
 
+	if (err) {
+		deactivate_locked_super(s);
+		return (ERR_PTR(err));
+	}
+
 	if (s->s_root == NULL) {
 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
 		if (err) {
-- 
cgit v1.2.3


From ab0b0393cbd7f7672df31d633b052216bbcd1b30 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 21 Jul 2023 11:46:58 -0700
Subject: zed: Fix zed ASSERT on slot power cycle

We would see zed assert on one of our systems if we powered off a
slot.  Further examination showed zfs_retire_recv() was reporting
a GUID of 0, which in turn would return a NULL nvlist.  Add
in a check for a zero GUID.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #15084
---
 cmd/zed/agents/zfs_retire.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index f83ae09259ab..a0e377a4a0c8 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -416,6 +416,11 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
 			return;
 
+		if (vdev_guid == 0) {
+			fmd_hdl_debug(hdl, "Got a zero GUID");
+			return;
+		}
+
 		if (spare) {
 			int nspares = find_and_remove_spares(zhdl, vdev_guid);
 			fmd_hdl_debug(hdl, "%d spares removed", nspares);
-- 
cgit v1.2.3


From cf2a225b2481534e313029da1fec248b6d2ad297 Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@FreeBSD.org>
Date: Tue, 11 Jul 2023 14:45:06 -0600
Subject: Don't emit checksum histograms in ereport.fs.zfs.checksum events

The checksum histograms were intended to be used with ATA and parallel
SCSI, which are obsolete.  With modern storage hardware, they will
almost always look like white noise; all bits will be wrong.  They only
serve to bloat the event.  That's a particular problem on FreeBSD, where
events must fit into a 1016 byte buffer.

This fixes issue #14717 for RAIDZ pools, but not for mirror pools.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Alan Somers <asomers@gmail.com>
Sponsored-by: Axcient
Closes #15052
---
 include/sys/fm/fs/zfs.h |  2 --
 man/man8/zpool-events.8 | 19 +------------------
 module/zfs/zfs_fm.c     | 25 ++++---------------------
 3 files changed, 5 insertions(+), 41 deletions(-)

diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index b9bac7e252e5..3cf2b1274dd2 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -112,8 +112,6 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS	"bad_range_clears"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS	"bad_set_bits"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS	"bad_cleared_bits"
-#define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
-#define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
 #define	FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME	"snapshot_name"
 #define	FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME	"device_name"
 #define	FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME	"raw_name"
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index 341f902fe66e..c3bd3208e63b 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -26,7 +26,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 27, 2021
+.Dd July 11, 2023
 .Dt ZPOOL-EVENTS 8
 .Os
 .
@@ -362,23 +362,6 @@ Like
 but contains
 .Pq Ar good data No & ~( Ns Ar bad data ) ;
 that is, the bits set in the good data which are cleared in the bad data.
-.It Sy bad_set_histogram
-If this field exists, it is an array of counters.
-Each entry counts bits set in a particular bit of a big-endian uint64 type.
-The first entry counts bits
-set in the high-order bit of the first byte, the 9th byte, etc, and the last
-entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc.
-This information is useful for observing a stuck bit in a parallel data path,
-such as IDE or parallel SCSI.
-.It Sy bad_cleared_histogram
-If this field exists, it is an array of counters.
-Each entry counts bit clears in a particular bit of a big-endian uint64 type.
-The first entry counts bits
-clears of the high-order bit of the first byte, the 9th byte, etc, and the
-last entry counts clears of the low-order bit of the 8th byte, the 16th byte,
-etc.
-This information is useful for observing a stuck bit in a parallel data
-path, such as IDE or parallel SCSI.
 .El
 .
 .Sh I/O STAGES
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 2754ceec83ca..9365ca500d7d 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -754,10 +754,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 #define	MAX_RANGES		16
 
 typedef struct zfs_ecksum_info {
-	/* histograms of set and cleared bits by bit number in a 64-bit word */
-	uint8_t zei_histogram_set[sizeof (uint64_t) * NBBY];
-	uint8_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
-
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
@@ -781,7 +777,7 @@ typedef struct zfs_ecksum_info {
 } zfs_ecksum_info_t;
 
 static void
-update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
+update_bad_bits(uint64_t value_arg, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
@@ -789,10 +785,8 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
 
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
-		if (value & (1ull << i)) {
-			hist[63 - i] = MAX(hist[63 - i], hist[63 - i] + 1);
+		if (value & (1ull << i))
 			++bits;
-		}
 	}
 	/* update the count of bits changed */
 	*count += bits;
@@ -1010,10 +1004,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 				offset++;
 			}
 
-			update_histogram(set, eip->zei_histogram_set,
-			    &eip->zei_range_sets[range]);
-			update_histogram(cleared, eip->zei_histogram_cleared,
-			    &eip->zei_range_clears[range]);
+			update_bad_bits(set, &eip->zei_range_sets[range]);
+			update_bad_bits(cleared, &eip->zei_range_clears[range]);
 		}
 
 		/* convert to byte offsets */
@@ -1049,15 +1041,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
 		    NULL);
-	} else {
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
-		    DATA_TYPE_UINT8_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
-		    DATA_TYPE_UINT8_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
-		    NULL);
 	}
 	return (eip);
 }
-- 
cgit v1.2.3


From 6fd87e1d8df1cab8d6087026cbc361a0886a2a97 Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@FreeBSD.org>
Date: Tue, 11 Jul 2023 15:13:57 -0600
Subject: Don't emit cksum_{actual_expected} in ereport.fs.zfs.checksum events

With anything but fletcher-4, even a tiny change in the input will cause
the checksum value to change completely.  So knowing the actual and
expected checksums doesn't provide much more information than "they
don't match".  The harm in sending them is simply that they bloat the
event.  In particular, on FreeBSD the event must fit into a 1016 byte
buffer.

Fixes #14717 for mirrored pools.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Alan Somers <asomers@gmail.com>
Sponsored-by: Axcient
Closes #14717
Closes #15052
---
 include/sys/fm/fs/zfs.h    | 2 --
 include/sys/zio_checksum.h | 2 --
 man/man8/zpool-events.8    | 4 ----
 module/zfs/vdev_indirect.c | 2 +-
 module/zfs/vdev_raidz.c    | 2 +-
 module/zfs/zfs_fm.c        | 8 --------
 module/zfs/zio_checksum.c  | 2 --
 7 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index 3cf2b1274dd2..fb9e8649221e 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -102,8 +102,6 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP	"zio_timestamp"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA	"zio_delta"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
-#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
-#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL	"cksum_actual"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO	"cksum_algorithm"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP	"cksum_byteswap"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h
index 9fb79ab4a54b..37fd65b7cb3e 100644
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@@ -94,8 +94,6 @@ typedef const struct zio_checksum_info {
 } zio_checksum_info_t;
 
 typedef struct zio_bad_cksum {
-	zio_cksum_t		zbc_expected;
-	zio_cksum_t		zbc_actual;
 	const char		*zbc_checksum_name;
 	uint8_t			zbc_byteswapped;
 	uint8_t			zbc_injected;
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index c3bd3208e63b..e1436f6ded57 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -305,10 +305,6 @@ The time when a given I/O request was submitted.
 The time required to service a given I/O request.
 .It Sy prev_state
 The previous state of the vdev.
-.It Sy cksum_expected
-The expected checksum value for the block.
-.It Sy cksum_actual
-The actual checksum value for an errant block.
 .It Sy cksum_algorithm
 Checksum algorithm used.
 See
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 89667585345d..acb725696674 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1398,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio,
 	vd->vdev_stat.vs_checksum_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	zio_bad_cksum_t zbc = {{{ 0 }}};
+	zio_bad_cksum_t zbc = { 0 };
 	abd_t *bad_abd = ic->ic_data;
 	abd_t *good_abd = is->is_good_child->ic_data;
 	(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 14b98a76b84f..3445fa9d35d5 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1785,7 +1785,7 @@ vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 static int
 raidz_checksum_verify(zio_t *zio)
 {
-	zio_bad_cksum_t zbc = {{{0}}};
+	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 9365ca500d7d..c4eb74e873db 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -914,14 +914,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 
 	if (info != NULL && info->zbc_has_cksum) {
 		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_expected) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_expected,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_actual) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_actual,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 		    DATA_TYPE_STRING,
 		    info->zbc_checksum_name,
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index 6090959c5b8c..9de515e8767a 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -515,8 +515,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
 	}
 
 	if (info != NULL) {
-		info->zbc_expected = expected_cksum;
-		info->zbc_actual = actual_cksum;
 		info->zbc_checksum_name = ci->ci_name;
 		info->zbc_byteswapped = byteswap;
 		info->zbc_injected = 0;
-- 
cgit v1.2.3


From 28430b51e3e2387e6f36d5b4ee5b30ef33095993 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 21 Jul 2023 14:50:48 -0400
Subject: Add explicit prefetches to bpobj_iterate().

To simplify error handling bpobj_iterate_blkptrs() iterates through
the list of block pointers backwards.  Unfortunately speculative
prefetcher is currently unable to detect such patterns, that makes
each block read there synchronous and very slow on HDD pools.

According to my tests, added explicit prefetch reduces time needed
to asynchronously delete 8 snapshots of 4 million blocks each from
20 seconds to less than one, that should free sync thread for other
useful work, such as async writes, scrub, etc.

While there, plug one memory leak in case of bpobj_open() error and
harmonize some variable names.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15071
---
 include/sys/bpobj.h |  2 +-
 module/zfs/bpobj.c  | 49 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h
index f3384f526454..81bc0fe21086 100644
--- a/include/sys/bpobj.h
+++ b/include/sys/bpobj.h
@@ -60,7 +60,7 @@ typedef struct bpobj {
 	kmutex_t	bpo_lock;
 	objset_t	*bpo_os;
 	uint64_t	bpo_object;
-	int		bpo_epb;
+	uint32_t	bpo_epb;
 	uint8_t		bpo_havecomp;
 	uint8_t		bpo_havesubobj;
 	uint8_t		bpo_havefreed;
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index 211bab56519c..e772caead29b 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 	dmu_buf_t *dbuf = NULL;
 	bpobj_t *bpo = bpi->bpi_bpo;
 
-	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+	int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
+	uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
+	    sizeof (blkptr_t);
+	uint64_t ps = start * sizeof (blkptr_t);
+	uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
+	    ps);
+	if (pe > pb) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+	for (; i >= start; i--) {
 		uint64_t offset = i * sizeof (blkptr_t);
 		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
 
@@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
-			    offset, FTAG, &dbuf, 0);
+			    offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
+			pe = pb;
+			pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
+			    dbuf->db_offset - dmu_prefetch_max : 0, ps);
+			if (pe > pb) {
+				dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+				    pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
+			}
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
@@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 			int64_t i = bpi->bpi_unprocessed_subobjs - 1;
 			uint64_t offset = i * sizeof (uint64_t);
 
-			uint64_t obj_from_sublist;
+			uint64_t subobj;
 			err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-			    offset, sizeof (uint64_t), &obj_from_sublist,
-			    DMU_READ_PREFETCH);
+			    offset, sizeof (uint64_t), &subobj,
+			    DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
-			bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
-			    KM_SLEEP);
 
-			err = bpobj_open(sublist, bpo->bpo_os,
-			    obj_from_sublist);
-			if (err)
+			bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
+			    KM_SLEEP);
+			err = bpobj_open(subbpo, bpo->bpo_os, subobj);
+			if (err) {
+				kmem_free(subbpo, sizeof (bpobj_t));
 				break;
+			}
+
+			if (subbpo->bpo_havesubobj &&
+			    subbpo->bpo_phys->bpo_subobjs != 0) {
+				dmu_prefetch(subbpo->bpo_os,
+				    subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
+				    ZIO_PRIORITY_ASYNC_READ);
+			}
 
-			list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
-			mutex_enter(&sublist->bpo_lock);
+			list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
+			mutex_enter(&subbpo->bpo_lock);
 			bpi->bpi_unprocessed_subobjs--;
 		}
 	}
-- 
cgit v1.2.3


From 34b3d498a965451f2c21e9652a99952ed6e93c59 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 21 Jul 2023 14:51:47 -0400
Subject: Adjust prefetch parameters.

- Reduce maximum prefetch distance for 32bit platforms to 8MB as it
was previously.  Those systems didn't grow much probably, so better
stay conservative there.
 - Retire array_rd_sz tunable, blocking prefetch for large requests.
We should not penalize applications trying to be more efficient. The
speculative prefetcher by itself has reasonable distance limits, and
1MB is not much at all these days.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15072
---
 include/sys/dmu_zfetch.h |  2 --
 man/man4/zfs.4           |  3 ---
 module/zfs/dmu.c         |  7 +++++--
 module/zfs/dmu_zfetch.c  | 12 +++++++-----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h
index 0fbc3bacffb9..f00e13cf03a6 100644
--- a/include/sys/dmu_zfetch.h
+++ b/include/sys/dmu_zfetch.h
@@ -36,8 +36,6 @@
 extern "C" {
 #endif
 
-extern uint64_t	zfetch_array_rd_sz;
-
 struct dnode;				/* so we can reference dnode */
 
 typedef struct zfetch {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 271b02b6ee42..7959bfe33b66 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -519,9 +519,6 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
-.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
-If prefetching is enabled, disable prefetching for reads larger than this size.
-.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index dda869287c78..3a4560cec2c4 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -89,7 +89,11 @@ static int zfs_dmu_offset_next_sync = 1;
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
+#ifdef _ILP32
+uint_t dmu_prefetch_max = 8 * 1024 * 1024;
+#else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+#endif
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
@@ -552,8 +556,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
-	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
-	    length <= zfetch_array_rd_sz) {
+	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index b70459380c24..d0acaf502066 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -52,14 +52,19 @@ static unsigned int	zfetch_max_streams = 8;
 static unsigned int	zfetch_min_sec_reap = 1;
 /* max time before stream delete */
 static unsigned int	zfetch_max_sec_reap = 2;
+#ifdef _ILP32
+/* min bytes to prefetch per stream (default 2MB) */
+static unsigned int	zfetch_min_distance = 2 * 1024 * 1024;
+/* max bytes to prefetch per stream (default 8MB) */
+unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+#else
 /* min bytes to prefetch per stream (default 4MB) */
 static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 64MB) */
 unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
+#endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
-/* max number of bytes in an array_read in which we allow prefetching (1MB) */
-uint64_t	zfetch_array_rd_sz = 1024 * 1024;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
@@ -580,6 +585,3 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
-
-ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW,
-	"Number of bytes in a array_read");
-- 
cgit v1.2.3


From 46adb2820ac361700d54b8378314f8343c064642 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 22 Jul 2023 04:52:32 +1000
Subject: metaslab: tuneable to better control force ganging

metaslab_force_ganging isn't enough to actually force ganging, because
it still only forces 3% of the time. This adds
metaslab_force_ganging_pct so we can configure how often to force
ganging.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15088
---
 man/man4/zfs.4        |  7 ++++++-
 module/zfs/metaslab.c | 14 ++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 7959bfe33b66..3843419731b8 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -15,7 +15,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd January 10, 2023
+.Dd July 21, 2023
 .Dt ZFS 4
 .Os
 .
@@ -239,6 +239,11 @@ relative to the pool.
 Make some blocks above a certain size be gang blocks.
 This option is used by the test suite to facilitate testing.
 .
+.It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint
+For blocks that could be forced to be a gang block (due to
+.Sy metaslab_force_ganging ) ,
+force this many of them to be gang blocks.
+.
 .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 176247d63b76..9991e1a22cdf 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -58,6 +58,11 @@ static uint64_t metaslab_aliquot = 1024 * 1024;
  */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
+/*
+ * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
+ */
+uint_t metaslab_force_ganging_pct = 3;
+
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
@@ -5109,7 +5114,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
-	if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
+	if (psize >= metaslab_force_ganging &&
+	    metaslab_force_ganging_pct > 0 &&
+	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
@@ -6266,7 +6273,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
-	"Blocks larger than this size are forced to be gang blocks");
+	"Blocks larger than this size are sometimes forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
+	"Percentage of large blocks that will be forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");
-- 
cgit v1.2.3


From 13ec73a02830365ca1d188f6cc515b9f398fb7f3 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 22 Jul 2023 04:53:06 +1000
Subject: shellcheck: disable "unreachable command" check [SC2317]

This new check in 0.9.0 appears to have some issues with various forms
of "early return", like trap, exit and return. This is tripping up (at
least):

  cmd/zed/zed.d/history_event-zfs-list-cacher.sh
  /etc/zfs/zfs-functions

Its not obvious what its complaining about or what the remedy is, so it
seems sensible to disable this check for now.

See also:

  https://www.shellcheck.net/wiki/SC2317
  https://github.com/koalaman/shellcheck/issues/2542
  https://github.com/koalaman/shellcheck/issues/2613

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15089
---
 config/Shellcheck.am | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/Shellcheck.am b/config/Shellcheck.am
index 1cff81e04be8..1ab13516066c 100644
--- a/config/Shellcheck.am
+++ b/config/Shellcheck.am
@@ -4,6 +4,7 @@
 # Not following: a was not specified as input (see shellcheck -x). [SC1091]
 # Prefer putting braces around variable references even when not strictly required. [SC2250]
 # Consider invoking this command separately to avoid masking its return value (or use '|| true' to ignore). [SC2312]
+# Command appears to be unreachable. Check usage (or ignore if invoked indirectly). [SC2317]
 # In POSIX sh, 'local' is undefined. [SC2039] # older ShellCheck versions
 # In POSIX sh, 'local' is undefined. [SC3043] # newer ShellCheck versions
 
@@ -18,7 +19,7 @@ PHONY += shellcheck
 _STGT = $(subst ^,/,$(subst shellcheck-here-,,$@))
 shellcheck-here-%:
 if HAVE_SHELLCHECK
-	shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
+	shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
 else
 	@echo "skipping shellcheck of" $(_STGT) "because shellcheck is not installed"
 endif
-- 
cgit v1.2.3


From fb344f5aeb7088238f9bd3c2a8507132b702cfed Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 24 Jul 2023 11:20:42 -0700
Subject: Linux 6.4 compat: META

Update the META file to reflect compatibility with the 6.4 kernel.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #15095
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index e6488a6fa6f0..3919b0df4680 100644
--- a/META
+++ b/META
@@ -6,5 +6,5 @@ Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.3
+Linux-Maximum: 6.4
 Linux-Minimum: 3.10
-- 
cgit v1.2.3


From 2cb992a99ccadb78d97049b40bd442eb4fdc549d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 24 Jul 2023 16:41:11 -0400
Subject: ZIL: Fix config lock deadlock.

When we have some LWBs closed and their ZIOs ready to be issued, we
can not afford sleeping on config lock if somebody else try to lock
it as writer, or it will cause a deadlock.

To solve it, move spa_config_enter() from zil_lwb_write_issue() to
zil_lwb_write_close() under zl_issuer_lock to enforce lock ordering
with other threads.  Now if we can't immediately lock config, issue
all previously closed LWBs so that they could drop their config
locks after completion, and only then allow sleeping on our lock.

Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15078
Closes #15080
---
 module/zfs/zil.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 00d66a2481d7..af7137faaccf 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -151,6 +151,7 @@ static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
+static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb);
 static itx_t *zil_itx_clone(itx_t *oitx);
 
 static int
@@ -1768,7 +1769,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
-zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
 {
 	lwb_t *nlwb = NULL;
 	zil_chain_t *zilc;
@@ -1870,6 +1871,27 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
 
 	dmu_tx_commit(tx);
 
+	/*
+	 * We need to acquire the config lock for the lwb to issue it later.
+	 * However, if we already have a queue of closed parent lwbs already
+	 * holding the config lock (but not yet issued), we can't block here
+	 * waiting on the lock or we will deadlock.  In that case we must
+	 * first issue to parent IOs before waiting on the lock.
+	 */
+	if (ilwbs && !list_is_empty(ilwbs)) {
+		if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) {
+			lwb_t *tlwb;
+			while ((tlwb = list_remove_head(ilwbs)) != NULL)
+				zil_lwb_write_issue(zilog, tlwb);
+			spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
+		}
+	} else {
+		spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
+	}
+
+	if (ilwbs)
+		list_insert_tail(ilwbs, lwb);
+
 	/*
 	 * If there was an allocation failure then nlwb will be null which
 	 * forces a txg_wait_synced().
@@ -1933,7 +1955,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
-	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+	ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER));
 	zil_lwb_add_block(lwb, &lwb->lwb_blk);
 	lwb->lwb_issued_timestamp = gethrtime();
 	zio_nowait(lwb->lwb_root_zio);
@@ -2037,8 +2059,7 @@ cont:
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
-		list_insert_tail(ilwbs, lwb);
-		lwb = zil_lwb_write_close(zilog, lwb);
+		lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_open(zilog, lwb);
@@ -2937,8 +2958,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 			    zfs_commit_timeout_pct / 100;
 			if (sleep < zil_min_commit_timeout ||
 			    lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
-				list_insert_tail(ilwbs, lwb);
-				lwb = zil_lwb_write_close(zilog, lwb);
+				lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
 				zilog->zl_cur_used = 0;
 				if (lwb == NULL) {
 					while ((lwb = list_remove_head(ilwbs))
@@ -3096,7 +3116,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
-	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
+	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL);
 
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 
-- 
cgit v1.2.3


From 8d21c002c6ae9f0d406903821e832698fa32b711 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 22 Jun 2023 13:44:00 +1000
Subject: zfs_clone_range: use vmem_malloc for large allocation

Just silencing the warning about large allocations.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 module/zfs/zfs_vnops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 7bdcc1639384..3ebd2d0ff7c5 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1212,7 +1212,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	gid = KGID_TO_SGID(ZTOGID(outzp));
 	projid = outzp->z_projid;
 
-	bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+	bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
 
 	/*
 	 * Clone the file in reasonable size chunks.  Each chunk is cloned
@@ -1330,7 +1330,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		done += size;
 	}
 
-	kmem_free(bps, sizeof (bps[0]) * maxblocks);
+	vmem_free(bps, sizeof (bps[0]) * maxblocks);
 	zfs_znode_update_vfs(outzp);
 
 unlock:
-- 
cgit v1.2.3


From 87a6e135c5f5fcf9944adfb7c67cd9b56df72ea2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 22 Jun 2023 13:44:00 +1000
Subject: brt_vdev_realloc: use vmem_alloc for large allocation

bv_entcount can be a relatively large allocation (see comment for
BRT_RANGESIZE), so get it from the big allocator.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 module/zfs/brt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 99bd472d6fb4..877b503a1bf2 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -680,7 +680,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
 
-	entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
+	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
 
@@ -709,7 +709,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
-		kmem_free(brtvd->bv_entcount,
+		vmem_free(brtvd->bv_entcount,
 		    sizeof (entcount[0]) * brtvd->bv_size);
 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
 	}
@@ -792,7 +792,7 @@ brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_initiated);
 
-	kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
+	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
 	brtvd->bv_entcount = NULL;
 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
 	brtvd->bv_bitmap = NULL;
-- 
cgit v1.2.3


From d4edecd1a29f9162811dacf1500e2f3daf74a010 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 24 Jul 2023 17:54:05 +1000
Subject: dmu_buf_will_clone: only check that current txg is clean

dbuf_undirty() will (correctly) only removed dirty records for the given
(open) txg. If there is a dirty record for an earlier closed txg that
has not been synced out yet, then db_dirty_records will still have
entries on it, tripping the assertion.

Instead, change the assertion to only consider the current txg. To some
extent this is redundant, as its really just saying "did dbuf_undirty()
work?", but it it doesn't hurt and accurately expresses our
expectations.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Original-patch-by: Kay Pedersen <mail@mkwg.de>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 module/zfs/dbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 1ea075217fb1..fbeac866ae91 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2701,7 +2701,7 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	 */
 	mutex_enter(&db->db_mtx);
 	VERIFY(!dbuf_undirty(db, tx));
-	ASSERT(list_head(&db->db_dirty_records) == NULL);
+	ASSERT0(dbuf_find_dirty_eq(db, tx->tx_txg));
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
-- 
cgit v1.2.3


From f6facd242937e52ab1ad5a7fd3b6bbbb6ce08050 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 24 Jul 2023 18:02:21 +1000
Subject: dbuf_sync_leaf: check DB_READ in state assertions

Block cloning introduced a new state transition from DB_NOFILL to
DB_READ. This occurs when a block is cloned and then read on the
current txg.

In this case, the clone will move the dbuf to DB_NOFILL, and then the
read will be issued for the overidden block pointer. If that read is
still outstanding when it comes time to write, the dbuf will be in
DB_READ, which is not handled by the checks in dbuf_sync_leaf, thus
tripping the assertions.

This updates those checks to allow DB_READ as a valid state iff the
dirty record is for a BRT write and there is a override block pointer.
This is a safe situation because the block already exists, so there's
nothing that could change from underneath the read.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Original-patch-by: Kay Pedersen <mail@mkwg.de>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 module/zfs/dbuf.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index fbeac866ae91..b7453578a76f 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4457,6 +4457,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+	} else if (db->db_state == DB_READ) {
+		/*
+		 * This buffer has a clone we need to write, and an in-flight
+		 * read on the BP we're about to clone. Its safe to issue the
+		 * write here because the read has already been issued and the
+		 * contents won't change.
+		 */
+		ASSERT(dr->dt.dl.dr_brtwrite &&
+		    dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
-- 
cgit v1.2.3


From 5a35c68b67473a7ae0a75c4beb51c928d3e2628d Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 27 Jun 2023 23:44:53 +1000
Subject: linux: implement filesystem-side copy/clone functions

This implements the Linux VFS ops required to service the file
copy/clone APIs:

  .copy_file_range    (4.5+)
  .clone_file_range   (4.5-4.19)
  .dedupe_file_range  (4.5-4.19)
  .remap_file_range   (4.20+)

Note that dedupe_file_range() and remap_file_range(REMAP_FILE_DEDUP) are
hooked up here, but are not implemented yet.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 config/kernel-vfs-file_range.m4      | 164 +++++++++++++++++++++++++++++++
 config/kernel.m4                     |  10 ++
 include/os/linux/zfs/sys/zpl.h       |  14 +++
 module/Kbuild.in                     |   1 +
 module/os/linux/zfs/zpl_file.c       |  13 ++-
 module/os/linux/zfs/zpl_file_range.c | 183 +++++++++++++++++++++++++++++++++++
 6 files changed, 384 insertions(+), 1 deletion(-)
 create mode 100644 config/kernel-vfs-file_range.m4
 create mode 100644 module/os/linux/zfs/zpl_file_range.c

diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
new file mode 100644
index 000000000000..cc96404d8bbe
--- /dev/null
+++ b/config/kernel-vfs-file_range.m4
@@ -0,0 +1,164 @@
+dnl #
+dnl # The *_file_range APIs have a long history:
+dnl #
+dnl # 2.6.29: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE ioctl introduced
+dnl # 3.12: BTRFS_IOC_FILE_EXTENT_SAME ioctl introduced
+dnl #
+dnl # 4.5: copy_file_range() syscall introduced, added to VFS
+dnl # 4.5: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE renamed to FICLONE ands
+dnl #      FICLONERANGE, added to VFS as clone_file_range()
+dnl # 4.5: BTRFS_IOC_FILE_EXTENT_SAME renamed to FIDEDUPERANGE, added to VFS
+dnl #      as dedupe_file_range()
+dnl #
+dnl # 4.20: VFS clone_file_range() and dedupe_file_range() replaced by
+dnl #       remap_file_range()
+dnl #
+dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
+dnl #      generic_copy_file_range() added to support it
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
+		#include <linux/fs.h>
+
+		static ssize_t test_copy_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    size_t len, unsigned int flags) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len; (void) flags;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+			.copy_file_range	= test_copy_file_range,
+		};
+	],[])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_copy_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_COPY_FILE_RANGE, 1,
+		    [fops->copy_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([generic_copy_file_range], [
+		#include <linux/fs.h>
+	], [
+		struct file *src_file __attribute__ ((unused)) = NULL;
+		loff_t src_off __attribute__ ((unused)) = 0;
+		struct file *dst_file __attribute__ ((unused)) = NULL;
+		loff_t dst_off __attribute__ ((unused)) = 0;
+		size_t len __attribute__ ((unused)) = 0;
+		unsigned int flags __attribute__ ((unused)) = 0;
+		generic_copy_file_range(src_file, src_off, dst_file, dst_off,
+		    len, flags);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether generic_copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT_SYMBOL([generic_copy_file_range],
+	[generic_copy_file_range], [fs/read_write.c], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_VFS_GENERIC_COPY_FILE_RANGE, 1,
+		    [generic_copy_file_range() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
+		#include <linux/fs.h>
+
+		static int test_clone_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    u64 len) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+			.clone_file_range	= test_clone_file_range,
+		};
+	],[])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->clone_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_clone_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_CLONE_FILE_RANGE, 1,
+		    [fops->clone_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_dedupe_file_range], [
+		#include <linux/fs.h>
+
+		static int test_dedupe_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    u64 len) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+                .dedupe_file_range	= test_dedupe_file_range,
+		};
+	],[])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->dedupe_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_dedupe_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_DEDUPE_FILE_RANGE, 1,
+		    [fops->dedupe_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_remap_file_range], [
+		#include <linux/fs.h>
+
+		static loff_t test_remap_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    loff_t len, unsigned int flags) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len; (void) flags;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+			.remap_file_range	= test_remap_file_range,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->remap_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_remap_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_REMAP_FILE_RANGE, 1,
+		    [fops->remap_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index cb7e736c9a43..b17ccfdeec92 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -116,6 +116,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
+	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN
@@ -249,6 +254,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_RW_ITERATE
 	ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS
 	ZFS_AC_KERNEL_VFS_IOV_ITER
+	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_MAKE_REQUEST_FN
diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h
index 2b302e9dab07..8b0e79afb0f1 100644
--- a/include/os/linux/zfs/sys/zpl.h
+++ b/include/os/linux/zfs/sys/zpl.h
@@ -180,6 +180,20 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx)
 }
 #endif /* HAVE_VFS_ITERATE */
 
+
+/* zpl_file_range.c */
+
+/* handlers for file_operations of the same name */
+extern ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags);
+extern loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags);
+extern int zpl_clone_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len);
+extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len);
+
+
 #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE)
 #define	zpl_inode_timestamp_truncate(ts, ip)	timestamp_truncate(ts, ip)
 #elif defined(HAVE_INODE_TIMESPEC64_TIMES)
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 485331ac655e..c132171592a8 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -461,6 +461,7 @@ ZFS_OBJS_OS := \
 	zpl_ctldir.o \
 	zpl_export.o \
 	zpl_file.o \
+	zpl_file_range.o \
 	zpl_inode.o \
 	zpl_super.o \
 	zpl_xattr.o \
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index e690525d3cd4..92b603e98a23 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -1283,7 +1283,6 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 }
 #endif /* CONFIG_COMPAT */
 
-
 const struct address_space_operations zpl_address_space_operations = {
 #ifdef HAVE_VFS_READPAGES
 	.readpages	= zpl_readpages,
@@ -1333,6 +1332,18 @@ const struct file_operations zpl_file_operations = {
 	.aio_fsync	= zpl_aio_fsync,
 #endif
 	.fallocate	= zpl_fallocate,
+#ifdef HAVE_VFS_COPY_FILE_RANGE
+	.copy_file_range	= zpl_copy_file_range,
+#endif
+#ifdef HAVE_VFS_REMAP_FILE_RANGE
+	.remap_file_range	= zpl_remap_file_range,
+#endif
+#ifdef HAVE_VFS_CLONE_FILE_RANGE
+	.clone_file_range	= zpl_clone_file_range,
+#endif
+#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
+	.dedupe_file_range	= zpl_dedupe_file_range,
+#endif
 #ifdef HAVE_FILE_FADVISE
 	.fadvise	= zpl_fadvise,
 #endif
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
new file mode 100644
index 000000000000..db387a748130
--- /dev/null
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -0,0 +1,183 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <linux/fs.h>
+#include <sys/file.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfeature.h>
+
+/*
+ * Clone part of a file via block cloning.
+ *
+ * Note that we are not required to update file offsets; the kernel will take
+ * care of that depending on how it was called.
+ */
+static ssize_t
+__zpl_clone_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, size_t len)
+{
+	struct inode *src_i = file_inode(src_file);
+	struct inode *dst_i = file_inode(dst_file);
+	uint64_t src_off_o = (uint64_t)src_off;
+	uint64_t dst_off_o = (uint64_t)dst_off;
+	uint64_t len_o = (uint64_t)len;
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int err;
+
+	if (!spa_feature_is_enabled(
+	    dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING))
+		return (-EOPNOTSUPP);
+
+	if (src_i != dst_i)
+		spl_inode_lock_shared(src_i);
+	spl_inode_lock(dst_i);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i),
+	    &dst_off_o, &len_o, cr);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	spl_inode_unlock(dst_i);
+	if (src_i != dst_i)
+		spl_inode_unlock_shared(src_i);
+
+	if (err < 0)
+		return (err);
+
+	return ((ssize_t)len_o);
+}
+
+#ifdef HAVE_VFS_COPY_FILE_RANGE
+/*
+ * Entry point for copy_file_range(). Copy len bytes from src_off in src_file
+ * to dst_off in dst_file. We are permitted to do this however we like, so we
+ * try to just clone the blocks, and if we can't support it, fall back to the
+ * kernel's generic byte copy function.
+ */
+ssize_t
+zpl_copy_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	if (flags != 0)
+		return (-EINVAL);
+
+	/* Try to do it via zfs_clone_range() */
+	ret =__zpl_clone_file_range(src_file, src_off,
+	    dst_file, dst_off, len);
+
+#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
+	/*
+	 * Since Linux 5.3 the filesystem driver is responsible for executing
+	 * an appropriate fallback, and a generic fallback function is provided.
+	 */
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src_file, src_off, dst_file,
+		    dst_off, len, flags);
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
+
+	return (ret);
+}
+#endif /* HAVE_VFS_COPY_FILE_RANGE */
+
+#ifdef HAVE_VFS_REMAP_FILE_RANGE
+/*
+ * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE.
+ *
+ * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except
+ * that they must clone - they cannot fall back to copying. FICLONE is exactly
+ * FICLONERANGE, for the entire file. We don't need to try to tell them apart;
+ * the kernel will sort that out for us.
+ *
+ * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
+ * range in both files and if they're the same, arrange for them to be backed
+ * by the same storage.
+ */
+loff_t
+zpl_remap_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags)
+{
+	if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
+		return (-EINVAL);
+
+	/*
+	 * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
+	 * range if we want. Its designed for filesystems that make data past
+	 * EOF available, and don't want it to be visible in both files. ZFS
+	 * doesn't do that, so we just turn the flag off.
+	 */
+	flags &= ~REMAP_FILE_CAN_SHORTEN;
+
+	if (flags & REMAP_FILE_DEDUP)
+		/* No support for dedup yet */
+		return (-EOPNOTSUPP);
+
+	/* Zero length means to clone everything to the end of the file */
+	if (len == 0)
+		len = i_size_read(file_inode(src_file)) - src_off;
+
+	return (__zpl_clone_file_range(src_file, src_off,
+	    dst_file, dst_off, len));
+}
+#endif /* HAVE_VFS_REMAP_FILE_RANGE */
+
+#ifdef HAVE_VFS_CLONE_FILE_RANGE
+/*
+ * Entry point for FICLONE and FICLONERANGE, before Linux 4.20.
+ */
+int
+zpl_clone_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len)
+{
+	/* Zero length means to clone everything to the end of the file */
+	if (len == 0)
+		len = i_size_read(file_inode(src_file)) - src_off;
+
+	return (__zpl_clone_file_range(src_file, src_off,
+	    dst_file, dst_off, len));
+}
+#endif /* HAVE_VFS_CLONE_FILE_RANGE */
+
+#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
+/*
+ * Entry point for FIDEDUPERANGE, before Linux 4.20.
+ */
+int
+zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len)
+{
+	/* No support for dedup yet */
+	return (-EOPNOTSUPP);
+}
+#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */
-- 
cgit v1.2.3


From 9927f219f1e9f4ee886d426190500abf5b1d602e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 27 Jun 2023 23:45:00 +1000
Subject: linux: implement filesystem-side clone ioctls

Prior to Linux 4.5, the FICLONE etc ioctls were specific to BTRFS, and
were implemented as regular filesystem-specific ioctls. This implements
those ioctls directly in OpenZFS, allowing cloning to work on older
kernels.

There's no need to gate these behind version checks; on later kernels
Linux will simply never deliver these ioctls, instead calling the
approprate VFS op.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 include/os/linux/zfs/sys/zpl.h       | 35 ++++++++++++++++
 module/os/linux/zfs/zpl_file.c       |  6 +++
 module/os/linux/zfs/zpl_file_range.c | 79 ++++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+)

diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h
index 8b0e79afb0f1..b62ab5eec81f 100644
--- a/include/os/linux/zfs/sys/zpl.h
+++ b/include/os/linux/zfs/sys/zpl.h
@@ -193,6 +193,41 @@ extern int zpl_clone_file_range(struct file *src_file, loff_t src_off,
 extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
     struct file *dst_file, loff_t dst_off, uint64_t len);
 
+/* compat for FICLONE/FICLONERANGE/FIDEDUPERANGE ioctls */
+typedef struct {
+	int64_t		fcr_src_fd;
+	uint64_t	fcr_src_offset;
+	uint64_t	fcr_src_length;
+	uint64_t	fcr_dest_offset;
+} zfs_ioc_compat_file_clone_range_t;
+
+typedef struct {
+	int64_t		fdri_dest_fd;
+	uint64_t	fdri_dest_offset;
+	uint64_t	fdri_bytes_deduped;
+	int32_t		fdri_status;
+	uint32_t	fdri_reserved;
+} zfs_ioc_compat_dedupe_range_info_t;
+
+typedef struct {
+	uint64_t	fdr_src_offset;
+	uint64_t	fdr_src_length;
+	uint16_t	fdr_dest_count;
+	uint16_t	fdr_reserved1;
+	uint32_t	fdr_reserved2;
+	zfs_ioc_compat_dedupe_range_info_t	fdr_info[];
+} zfs_ioc_compat_dedupe_range_t;
+
+#define	ZFS_IOC_COMPAT_FICLONE		_IOW(0x94, 9, int)
+#define	ZFS_IOC_COMPAT_FICLONERANGE \
+    _IOW(0x94, 13, zfs_ioc_compat_file_clone_range_t)
+#define	ZFS_IOC_COMPAT_FIDEDUPERANGE \
+    _IOWR(0x94, 54, zfs_ioc_compat_dedupe_range_t)
+
+extern long zpl_ioctl_ficlone(struct file *filp, void *arg);
+extern long zpl_ioctl_ficlonerange(struct file *filp, void *arg);
+extern long zpl_ioctl_fideduperange(struct file *filp, void *arg);
+
 
 #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE)
 #define	zpl_inode_timestamp_truncate(ts, ip)	timestamp_truncate(ts, ip)
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 92b603e98a23..87a248af8303 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -1257,6 +1257,12 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return (zpl_ioctl_getdosflags(filp, (void *)arg));
 	case ZFS_IOC_SETDOSFLAGS:
 		return (zpl_ioctl_setdosflags(filp, (void *)arg));
+	case ZFS_IOC_COMPAT_FICLONE:
+		return (zpl_ioctl_ficlone(filp, (void *)arg));
+	case ZFS_IOC_COMPAT_FICLONERANGE:
+		return (zpl_ioctl_ficlonerange(filp, (void *)arg));
+	case ZFS_IOC_COMPAT_FIDEDUPERANGE:
+		return (zpl_ioctl_fideduperange(filp, (void *)arg));
 	default:
 		return (-ENOTTY);
 	}
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index db387a748130..aad502a8092e 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -181,3 +181,82 @@ zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
 	return (-EOPNOTSUPP);
 }
 #endif /* HAVE_VFS_DEDUPE_FILE_RANGE */
+
+/* Entry point for FICLONE, before Linux 4.5. */
+long
+zpl_ioctl_ficlone(struct file *dst_file, void *arg)
+{
+	unsigned long sfd = (unsigned long)arg;
+
+	struct file *src_file = fget(sfd);
+	if (src_file == NULL)
+		return (-EBADF);
+
+	if (dst_file->f_op != src_file->f_op)
+		return (-EXDEV);
+
+	size_t len = i_size_read(file_inode(src_file));
+
+	ssize_t ret =
+	    __zpl_clone_file_range(src_file, 0, dst_file, 0, len);
+
+	fput(src_file);
+
+	if (ret < 0) {
+		if (ret == -EOPNOTSUPP)
+			return (-ENOTTY);
+		return (ret);
+	}
+
+	if (ret != len)
+		return (-EINVAL);
+
+	return (0);
+}
+
+/* Entry point for FICLONERANGE, before Linux 4.5. */
+long
+zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
+{
+	zfs_ioc_compat_file_clone_range_t fcr;
+
+	if (copy_from_user(&fcr, arg, sizeof (fcr)))
+		return (-EFAULT);
+
+	struct file *src_file = fget(fcr.fcr_src_fd);
+	if (src_file == NULL)
+		return (-EBADF);
+
+	if (dst_file->f_op != src_file->f_op)
+		return (-EXDEV);
+
+	size_t len = fcr.fcr_src_length;
+	if (len == 0)
+		len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
+
+	ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
+	    dst_file, fcr.fcr_dest_offset, len);
+
+	fput(src_file);
+
+	if (ret < 0) {
+		if (ret == -EOPNOTSUPP)
+			return (-ENOTTY);
+		return (ret);
+	}
+
+	if (ret != len)
+		return (-EINVAL);
+
+	return (0);
+}
+
+/* Entry point for FIDEDUPERANGE, before Linux 4.5. */
+long
+zpl_ioctl_fideduperange(struct file *filp, void *arg)
+{
+	(void) arg;
+
+	/* No support for dedup yet */
+	return (-ENOTTY);
+}
-- 
cgit v1.2.3


From 6b0a4be5fec60eb774c5393a0093150c608b7496 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sun, 25 Jun 2023 20:50:19 +1000
Subject: linux: implement filesystem-side copy/clone functions for EL7

Redhat have backported copy_file_range and clone_file_range to the EL7
kernel using an "extended file operations" wrapper structure. This
connects all that up to let cloning work there too.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 config/kernel-vfs-extended-file_range.m4 | 50 ++++++++++++++++++++++++++++++++
 config/kernel.m4                         |  2 ++
 include/os/linux/zfs/sys/zpl.h           |  4 +++
 module/os/linux/zfs/zfs_vfsops.c         |  6 ++++
 module/os/linux/zfs/zfs_znode.c          |  8 +++++
 module/os/linux/zfs/zpl_file.c           | 16 ++++++++--
 module/os/linux/zfs/zpl_file_range.c     | 12 ++++----
 7 files changed, 90 insertions(+), 8 deletions(-)
 create mode 100644 config/kernel-vfs-extended-file_range.m4

diff --git a/config/kernel-vfs-extended-file_range.m4 b/config/kernel-vfs-extended-file_range.m4
new file mode 100644
index 000000000000..a2622313129e
--- /dev/null
+++ b/config/kernel-vfs-extended-file_range.m4
@@ -0,0 +1,50 @@
+dnl #
+dnl # EL7 have backported copy_file_range and clone_file_range and
+dnl # added them to an "extended" file_operations struct.
+dnl #
+dnl # We're testing for both functions in one here, because they will only
+dnl # ever appear together and we don't want to match a similar method in
+dnl # some future vendor kernel.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND], [
+	ZFS_LINUX_TEST_SRC([vfs_file_operations_extend], [
+		#include <linux/fs.h>
+
+		static ssize_t test_copy_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    size_t len, unsigned int flags) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len; (void) flags;
+			return (0);
+		}
+
+		static int test_clone_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    u64 len) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len;
+			return (0);
+		}
+
+		static const struct file_operations_extend
+		    fops __attribute__ ((unused)) = {
+			.kabi_fops = {},
+			.copy_file_range = test_copy_file_range,
+			.clone_file_range = test_clone_file_range,
+		};
+	],[])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND], [
+	AC_MSG_CHECKING([whether file_operations_extend takes \
+.copy_file_range() and .clone_file_range()])
+	ZFS_LINUX_TEST_RESULT([vfs_file_operations_extend], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_FILE_OPERATIONS_EXTEND, 1,
+		    [file_operations_extend takes .copy_file_range()
+		    and .clone_file_range()])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index b17ccfdeec92..1487fa2e7793 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -121,6 +121,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND
 	ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN
@@ -259,6 +260,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND
 	ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_MAKE_REQUEST_FN
diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h
index b62ab5eec81f..0bd20f64897d 100644
--- a/include/os/linux/zfs/sys/zpl.h
+++ b/include/os/linux/zfs/sys/zpl.h
@@ -52,7 +52,11 @@ extern const struct inode_operations zpl_special_inode_operations;
 
 /* zpl_file.c */
 extern const struct address_space_operations zpl_address_space_operations;
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+extern const struct file_operations_extend zpl_file_operations;
+#else
 extern const struct file_operations zpl_file_operations;
+#endif
 extern const struct file_operations zpl_dir_file_operations;
 
 /* zpl_super.c */
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index 87c4e6dcaf7d..464c12e1108d 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -2092,6 +2092,9 @@ zfs_init(void)
 	zfs_znode_init();
 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
 	register_filesystem(&zpl_fs_type);
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+	register_fo_extend(&zpl_file_operations);
+#endif
 }
 
 void
@@ -2102,6 +2105,9 @@ zfs_fini(void)
 	 */
 	taskq_wait(system_delay_taskq);
 	taskq_wait(system_taskq);
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+	unregister_fo_extend(&zpl_file_operations);
+#endif
 	unregister_filesystem(&zpl_fs_type);
 	zfs_znode_fini();
 	zfsctl_fini();
diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c
index 02b1af3edc4f..335ae3460c58 100644
--- a/module/os/linux/zfs/zfs_znode.c
+++ b/module/os/linux/zfs/zfs_znode.c
@@ -415,7 +415,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 	switch (ip->i_mode & S_IFMT) {
 	case S_IFREG:
 		ip->i_op = &zpl_inode_operations;
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+		ip->i_fop = &zpl_file_operations.kabi_fops;
+#else
 		ip->i_fop = &zpl_file_operations;
+#endif
 		ip->i_mapping->a_ops = &zpl_address_space_operations;
 		break;
 
@@ -455,7 +459,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 		/* Assume the inode is a file and attempt to continue */
 		ip->i_mode = S_IFREG | 0644;
 		ip->i_op = &zpl_inode_operations;
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+		ip->i_fop = &zpl_file_operations.kabi_fops;
+#else
 		ip->i_fop = &zpl_file_operations;
+#endif
 		ip->i_mapping->a_ops = &zpl_address_space_operations;
 		break;
 	}
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 87a248af8303..73526db731c4 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -1311,7 +1311,12 @@ const struct address_space_operations zpl_address_space_operations = {
 #endif
 };
 
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+const struct file_operations_extend zpl_file_operations = {
+	.kabi_fops = {
+#else
 const struct file_operations zpl_file_operations = {
+#endif
 	.open		= zpl_open,
 	.release	= zpl_release,
 	.llseek		= zpl_llseek,
@@ -1341,12 +1346,12 @@ const struct file_operations zpl_file_operations = {
 #ifdef HAVE_VFS_COPY_FILE_RANGE
 	.copy_file_range	= zpl_copy_file_range,
 #endif
-#ifdef HAVE_VFS_REMAP_FILE_RANGE
-	.remap_file_range	= zpl_remap_file_range,
-#endif
 #ifdef HAVE_VFS_CLONE_FILE_RANGE
 	.clone_file_range	= zpl_clone_file_range,
 #endif
+#ifdef HAVE_VFS_REMAP_FILE_RANGE
+	.remap_file_range	= zpl_remap_file_range,
+#endif
 #ifdef HAVE_VFS_DEDUPE_FILE_RANGE
 	.dedupe_file_range	= zpl_dedupe_file_range,
 #endif
@@ -1357,6 +1362,11 @@ const struct file_operations zpl_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= zpl_compat_ioctl,
 #endif
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+	}, /* kabi_fops */
+	.copy_file_range	= zpl_copy_file_range,
+	.clone_file_range	= zpl_clone_file_range,
+#endif
 };
 
 const struct file_operations zpl_dir_file_operations = {
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index aad502a8092e..18efebfc1dec 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -77,7 +77,8 @@ __zpl_clone_file_range(struct file *src_file, loff_t src_off,
 	return ((ssize_t)len_o);
 }
 
-#ifdef HAVE_VFS_COPY_FILE_RANGE
+#if defined(HAVE_VFS_COPY_FILE_RANGE) || \
+    defined(HAVE_VFS_FILE_OPERATIONS_EXTEND)
 /*
  * Entry point for copy_file_range(). Copy len bytes from src_off in src_file
  * to dst_off in dst_file. We are permitted to do this however we like, so we
@@ -94,7 +95,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 		return (-EINVAL);
 
 	/* Try to do it via zfs_clone_range() */
-	ret =__zpl_clone_file_range(src_file, src_off,
+	ret = __zpl_clone_file_range(src_file, src_off,
 	    dst_file, dst_off, len);
 
 #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
@@ -109,7 +110,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 
 	return (ret);
 }
-#endif /* HAVE_VFS_COPY_FILE_RANGE */
+#endif /* HAVE_VFS_COPY_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
 
 #ifdef HAVE_VFS_REMAP_FILE_RANGE
 /*
@@ -152,7 +153,8 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
 }
 #endif /* HAVE_VFS_REMAP_FILE_RANGE */
 
-#ifdef HAVE_VFS_CLONE_FILE_RANGE
+#if defined(HAVE_VFS_CLONE_FILE_RANGE) || \
+    defined(HAVE_VFS_FILE_OPERATIONS_EXTEND)
 /*
  * Entry point for FICLONE and FICLONERANGE, before Linux 4.20.
  */
@@ -167,7 +169,7 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
 	return (__zpl_clone_file_range(src_file, src_off,
 	    dst_file, dst_off, len));
 }
-#endif /* HAVE_VFS_CLONE_FILE_RANGE */
+#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
 
 #ifdef HAVE_VFS_DEDUPE_FILE_RANGE
 /*
-- 
cgit v1.2.3


From 48d0e9465de9571a5268f65b7446c693410e0220 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 11 Jul 2023 20:46:33 +1000
Subject: zts: block cloning tests

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
Closes #405
Closes #13349
---
 tests/runfiles/linux.run                           |   9 +
 tests/test-runner/bin/zts-report.py.in             |  14 +
 tests/zfs-tests/cmd/.gitignore                     |   1 +
 tests/zfs-tests/cmd/Makefile.am                    |   1 +
 tests/zfs-tests/cmd/clonefile.c                    | 333 +++++++++++++++++++++
 tests/zfs-tests/include/commands.cfg               |   1 +
 tests/zfs-tests/tests/Makefile.am                  |  12 +
 .../functional/block_cloning/block_cloning.kshlib  |  46 +++
 .../block_cloning/block_cloning_copyfilerange.ksh  |  60 ++++
 .../block_cloning_copyfilerange_cross_dataset.ksh  |  65 ++++
 .../block_cloning_copyfilerange_partial.ksh        |  68 +++++
 .../block_cloning_disabled_copyfilerange.ksh       |  60 ++++
 .../block_cloning_disabled_ficlone.ksh             |  50 ++++
 .../block_cloning_disabled_ficlonerange.ksh        |  50 ++++
 .../block_cloning/block_cloning_ficlone.ksh        |  56 ++++
 .../block_cloning/block_cloning_ficlonerange.ksh   |  56 ++++
 .../block_cloning_ficlonerange_partial.ksh         |  64 ++++
 .../tests/functional/block_cloning/cleanup.ksh     |  34 +++
 .../tests/functional/block_cloning/setup.ksh       |  36 +++
 19 files changed, 1016 insertions(+)
 create mode 100644 tests/zfs-tests/cmd/clonefile.c
 create mode 100644 tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlone.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlonerange.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/setup.ksh

diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 618eeb934017..b68202d84924 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -34,6 +34,15 @@ tags = ['functional', 'acl', 'posix-sa']
 tests = ['atime_003_pos', 'root_relatime_on']
 tags = ['functional', 'atime']
 
+[tests/functional/block_cloning:Linux]
+tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
+    'block_cloning_ficlone', 'block_cloning_ficlonerange',
+    'block_cloning_ficlonerange_partial',
+    'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
+    'block_cloning_disabled_ficlonerange',
+    'block_cloning_copyfilerange_cross_dataset']
+tags = ['functional', 'block_cloning']
+
 [tests/functional/chattr:Linux]
 tests = ['chattr_001_pos', 'chattr_002_neg']
 tags = ['functional', 'chattr']
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index cf438e0e6495..c9a2b4179aec 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -134,6 +134,12 @@ ci_reason = 'CI runner doesn\'t have all requirements'
 #
 idmap_reason = 'Idmapped mount needs kernel 5.12+'
 
+#
+# copy_file_range() is not supported by all kernels
+#
+cfr_reason = 'Kernel copy_file_range support required'
+cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
+
 #
 # These tests are known to fail, thus we use this list to prevent these
 # failures from failing the job as a whole; only unexpected failures
@@ -288,6 +294,14 @@ elif sys.platform.startswith('linux'):
         'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason],
         'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason],
         'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason],
+        'block_cloning/block_cloning_disabled_copyfilerange':
+            ['SKIP', cfr_reason],
+        'block_cloning/block_cloning_copyfilerange':
+            ['SKIP', cfr_reason],
+        'block_cloning/block_cloning_copyfilerange_partial':
+            ['SKIP', cfr_reason],
+        'block_cloning/block_cloning_copyfilerange_cross_dataset':
+            ['SKIP', cfr_cross_reason],
     })
 
 
diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore
index f68f58072818..5f53b687191a 100644
--- a/tests/zfs-tests/cmd/.gitignore
+++ b/tests/zfs-tests/cmd/.gitignore
@@ -1,6 +1,7 @@
 /badsend
 /btree_test
 /chg_usr_exec
+/clonefile
 /devname2devid
 /dir_rd_update
 /draid
diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am
index 066abb6ce3b5..9bdb3c209756 100644
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@@ -119,6 +119,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2
 scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest
 scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet
 scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util
+scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile
 
 %C%_idmap_util_LDADD = libspl.la
 
diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c
new file mode 100644
index 000000000000..a7e7277ae411
--- /dev/null
+++ b/tests/zfs-tests/cmd/clonefile.c
@@ -0,0 +1,333 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * This program is to test the availability and behaviour of copy_file_range,
+ * FICLONE, FICLONERANGE and FIDEDUPERANGE in the Linux kernel. It should
+ * compile and run even if these features aren't exposed through the libc.
+ */
+
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#ifndef __NR_copy_file_range
+#if defined(__x86_64__)
+#define	__NR_copy_file_range (326)
+#elif defined(__i386__)
+#define	__NR_copy_file_range (377)
+#elif defined(__s390__)
+#define	__NR_copy_file_range (375)
+#elif defined(__arm__)
+#define	__NR_copy_file_range (391)
+#elif defined(__aarch64__)
+#define	__NR_copy_file_range (285)
+#elif defined(__powerpc__)
+#define	__NR_copy_file_range (379)
+#else
+#error "no definition of __NR_copy_file_range for this platform"
+#endif
+#endif /* __NR_copy_file_range */
+
+ssize_t
+copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int)
+    __attribute__((weak));
+
+static inline ssize_t
+cf_copy_file_range(int sfd, loff_t *soff, int dfd, loff_t *doff,
+    size_t len, unsigned int flags)
+{
+	if (copy_file_range)
+		return (copy_file_range(sfd, soff, dfd, doff, len, flags));
+	return (
+	    syscall(__NR_copy_file_range, sfd, soff, dfd, doff, len, flags));
+}
+
+/* Define missing FICLONE */
+#ifdef FICLONE
+#define	CF_FICLONE	FICLONE
+#else
+#define	CF_FICLONE	_IOW(0x94, 9, int)
+#endif
+
+/* Define missing FICLONERANGE and support structs */
+#ifdef FICLONERANGE
+#define	CF_FICLONERANGE	FICLONERANGE
+typedef struct file_clone_range cf_file_clone_range_t;
+#else
+typedef struct {
+	int64_t		src_fd;
+	uint64_t	src_offset;
+	uint64_t	src_length;
+	uint64_t	dest_offset;
+} cf_file_clone_range_t;
+#define	CF_FICLONERANGE	_IOW(0x94, 13, cf_file_clone_range_t)
+#endif
+
+/* Define missing FIDEDUPERANGE and support structs */
+#ifdef FIDEDUPERANGE
+#define	CF_FIDEDUPERANGE		FIDEDUPERANGE
+#define	CF_FILE_DEDUPE_RANGE_SAME	FILE_DEDUPE_RANGE_SAME
+#define	CF_FILE_DEDUPE_RANGE_DIFFERS	FILE_DEDUPE_RANGE_DIFFERS
+typedef struct file_dedupe_range_info	cf_file_dedupe_range_info_t;
+typedef struct file_dedupe_range	cf_file_dedupe_range_t;
+#else
+typedef struct {
+	int64_t dest_fd;
+	uint64_t dest_offset;
+	uint64_t bytes_deduped;
+	int32_t status;
+	uint32_t reserved;
+} cf_file_dedupe_range_info_t;
+typedef struct {
+	uint64_t src_offset;
+	uint64_t src_length;
+	uint16_t dest_count;
+	uint16_t reserved1;
+	uint32_t reserved2;
+	cf_file_dedupe_range_info_t info[0];
+} cf_file_dedupe_range_t;
+#define	CF_FIDEDUPERANGE		_IOWR(0x94, 54, cf_file_dedupe_range_t)
+#define	CF_FILE_DEDUPE_RANGE_SAME	(0)
+#define	CF_FILE_DEDUPE_RANGE_DIFFERS	(1)
+#endif
+
+typedef enum {
+	CF_MODE_NONE,
+	CF_MODE_CLONE,
+	CF_MODE_CLONERANGE,
+	CF_MODE_COPYFILERANGE,
+	CF_MODE_DEDUPERANGE,
+} cf_mode_t;
+
+static int
+usage(void)
+{
+	printf(
+	    "usage:\n"
+	    "  FICLONE:\n"
+	    "    clonefile -c <src> <dst>\n"
+	    "  FICLONERANGE:\n"
+	    "    clonefile -r <src> <dst> <soff> <doff> <len>\n"
+	    "  copy_file_range:\n"
+	    "    clonefile -f <src> <dst> <soff> <doff> <len>\n"
+	    "  FIDEDUPERANGE:\n"
+	    "    clonefile -d <src> <dst> <soff> <doff> <len>\n");
+	return (1);
+}
+
+int do_clone(int sfd, int dfd);
+int do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
+int do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
+int do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
+
+int quiet = 0;
+
+int
+main(int argc, char **argv)
+{
+	cf_mode_t mode = CF_MODE_NONE;
+
+	char c;
+	while ((c = getopt(argc, argv, "crfdq")) != -1) {
+		switch (c) {
+			case 'c':
+				mode = CF_MODE_CLONE;
+				break;
+			case 'r':
+				mode = CF_MODE_CLONERANGE;
+				break;
+			case 'f':
+				mode = CF_MODE_COPYFILERANGE;
+				break;
+			case 'd':
+				mode = CF_MODE_DEDUPERANGE;
+				break;
+			case 'q':
+				quiet = 1;
+				break;
+		}
+	}
+
+	if (mode == CF_MODE_NONE || (argc-optind) < 2 ||
+	    (mode != CF_MODE_CLONE && (argc-optind) < 5))
+		return (usage());
+
+	loff_t soff = 0, doff = 0;
+	size_t len = 0;
+	if (mode != CF_MODE_CLONE) {
+		soff = strtoull(argv[optind+2], NULL, 10);
+		if (soff == ULLONG_MAX) {
+			fprintf(stderr, "invalid source offset");
+			return (1);
+		}
+		doff = strtoull(argv[optind+3], NULL, 10);
+		if (doff == ULLONG_MAX) {
+			fprintf(stderr, "invalid dest offset");
+			return (1);
+		}
+		len = strtoull(argv[optind+4], NULL, 10);
+		if (len == ULLONG_MAX) {
+			fprintf(stderr, "invalid length");
+			return (1);
+		}
+	}
+
+	int sfd = open(argv[optind], O_RDONLY);
+	if (sfd < 0) {
+		fprintf(stderr, "open: %s: %s\n",
+		    argv[optind], strerror(errno));
+		return (1);
+	}
+
+	int dfd = open(argv[optind+1], O_WRONLY|O_CREAT,
+	    S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
+	if (sfd < 0) {
+		fprintf(stderr, "open: %s: %s\n",
+		    argv[optind+1], strerror(errno));
+		close(sfd);
+		return (1);
+	}
+
+	int err;
+	switch (mode) {
+		case CF_MODE_CLONE:
+			err = do_clone(sfd, dfd);
+			break;
+		case CF_MODE_CLONERANGE:
+			err = do_clonerange(sfd, dfd, soff, doff, len);
+			break;
+		case CF_MODE_COPYFILERANGE:
+			err = do_copyfilerange(sfd, dfd, soff, doff, len);
+			break;
+		case CF_MODE_DEDUPERANGE:
+			err = do_deduperange(sfd, dfd, soff, doff, len);
+			break;
+		default:
+			abort();
+	}
+
+	off_t spos = lseek(sfd, 0, SEEK_CUR);
+	off_t slen = lseek(sfd, 0, SEEK_END);
+	off_t dpos = lseek(dfd, 0, SEEK_CUR);
+	off_t dlen = lseek(dfd, 0, SEEK_END);
+
+	fprintf(stderr, "file offsets: src=%lu/%lu; dst=%lu/%lu\n", spos, slen,
+	    dpos, dlen);
+
+	close(dfd);
+	close(sfd);
+
+	return (err == 0 ? 0 : 1);
+}
+
+int
+do_clone(int sfd, int dfd)
+{
+	fprintf(stderr, "using FICLONE\n");
+	int err = ioctl(dfd, CF_FICLONE, sfd);
+	if (err < 0) {
+		fprintf(stderr, "ioctl(FICLONE): %s\n", strerror(errno));
+		return (err);
+	}
+	return (0);
+}
+
+int
+do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
+{
+	fprintf(stderr, "using FICLONERANGE\n");
+	cf_file_clone_range_t fcr = {
+		.src_fd = sfd,
+		.src_offset = soff,
+		.src_length = len,
+		.dest_offset = doff,
+	};
+	int err = ioctl(dfd, CF_FICLONERANGE, &fcr);
+	if (err < 0) {
+		fprintf(stderr, "ioctl(FICLONERANGE): %s\n", strerror(errno));
+		return (err);
+	}
+	return (0);
+}
+
+int
+do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
+{
+	fprintf(stderr, "using copy_file_range\n");
+	ssize_t copied = cf_copy_file_range(sfd, &soff, dfd, &doff, len, 0);
+	if (copied < 0) {
+		fprintf(stderr, "copy_file_range: %s\n", strerror(errno));
+		return (1);
+	}
+	if (copied != len) {
+		fprintf(stderr, "copy_file_range: copied less than requested: "
+		    "requested=%lu; copied=%lu\n", len, copied);
+		return (1);
+	}
+	return (0);
+}
+
+int
+do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
+{
+	fprintf(stderr, "using FIDEDUPERANGE\n");
+
+	char buf[sizeof (cf_file_dedupe_range_t)+
+	    sizeof (cf_file_dedupe_range_info_t)] = {0};
+	cf_file_dedupe_range_t *fdr = (cf_file_dedupe_range_t *)&buf[0];
+	cf_file_dedupe_range_info_t *fdri =
+	    (cf_file_dedupe_range_info_t *)
+	    &buf[sizeof (cf_file_dedupe_range_t)];
+
+	fdr->src_offset = soff;
+	fdr->src_length = len;
+	fdr->dest_count = 1;
+
+	fdri->dest_fd = dfd;
+	fdri->dest_offset = doff;
+
+	int err = ioctl(sfd, CF_FIDEDUPERANGE, fdr);
+	if (err != 0)
+		fprintf(stderr, "ioctl(FIDEDUPERANGE): %s\n", strerror(errno));
+
+	if (fdri->status < 0) {
+		fprintf(stderr, "dedup failed: %s\n", strerror(-fdri->status));
+		err = -1;
+	} else if (fdri->status == CF_FILE_DEDUPE_RANGE_DIFFERS) {
+		fprintf(stderr, "dedup failed: range differs\n");
+		err = -1;
+	}
+
+	return (err);
+}
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg
index b3cfe149ffa7..fa545e06bbf3 100644
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -182,6 +182,7 @@ export ZFS_FILES='zdb
 export ZFSTEST_FILES='badsend
     btree_test
     chg_usr_exec
+    clonefile
     devname2devid
     dir_rd_update
     draid
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index ff65dc1ac2b0..0819cb6b576e 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -90,6 +90,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
 	functional/alloc_class/alloc_class.kshlib \
 	functional/atime/atime.cfg \
 	functional/atime/atime_common.kshlib \
+	functional/block_cloning/block_cloning.kshlib \
 	functional/cache/cache.cfg \
 	functional/cache/cache.kshlib \
 	functional/cachefile/cachefile.cfg \
@@ -437,6 +438,17 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/atime/root_atime_on.ksh \
 	functional/atime/root_relatime_on.ksh \
 	functional/atime/setup.ksh \
+	functional/block_cloning/cleanup.ksh \
+	functional/block_cloning/setup.ksh \
+	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
+	functional/block_cloning/block_cloning_copyfilerange.ksh \
+	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
+	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
+	functional/block_cloning/block_cloning_disabled_ficlone.ksh \
+	functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
+	functional/block_cloning/block_cloning_ficlone.ksh \
+	functional/block_cloning/block_cloning_ficlonerange.ksh \
+	functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
 	functional/bootfs/bootfs_001_pos.ksh \
 	functional/bootfs/bootfs_002_neg.ksh \
 	functional/bootfs/bootfs_003_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
new file mode 100644
index 000000000000..9998e5a87bfe
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
@@ -0,0 +1,46 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+function have_same_content
+{
+	typeset hash1=$(cat $1 | md5sum)
+	typeset hash2=$(cat $2 | md5sum)
+
+	log_must [ "$hash1" = "$hash2" ]
+}
+
+function unique_blocks
+{
+	typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$
+	zdb -vvvvv $1 -O $2 | \
+	    awk '/ L0 / { print ++l " " $3 " " $7 }' > $zdbout.a
+	zdb -vvvvv $3 -O $4 | \
+	    awk '/ L0 / { print ++l " " $3 " " $7 }' > $zdbout.b
+	echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ')
+}
+
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh
new file mode 100755
index 000000000000..9adcbfcd88a1
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+claim="The copy_file_range syscall can clone whole files."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "1 2 3 4" ]
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
new file mode 100755
index 000000000000..07e089e89ceb
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
@@ -0,0 +1,65 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
+  log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
+fi
+
+claim="The copy_file_range syscall can clone across datasets."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must zfs create $TESTPOOL/$TESTFS1
+log_must zfs create $TESTPOOL/$TESTFS2
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS1/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must \
+    clonefile -f /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2 0 0 524288
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2
+
+typeset blocks=$(unique_blocks \
+  $TESTPOOL/$TESTFS1 file1 $TESTPOOL/$TESTFS2 file2)
+log_must [ "$blocks" = "1 2 3 4" ]
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh
new file mode 100755
index 000000000000..ecac62b20350
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh
@@ -0,0 +1,68 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+claim="The copy_file_range syscall can clone parts of a file."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "" ]
+
+log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "2 3" ]
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh
new file mode 100755
index 000000000000..30b155a140c4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+claim="The copy_file_range syscall copies files when block cloning is disabled."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=disabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "" ]
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlone.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlone.ksh
new file mode 100755
index 000000000000..10a2715ea253
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlone.ksh
@@ -0,0 +1,50 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+claim="The FICLONE ioctl fails when block cloning is disabled."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=disabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_mustnot clonefile -c /$TESTPOOL/file1 /$TESTPOOL/file2
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlonerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlonerange.ksh
new file mode 100755
index 000000000000..e8461e6d3c38
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_ficlonerange.ksh
@@ -0,0 +1,50 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+claim="The FICLONERANGE ioctl fails when block cloning is disabled."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=disabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_mustnot clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh
new file mode 100755
index 000000000000..d13a39229870
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh
@@ -0,0 +1,56 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+claim="The FICLONE ioctl can clone files."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must clonefile -c /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "1 2 3 4" ]
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh
new file mode 100755
index 000000000000..6556050c4352
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh
@@ -0,0 +1,56 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+claim="The FICLONERANGE ioctl can clone whole files."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "1 2 3 4" ]
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh
new file mode 100755
index 000000000000..37a3511a26d5
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+claim="The FICLONERANGE ioctl can clone parts of a file."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "" ]
+
+log_must clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
+
+typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "2 3" ]
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh b/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh
new file mode 100755
index 000000000000..7ac13adb6325
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+default_cleanup_noexit
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh
new file mode 100755
index 000000000000..512f5a0644df
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh
@@ -0,0 +1,36 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+if ! command -v clonefile > /dev/null ; then
+  log_unsupported "clonefile program required to test block cloning"
+fi
+
+verify_runnable "global"
+
+log_pass
-- 
cgit v1.2.3


From 2848de11e516a2ef2e6baa574a60d77a6fb47023 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 25 Jul 2023 12:08:36 -0400
Subject: Remove zl_issuer_lock from zil_suspend().

This locking was recently added as part of #14979. But appears it
is illegal to take zl_issuer_lock while holding dp_config_rwlock,
taken by dsl_pool_hold().  It causes deadlock with sync thread in
spa_sync_upgrades().  On a second thought, we should not
need this locking, since zil_commit_impl() we call below takes
zl_issuer_lock, that should sufficiently protect zl_suspend reads,
combined with other logic from #14979.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15103
---
 module/zfs/zil.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index af7137faaccf..be5b9edf6ede 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -3941,13 +3941,11 @@ zil_suspend(const char *osname, void **cookiep)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
-	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
-		mutex_exit(&zilog->zl_issuer_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
@@ -3961,7 +3959,6 @@ zil_suspend(const char *osname, void **cookiep)
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
-		mutex_exit(&zilog->zl_issuer_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
@@ -3970,7 +3967,6 @@ zil_suspend(const char *osname, void **cookiep)
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
-	mutex_exit(&zilog->zl_issuer_lock);
 
 	if (zilog->zl_suspend > 1) {
 		/*
-- 
cgit v1.2.3


From 782312c612777029ac7955711a67a5f916777bc4 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 25 Jul 2023 13:55:29 -0700
Subject: zed: Reduce log noise for large JBODs

For large JBODs the log message "zfs_iter_vdev: no match" can
account for the bulk of the log messages (over 70%).  Since this
message is purely informational and not that useful we remove it.

Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #15086
Closes #15094
---
 cmd/zed/agents/zfs_mod.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
index b07a02712295..a8d084bb4bd3 100644
--- a/cmd/zed/agents/zfs_mod.c
+++ b/cmd/zed/agents/zfs_mod.c
@@ -607,8 +607,6 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
 		 */
 		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
 		    strcmp(dp->dd_compare, path) != 0) {
-			zed_log_msg(LOG_INFO, "  %s: no match (%s != vdev %s)",
-			    __func__, dp->dd_compare, path);
 			return;
 		}
 		if (dp->dd_new_vdev_guid != 0 && dp->dd_new_vdev_guid != guid) {
-- 
cgit v1.2.3


From 704c80f0487199a00c3ce5eba1b2c1ee7854c900 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 27 Jul 2023 12:07:09 -0400
Subject: Avoid waiting in dmu_sync_late_arrival().

The transaction there does not produce any dirty data or log blocks,
so it should not be throttled. All other cases wait for TXG sync, by
which time the log block we are writing will be obsolete, so we can
skip waiting and just return error here instead.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15096
---
 module/zfs/dmu.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3a4560cec2c4..078811dbf4e3 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1656,7 +1656,13 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
-	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+	/*
+	 * This transaction does not produce any dirty data or log blocks, so
+	 * it should not be throttled.  All other cases wait for TXG sync, by
+	 * which time the log block we are writing will be obsolete, so we can
+	 * skip waiting and just return error here instead.
+	 */
+	if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
-- 
cgit v1.2.3


From 5bdfff5cfc8baff48b3b59a577e7ef756a011024 Mon Sep 17 00:00:00 2001
From: oromenahar <christianpe96@gmail.com>
Date: Thu, 27 Jul 2023 20:32:34 +0200
Subject: BRT should return EOPNOTSUPP

Return the more descriptive EOPNOTSUPP instead of EXDEV when the
storage pool doesn't support block cloning.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Kay Pedersen <mail@mkwg.de>
Closes #15097
---
 module/os/freebsd/zfs/zfs_vnops_os.c |  2 +-
 module/zfs/zfs_vnops.c               | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 7692200ab250..45cf6fdfc409 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6290,7 +6290,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 
 	error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
 	    ap->a_outoffp, &len, ap->a_outcred);
-	if (error == EXDEV)
+	if (error == EXDEV || error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
 out_locked:
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 3ebd2d0ff7c5..54ea43363bfc 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1078,6 +1078,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		return (SET_ERROR(EXDEV));
 	}
 
+	/*
+	 * outos and inos belongs to the same storage pool.
+	 * see a few lines above, only one check.
+	 */
+	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+	    SPA_FEATURE_BLOCK_CLONING)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
 	ASSERT(!outzfsvfs->z_replay);
 
 	error = zfs_verify_zp(inzp);
@@ -1088,12 +1098,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		return (error);
 	}
 
-	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
-	    SPA_FEATURE_BLOCK_CLONING)) {
-		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
-		return (SET_ERROR(EXDEV));
-	}
-
 	/*
 	 * We don't copy source file's flags that's why we don't allow to clone
 	 * files that are in quarantine.
-- 
cgit v1.2.3


From b22bab2547b74b5df5e4698880e7d027b1d44509 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 28 Jul 2023 16:30:33 -0400
Subject: Remove fastwrite mechanism.

Fastwrite was introduced many years ago to improve ZIL writes spread
between multiple top-level vdevs by tracking number of allocated but
not written blocks and choosing vdev with smaller count.  It suposed
to reduce ZIL knowledge about allocation, but actually made ZIL to
even more actively report allocation code about the allocations,
complicating both ZIL and metaslabs code.

On top of that, it seems ZIO_FLAG_FASTWRITE setting in dmu_sync()
was lost many years ago, that was one of the declared benefits. Plus
introduction of embedded log metaslab class solved another problem
with allocation rotor accounting both normal and log allocations,
since in most cases those are now in different metaslab classes.

After all that, I'd prefer to simplify already too complicated ZIL,
ZIO and metaslab code if the benefit of complexity is not obvious.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15107
---
 include/sys/metaslab.h  |  3 ---
 include/sys/vdev_impl.h |  1 -
 include/sys/zil_impl.h  |  1 -
 include/sys/zio.h       |  1 -
 module/zfs/metaslab.c   | 67 ++-----------------------------------------------
 module/zfs/vdev.c       |  2 --
 module/zfs/zil.c        | 42 ++++---------------------------
 module/zfs/zio.c        | 14 +----------
 8 files changed, 8 insertions(+), 123 deletions(-)

diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index fec080139a2b..0df6e5f81fc1 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
 #define	METASLAB_ASYNC_ALLOC		0x8
 #define	METASLAB_DONT_THROTTLE		0x10
 #define	METASLAB_MUST_RESERVE		0x20
-#define	METASLAB_FASTWRITE		0x40
 #define	METASLAB_ZIL			0x80
 
 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
@@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
 int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
 int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
 void metaslab_check_free(spa_t *, const blkptr_t *);
-void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
-void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
 
 void metaslab_stat_init(void);
 void metaslab_stat_fini(void);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 5f4e82ad8657..ad9dc3aefd8e 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -266,7 +266,6 @@ struct vdev {
 	metaslab_group_t *vdev_mg;	/* metaslab group		*/
 	metaslab_group_t *vdev_log_mg;	/* embedded slog metaslab group	*/
 	metaslab_t	**vdev_ms;	/* metaslab array		*/
-	uint64_t	vdev_pending_fastwrite; /* allocated fastwrites */
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index 03a409c5257c..b58dad9695a6 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -91,7 +91,6 @@ typedef enum {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
-	boolean_t	lwb_fastwrite;	/* is blk marked for fastwrite? */
 	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
 	boolean_t	lwb_indirect;	/* do not postpone zil_lwb_commit() */
 	int		lwb_nused;	/* # used bytes in buffer */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index f4da80783e56..e1f4d5c04499 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t;
 #define	ZIO_FLAG_NOPWRITE	(1ULL << 28)
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 29)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 30)
-#define	ZIO_FLAG_FASTWRITE	(1ULL << 31)
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 9991e1a22cdf..8393e8dd91d5 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5101,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
-	metaslab_group_t *mg, *fast_mg, *rotor;
+	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
 
@@ -5164,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
-	} else if (flags & METASLAB_FASTWRITE) {
-		mg = fast_mg = mca->mca_rotor;
-
-		do {
-			if (fast_mg->mg_vd->vdev_pending_fastwrite <
-			    mg->mg_vd->vdev_pending_fastwrite)
-				mg = fast_mg;
-		} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
-
 	} else {
 		ASSERT(mca->mca_rotor != NULL);
 		mg = mca->mca_rotor;
@@ -5297,7 +5288,7 @@ top:
 				mg->mg_bias = 0;
 			}
 
-			if ((flags & METASLAB_FASTWRITE) ||
+			if ((flags & METASLAB_ZIL) ||
 			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mca->mca_rotor = mg->mg_next;
@@ -5310,11 +5301,6 @@ top:
 			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 			DVA_SET_ASIZE(&dva[d], asize);
 
-			if (flags & METASLAB_FASTWRITE) {
-				atomic_add_64(&vd->vdev_pending_fastwrite,
-				    psize);
-			}
-
 			return (0);
 		}
 next:
@@ -5950,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 	return (error);
 }
 
-void
-metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	uint64_t psize = BP_GET_PSIZE(bp);
-	int d;
-	vdev_t *vd;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT(psize > 0);
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (d = 0; d < ndvas; d++) {
-		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
-			continue;
-		atomic_add_64(&vd->vdev_pending_fastwrite, psize);
-	}
-
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
-void
-metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	uint64_t psize = BP_GET_PSIZE(bp);
-	int d;
-	vdev_t *vd;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT(psize > 0);
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (d = 0; d < ndvas; d++) {
-		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
-			continue;
-		ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
-		atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
-	}
-
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index f3812b843e95..87c145593237 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1192,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
 	ASSERT(tvd == tvd->vdev_top);
 
-	tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
@@ -1655,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd)
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
-	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
 }
 
 typedef struct vdev_probe_stats {
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index be5b9edf6ede..6f04a7d4a7df 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -761,15 +761,13 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
 }
 
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
-    boolean_t fastwrite)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
-	lwb->lwb_fastwrite = fastwrite;
 	lwb->lwb_slog = slog;
 	lwb->lwb_indirect = B_FALSE;
 	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
@@ -916,7 +914,6 @@ zil_create(zilog_t *zilog)
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
-	boolean_t fastwrite = FALSE;
 	boolean_t slog = FALSE;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
@@ -949,8 +946,6 @@ zil_create(zilog_t *zilog)
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
-		fastwrite = TRUE;
-
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
@@ -959,7 +954,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite);
+		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -1044,9 +1039,6 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
-			if (lwb->lwb_fastwrite)
-				metaslab_fastwrite_unmark(zilog->zl_spa,
-				    &lwb->lwb_blk);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
@@ -1551,7 +1543,6 @@ zil_lwb_write_done(zio_t *zio)
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_write_zio = NULL;
-	lwb->lwb_fastwrite = FALSE;
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	mutex_exit(&zilog->zl_lock);
 
@@ -1718,20 +1709,12 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
-	/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
-	mutex_enter(&zilog->zl_lock);
-	if (!lwb->lwb_fastwrite) {
-		metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
-		lwb->lwb_fastwrite = 1;
-	}
-
 	lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0,
 	    &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
-	    zil_lwb_write_done, lwb, prio,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
+	    zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 
+	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
-
 	zil_lwb_set_zio_dependency(zilog, lwb);
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
@@ -1864,7 +1847,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
 		/*
 		 * Allocate a new log write block (lwb).
 		 */
-		nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
+		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
 	}
 
 	lwb->lwb_state = LWB_STATE_ISSUED;
@@ -3651,18 +3634,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 			BP_ZERO(&zh->zh_log);
 	}
 
-	/*
-	 * Remove fastwrite on any blocks that have been pre-allocated for
-	 * the next commit. This prevents fastwrite counter pollution by
-	 * unused, long-lived LWBs.
-	 */
-	for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
-		if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) {
-			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
-			lwb->lwb_fastwrite = 0;
-		}
-	}
-
 	mutex_exit(&zilog->zl_lock);
 }
 
@@ -3895,9 +3866,6 @@ zil_close(zilog_t *zilog)
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
 
-		if (lwb->lwb_fastwrite)
-			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
-
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 3f5e6a08d89c..b5627109900c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3024,11 +3024,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	/*
-	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
-	 */
-	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
-
 	zio_nowait(zio);
 
 	return (pio);
@@ -3616,7 +3611,6 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
-	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
@@ -3776,7 +3770,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
-	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash4(0, 0, 0,
 	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
@@ -4931,12 +4925,6 @@ zio_done(zio_t *zio)
 		zfs_ereport_free_checksum(zcr);
 	}
 
-	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
-	    !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
-	    !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
-		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
-	}
-
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
-- 
cgit v1.2.3


From e9c59310f7da120d5506d11ccebb761f2825ac06 Mon Sep 17 00:00:00 2001
From: oromenahar <christianpe96@gmail.com>
Date: Tue, 1 Aug 2023 17:26:12 +0200
Subject: Check the return value in clonefile test

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Kay Pedersen <mail@mkwg.de>
Closes #15128
---
 tests/zfs-tests/cmd/clonefile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c
index a7e7277ae411..696dc471d8c3 100644
--- a/tests/zfs-tests/cmd/clonefile.c
+++ b/tests/zfs-tests/cmd/clonefile.c
@@ -212,7 +212,7 @@ main(int argc, char **argv)
 
 	int dfd = open(argv[optind+1], O_WRONLY|O_CREAT,
 	    S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
-	if (sfd < 0) {
+	if (dfd < 0) {
 		fprintf(stderr, "open: %s: %s\n",
 		    argv[optind+1], strerror(errno));
 		close(sfd);
-- 
cgit v1.2.3


From b35374fd6474603170fd9a3c7503da6eb13ac712 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Tue, 1 Aug 2023 11:27:58 -0400
Subject: Linux 6.5 compat: register_sysctl_table removed

Additionally, the .child element of ctl_table has been removed in 6.5.
This change adds a new test for the pre-6.5 register_sysctl_table()
function, and uses the old code in that case. If it isn't found, then
the parentage entries in the tables are removed, and the register_sysctl
call is provided the paths of "kernel/spl", "kernel/spl/kmem", and
"kernel/spl/kstat" directly, to populate each subdirectory over three
calls, as is the new API.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15098
---
 config/kernel-register_sysctl_table.m4 | 27 +++++++++++++++++++++++++++
 config/kernel.m4                       |  2 ++
 module/os/linux/spl/spl-proc.c         | 26 +++++++++++++++++++++++---
 3 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 config/kernel-register_sysctl_table.m4

diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4
new file mode 100644
index 000000000000..f18316b32b6d
--- /dev/null
+++ b/config/kernel-register_sysctl_table.m4
@@ -0,0 +1,27 @@
+dnl #
+dnl # Linux 6.5 removes register_sysctl_table
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [
+	ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [
+		#include <linux/sysctl.h>
+
+		static struct ctl_table dummy_table[] = {
+			{}
+		};
+
+    ],[
+		struct ctl_table_header *h
+			__attribute((unused)) = register_sysctl_table(dummy_table);
+    ])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
+	AC_MSG_CHECKING([whether register_sysctl_table exists])
+	ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_SYSCTL_REGISTER_TABLE, 1,
+			[sysctl_register_table exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 1487fa2e7793..28bd361d33ff 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -160,6 +160,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_FILEMAP
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
+	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -299,6 +300,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_FILEMAP
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
+	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index 01f5619e1893..bcc356ae55b6 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -624,6 +624,7 @@ static struct ctl_table spl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dohostid,
 	},
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
 	{
 		.procname	= "kmem",
 		.mode		= 0555,
@@ -634,9 +635,11 @@ static struct ctl_table spl_table[] = {
 		.mode		= 0555,
 		.child		= spl_kstat_table,
 	},
+#endif
 	{},
 };
 
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
 static struct ctl_table spl_dir[] = {
 	{
 		.procname	= "spl",
@@ -648,21 +651,38 @@ static struct ctl_table spl_dir[] = {
 
 static struct ctl_table spl_root[] = {
 	{
-	.procname = "kernel",
-	.mode = 0555,
-	.child = spl_dir,
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= spl_dir,
 	},
 	{}
 };
+#endif
 
 int
 spl_proc_init(void)
 {
 	int rc = 0;
 
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
 	spl_header = register_sysctl_table(spl_root);
 	if (spl_header == NULL)
 		return (-EUNATCH);
+#else
+	spl_header = register_sysctl("kernel/spl", spl_table);
+	if (spl_header == NULL)
+		return (-EUNATCH);
+
+	if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+#endif
 
 	proc_spl = proc_mkdir("spl", NULL);
 	if (proc_spl == NULL) {
-- 
cgit v1.2.3


From 3b8e318b7737fa40daf6abbc06ba31cd6ae8d572 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Tue, 1 Aug 2023 11:32:38 -0400
Subject: Linux 6.5 compat: use disk_check_media_change when it exists

When disk_check_media_change() exists, then define
zfs_check_media_change() to simply call disk_check_media_change() on
the bd_disk member of its argument. Since disk_check_media_change()
is newer than when revalidate_disk was present in bops, we should
be able to safely do this via a macro, instead of recreating a new
implementation of the inline function that forces revalidation.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15101
---
 include/os/linux/kernel/linux/blkdev_compat.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index e0f20ba32008..1641dd92a918 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -347,6 +347,7 @@ zfs_check_media_change(struct block_device *bdev)
 #define	vdev_bdev_reread_part(bdev)	zfs_check_media_change(bdev)
 #elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE)
 #define	vdev_bdev_reread_part(bdev)	disk_check_media_change(bdev->bd_disk)
+#define	zfs_check_media_change(bdev)	disk_check_media_change(bdev->bd_disk)
 #else
 /*
  * This is encountered if check_disk_change() and bdev_check_media_change()
-- 
cgit v1.2.3


From 43e8f6e37fddc31f23301cb70d466687bd205cd9 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Tue, 1 Aug 2023 11:37:20 -0400
Subject: Linux 6.5 compat: blkdev changes

Multiple changes to the blkdev API were introduced in Linux 6.5. This
includes passing (void* holder) to blkdev_put, adding a new
blk_holder_ops* arg to blkdev_get_by_path, adding a new blk_mode_t type
that replaces uses of fmode_t, and removing an argument from the release
handler on block_device_operations that we weren't using. The open
function definition has also changed to take gendisk* and blk_mode_t, so
update it accordingly, too.

Implement local wrappers for blkdev_get_by_path() and
vdev_blkdev_put() so that the in-line calls are cleaner, and place the
conditionally-compiled implementation details inside of both of these
local wrappers. Both calls are exclusively used within vdev_disk.c, at
this time.

Add blk_mode_is_open_write() to test FMODE_WRITE / BLK_OPEN_WRITE
The wrapper function is now used for testing using the appropriate
method for the kernel, whether the open mode is writable or not.

Emphasize fmode_t arg in zvol_release is not used

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15099
---
 config/kernel-blkdev.m4                       | 84 ++++++++++++++++++++++++++-
 config/kernel-block-device-operations.m4      | 35 ++++++++++-
 include/os/linux/kernel/linux/blkdev_compat.h |  6 ++
 module/os/linux/zfs/vdev_disk.c               | 65 ++++++++++++++++++---
 module/os/linux/zfs/zfs_vnops_os.c            |  2 +-
 module/os/linux/zfs/zpl_ctldir.c              |  2 +-
 module/os/linux/zfs/zvol_os.c                 | 28 +++++++--
 7 files changed, 203 insertions(+), 19 deletions(-)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 887acee670ba..e04a2bd2c3b6 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -16,12 +16,63 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [
 	])
 ])
 
+dnl #
+dnl # 6.5.x API change,
+dnl # blkdev_get_by_path() takes 4 args
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [
+	ZFS_LINUX_TEST_SRC([blkdev_get_by_path_4arg], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct block_device *bdev __attribute__ ((unused)) = NULL;
+		const char *path = "path";
+		fmode_t mode = 0;
+		void *holder = NULL;
+		struct blk_holder_ops h;
+
+		bdev = blkdev_get_by_path(path, mode, holder, &h);
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
-	AC_MSG_CHECKING([whether blkdev_get_by_path() exists])
+	AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args])
 	ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [
 		AC_MSG_RESULT(yes)
 	], [
-		ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+		AC_MSG_RESULT(no)
+		AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 4 args])
+		ZFS_LINUX_TEST_RESULT([blkdev_get_by_path_4arg], [
+			AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH_4ARG, 1,
+				[blkdev_get_by_path() exists and takes 4 args])
+			AC_MSG_RESULT(yes)
+		], [
+			ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+		])
+	])
+])
+
+dnl #
+dnl # 6.5.x API change
+dnl # blk_mode_t was added as a type to supercede some places where fmode_t
+dnl # is used
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T], [
+	ZFS_LINUX_TEST_SRC([blk_mode_t], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		blk_mode_t m __attribute((unused)) = (blk_mode_t)0;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [
+	AC_MSG_CHECKING([whether blk_mode_t is defined])
+	ZFS_LINUX_TEST_RESULT([blk_mode_t], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLK_MODE_T, 1, [blk_mode_t is defined])
+	], [
+		AC_MSG_RESULT(no)
 	])
 ])
 
@@ -41,12 +92,35 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [
 	])
 ])
 
+dnl #
+dnl # 6.5.x API change.
+dnl # blkdev_put() takes (void* holder) as arg 2
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [
+	ZFS_LINUX_TEST_SRC([blkdev_put_holder], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct block_device *bdev = NULL;
+		void *holder = NULL;
+
+		blkdev_put(bdev, holder);
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 	AC_MSG_CHECKING([whether blkdev_put() exists])
 	ZFS_LINUX_TEST_RESULT([blkdev_put], [
 		AC_MSG_RESULT(yes)
 	], [
-		ZFS_LINUX_TEST_ERROR([blkdev_put()])
+		AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2])
+		ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1,
+				[blkdev_put() accepts void* as arg 2])
+		], [
+			ZFS_LINUX_TEST_ERROR([blkdev_put()])
+		])
 	])
 ])
 
@@ -495,7 +569,9 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [
 
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
+	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
+	ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER
 	ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART
 	ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV
@@ -510,6 +586,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
+	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
@@ -530,4 +607,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
+	ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T
 ])
diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4
index 84e39dc8a2f6..d13c1337b1fb 100644
--- a/config/kernel-block-device-operations.m4
+++ b/config/kernel-block-device-operations.m4
@@ -49,12 +49,42 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
 	], [], [])
 ])
 
+dnl #
+dnl # 5.9.x API change
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [
+	ZFS_LINUX_TEST_SRC([block_device_operations_release_void_1arg], [
+		#include <linux/blkdev.h>
+
+		void blk_release(struct gendisk *g) {
+			(void) g;
+			return;
+		}
+
+		static const struct block_device_operations
+		    bops __attribute__ ((unused)) = {
+			.open		= NULL,
+			.release	= blk_release,
+			.ioctl		= NULL,
+			.compat_ioctl	= NULL,
+		};
+	], [], [])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
-	AC_MSG_CHECKING([whether bops->release() is void])
+	AC_MSG_CHECKING([whether bops->release() is void and takes 2 args])
 	ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [
 		AC_MSG_RESULT(yes)
 	],[
-		ZFS_LINUX_TEST_ERROR([bops->release()])
+		AC_MSG_RESULT(no)
+		AC_MSG_CHECKING([whether bops->release() is void and takes 1 arg])
+		ZFS_LINUX_TEST_RESULT([block_device_operations_release_void_1arg], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [1],
+				[Define if release() in block_device_operations takes 1 arg])
+		],[
+			ZFS_LINUX_TEST_ERROR([bops->release()])
+		])
 	])
 ])
 
@@ -92,6 +122,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
+	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 ])
 
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index 1641dd92a918..f111e648ccf7 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -398,6 +398,12 @@ vdev_lookup_bdev(const char *path, dev_t *dev)
 #endif
 }
 
+#if defined(HAVE_BLK_MODE_T)
+#define	blk_mode_is_open_write(flag)	((flag) & BLK_OPEN_WRITE)
+#else
+#define	blk_mode_is_open_write(flag)	((flag) & FMODE_WRITE)
+#endif
+
 /*
  * Kernels without bio_set_op_attrs use bi_rw for the bio flags.
  */
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 925ee9d9fe9c..48ac55f07034 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -80,9 +80,22 @@ typedef struct dio_request {
 
 static unsigned int zfs_vdev_failfast_mask = 1;
 
+#ifdef HAVE_BLK_MODE_T
+static blk_mode_t
+#else
 static fmode_t
+#endif
 vdev_bdev_mode(spa_mode_t spa_mode)
 {
+#ifdef HAVE_BLK_MODE_T
+	blk_mode_t mode = 0;
+
+	if (spa_mode & SPA_MODE_READ)
+		mode |= BLK_OPEN_READ;
+
+	if (spa_mode & SPA_MODE_WRITE)
+		mode |= BLK_OPEN_WRITE;
+#else
 	fmode_t mode = 0;
 
 	if (spa_mode & SPA_MODE_READ)
@@ -90,6 +103,7 @@ vdev_bdev_mode(spa_mode_t spa_mode)
 
 	if (spa_mode & SPA_MODE_WRITE)
 		mode |= FMODE_WRITE;
+#endif
 
 	return (mode);
 }
@@ -197,12 +211,47 @@ vdev_disk_kobj_evt_post(vdev_t *v)
 	}
 }
 
+#if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
+/*
+ * Define a dummy struct blk_holder_ops for kernel versions
+ * prior to 6.5.
+ */
+struct blk_holder_ops {};
+#endif
+
+static struct block_device *
+vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder,
+    const struct blk_holder_ops *hops)
+{
+#ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG
+	return (blkdev_get_by_path(path,
+	    vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops));
+#else
+	return (blkdev_get_by_path(path,
+	    vdev_bdev_mode(mode) | FMODE_EXCL, holder));
+#endif
+}
+
+static void
+vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder)
+{
+#ifdef HAVE_BLKDEV_PUT_HOLDER
+	return (blkdev_put(bdev, holder));
+#else
+	return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL));
+#endif
+}
+
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	struct block_device *bdev;
+#ifdef HAVE_BLK_MODE_T
+	blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+#else
 	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+#endif
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
 
@@ -252,15 +301,15 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 					reread_part = B_TRUE;
 			}
 
-			blkdev_put(bdev, mode | FMODE_EXCL);
+			vdev_blkdev_put(bdev, mode, zfs_vdev_holder);
 		}
 
 		if (reread_part) {
-			bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL,
-			    zfs_vdev_holder);
+			bdev = vdev_blkdev_get_by_path(disk_name, mode,
+			    zfs_vdev_holder, NULL);
 			if (!IS_ERR(bdev)) {
 				int error = vdev_bdev_reread_part(bdev);
-				blkdev_put(bdev, mode | FMODE_EXCL);
+				vdev_blkdev_put(bdev, mode, zfs_vdev_holder);
 				if (error == 0) {
 					timeout = MSEC2NSEC(
 					    zfs_vdev_open_timeout_ms * 2);
@@ -305,8 +354,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	hrtime_t start = gethrtime();
 	bdev = ERR_PTR(-ENXIO);
 	while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) {
-		bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
-		    zfs_vdev_holder);
+		bdev = vdev_blkdev_get_by_path(v->vdev_path, mode,
+		    zfs_vdev_holder, NULL);
 		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
 			/*
 			 * There is no point of waiting since device is removed
@@ -382,8 +431,8 @@ vdev_disk_close(vdev_t *v)
 		return;
 
 	if (vd->vd_bdev != NULL) {
-		blkdev_put(vd->vd_bdev,
-		    vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
+		vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa),
+		    zfs_vdev_holder);
 	}
 
 	rw_destroy(&vd->vd_lock);
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 234c4d5ef0e0..33baac9db06b 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -186,7 +186,7 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 		return (error);
 
 	/* Honor ZFS_APPENDONLY file attribute */
-	if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+	if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & O_APPEND) == 0)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c
index 68a7de78f471..7786444fea35 100644
--- a/module/os/linux/zfs/zpl_ctldir.c
+++ b/module/os/linux/zfs/zpl_ctldir.c
@@ -42,7 +42,7 @@
 static int
 zpl_common_open(struct inode *ip, struct file *filp)
 {
-	if (filp->f_mode & FMODE_WRITE)
+	if (blk_mode_is_open_write(filp->f_mode))
 		return (-EACCES);
 
 	return (generic_file_open(ip, filp));
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 38bc8e2c4eeb..7a95b54bdf0d 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -671,7 +671,11 @@ zvol_request(struct request_queue *q, struct bio *bio)
 }
 
 static int
+#ifdef HAVE_BLK_MODE_T
+zvol_open(struct gendisk *disk, blk_mode_t flag)
+#else
 zvol_open(struct block_device *bdev, fmode_t flag)
+#endif
 {
 	zvol_state_t *zv;
 	int error = 0;
@@ -686,10 +690,14 @@ retry:
 	/*
 	 * Obtain a copy of private_data under the zvol_state_lock to make
 	 * sure that either the result of zvol free code path setting
-	 * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free()
+	 * disk->private_data to NULL is observed, or zvol_os_free()
 	 * is not called on this zv because of the positive zv_open_count.
 	 */
+#ifdef HAVE_BLK_MODE_T
+	zv = disk->private_data;
+#else
 	zv = bdev->bd_disk->private_data;
+#endif
 	if (zv == NULL) {
 		rw_exit(&zvol_state_lock);
 		return (SET_ERROR(-ENXIO));
@@ -769,14 +777,15 @@ retry:
 			}
 		}
 
-		error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+		error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
 
 		if (drop_namespace)
 			mutex_exit(&spa_namespace_lock);
 	}
 
 	if (error == 0) {
-		if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		if ((blk_mode_is_open_write(flag)) &&
+		    (zv->zv_flags & ZVOL_RDONLY)) {
 			if (zv->zv_open_count == 0)
 				zvol_last_close(zv);
 
@@ -791,14 +800,25 @@ retry:
 		rw_exit(&zv->zv_suspend_lock);
 
 	if (error == 0)
+#ifdef HAVE_BLK_MODE_T
+		disk_check_media_change(disk);
+#else
 		zfs_check_media_change(bdev);
+#endif
 
 	return (error);
 }
 
 static void
-zvol_release(struct gendisk *disk, fmode_t mode)
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
+zvol_release(struct gendisk *disk)
+#else
+zvol_release(struct gendisk *disk, fmode_t unused)
+#endif
 {
+#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
+	(void) unused;
+#endif
 	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
-- 
cgit v1.2.3


From 325505e5c4e48f32e1a03e42a694509bf4c02670 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Sun, 23 Jul 2023 01:34:29 -0400
Subject: Linux 6.4 compat: iter_iov() function now used to get old iov member

The iov_iter->iov member is now iov_iter->__iov and must be accessed via
the accessor function iter_iov(). Create a wrapper that is conditionally
compiled to use the access method appropriate for the target kernel
version.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15100
---
 config/kernel-vfs-iov_iter.m4  | 23 +++++++++++++++++++++++
 include/os/linux/spl/sys/uio.h |  6 ++++++
 module/os/linux/zfs/zpl_file.c |  8 +++-----
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4
index e0617faab02c..cc5a7ab0c237 100644
--- a/config/kernel-vfs-iov_iter.m4
+++ b/config/kernel-vfs-iov_iter.m4
@@ -93,6 +93,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
 		struct iov_iter iter = { 0 };
 		__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
 	])
+
+	ZFS_LINUX_TEST_SRC([iter_iov], [
+		#include <linux/fs.h>
+		#include <linux/uio.h>
+	],[
+		struct iov_iter iter = { 0 };
+		__attribute__((unused)) const struct iovec *iov = iter_iov(&iter);
+	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
@@ -201,4 +209,19 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
 		AC_DEFINE(HAVE_VFS_IOV_ITER, 1,
 		    [All required iov_iter interfaces are available])
 	])
+
+	dnl #
+	dnl # Kernel 6.5 introduces the iter_iov() function that returns the
+	dnl # __iov member of an iov_iter*. The iov member was renamed to this
+	dnl # __iov member, and is intended to be accessed via the helper
+	dnl # function now.
+	dnl #
+	AC_MSG_CHECKING([whether iter_iov() is available])
+	ZFS_LINUX_TEST_RESULT([iter_iov], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_ITER_IOV, 1,
+		    [iter_iov() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
 ])
diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h
index fe2b5c07a018..082e930e46b3 100644
--- a/include/os/linux/spl/sys/uio.h
+++ b/include/os/linux/spl/sys/uio.h
@@ -173,4 +173,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset,
 }
 #endif
 
+#if defined(HAVE_ITER_IOV)
+#define	zfs_uio_iter_iov(iter)	iter_iov((iter))
+#else
+#define	zfs_uio_iter_iov(iter)	(iter)->iov
+#endif
+
 #endif /* SPL_UIO_H */
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 73526db731c4..aedafd6002b9 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -300,17 +300,15 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
 {
 #if defined(HAVE_VFS_IOV_ITER)
 	zfs_uio_iov_iter_init(uio, to, pos, count, skip);
-#else
-#ifdef HAVE_IOV_ITER_TYPE
-	zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
+#elif defined(HAVE_IOV_ITER_TYPE)
+	zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
 	    iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
 	    count, skip);
 #else
-	zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
+	zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
 	    to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
 	    count, skip);
 #endif
-#endif
 }
 
 static ssize_t
-- 
cgit v1.2.3


From 6751634d77520793ec410b6886febb7e665fc175 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Sun, 30 Jul 2023 15:23:47 -0400
Subject: Linux 4.20 compat: wrapper function for iov_iter type access

An iov_iter_type() function to access the "type" member of the struct
iov_iter was added at one point. Move the conditional logic to decide
which method to use for accessing it into a macro and simplify the
zpl_uio_init code.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15100
---
 include/os/linux/spl/sys/uio.h | 6 ++++++
 module/os/linux/zfs/zpl_file.c | 7 ++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h
index 082e930e46b3..cce097e16fbc 100644
--- a/include/os/linux/spl/sys/uio.h
+++ b/include/os/linux/spl/sys/uio.h
@@ -179,4 +179,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset,
 #define	zfs_uio_iter_iov(iter)	(iter)->iov
 #endif
 
+#if defined(HAVE_IOV_ITER_TYPE)
+#define	zfs_uio_iov_iter_type(iter)	iov_iter_type((iter))
+#else
+#define	zfs_uio_iov_iter_type(iter)	(iter)->type
+#endif
+
 #endif /* SPL_UIO_H */
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index aedafd6002b9..f6af2ebd1163 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -300,13 +300,10 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
 {
 #if defined(HAVE_VFS_IOV_ITER)
 	zfs_uio_iov_iter_init(uio, to, pos, count, skip);
-#elif defined(HAVE_IOV_ITER_TYPE)
-	zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
-	    iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
-	    count, skip);
 #else
 	zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
-	    to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
+	    zfs_uio_iov_iter_type(to) & ITER_KVEC ?
+	    UIO_SYSSPACE : UIO_USERSPACE,
 	    count, skip);
 #endif
 }
-- 
cgit v1.2.3


From a21ca18d4d2943667aa409a6dc2a9c8ad033e331 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Tue, 1 Aug 2023 17:50:17 +0200
Subject: linux: zfs: ctldir: set [amc]time to snapshot's creation property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If looking up a snapdir inode failed, hold pool config – hold the
snapshot – get its creation property – release it – release it,
then use that as the [amc]time in the allocated inode. If that
fails then fall back to current time. No performance impact since
this is only done when allocating a new snapdir inode.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Closes #15110
Closes #15117
---
 module/os/linux/zfs/zfs_ctldir.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index dca48e1e4010..c45a3eb5a4eb 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -478,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip)
  */
 static struct inode *
 zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
-    const struct file_operations *fops, const struct inode_operations *ops)
+    const struct file_operations *fops, const struct inode_operations *ops,
+    uint64_t creation)
 {
-	inode_timespec_t now;
 	struct inode *ip;
 	znode_t *zp;
+	inode_timespec_t now = {.tv_sec = creation};
 
 	ip = new_inode(zfsvfs->z_sb);
 	if (ip == NULL)
 		return (NULL);
 
-	now = current_time(ip);
+	if (!creation)
+		now = current_time(ip);
 	zp = ITOZ(ip);
 	ASSERT3P(zp->z_dirlocks, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
@@ -552,14 +554,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
     const struct file_operations *fops, const struct inode_operations *ops)
 {
 	struct inode *ip = NULL;
+	uint64_t creation = 0;
+	dsl_dataset_t *snap_ds;
+	dsl_pool_t *pool;
 
 	while (ip == NULL) {
 		ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
 		if (ip)
 			break;
 
+		if (id <= ZFSCTL_INO_SNAPDIRS && !creation) {
+			pool = dmu_objset_pool(zfsvfs->z_os);
+			dsl_pool_config_enter(pool, FTAG);
+			if (!dsl_dataset_hold_obj(pool,
+			    ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) {
+				creation = dsl_get_creation(snap_ds);
+				dsl_dataset_rele(snap_ds, FTAG);
+			}
+			dsl_pool_config_exit(pool, FTAG);
+		}
+
 		/* May fail due to concurrent zfsctl_inode_alloc() */
-		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation);
 	}
 
 	return (ip);
@@ -581,7 +597,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
-	    &zpl_fops_root, &zpl_ops_root);
+	    &zpl_fops_root, &zpl_ops_root, 0);
 	if (zfsvfs->z_ctldir == NULL)
 		return (SET_ERROR(ENOENT));
 
-- 
cgit v1.2.3


From 114a39964f5c8d89558d2823f1211f2258825ec7 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Wed, 2 Aug 2023 01:56:30 +1000
Subject: zdb: include cloned blocks in block statistics

This gives `zdb -b` support for clone blocks.

Previously, it didn't know what clones were, so would count their space
allocation multiple times and then report leaked space (or, in debug,
would assert trying to claim blocks a second time).

This commit fixes those bugs, and reports the number of clones and the
space "used" (saved) by them.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15123
---
 cmd/zdb/zdb.c     | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/sys/brt.h |  1 +
 module/zfs/brt.c  | 31 ++++++++++++++++++++++
 3 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 9568d2bbfe38..4b9921d47b81 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -79,6 +79,7 @@
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
+#include <sys/brt.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 
@@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = {
 #define	ZB_TOTAL	DN_MAX_LEVELS
 #define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)
 
+typedef struct zdb_brt_entry {
+	dva_t		zbre_dva;
+	uint64_t	zbre_refcount;
+	avl_node_t	zbre_node;
+} zdb_brt_entry_t;
+
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_removing_size;
 	uint64_t	zcb_checkpoint_size;
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
+	uint64_t	zcb_clone_asize;
+	uint64_t	zcb_clone_blocks;
 	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
@@ -5368,6 +5377,8 @@ typedef struct zdb_cb {
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 	uint32_t	**zcb_vd_obsolete_counts;
+	avl_tree_t	zcb_brt;
+	boolean_t	zcb_brt_is_active;
 } zdb_cb_t;
 
 /* test if two DVA offsets from same vdev are within the same metaslab */
@@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
+	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
+		/*
+		 * Cloned blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them them once.
+		 *
+		 * To do this, we keep our own in-memory BRT. For each block
+		 * we haven't seen before, we look it up in the real BRT and
+		 * if its there, we note it and its refcount then proceed as
+		 * normal. If we see the block again, we count it as a clone
+		 * and then give it no further consideration.
+		 */
+		zdb_brt_entry_t zbre_search, *zbre;
+		avl_index_t where;
+
+		zbre_search.zbre_dva = bp->blk_dva[0];
+		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
+		if (zbre != NULL) {
+			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_clone_blocks++;
+
+			zbre->zbre_refcount--;
+			if (zbre->zbre_refcount == 0) {
+				avl_remove(&zcb->zcb_brt, zbre);
+				umem_free(zbre, sizeof (zdb_brt_entry_t));
+			}
+			return;
+		}
+
+		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
+		if (crefcnt > 0) {
+			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
+			    UMEM_NOFAIL);
+			zbre->zbre_dva = bp->blk_dva[0];
+			zbre->zbre_refcount = crefcnt;
+			avl_insert(&zcb->zcb_brt, zbre, where);
+		}
+	}
+
 	if (dump_opt['L'])
 		return;
 
@@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa)
 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
 }
 
+static int
+zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
+{
+	const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
+	const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
+	int cmp;
+
+	cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+	if (cmp == 0)
+		cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
+
+	return (cmp);
+}
+
 static int
 dump_block_stats(spa_t *spa)
 {
@@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa)
 
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
+	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
+		avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
+		    sizeof (zdb_brt_entry_t),
+		    offsetof(zdb_brt_entry_t, zbre_node));
+		zcb->zcb_brt_is_active = B_TRUE;
+	}
+
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
@@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa)
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
-	total_found = tzb->zb_asize - zcb->zcb_dedup_asize +
+	total_found =
+	    tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
 	    zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
 
 	if (total_found == total_alloc && !dump_opt['L']) {
@@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa)
 	    "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
 	    (u_longlong_t)zcb->zcb_dedup_blocks,
 	    (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
+	(void) printf("\t%-16s %14llu    count: %6llu\n",
+	    "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
+	    (u_longlong_t)zcb->zcb_clone_blocks);
 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
diff --git a/include/sys/brt.h b/include/sys/brt.h
index 0761159e3f5f..f73df95058d9 100644
--- a/include/sys/brt.h
+++ b/include/sys/brt.h
@@ -36,6 +36,7 @@ extern "C" {
 #endif
 
 extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);
+extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp);
 
 extern uint64_t brt_get_dspace(spa_t *spa);
 extern uint64_t brt_get_used(spa_t *spa);
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 877b503a1bf2..e8218fb26888 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1544,6 +1544,37 @@ out:
 	return (B_FALSE);
 }
 
+uint64_t
+brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_vdev_t *brtvd;
+	brt_entry_t bre_search, *bre;
+	uint64_t vdevid, refcnt;
+	int error;
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_rlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	ASSERT(brtvd != NULL);
+
+	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+	if (bre == NULL) {
+		error = brt_entry_lookup(brt, brtvd, &bre_search);
+		ASSERT(error == 0 || error == ENOENT);
+		if (error == ENOENT)
+			refcnt = 0;
+		else
+			refcnt = bre_search.bre_refcount;
+	} else
+		refcnt = bre->bre_refcount;
+
+	brt_unlock(brt);
+	return (refcnt);
+}
+
 static void
 brt_prefetch(brt_t *brt, const blkptr_t *bp)
 {
-- 
cgit v1.2.3


From fcd61d937f217baf4ae328781bd07e8bce5a6ce8 Mon Sep 17 00:00:00 2001
From: Zach Dykstra <dykstra.zachary@gmail.com>
Date: Tue, 1 Aug 2023 11:01:32 -0500
Subject: readmmap.c: fix building with MUSL libc

glibc includes sys/types.h from stdlib.h. This is not the case for MUSL,
so explicitly include it. Fixes usage of uint_t.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Zach Dykstra <dykstra.zachary@gmail.com>
Closes #15130
---
 tests/zfs-tests/cmd/readmmap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/zfs-tests/cmd/readmmap.c b/tests/zfs-tests/cmd/readmmap.c
index 704ffd55c8a5..a5c8079d0e46 100644
--- a/tests/zfs-tests/cmd/readmmap.c
+++ b/tests/zfs-tests/cmd/readmmap.c
@@ -44,6 +44,7 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <sys/mman.h>
+#include <sys/types.h>
 #include <time.h>
 
 int
-- 
cgit v1.2.3


From ead3eea3e07cdcfa6ae02b3e565baddb6a0773db Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Wed, 2 Aug 2023 04:31:11 +1000
Subject: linux/copy_file_range: properly request a fallback copy on Linux <5.3

Before Linux 5.3, the filesystem's copy_file_range handler had to signal
back to the kernel that we can't fulfill the request and it should
fallback to a content copy. This is done by returning -EOPNOTSUPP.

This commit converts the EXDEV return from zfs_clone_range to
EOPNOTSUPP, to force the kernel to fallback for all the valid reasons it
might be unable to clone. Without it the copy_file_range() syscall will
return EXDEV to userspace, breaking its semantics.

Add test for copy_file_range fallbacks.  copy_file_range should always
fallback to a content copy whenever ZFS can't service the request with
cloning.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15131
---
 module/os/linux/zfs/zpl_file_range.c               |  7 ++
 tests/runfiles/linux.run                           |  1 +
 tests/test-runner/bin/zts-report.py.in             |  2 +
 tests/zfs-tests/tests/Makefile.am                  |  1 +
 .../block_cloning_copyfilerange_fallback.ksh       | 86 ++++++++++++++++++++++
 5 files changed, 97 insertions(+)
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh

diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 18efebfc1dec..72384b638be5 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -106,6 +106,13 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	if (ret == -EOPNOTSUPP || ret == -EXDEV)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
+#else
+	/*
+	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
+	 * to the kernel that it should fallback to a content copy.
+	 */
+	if (ret == -EXDEV)
+		ret = -EOPNOTSUPP;
 #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 
 	return (ret);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index b68202d84924..4747b9837337 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -36,6 +36,7 @@ tags = ['functional', 'atime']
 
 [tests/functional/block_cloning:Linux]
 tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
+    'block_cloning_copyfilerange_fallback',
     'block_cloning_ficlone', 'block_cloning_ficlonerange',
     'block_cloning_ficlonerange_partial',
     'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index c9a2b4179aec..5c4b3a7bcdc1 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -300,6 +300,8 @@ elif sys.platform.startswith('linux'):
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_copyfilerange_partial':
             ['SKIP', cfr_reason],
+        'block_cloning/block_cloning_copyfilerange_fallback':
+            ['SKIP', cfr_reason],
         'block_cloning/block_cloning_copyfilerange_cross_dataset':
             ['SKIP', cfr_cross_reason],
     })
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 0819cb6b576e..3b6b2ef734d0 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -443,6 +443,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
 	functional/block_cloning/block_cloning_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
+	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
 	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlone.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh
new file mode 100755
index 000000000000..87f99eb5c0f0
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh
@@ -0,0 +1,86 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+claim="copy_file_range will fall back to copy when cloning not possible."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+
+log_note "Copying entire file with copy_file_range"
+
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "1 2 3 4" ]
+
+
+log_note "Copying within a block with copy_file_range"
+
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "2 3 4" ]
+
+
+log_note "Copying across a block with copy_file_range"
+
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "2" ]
+
+log_pass $claim
-- 
cgit v1.2.3


From 12373b0cc7c657d054543ff11139f5d15e66e560 Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Tue, 1 Aug 2023 14:47:00 -0700
Subject: zpool_vdev_remove() should handle EALREADY error return

When the vdev properties features was merged an extra check
was added in `spa_vdev_remove_top_check()` which checked
whether the vdev that we want to remove is already being
removed and if so return an EALREADY error.

```
static int
spa_vdev_remove_top_check(vdev_t *vd)
{
	... <snip> ...
	/*
	 * This device is already being removed
	 */
	if (vd->vdev_removing)
		return (SET_ERROR(EALREADY));
```

Before that change we'd still fail with an error but it
was a more generic one - here is the check that failed
later in the same function:
```
	/*
	 * There can not be a removal in progress.
	 */
	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
		return (SET_ERROR(EBUSY));
```

Changing the error code returned from that function changed
the behavior of the removal's library interface exposed to
the userland - `spa_vdev_remove()` now returns `EZFS_UNKNOWN`
instead of `EZFS_EBUSY` that was returning before.

This patch adds logic to make `spa_vdev_remove()` mindful
of the new EALREADY code and propagating `EZFS_EBUSY`
reverting to the previously established semantics of that
function.

Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #15013
Closes #15129
---
 lib/libzfs/libzfs_pool.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index d4af31c50cf8..85564edfd862 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -3926,6 +3926,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
 
 	switch (errno) {
 
+	case EALREADY:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "removal for this vdev is already in progress."));
+		(void) zfs_error(hdl, EZFS_BUSY, errbuf);
+		break;
+
 	case EINVAL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid config; all top-level vdevs must "
-- 
cgit v1.2.3


From a5fdba1185eeb1b6596e656a3e6bf3ef39b87ea3 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <batkinson@lanl.gov>
Date: Tue, 1 Aug 2023 17:48:19 -0400
Subject: Revert "Linux 6.5 compat: register_sysctl_table removed"

This reverts commit b35374fd6474603170fd9a3c7503da6eb13ac712 as there
are error messages when loading the SPL module. Errors seemed to be tied
to duplicate a duplicate entry.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #15134
---
 config/kernel-register_sysctl_table.m4 | 27 ---------------------------
 config/kernel.m4                       |  2 --
 module/os/linux/spl/spl-proc.c         | 26 +++-----------------------
 3 files changed, 3 insertions(+), 52 deletions(-)
 delete mode 100644 config/kernel-register_sysctl_table.m4

diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4
deleted file mode 100644
index f18316b32b6d..000000000000
--- a/config/kernel-register_sysctl_table.m4
+++ /dev/null
@@ -1,27 +0,0 @@
-dnl #
-dnl # Linux 6.5 removes register_sysctl_table
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [
-	ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [
-		#include <linux/sysctl.h>
-
-		static struct ctl_table dummy_table[] = {
-			{}
-		};
-
-    ],[
-		struct ctl_table_header *h
-			__attribute((unused)) = register_sysctl_table(dummy_table);
-    ])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
-	AC_MSG_CHECKING([whether register_sysctl_table exists])
-	ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [
-		AC_MSG_RESULT([yes])
-		AC_DEFINE(HAVE_SYSCTL_REGISTER_TABLE, 1,
-			[sysctl_register_table exists])
-	],[
-		AC_MSG_RESULT([no])
-	])
-])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 28bd361d33ff..1487fa2e7793 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -160,7 +160,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_FILEMAP
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
-	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -300,7 +299,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_FILEMAP
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
-	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index bcc356ae55b6..01f5619e1893 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -624,7 +624,6 @@ static struct ctl_table spl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dohostid,
 	},
-#ifdef HAVE_REGISTER_SYSCTL_TABLE
 	{
 		.procname	= "kmem",
 		.mode		= 0555,
@@ -635,11 +634,9 @@ static struct ctl_table spl_table[] = {
 		.mode		= 0555,
 		.child		= spl_kstat_table,
 	},
-#endif
 	{},
 };
 
-#ifdef HAVE_REGISTER_SYSCTL_TABLE
 static struct ctl_table spl_dir[] = {
 	{
 		.procname	= "spl",
@@ -651,38 +648,21 @@ static struct ctl_table spl_dir[] = {
 
 static struct ctl_table spl_root[] = {
 	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= spl_dir,
+	.procname = "kernel",
+	.mode = 0555,
+	.child = spl_dir,
 	},
 	{}
 };
-#endif
 
 int
 spl_proc_init(void)
 {
 	int rc = 0;
 
-#ifdef HAVE_REGISTER_SYSCTL_TABLE
 	spl_header = register_sysctl_table(spl_root);
 	if (spl_header == NULL)
 		return (-EUNATCH);
-#else
-	spl_header = register_sysctl("kernel/spl", spl_table);
-	if (spl_header == NULL)
-		return (-EUNATCH);
-
-	if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
-	if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-#endif
 
 	proc_spl = proc_mkdir("spl", NULL);
 	if (proc_spl == NULL) {
-- 
cgit v1.2.3


From e47e9bbe86f2e8fe5da0fc7c3a9014e1f8c132a9 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Wed, 2 Aug 2023 17:05:46 -0400
Subject: Linux 6.5 compat: register_sysctl_table removed

Additionally, the .child element of ctl_table has been removed in 6.5.
This change adds a new test for the pre-6.5 register_sysctl_table()
function, and uses the old code in that case. If it isn't found, then
the parentage entries in the tables are removed, and the register_sysctl
call is provided the paths of "kernel/spl", "kernel/spl/kmem", and
"kernel/spl/kstat" directly, to populate each subdirectory over three
calls, as is the new API.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15138
---
 config/kernel-register_sysctl_table.m4 | 27 +++++++++++++++++++++++++++
 config/kernel.m4                       |  2 ++
 module/os/linux/spl/spl-proc.c         | 26 +++++++++++++++++++++++---
 3 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 config/kernel-register_sysctl_table.m4

diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4
new file mode 100644
index 000000000000..a5e934f56d29
--- /dev/null
+++ b/config/kernel-register_sysctl_table.m4
@@ -0,0 +1,27 @@
+dnl #
+dnl # Linux 6.5 removes register_sysctl_table
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [
+	ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [
+		#include <linux/sysctl.h>
+
+		static struct ctl_table dummy_table[] = {
+			{}
+		};
+
+    ],[
+		struct ctl_table_header *h
+			__attribute((unused)) = register_sysctl_table(dummy_table);
+    ])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
+	AC_MSG_CHECKING([whether register_sysctl_table exists])
+	ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_REGISTER_SYSCTL_TABLE, 1,
+			[register_sysctl_table exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 1487fa2e7793..28bd361d33ff 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -160,6 +160,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_FILEMAP
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
+	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -299,6 +300,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_FILEMAP
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
+	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index 01f5619e1893..bcc356ae55b6 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -624,6 +624,7 @@ static struct ctl_table spl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dohostid,
 	},
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
 	{
 		.procname	= "kmem",
 		.mode		= 0555,
@@ -634,9 +635,11 @@ static struct ctl_table spl_table[] = {
 		.mode		= 0555,
 		.child		= spl_kstat_table,
 	},
+#endif
 	{},
 };
 
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
 static struct ctl_table spl_dir[] = {
 	{
 		.procname	= "spl",
@@ -648,21 +651,38 @@ static struct ctl_table spl_dir[] = {
 
 static struct ctl_table spl_root[] = {
 	{
-	.procname = "kernel",
-	.mode = 0555,
-	.child = spl_dir,
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= spl_dir,
 	},
 	{}
 };
+#endif
 
 int
 spl_proc_init(void)
 {
 	int rc = 0;
 
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
 	spl_header = register_sysctl_table(spl_root);
 	if (spl_header == NULL)
 		return (-EUNATCH);
+#else
+	spl_header = register_sysctl("kernel/spl", spl_table);
+	if (spl_header == NULL)
+		return (-EUNATCH);
+
+	if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+#endif
 
 	proc_spl = proc_mkdir("spl", NULL);
 	if (proc_spl == NULL) {
-- 
cgit v1.2.3


From a97b8fc2dd9ed85640b81b0514ff7424e26fc32e Mon Sep 17 00:00:00 2001
From: Mateusz Piotrowski <0mp@FreeBSD.org>
Date: Mon, 7 Aug 2023 22:53:59 +0200
Subject: Fix some typos

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org>
Closes #15141
---
 include/sys/metaslab_impl.h | 6 +++---
 module/zfs/metaslab.c       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 5beb1b737775..d328068890cc 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -313,7 +313,7 @@ struct metaslab_group {
  * Each metaslab maintains a set of in-core trees to track metaslab
  * operations.  The in-core free tree (ms_allocatable) contains the list of
  * free segments which are eligible for allocation.  As blocks are
- * allocated, the allocated segment are removed from the ms_allocatable and
+ * allocated, the allocated segments are removed from the ms_allocatable and
  * added to a per txg allocation tree (ms_allocating).  As blocks are
  * freed, they are added to the free tree (ms_freeing).  These trees
  * allow us to process all allocations and frees in syncing context
@@ -366,9 +366,9 @@ struct metaslab_group {
 struct metaslab {
 	/*
 	 * This is the main lock of the metaslab and its purpose is to
-	 * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+	 * coordinate our allocations and frees [e.g., metaslab_block_alloc(),
 	 * metaslab_free_concrete(), ..etc] with our various syncing
-	 * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+	 * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc].
 	 *
 	 * The lock is also used during some miscellaneous operations like
 	 * using the metaslab's histogram for the metaslab group's histogram
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 8393e8dd91d5..20dc934593f1 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1292,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 
 		/*
 		 * If this metaslab group is below its qmax or it's
-		 * the only allocatable metasable group, then attempt
+		 * the only allocatable metaslab group, then attempt
 		 * to allocate from it.
 		 */
 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
-- 
cgit v1.2.3


From 6c94e64963139e5e598a9880fe73cd326fa5453b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 7 Aug 2023 16:54:41 -0400
Subject: Refactor dmu_prefetch().

- Split dmu_prefetch_dnode() from dmu_prefetch() into a separate
function.  It is quite inconvenient to read the code where len = 0
means dnode prefetch instead indirect/data prefetch.  One function
doing both has no benefits, since the code paths are independent.
 - Improve dmu_prefetch() handling of long block ranges.  Instead
of limiting L0 data length to prefetch for to dmu_prefetch_max,
make dmu_prefetch_max limit the actual amount of prefetch at the
specified level, and, if there is more, prefetch all the rest at
higher indirection level.  It should improve random access times
within the prefetched range of any length, reducing importance of
specific dmu_prefetch_max value.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15076
---
 include/sys/dmu.h                    |   1 +
 module/os/freebsd/zfs/zfs_vnops_os.c |   4 +-
 module/os/linux/zfs/zfs_vnops_os.c   |   7 +--
 module/zfs/dmu.c                     | 103 +++++++++++++++++++++--------------
 module/zfs/dsl_deadlist.c            |   8 +--
 module/zfs/spa_log_spacemap.c        |   4 +-
 module/zfs/zvol.c                    |   2 +-
 7 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 7e57d133c2ec..a84175c980d7 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -885,6 +885,7 @@ extern uint_t zfs_max_recordsize;
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
+void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 45cf6fdfc409..e80e29c2da32 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
 
 		ASSERT3S(outcount, <=, bufsize);
 
-		/* Prefetch znode */
 		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0, 0,
-			    ZIO_PRIORITY_SYNC_READ);
+			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 33baac9db06b..b7d44f344daf 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 		if (done)
 			break;
 
-		/* Prefetch znode */
-		if (prefetch) {
-			dmu_prefetch(os, objnum, 0, 0, 0,
-			    ZIO_PRIORITY_SYNC_READ);
-		}
+		if (prefetch)
+			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 078811dbf4e3..c2565b50b7d0 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -694,74 +694,93 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 }
 
 /*
- * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
- * the data starting at offset, and continuing to offset + len.
+ * the data starting at offset, and continuing to offset + len.  If the range
+ * it too long, prefetch the first dmu_prefetch_max bytes as requested, while
+ * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
+ * should primarily help random reads, since for long sequential reads there is
+ * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
- * in cache, they will be asynchronously read in.
+ * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
+ * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	uint64_t blkid;
-	int nblks, err;
-
-	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = DMU_META_DNODE(os);
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
 
-		if (object == 0 || object >= DN_MAX_OBJECT)
-			return;
-
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, level,
-		    object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, level, blkid, pri, 0);
-		rw_exit(&dn->dn_struct_rwlock);
+	if (dmu_prefetch_max == 0 || len == 0) {
+		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
-	/*
-	 * See comment before the definition of dmu_prefetch_max.
-	 */
-	len = MIN(len, dmu_prefetch_max);
-
-	/*
-	 * XXX - Note, if the dnode for the requested object is not
-	 * already cached, we will do a *synchronous* read in the
-	 * dnode_hold() call.  The same is true for any indirects.
-	 */
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0)
+	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	/*
-	 * offset + len - 1 is the last byte we want to prefetch for, and offset
-	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
-	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
-	 * offset)  is the first.  Then the number we need to prefetch is the
-	 * last - first + 1.
+	 * Depending on len we may do two prefetches: blocks [start, end) at
+	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (level > 0 || dn->dn_datablkshift != 0) {
-		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
-		    dbuf_whichblock(dn, level, offset) + 1;
+	if (dn->dn_datablkshift != 0) {
+		/*
+		 * The object has multiple blocks.  Calculate the full range
+		 * of blocks [start, end2) and then split it into two parts,
+		 * so that the first [start, end) fits into dmu_prefetch_max.
+		 */
+		start = dbuf_whichblock(dn, level, offset);
+		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
+		uint8_t ibs = dn->dn_indblkshift;
+		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
+		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
+		start2 = end = MIN(end2, start + limit);
+
+		/*
+		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
+		 */
+		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
+		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+		do {
+			level2++;
+			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
+			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
+		} while (end2 - start2 > limit);
 	} else {
-		nblks = (offset < dn->dn_datablksz);
+		/* There is only one block.  Prefetch it or nothing. */
+		start = start2 = end2 = 0;
+		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
-	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, level, offset);
-		for (int i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, level, blkid + i, pri, 0);
-	}
+	for (uint64_t i = start; i < end; i++)
+		dbuf_prefetch(dn, level, i, pri, 0);
+	for (uint64_t i = start2; i < end2; i++)
+		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
+/*
+ * Issue prefetch I/Os for the given object's dnode.
+ */
+void
+dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
+{
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return;
+
+	dnode_t *dn = DMU_META_DNODE(os);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
+	dbuf_prefetch(dn, 0, blkid, pri, 0);
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index 47c234f76c40..2832294b6974 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dle->dle_bpobj.bpo_object = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
+		    ZIO_PRIORITY_SYNC_READ);
 
 		avl_add(&dl->dl_tree, dle);
 	}
@@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dlce->dlce_bpobj = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
+		    ZIO_PRIORITY_SYNC_READ);
 		avl_add(&dl->dl_cache, dlce);
 	}
 	VERIFY3U(error, ==, ENOENT);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index 2878e68c6e4b..cf05158b63f8 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa)
 	/* Prefetch log spacemaps dnodes. */
 	for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
 	    sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
-		dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
+		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	uint_t pn = 0;
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index cd4e6f0c7558..547687f07ed5 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -981,7 +981,7 @@ zvol_prefetch_minors_impl(void *arg)
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
-		dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }
-- 
cgit v1.2.3


From fdb8fff916f47ffe292313f743a241406be7a1ef Mon Sep 17 00:00:00 2001
From: Ryan Lahfa <masterancpp@gmail.com>
Date: Mon, 7 Aug 2023 22:55:59 +0200
Subject: linux/spl/kmem_cache: undefine `kmem_cache_alloc` before defining it

When compiling a kernel with bcachefs and zfs,
the two macros will collide, making it impossible
to have both filesystems.

It is sufficient to just undefine the macro before calling it.

On why this should be in ZFS rather than bcachefs, currently,
bcachefs is not a in-tree filesystem, but,
it has a reasonably high chance of getting included soon.

This avoids the breakage in ZFS early,
this patch may be distributed downstream in NixOS
and is already used there.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ryan Lahfa <ryan@lahfa.xyz>
Closes #15144
---
 include/os/linux/spl/sys/kmem_cache.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h
index cc9cafa84f99..20eeadc46e10 100644
--- a/include/os/linux/spl/sys/kmem_cache.h
+++ b/include/os/linux/spl/sys/kmem_cache.h
@@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
     spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
 #define	kmem_cache_set_move(skc, move)	spl_kmem_cache_set_move(skc, move)
 #define	kmem_cache_destroy(skc)		spl_kmem_cache_destroy(skc)
+/*
+ * This is necessary to be compatible with other kernel modules
+ * or in-tree filesystem that may define kmem_cache_alloc,
+ * like bcachefs does it now.
+ */
+#ifdef kmem_cache_alloc
+#undef kmem_cache_alloc
+#endif
 #define	kmem_cache_alloc(skc, flags)	spl_kmem_cache_alloc(skc, flags)
 #define	kmem_cache_free(skc, obj)	spl_kmem_cache_free(skc, obj)
 #define	kmem_cache_reap_now(skc)	spl_kmem_cache_reap_now(skc)
-- 
cgit v1.2.3


From 36261c8238df462b214854ccea1df4f060cf0995 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Mon, 7 Aug 2023 18:47:46 -0400
Subject: Linux 6.5 compat: replace generic_file_splice_read with
 filemap_splice_read

The generic_file_splice_read function was removed in Linux 6.5 in favor
of filemap_splice_read. Add an autoconf test for filemap_splice_read and
use it if it is found as the handler for .splice_read in the
file_operations struct. Additionally, ITER_PIPE was removed in 6.5. This
change removes the ITER_* macros that OpenZFS doesn't use from being
tested in config/kernel-vfs-iov_iter.m4. The removal of ITER_PIPE was
causing the test to fail, which also affected the code responsible for
setting the .splice_read handler, above. That behavior caused run-time
panics on Linux 6.5.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15155
---
 config/kernel-filemap-splice-read.m4 | 25 +++++++++++++++++++++++++
 config/kernel-vfs-iov_iter.m4        |  3 +--
 config/kernel.m4                     |  2 ++
 module/os/linux/zfs/zpl_file.c       |  4 ++++
 4 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 config/kernel-filemap-splice-read.m4

diff --git a/config/kernel-filemap-splice-read.m4 b/config/kernel-filemap-splice-read.m4
new file mode 100644
index 000000000000..5199b7373e4d
--- /dev/null
+++ b/config/kernel-filemap-splice-read.m4
@@ -0,0 +1,25 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ], [
+	dnl #
+	dnl # Kernel 6.5 - generic_file_splice_read was removed in favor
+	dnl # of filemap_splice_read for the .splice_read member of the
+	dnl # file_operations struct.
+	dnl #
+	ZFS_LINUX_TEST_SRC([has_filemap_splice_read], [
+		#include <linux/fs.h>
+
+		struct file_operations fops __attribute__((unused)) = {
+			.splice_read = filemap_splice_read,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_FILEMAP_SPLICE_READ], [
+	AC_MSG_CHECKING([whether filemap_splice_read() exists])
+	ZFS_LINUX_TEST_RESULT([has_filemap_splice_read], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_FILEMAP_SPLICE_READ, 1,
+		    [filemap_splice_read exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4
index cc5a7ab0c237..ff560ff3eef0 100644
--- a/config/kernel-vfs-iov_iter.m4
+++ b/config/kernel-vfs-iov_iter.m4
@@ -6,8 +6,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
 		#include <linux/fs.h>
 		#include <linux/uio.h>
 	],[
-		int type __attribute__ ((unused)) =
-		    ITER_IOVEC | ITER_KVEC | ITER_BVEC | ITER_PIPE;
+		int type __attribute__ ((unused)) = ITER_KVEC;
 	])
 
 	ZFS_LINUX_TEST_SRC([iov_iter_advance], [
diff --git a/config/kernel.m4 b/config/kernel.m4
index 28bd361d33ff..309f1819be48 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -161,6 +161,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -301,6 +302,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_FILEMAP_SPLICE_READ
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index f6af2ebd1163..24cc1064a8fc 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -1323,7 +1323,11 @@ const struct file_operations zpl_file_operations = {
 	.read_iter	= zpl_iter_read,
 	.write_iter	= zpl_iter_write,
 #ifdef HAVE_VFS_IOV_ITER
+#ifdef HAVE_FILEMAP_SPLICE_READ
+	.splice_read	= filemap_splice_read,
+#else
 	.splice_read	= generic_file_splice_read,
+#endif
 	.splice_write	= iter_file_splice_write,
 #endif
 #else
-- 
cgit v1.2.3


From 683edb32b73885d1718a2220fe08c4cd61e63fd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Tue, 8 Aug 2023 18:35:35 +0200
Subject: libzfs: sendrecv: send_progress_thread: handle SIGINFO/SIGUSR1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

POSIX timers target the process, not the thread (as does SIGINFO),
so we need to block it in the main thread which will die if interrupted.

Ref: https://101010.pl/@ed1conf@bsd.network/110731819189629373
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Closes #15113
---
 lib/libzfs/Makefile.am       |  2 +-
 lib/libzfs/libzfs_sendrecv.c | 95 ++++++++++++++++++++++++++++++++++++--------
 man/man8/zfs-send.8          | 18 ++++++++-
 3 files changed, 96 insertions(+), 19 deletions(-)

diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am
index cffe341220c2..5e74d908de3d 100644
--- a/lib/libzfs/Makefile.am
+++ b/lib/libzfs/Makefile.am
@@ -57,7 +57,7 @@ libzfs_la_LIBADD = \
 	libzutil.la \
 	libuutil.la
 
-libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
+libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
 
 libzfs_la_LDFLAGS = -pthread
 
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 87a30f54fea8..e9bc78aa8d39 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written,
 	return (0);
 }
 
+static volatile boolean_t send_progress_thread_signal_duetotimer;
+static void
+send_progress_thread_act(int sig, siginfo_t *info, void *ucontext)
+{
+	(void) sig, (void) ucontext;
+	send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER;
+}
+
+struct timer_desirability {
+	timer_t timer;
+	boolean_t desired;
+};
+static void
+timer_delete_cleanup(void *timer)
+{
+	struct timer_desirability *td = timer;
+	if (td->desired)
+		timer_delete(td->timer);
+}
+
+#ifdef SIGINFO
+#define	SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO)
+#else
+#define	SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO
+#endif
+#define	SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \
+	sigset_t new; \
+	sigemptyset(&new); \
+	sigaddset(&new, SIGUSR1); \
+	SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \
+	pthread_sigmask(SIG_BLOCK, &new, old); \
+}
+
 static void *
 send_progress_thread(void *arg)
 {
@@ -941,6 +974,26 @@ send_progress_thread(void *arg)
 	struct tm tm;
 	int err;
 
+	const struct sigaction signal_action =
+	    {.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO};
+	struct sigevent timer_cfg =
+	    {.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1};
+	const struct itimerspec timer_time =
+	    {.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}};
+	struct timer_desirability timer = {};
+
+	sigaction(SIGUSR1, &signal_action, NULL);
+#ifdef SIGINFO
+	sigaction(SIGINFO, &signal_action, NULL);
+#endif
+
+	if ((timer.desired = pa->pa_progress || pa->pa_astitle)) {
+		if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer))
+			return ((void *)(uintptr_t)errno);
+		(void) timer_settime(timer.timer, 0, &timer_time, NULL);
+	}
+	pthread_cleanup_push(timer_delete_cleanup, &timer);
+
 	if (!pa->pa_parsable && pa->pa_progress) {
 		(void) fprintf(stderr,
 		    "TIME       %s   %sSNAPSHOT %s\n",
@@ -953,12 +1006,12 @@ send_progress_thread(void *arg)
 	 * Print the progress from ZFS_IOC_SEND_PROGRESS every second.
 	 */
 	for (;;) {
-		(void) sleep(1);
+		pause();
 		if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes,
 		    &blocks)) != 0) {
 			if (err == EINTR || err == ENOENT)
-				return ((void *)0);
-			return ((void *)(uintptr_t)err);
+				err = 0;
+			pthread_exit(((void *)(uintptr_t)err));
 		}
 
 		(void) time(&t);
@@ -991,21 +1044,25 @@ send_progress_thread(void *arg)
 			(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
 			    tm.tm_hour, tm.tm_min, tm.tm_sec,
 			    (u_longlong_t)bytes, zhp->zfs_name);
-		} else if (pa->pa_progress) {
+		} else if (pa->pa_progress ||
+		    !send_progress_thread_signal_duetotimer) {
 			zfs_nicebytes(bytes, buf, sizeof (buf));
 			(void) fprintf(stderr, "%02d:%02d:%02d   %5s   %s\n",
 			    tm.tm_hour, tm.tm_min, tm.tm_sec,
 			    buf, zhp->zfs_name);
 		}
 	}
+	pthread_cleanup_pop(B_TRUE);
 }
 
 static boolean_t
-send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid)
+send_progress_thread_exit(
+    libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask)
 {
 	void *status = NULL;
 	(void) pthread_cancel(ptid);
 	(void) pthread_join(ptid, &status);
+	pthread_sigmask(SIG_SETMASK, oldmask, NULL);
 	int error = (int)(uintptr_t)status;
 	if (error != 0 && status != PTHREAD_CANCELED)
 		return (zfs_standard_error(hdl, error,
@@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
-		if (sdd->progress || sdd->progressastitle) {
+		sigset_t oldmask;
+		{
 			pa.pa_zhp = zhp;
 			pa.pa_fd = sdd->outfd;
 			pa.pa_parsable = sdd->parsable;
@@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 				zfs_close(zhp);
 				return (err);
 			}
+			SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 		}
 
 		err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
 		    fromorigin, sdd->outfd, flags, sdd->debugnv);
 
-		if ((sdd->progress || sdd->progressastitle) &&
-		    send_progress_thread_exit(zhp->zfs_hdl, tid))
+		if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask))
 			return (-1);
 	}
 
@@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 	progress_arg_t pa = { 0 };
 	int err = 0;
 	pthread_t ptid;
+	sigset_t oldmask;
 
-	if (flags->progress || flags->progressastitle) {
+	{
 		pa.pa_zhp = zhp;
 		pa.pa_fd = fd;
 		pa.pa_parsable = flags->parsable;
@@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
+		SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 	}
 
 	err = lzc_send_space_resume_redacted(zhp->zfs_name, from,
@@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 	    redactbook, fd, &size);
 	*sizep = size;
 
-	if ((flags->progress || flags->progressastitle) &&
-	    send_progress_thread_exit(zhp->zfs_hdl, ptid))
+	if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask))
 		return (-1);
 
 	if (!flags->progress && !flags->parsable)
@@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
 	if (!flags->dryrun) {
 		progress_arg_t pa = { 0 };
 		pthread_t tid;
+		sigset_t oldmask;
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
-		if (flags->progress || flags->progressastitle) {
+		{
 			pa.pa_zhp = zhp;
 			pa.pa_fd = outfd;
 			pa.pa_parsable = flags->parsable;
@@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
 				zfs_close(zhp);
 				return (error);
 			}
+			SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 		}
 
 		error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd,
@@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
 		if (redact_book != NULL)
 			free(redact_book);
 
-		if ((flags->progressastitle || flags->progress) &&
-		    send_progress_thread_exit(hdl, tid)) {
+		if (send_progress_thread_exit(hdl, tid, &oldmask)) {
 			zfs_close(zhp);
 			return (-1);
 		}
@@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
 	 * If progress reporting is requested, spawn a new thread to poll
 	 * ZFS_IOC_SEND_PROGRESS at a regular interval.
 	 */
-	if (flags->progress || flags->progressastitle) {
+	sigset_t oldmask;
+	{
 		pa.pa_zhp = zhp;
 		pa.pa_fd = fd;
 		pa.pa_parsable = flags->parsable;
@@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
+		SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 	}
 
 	err = lzc_send_redacted(name, from, fd,
 	    lzc_flags_from_sendflags(flags), redactbook);
 
-	if ((flags->progress || flags->progressastitle) &&
-	    send_progress_thread_exit(hdl, ptid))
+	if (send_progress_thread_exit(hdl, ptid, &oldmask))
 			return (-1);
 
 	if (err == 0 && (flags->props || flags->holds || flags->backup)) {
diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8
index 8cc6ae6ad59b..ba604bf77855 100644
--- a/man/man8/zfs-send.8
+++ b/man/man8/zfs-send.8
@@ -29,7 +29,7 @@
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd January 12, 2023
+.Dd July 27, 2023
 .Dt ZFS-SEND 8
 .Os
 .
@@ -297,6 +297,12 @@ This flag can only be used in conjunction with
 .It Fl v , -verbose
 Print verbose information about the stream package generated.
 This information includes a per-second report of how much data has been sent.
+The same report can be requested by sending
+.Dv SIGINFO
+or
+.Dv SIGUSR1 ,
+regardless of
+.Fl v .
 .Pp
 The format of the stream is committed.
 You will be able to receive your streams on future versions of ZFS.
@@ -433,6 +439,12 @@ and the verbose output goes to standard error
 .It Fl v , -verbose
 Print verbose information about the stream package generated.
 This information includes a per-second report of how much data has been sent.
+The same report can be requested by sending
+.Dv SIGINFO
+or
+.Dv SIGUSR1 ,
+regardless of
+.Fl v .
 .El
 .It Xo
 .Nm zfs
@@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the
 target contains none of the username and password data present on the source,
 because it was removed by the redacted send operation.
 .
+.Sh SIGNALS
+See
+.Fl v .
+.
 .Sh EXAMPLES
 .\" These are, respectively, examples 12, 13 from zfs.8
 .\" Make sure to update them bidirectionally
-- 
cgit v1.2.3


From 019dea0a558517bf327d2c52e84a3a4b01eaaeb8 Mon Sep 17 00:00:00 2001
From: oromenahar <christianpe96@gmail.com>
Date: Tue, 8 Aug 2023 18:37:06 +0200
Subject: zfs_clone_range should return a descriptive error codes

Return the more descriptive error codes instead of `EXDEV` when
the parameters don't match the requirements of the clone function.
Updated the comments in `brt.c` accordingly.
The first three errors are just invalid parameters, which zfs can
not handle.
The fourth error indicates that the block which should be cloned
is created and cloned or modified in the same transaction
group (`txg`).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Kay Pedersen <mail@mkwg.de>
Closes #15148
---
 module/os/freebsd/zfs/zfs_vnops_os.c |  2 +-
 module/os/linux/zfs/zpl_file_range.c |  4 ++--
 module/zfs/brt.c                     |  6 +++---
 module/zfs/zfs_vnops.c               | 13 +++++++------
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index e80e29c2da32..0ffed347fdb7 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6288,7 +6288,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 
 	error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
 	    ap->a_outoffp, &len, ap->a_outcred);
-	if (error == EXDEV || error == EOPNOTSUPP)
+	if (error == EXDEV || error == EINVAL || error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
 out_locked:
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 72384b638be5..43ba9a498202 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -103,7 +103,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
 	 */
-	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
 #else
@@ -111,7 +111,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
 	 * to the kernel that it should fallback to a content copy.
 	 */
-	if (ret == -EXDEV)
+	if (ret == -EINVAL || ret == -EXDEV)
 		ret = -EOPNOTSUPP;
 #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index e8218fb26888..ddd8eefe600b 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -174,7 +174,7 @@
  *	                size_t len, unsigned int flags);
  *
  * Even though offsets and length represent bytes, they have to be
- * block-aligned or we will return the EXDEV error so the upper layer can
+ * block-aligned or we will return an error so the upper layer can
  * fallback to the generic mechanism that will just copy the data.
  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
  * This function was implemented based on zfs_write(), but instead of writing
@@ -192,9 +192,9 @@
  * Some special cases to consider and how we address them:
  * - The block we want to clone may have been created within the same
  *   transaction group that we are trying to clone. Such block has no BP
- *   allocated yet, so cannot be immediately cloned. We return EXDEV.
+ *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
  * - The block we want to clone may have been modified within the same
- *   transaction group. We return EXDEV.
+ *   transaction group. We return EAGAIN.
  * - A block may be cloned multiple times during one transaction group (that's
  *   why pending list is actually a tree and not an append-only list - this
  *   way we can figure out faster if this block is cloned for the first time
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 54ea43363bfc..c6831ff6cd93 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
  *
  * On success, the function return the number of bytes copied in *lenp.
  * Note, it doesn't return how much bytes are left to be copied.
+ * On errors which are caused by any file system limitations or
+ * brt limitations `EINVAL` is returned. In the most cases a user
+ * requested bad parameters, it could be possible to clone the file but
+ * some parameters don't match the requirements.
  */
 int
 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
@@ -1171,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	 * We cannot clone into files with different block size.
 	 */
 	if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
-		error = SET_ERROR(EXDEV);
+		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 
@@ -1179,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	 * Offsets and len must be at block boundries.
 	 */
 	if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
-		error = SET_ERROR(EXDEV);
+		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 	/*
@@ -1187,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	 */
 	if ((len % inblksz) != 0 &&
 	    (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
-		error = SET_ERROR(EXDEV);
+		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 
@@ -1246,9 +1250,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 			 * in the current transaction group. Return an error,
 			 * so the caller can fallback to just copying the data.
 			 */
-			if (error == EAGAIN) {
-				error = SET_ERROR(EXDEV);
-			}
 			break;
 		}
 		/*
-- 
cgit v1.2.3


From b8c9070d09446aca176bd8487cea7c99d6ccf0b6 Mon Sep 17 00:00:00 2001
From: Rafael Kitover <rkitover@gmail.com>
Date: Tue, 8 Aug 2023 16:38:34 +0000
Subject: dracut: support mountpoint=legacy for root dataset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support mountpoint=legacy for the root dataset in the dracut zfs support
scripts.

mountpoint=/ or mountpoint=/sysroot also works.

Change zfs-env-bootfs.service to add zfsutil to BOOTFSFLAGS only for
root datasets with mountpoint != legacy.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Signed-off-by: Rafael Kitover <rkitover@gmail.com>
Closes #15149
---
 contrib/dracut/90zfs/zfs-env-bootfs.service.in | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/contrib/dracut/90zfs/zfs-env-bootfs.service.in
index 7ebab4c1a58d..fe362b930bf5 100644
--- a/contrib/dracut/90zfs/zfs-env-bootfs.service.in
+++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in
@@ -12,11 +12,12 @@ ExecStart=/bin/sh -c '
     decode_root_args || exit 0;                                                                \
     [ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \
     rootflags="$(getarg rootflags=)";                                                          \
-    case ",$rootflags," in                                                                     \
-        *,zfsutil,*) ;;                                                                        \
-        ,,) rootflags=zfsutil ;;                                                               \
-        *)  rootflags="zfsutil,$rootflags" ;;                                                  \
-    esac;                                                                                      \
+    [ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] ||                      \
+        case ",$rootflags," in                                                                 \
+            *,zfsutil,*) ;;                                                                    \
+            ,,) rootflags=zfsutil ;;                                                           \
+            *)  rootflags="zfsutil,$rootflags" ;;                                              \
+        esac;                                                                                  \
     exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"'
 
 [Install]
-- 
cgit v1.2.3


From 81500902578da8da7c59347742e9e7aea31fe060 Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Tue, 8 Aug 2023 21:40:36 +0500
Subject: Move zinject from openzfs-zfs-test to openzfs-zfsutils

For Native Debian packaging, zinject binary and man page is
packaged in ZFS test package. zinject is not not directly related
to ZTS and should be packaged with other utilities, like it is
present in zfs_<ver>.rpm/deb packages.

This commit moves zinject binary and man page from openzfs-zfs-test
to openzfs-zfsutils package.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #15160
---
 contrib/debian/openzfs-zfs-test.install | 2 --
 contrib/debian/openzfs-zfsutils.install | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/debian/openzfs-zfs-test.install b/contrib/debian/openzfs-zfs-test.install
index cafcfdc0e15b..b3afef50dbd4 100644
--- a/contrib/debian/openzfs-zfs-test.install
+++ b/contrib/debian/openzfs-zfs-test.install
@@ -1,10 +1,8 @@
-sbin/zinject
 sbin/ztest
 usr/bin/raidz_test
 usr/share/man/man1/raidz_test.1
 usr/share/man/man1/test-runner.1
 usr/share/man/man1/ztest.1
-usr/share/man/man8/zinject.8
 usr/share/zfs/common.sh
 usr/share/zfs/runfiles/
 usr/share/zfs/test-runner
diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install
index 49f0ec0d5d92..0f58508f0062 100644
--- a/contrib/debian/openzfs-zfsutils.install
+++ b/contrib/debian/openzfs-zfsutils.install
@@ -27,6 +27,7 @@ sbin/zfs
 sbin/zfs_ids_to_path
 sbin/zgenhostid
 sbin/zhack
+sbin/zinject
 sbin/zpool
 sbin/zstream
 sbin/zstreamdump
@@ -92,6 +93,7 @@ usr/share/man/man8/zfs_ids_to_path.8
 usr/share/man/man7/zfsconcepts.7
 usr/share/man/man7/zfsprops.7
 usr/share/man/man8/zgenhostid.8
+usr/share/man/man8/zinject.8
 usr/share/man/man8/zpool-add.8
 usr/share/man/man8/zpool-attach.8
 usr/share/man/man8/zpool-checkpoint.8
-- 
cgit v1.2.3


From 8ce2eba9e6a384feef93d77c397f37d17dc588ce Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Tue, 8 Aug 2023 18:42:32 -0400
Subject: Linux 6.5 compat: Use copy_splice_read instead of filemap_splice_read

Using the filemap_splice_read function for the splice_read handler was
leading to occasional data corruption under certain circumstances. Favor
using copy_splice_read instead, which does not demonstrate the same
erroneous behavior under the tested failure cases.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15164
---
 config/kernel-filemap-splice-read.m4 | 18 +++++++++---------
 config/kernel.m4                     |  4 ++--
 module/os/linux/zfs/zpl_file.c       |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/config/kernel-filemap-splice-read.m4 b/config/kernel-filemap-splice-read.m4
index 5199b7373e4d..4c83b31d738a 100644
--- a/config/kernel-filemap-splice-read.m4
+++ b/config/kernel-filemap-splice-read.m4
@@ -1,24 +1,24 @@
-AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ], [
+AC_DEFUN([ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ], [
 	dnl #
 	dnl # Kernel 6.5 - generic_file_splice_read was removed in favor
-	dnl # of filemap_splice_read for the .splice_read member of the
+	dnl # of copy_splice_read for the .splice_read member of the
 	dnl # file_operations struct.
 	dnl #
-	ZFS_LINUX_TEST_SRC([has_filemap_splice_read], [
+	ZFS_LINUX_TEST_SRC([has_copy_splice_read], [
 		#include <linux/fs.h>
 
 		struct file_operations fops __attribute__((unused)) = {
-			.splice_read = filemap_splice_read,
+			.splice_read = copy_splice_read,
 		};
 	],[])
 ])
 
-AC_DEFUN([ZFS_AC_KERNEL_FILEMAP_SPLICE_READ], [
-	AC_MSG_CHECKING([whether filemap_splice_read() exists])
-	ZFS_LINUX_TEST_RESULT([has_filemap_splice_read], [
+AC_DEFUN([ZFS_AC_KERNEL_COPY_SPLICE_READ], [
+	AC_MSG_CHECKING([whether copy_splice_read() exists])
+	ZFS_LINUX_TEST_RESULT([has_copy_splice_read], [
 		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_FILEMAP_SPLICE_READ, 1,
-		    [filemap_splice_read exists])
+		AC_DEFINE(HAVE_COPY_SPLICE_READ, 1,
+		    [copy_splice_read exists])
 	],[
 		AC_MSG_RESULT(no)
 	])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 309f1819be48..df194ec72207 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -161,7 +161,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
-	ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ
+	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -302,7 +302,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
-	ZFS_AC_KERNEL_FILEMAP_SPLICE_READ
+	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 24cc1064a8fc..3caa0fc6c214 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -1323,8 +1323,8 @@ const struct file_operations zpl_file_operations = {
 	.read_iter	= zpl_iter_read,
 	.write_iter	= zpl_iter_write,
 #ifdef HAVE_VFS_IOV_ITER
-#ifdef HAVE_FILEMAP_SPLICE_READ
-	.splice_read	= filemap_splice_read,
+#ifdef HAVE_COPY_SPLICE_READ
+	.splice_read	= copy_splice_read,
 #else
 	.splice_read	= generic_file_splice_read,
 #endif
-- 
cgit v1.2.3


From bdb7df42451836f629e725de74b4edbc5e16ff49 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 11 Aug 2023 12:04:08 -0400
Subject: ZIL: Avoid dbuf_read() before dmu_sync().

In most cases dmu_sync() works with dirty records directly and does
not need actual data. The only exception is dmu_sync_late_arrival().
To save some CPU time use dmu_buf_hold_noread*() in z*_get_data()
and explicitly call dbuf_read() in dmu_sync_late_arrival(). There
is also a chance that by that time TXG will already be synced and
we won't have to do it at all.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15153
---
 include/sys/dmu.h      | 4 ++++
 include/sys/dmu_impl.h | 2 --
 module/zfs/dmu.c       | 9 ++++++++-
 module/zfs/zfs_vnops.c | 4 ++--
 module/zfs/zvol.c      | 4 ++--
 5 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index a84175c980d7..1cc8b8971a2d 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
 int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp);
+int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+    const void *tag, dmu_buf_t **dbp);
 int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags);
 int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp, uint32_t flags);
+int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
+    dmu_buf_t **dbp);
 /*
  * Add a reference to a dmu buffer that has already been held via
  * dmu_buf_hold() in the current context.
diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h
index ce6ae3c665ac..83ae2b76ba1f 100644
--- a/include/sys/dmu_impl.h
+++ b/include/sys/dmu_impl.h
@@ -247,8 +247,6 @@ typedef struct dmu_sendstatus {
 
 void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
 void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
-int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
-    const void *, dmu_buf_t **);
 
 #ifdef	__cplusplus
 }
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index c2565b50b7d0..a63aac51f225 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -165,7 +165,7 @@ dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
-static int
+int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
@@ -185,6 +185,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
 	*dbp = &db->db;
 	return (0);
 }
+
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
@@ -1672,6 +1673,12 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
+	int error;
+
+	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
+	    DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+	if (error != 0)
+		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index c6831ff6cd93..8a3b08139b6b 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -917,8 +917,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 		}
 #endif
 		if (error == 0)
-			error = dmu_buf_hold(os, object, offset, zgd, &db,
-			    DMU_READ_NO_PREFETCH);
+			error = dmu_buf_hold_noread(os, object, offset, zgd,
+			    &db);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 547687f07ed5..bbef53e4e479 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -727,8 +727,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
-		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
-		    DMU_READ_NO_PREFETCH);
+		error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
+		    &db);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
-- 
cgit v1.2.3


From 8e20e0ff39f0492cdc477f15d9d8c1b2f092d562 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 11 Aug 2023 12:04:44 -0400
Subject: ZIL: Replay blocks without next block pointer.

If we get next block allocation error during log write, we trigger
transaction commit.  But the block we have just completed is still
written and transactions it covers will be acknowledged normally.
If after that we ignore the block during replay just because it is
the last in the chain, we may not replay some transactions that we
have acknowledged as synced, that is not right.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15132
---
 module/zfs/zil.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 6f04a7d4a7df..567787a19b66 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -290,7 +290,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
 			char *lr = (char *)(zilc + 1);
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
-			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+			    sizeof (cksum)) ||
 			    zilc->zc_nused < sizeof (*zilc) ||
 			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
@@ -304,7 +304,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
-			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
-- 
cgit v1.2.3


From cae502c17576694721d290c2506880d0caa872c7 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Tue, 15 Aug 2023 10:34:14 +1000
Subject: copy_file_range: fix fallback when source create on same txg

In 019dea0a5 we removed the conversion from EAGAIN->EXDEV inside
zfs_clone_range(), but forgot to add a test for EAGAIN to the
copy_file_range() entry points to trigger fallback to a content copy.

This commit fixes that.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15170
Closes #15172
---
 module/os/freebsd/zfs/zfs_vnops_os.c               |  3 +-
 module/os/linux/zfs/zpl_file_range.c               |  5 +-
 module/zfs/zfs_vnops.c                             |  7 ++-
 tests/runfiles/linux.run                           |  3 +-
 tests/test-runner/bin/zts-report.py.in             |  2 +
 tests/zfs-tests/tests/Makefile.am                  |  3 +-
 ...ock_cloning_copyfilerange_fallback_same_txg.ksh | 66 ++++++++++++++++++++++
 7 files changed, 81 insertions(+), 8 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 0ffed347fdb7..ab72e91b4f9a 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6288,7 +6288,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 
 	error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
 	    ap->a_outoffp, &len, ap->a_outcred);
-	if (error == EXDEV || error == EINVAL || error == EOPNOTSUPP)
+	if (error == EXDEV || error == EAGAIN || error == EINVAL ||
+	    error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
 out_locked:
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 43ba9a498202..2abbf44df587 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -103,7 +103,8 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
 	 */
-	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV)
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+	    ret == -EAGAIN)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
 #else
@@ -111,7 +112,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
 	 * to the kernel that it should fallback to a content copy.
 	 */
-	if (ret == -EINVAL || ret == -EXDEV)
+	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
 		ret = -EOPNOTSUPP;
 #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 8a3b08139b6b..fceb56dfc029 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1246,9 +1246,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		    &nbps);
 		if (error != 0) {
 			/*
-			 * If we are tyring to clone a block that was created
-			 * in the current transaction group. Return an error,
-			 * so the caller can fallback to just copying the data.
+			 * If we are trying to clone a block that was created
+			 * in the current transaction group, error will be
+			 * EAGAIN here, which we can just return to the caller
+			 * so it can fallback if it likes.
 			 */
 			break;
 		}
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 4747b9837337..2c8d5cb0ecbb 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -41,7 +41,8 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
     'block_cloning_ficlonerange_partial',
     'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
     'block_cloning_disabled_ficlonerange',
-    'block_cloning_copyfilerange_cross_dataset']
+    'block_cloning_copyfilerange_cross_dataset',
+    'block_cloning_copyfilerange_fallback_same_txg']
 tags = ['functional', 'block_cloning']
 
 [tests/functional/chattr:Linux]
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 5c4b3a7bcdc1..e1bbe063ab4c 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -304,6 +304,8 @@ elif sys.platform.startswith('linux'):
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_copyfilerange_cross_dataset':
             ['SKIP', cfr_cross_reason],
+        'block_cloning/block_cloning_copyfilerange_fallback_same_txg':
+            ['SKIP', cfr_cross_reason],
     })
 
 
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 3b6b2ef734d0..66aff5026f8f 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -441,9 +441,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/block_cloning/cleanup.ksh \
 	functional/block_cloning/setup.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
+	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
+	functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
 	functional/block_cloning/block_cloning_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
-	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
 	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlone.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
new file mode 100755
index 000000000000..3451f887afb4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
@@ -0,0 +1,66 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+claim="copy_file_range will fall back to copy when cloning on same txg"
+
+log_assert $claim
+
+typeset timeout=$(get_tunable TXG_TIMEOUT)
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+	set_tunable64 TXG_TIMEOUT $timeout
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must set_tunable64 TXG_TIMEOUT 5000
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
+
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "" ]
+
+log_pass $claim
+
-- 
cgit v1.2.3


From 11fbcacf37d1a66c7a40bb8920c70ce9a87270ea Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 24 Aug 2023 11:59:03 -0700
Subject: zed: Add zedlet to power off slot when drive is faulted

If ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is enabled in zed.rc, then
power off the drive's slot in the enclosure if it becomes FAULTED.
This can help silence misbehaving drives.  This assumes your drive
enclosure fully supports slot power control via sysfs.

Reviewed-by: @AllKind
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #15200
---
 cmd/zed/zed.d/statechange-slot_off.sh | 61 +++++++++++++++++++++++++++++++++++
 cmd/zed/zed.d/zed.rc                  |  5 +++
 2 files changed, 66 insertions(+)
 create mode 100755 cmd/zed/zed.d/statechange-slot_off.sh

diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh
new file mode 100755
index 000000000000..d6f3c94a4197
--- /dev/null
+++ b/cmd/zed/zed.d/statechange-slot_off.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+#
+# Turn off disk's enclosure slot if it becomes FAULTED.
+#
+# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
+# as they flip between FAULTED and ONLINE.  If
+# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
+# FAULTED, then power down the slot via sysfs:
+#
+# /sys/class/enclosure/<enclosure>/<slot>/power_status
+#
+# We assume the user will be responsible for turning the slot back on again.
+#
+# Note that this script requires that your enclosure be supported by the
+# Linux SCSI Enclosure services (SES) driver.  The script will do nothing
+# if you have no enclosure, or if your enclosure isn't supported.
+#
+# Exit codes:
+#   0: slot successfully powered off
+#   1: enclosure not available
+#   2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
+#   3: vdev was not FAULTED
+#   4: The enclosure sysfs path passed from ZFS does not exist
+#   5: Enclosure slot didn't actually turn off after we told it to
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+if [ ! -d /sys/class/enclosure ] ; then
+	# No JBOD enclosure or NVMe slots
+	exit 1
+fi
+
+if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
+	exit 2
+fi
+
+if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then
+	exit 3
+fi
+
+if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
+	exit 4
+fi
+
+echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
+
+# Wait for sysfs for report that the slot is off.  It can take ~400ms on some
+# enclosures.
+for i in $(seq 1 20) ; do
+	if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
+		break
+	fi
+	sleep 0.1
+done
+
+if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then
+	exit 5
+fi
+
+zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"
diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
index c55a70c79f75..78dc1afc7b15 100644
--- a/cmd/zed/zed.d/zed.rc
+++ b/cmd/zed/zed.d/zed.rc
@@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
 # Disabled by default, 1 to enable and 0 to disable.
 #ZED_SYSLOG_DISPLAY_GUIDS=1
 
+##
+# Power off the drive's slot in the enclosure if it becomes FAULTED.  This can
+# help silence misbehaving drives.  This assumes your drive enclosure fully
+# supports slot power control via sysfs.
+#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1
-- 
cgit v1.2.3


From eda3fcd56ff9f46dd606c48331cb786856f37fe3 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 24 Aug 2023 20:08:49 -0400
Subject: ZIL: Second attempt to reduce scope of zl_issuer_lock.

The previous patch #14841 appeared to have significant flaw, causing
deadlocks if zl_get_data callback got blocked waiting for TXG sync.  I
already handled some of such cases in the original patch, but issue
 #14982 shown cases that were impossible to solve in that design.

This patch fixes the problem by postponing log blocks allocation till
the very end, just before the zios issue, leaving nothing blocking after
that point to cause deadlocks.  Before that point though any sleeps are
now allowed, not causing sync thread blockage.  This require slightly
more complicated lwb state machine to allocate blocks and issue zios
in proper order.  But with removal of special early issue workarounds
the new code is much cleaner now, and should even be more efficient.

Since this patch uses null zios between write, I've found that null
zios do not wait for logical children ready status in zio_ready(),
that makes parent write to proceed prematurely, producing incorrect
log blocks.  Added ZIO_CHILD_LOGICAL_BIT to zio_wait_for_children()
fixes it.

Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15122
---
 cmd/ztest.c            |   2 +-
 include/sys/zil_impl.h |  43 +++-
 module/zfs/zfs_vnops.c |   2 +-
 module/zfs/zil.c       | 683 +++++++++++++++++++++++--------------------------
 module/zfs/zio.c       |   4 +-
 module/zfs/zvol.c      |   2 +-
 6 files changed, 355 insertions(+), 381 deletions(-)

diff --git a/cmd/ztest.c b/cmd/ztest.c
index b6b99bfff6db..398c519cfc35 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 	int error;
 
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	ztest_object_lock(zd, object, RL_READER);
@@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		    DMU_READ_NO_PREFETCH);
 		ASSERT0(error);
 	} else {
+		ASSERT3P(zio, !=, NULL);
 		size = doi.doi_data_block_size;
 		if (ISP2(size)) {
 			offset = P2ALIGN(offset, size);
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index b58dad9695a6..f780ad3d61bc 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -38,14 +38,22 @@ extern "C" {
 /*
  * Possible states for a given lwb structure.
  *
- * An lwb will start out in the "closed" state, and then transition to
- * the "opened" state via a call to zil_lwb_write_open(). When
- * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
- * must be held.
+ * An lwb will start out in the "new" state, and transition to the "opened"
+ * state via a call to zil_lwb_write_open() on first itx assignment.  When
+ * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
+ * held.
  *
- * After the lwb is "opened", it can transition into the "issued" state
- * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
- * be held when making this transition.
+ * After the lwb is "opened", it can be assigned number of itxs and transition
+ * into the "closed" state via zil_lwb_write_close() when full or on timeout.
+ * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock"
+ * must be held.  New lwb allocation also takes "zl_lock" to protect the list.
+ *
+ * After the lwb is "closed", it can transition into the "ready" state via
+ * zil_lwb_write_issue().  "zl_lock" must be held when making this transition.
+ * Since it is done by the same thread, "zl_issuer_lock" is not needed.
+ *
+ * When lwb in "ready" state receives its block pointer, it can transition to
+ * "issued". "zl_lock" must be held when making this transition.
  *
  * After the lwb's write zio completes, it transitions into the "write
  * done" state via zil_lwb_write_done(); and then into the "flush done"
@@ -62,17 +70,20 @@ extern "C" {
  *
  * Additionally, correctness when reading an lwb's state is often
  * achieved by exploiting the fact that these state transitions occur in
- * this specific order; i.e. "closed" to "opened" to "issued" to "done".
+ * this specific order; i.e. "new" to "opened" to "closed" to "ready" to
+ * "issued" to "write_done" and finally "flush_done".
  *
- * Thus, if an lwb is in the "closed" or "opened" state, holding the
+ * Thus, if an lwb is in the "new" or "opened" state, holding the
  * "zl_issuer_lock" will prevent a concurrent thread from transitioning
- * that lwb to the "issued" state. Likewise, if an lwb is already in the
- * "issued" state, holding the "zl_lock" will prevent a concurrent
- * thread from transitioning that lwb to the "write done" state.
+ * that lwb to the "closed" state. Likewise, if an lwb is already in the
+ * "ready" state, holding the "zl_lock" will prevent a concurrent thread
+ * from transitioning that lwb to the "issued" state.
  */
 typedef enum {
-    LWB_STATE_CLOSED,
+    LWB_STATE_NEW,
     LWB_STATE_OPENED,
+    LWB_STATE_CLOSED,
+    LWB_STATE_READY,
     LWB_STATE_ISSUED,
     LWB_STATE_WRITE_DONE,
     LWB_STATE_FLUSH_DONE,
@@ -91,17 +102,21 @@ typedef enum {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	boolean_t	lwb_slim;	/* log block has slim format */
 	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
-	boolean_t	lwb_indirect;	/* do not postpone zil_lwb_commit() */
+	int		lwb_error;	/* log block allocation error */
+	int		lwb_nmax;	/* max bytes in the buffer */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_nfilled;	/* # filled bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	lwb_state_t	lwb_state;	/* the state of this lwb */
 	char		*lwb_buf;	/* log write buffer */
+	zio_t		*lwb_child_zio;	/* parent zio for children */
 	zio_t		*lwb_write_zio;	/* zio for the lwb buffer */
 	zio_t		*lwb_root_zio;	/* root zio for lwb write and flushes */
 	hrtime_t	lwb_issued_timestamp; /* when was the lwb issued? */
 	uint64_t	lwb_issued_txg;	/* the txg when the write is issued */
+	uint64_t	lwb_alloc_txg;	/* the txg when lwb_blk is allocated */
 	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
 	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
 	list_node_t	lwb_issue_node;	/* linkage of lwbs ready for issue */
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index fceb56dfc029..f8d13075d5c0 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 	uint64_t zp_gen;
 
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	/*
@@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
+		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 567787a19b66..f2d279e36a96 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -151,7 +151,6 @@ static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
-static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb);
 static itx_t *zil_itx_clone(itx_t *oitx);
 
 static int
@@ -760,33 +759,52 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
 	return (TREE_CMP(v1, v2));
 }
 
+/*
+ * Allocate a new lwb.  We may already have a block pointer for it, in which
+ * case we get size and version from there.  Or we may not yet, in which case
+ * we choose them here and later make the block allocation match.
+ */
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
+    uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
-	lwb->lwb_blk = *bp;
+	if (bp) {
+		lwb->lwb_blk = *bp;
+		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
+		sz = BP_GET_LSIZE(bp);
+	} else {
+		BP_ZERO(&lwb->lwb_blk);
+		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
+		    SPA_VERSION_SLIM_ZIL);
+	}
 	lwb->lwb_slog = slog;
-	lwb->lwb_indirect = B_FALSE;
-	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+	lwb->lwb_error = 0;
+	if (lwb->lwb_slim) {
+		lwb->lwb_nmax = sz;
 		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
-		lwb->lwb_sz = BP_GET_LSIZE(bp);
 	} else {
+		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 		lwb->lwb_nused = lwb->lwb_nfilled = 0;
-		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
 	}
-	lwb->lwb_state = LWB_STATE_CLOSED;
-	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+	lwb->lwb_sz = sz;
+	lwb->lwb_state = state;
+	lwb->lwb_buf = zio_buf_alloc(sz);
+	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	lwb->lwb_issued_txg = 0;
-	lwb->lwb_max_txg = txg;
+	lwb->lwb_alloc_txg = txg;
+	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
+	if (state != LWB_STATE_NEW)
+		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
@@ -800,10 +818,12 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
-	ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
+	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 
 	/*
@@ -954,7 +974,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
+		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -1041,7 +1061,8 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+			if (!BP_IS_HOLE(&lwb->lwb_blk))
+				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
@@ -1269,21 +1290,21 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
-	 * zl_lock, thus it must be held when calling this function.
+	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
+	 * zl_issuer_lock also protects leaving the open state.
+	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
+	 * flush_done, which transition is protected by zl_lock.
 	 */
-	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
+	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
+	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
-	mutex_enter(&zcw->zcw_lock);
 	ASSERT(!list_link_active(&zcw->zcw_node));
-	ASSERT3P(zcw->zcw_lwb, ==, NULL);
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
-	    lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE);
-
 	list_insert_tail(&lwb->lwb_waiters, zcw);
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
-	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
@@ -1294,11 +1315,9 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
-	mutex_enter(&zcw->zcw_lock);
 	ASSERT(!list_link_active(&zcw->zcw_node));
-	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	list_insert_tail(nolwb, zcw);
-	mutex_exit(&zcw->zcw_lock);
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
@@ -1484,7 +1503,7 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 	mutex_enter(&zilog->zl_lock);
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
-	while (lwb != NULL && lwb->lwb_max_txg <= txg) {
+	while (lwb != NULL) {
 		if (lwb->lwb_issued_txg <= txg) {
 			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
 			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
@@ -1527,14 +1546,6 @@ zil_lwb_write_done(zio_t *zio)
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
-	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
-	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
-	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
-	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
-	ASSERT(!BP_IS_GANG(zio->io_bp));
-	ASSERT(!BP_IS_HOLE(zio->io_bp));
-	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
-
 	abd_free(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	lwb->lwb_buf = NULL;
@@ -1542,6 +1553,7 @@ zil_lwb_write_done(zio_t *zio)
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
+	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	mutex_exit(&zilog->zl_lock);
@@ -1606,116 +1618,75 @@ zil_lwb_write_done(zio_t *zio)
 	}
 }
 
+/*
+ * Build the zio dependency chain, which is used to preserve the ordering of
+ * lwb completions that is required by the semantics of the ZIL. Each new lwb
+ * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
+ * cannot complete until the previous lwb's zio completes.
+ *
+ * This is required by the semantics of zil_commit(): the commit waiters
+ * attached to the lwbs will be woken in the lwb zio's completion callback,
+ * so this zio dependency graph ensures the waiters are woken in the correct
+ * order (the same order the lwbs were created).
+ */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
-	lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
+	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
+	if (prev_lwb == NULL ||
+	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+		return;
+
 	/*
-	 * The zilog's "zl_last_lwb_opened" field is used to build the
-	 * lwb/zio dependency chain, which is used to preserve the
-	 * ordering of lwb completions that is required by the semantics
-	 * of the ZIL. Each new lwb zio becomes a parent of the
-	 * "previous" lwb zio, such that the new lwb's zio cannot
-	 * complete until the "previous" lwb's zio completes.
+	 * If the previous lwb's write hasn't already completed, we also want
+	 * to order the completion of the lwb write zios (above, we only order
+	 * the completion of the lwb root zios). This is required because of
+	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+	 *
+	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
+	 * lwb will rely on this lwb to flush the vdevs written to by that
+	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
+	 * flush until after the previous lwb's write completes. We ensure
+	 * this ordering by setting the zio parent/child relationship here.
 	 *
-	 * This is required by the semantics of zil_commit(); the commit
-	 * waiters attached to the lwbs will be woken in the lwb zio's
-	 * completion callback, so this zio dependency graph ensures the
-	 * waiters are woken in the correct order (the same order the
-	 * lwbs were created).
+	 * Without this relationship on the lwb's write zio, it's possible
+	 * for this lwb's write to complete prior to the previous lwb's write
+	 * completing; and thus, the vdevs for the previous lwb would be
+	 * flushed prior to that lwb's data being written to those vdevs (the
+	 * vdevs are flushed in the lwb write zio's completion handler,
+	 * zil_lwb_write_done()).
 	 */
-	if (last_lwb_opened != NULL &&
-	    last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
-		ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-		    last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
-		    last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
-
-		ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
-		zio_add_child(lwb->lwb_root_zio,
-		    last_lwb_opened->lwb_root_zio);
-
-		/*
-		 * If the previous lwb's write hasn't already completed,
-		 * we also want to order the completion of the lwb write
-		 * zios (above, we only order the completion of the lwb
-		 * root zios). This is required because of how we can
-		 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
-		 *
-		 * When the DKIOCFLUSHWRITECACHE commands are deferred,
-		 * the previous lwb will rely on this lwb to flush the
-		 * vdevs written to by that previous lwb. Thus, we need
-		 * to ensure this lwb doesn't issue the flush until
-		 * after the previous lwb's write completes. We ensure
-		 * this ordering by setting the zio parent/child
-		 * relationship here.
-		 *
-		 * Without this relationship on the lwb's write zio,
-		 * it's possible for this lwb's write to complete prior
-		 * to the previous lwb's write completing; and thus, the
-		 * vdevs for the previous lwb would be flushed prior to
-		 * that lwb's data being written to those vdevs (the
-		 * vdevs are flushed in the lwb write zio's completion
-		 * handler, zil_lwb_write_done()).
-		 */
-		if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
-			ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-			    last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
-
-			ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
-			zio_add_child(lwb->lwb_write_zio,
-			    last_lwb_opened->lwb_write_zio);
-		}
+	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
+		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
+		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
+	} else {
+		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
+
+	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
+	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
- * accept new itxs being committed to it. To do this, the lwb's zio
- * structures are created, and linked to the lwb. This function is
- * idempotent; if the passed in lwb has already been opened, this
- * function is essentially a no-op.
+ * accept new itxs being committed to it. This function is idempotent; if
+ * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
-	zbookmark_phys_t zb;
-	zio_priority_t prio;
-
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT3P(lwb, !=, NULL);
-	EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
-	EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
 
-	if (lwb->lwb_root_zio != NULL)
+	if (lwb->lwb_state != LWB_STATE_NEW) {
+		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 		return;
-
-	lwb->lwb_root_zio = zio_root(zilog->zl_spa,
-	    zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
-
-	abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
-	    BP_GET_LSIZE(&lwb->lwb_blk));
-
-	if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
-		prio = ZIO_PRIORITY_SYNC_WRITE;
-	else
-		prio = ZIO_PRIORITY_ASYNC_WRITE;
-
-	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
-	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
-	lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0,
-	    &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
-	    zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb);
+	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
-	zil_lwb_set_zio_dependency(zilog, lwb);
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 }
@@ -1752,57 +1723,21 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
-zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
-	lwb_t *nlwb = NULL;
-	zil_chain_t *zilc;
-	spa_t *spa = zilog->zl_spa;
-	blkptr_t *bp;
-	dmu_tx_t *tx;
-	uint64_t txg;
-	uint64_t zil_blksz;
-	int i, error;
-	boolean_t slog;
+	int i;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
-	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
-	 * If this lwb includes indirect writes, we have to commit before
-	 * creating the transaction, otherwise we may end up in dead lock.
-	 */
-	if (lwb->lwb_indirect) {
-		for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
-		    itx = list_next(&lwb->lwb_itxs, itx))
-			zil_lwb_commit(zilog, lwb, itx);
-		lwb->lwb_nused = lwb->lwb_nfilled;
-	}
-
-	/*
-	 * Allocate the next block and save its address in this block
-	 * before writing it in order to establish the log chain.
+	 * If there was an allocation failure then returned NULL will trigger
+	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
+	 * since allocation may not have happened yet.
 	 */
-
-	tx = dmu_tx_create(zilog->zl_os);
-
-	/*
-	 * Since we are not going to create any new dirty data, and we
-	 * can even help with clearing the existing dirty data, we
-	 * should not be subject to the dirty data based delays. We
-	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
-	 */
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
-
-	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-	txg = dmu_tx_get_txg(tx);
-
-	mutex_enter(&zilog->zl_lwb_io_lock);
-	lwb->lwb_issued_txg = txg;
-	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
-	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
-	mutex_exit(&zilog->zl_lwb_io_lock);
+	if (lwb->lwb_error != 0)
+		return (NULL);
 
 	/*
 	 * Log blocks are pre-allocated. Here we select the size of the next
@@ -1820,7 +1755,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
 	 * the maximum block size because we can exhaust the available
 	 * pool log space.
 	 */
-	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+	uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
 	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
 		continue;
 	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
@@ -1832,94 +1767,149 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
 	    uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
-	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2)
-		zilc = (zil_chain_t *)lwb->lwb_buf;
-	else
-		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
-	bp = &zilc->zc_next_blk;
-	BP_ZERO(bp);
-	error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
-	if (error == 0) {
-		ASSERT3U(bp->blk_birth, ==, txg);
-		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
-		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+	return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
+}
 
-		/*
-		 * Allocate a new log write block (lwb).
-		 */
-		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
-	}
+/*
+ * Finalize previously closed block and issue the write zio.
+ */
+static void
+zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
+{
+	spa_t *spa = zilog->zl_spa;
+	zil_chain_t *zilc;
+	boolean_t slog;
+	zbookmark_phys_t zb;
+	zio_priority_t prio;
+	int error;
 
-	lwb->lwb_state = LWB_STATE_ISSUED;
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
-	dmu_tx_commit(tx);
+	/* Actually fill the lwb with the data. */
+	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
+	    itx = list_next(&lwb->lwb_itxs, itx))
+		zil_lwb_commit(zilog, lwb, itx);
+	lwb->lwb_nused = lwb->lwb_nfilled;
+
+	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
+	    ZIO_FLAG_CANFAIL);
 
 	/*
-	 * We need to acquire the config lock for the lwb to issue it later.
-	 * However, if we already have a queue of closed parent lwbs already
-	 * holding the config lock (but not yet issued), we can't block here
-	 * waiting on the lock or we will deadlock.  In that case we must
-	 * first issue to parent IOs before waiting on the lock.
+	 * The lwb is now ready to be issued, but it can be only if it already
+	 * got its block pointer allocated or the allocation has failed.
+	 * Otherwise leave it as-is, relying on some other thread to issue it
+	 * after allocating its block pointer via calling zil_lwb_write_issue()
+	 * for the previous lwb(s) in the chain.
 	 */
-	if (ilwbs && !list_is_empty(ilwbs)) {
-		if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) {
-			lwb_t *tlwb;
-			while ((tlwb = list_remove_head(ilwbs)) != NULL)
-				zil_lwb_write_issue(zilog, tlwb);
-			spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
+	mutex_enter(&zilog->zl_lock);
+	lwb->lwb_state = LWB_STATE_READY;
+	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
+		mutex_exit(&zilog->zl_lock);
+		return;
+	}
+	mutex_exit(&zilog->zl_lock);
+
+next_lwb:
+	if (lwb->lwb_slim)
+		zilc = (zil_chain_t *)lwb->lwb_buf;
+	else
+		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
+	int wsz = lwb->lwb_sz;
+	if (lwb->lwb_error == 0) {
+		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
+		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+			prio = ZIO_PRIORITY_SYNC_WRITE;
+		else
+			prio = ZIO_PRIORITY_ASYNC_WRITE;
+		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
+		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
+		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
+		zil_lwb_add_block(lwb, &lwb->lwb_blk);
+
+		if (lwb->lwb_slim) {
+			/* For Slim ZIL only write what is used. */
+			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
+			    int);
+			ASSERT3S(wsz, <=, lwb->lwb_sz);
+			zio_shrink(lwb->lwb_write_zio, wsz);
+			wsz = lwb->lwb_write_zio->io_size;
 		}
+		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
+		zilc->zc_pad = 0;
+		zilc->zc_nused = lwb->lwb_nused;
+		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 	} else {
-		spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
+		/*
+		 * We can't write the lwb if there was an allocation failure,
+		 * so create a null zio instead just to maintain dependencies.
+		 */
+		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
+		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
+		lwb->lwb_write_zio->io_error = lwb->lwb_error;
 	}
-
-	if (ilwbs)
-		list_insert_tail(ilwbs, lwb);
+	if (lwb->lwb_child_zio)
+		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
-	 * If there was an allocation failure then nlwb will be null which
-	 * forces a txg_wait_synced().
+	 * Open transaction to allocate the next block pointer.
 	 */
-	return (nlwb);
-}
+	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	uint64_t txg = dmu_tx_get_txg(tx);
 
-/*
- * Finalize previously closed block and issue the write zio.
- * Does not require locking.
- */
-static void
-zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
-{
-	zil_chain_t *zilc;
-	int wsz;
-
-	/* Actually fill the lwb with the data if not yet. */
-	if (!lwb->lwb_indirect) {
-		for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
-		    itx = list_next(&lwb->lwb_itxs, itx))
-			zil_lwb_commit(zilog, lwb, itx);
-		lwb->lwb_nused = lwb->lwb_nfilled;
+	/*
+	 * Allocate next the block pointer unless we are already in error.
+	 */
+	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
+	blkptr_t *bp = &zilc->zc_next_blk;
+	BP_ZERO(bp);
+	error = lwb->lwb_error;
+	if (error == 0) {
+		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
+		    &slog);
+	}
+	if (error == 0) {
+		ASSERT3U(bp->blk_birth, ==, txg);
+		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
+		    ZIO_CHECKSUM_ZILOG);
+		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
 
-	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
-		/* For Slim ZIL only write what is used. */
-		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int);
-		ASSERT3S(wsz, <=, lwb->lwb_sz);
-		zio_shrink(lwb->lwb_write_zio, wsz);
-		wsz = lwb->lwb_write_zio->io_size;
+	/*
+	 * Reduce TXG open time by incrementing inflight counter and committing
+	 * the transaciton.  zil_sync() will wait for it to return to zero.
+	 */
+	mutex_enter(&zilog->zl_lwb_io_lock);
+	lwb->lwb_issued_txg = txg;
+	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
+	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
+	mutex_exit(&zilog->zl_lwb_io_lock);
+	dmu_tx_commit(tx);
 
-		zilc = (zil_chain_t *)lwb->lwb_buf;
-	} else {
-		wsz = lwb->lwb_sz;
-		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
-	}
-	zilc->zc_pad = 0;
-	zilc->zc_nused = lwb->lwb_nused;
-	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
-	 * clear unused data for security
+	 * We've completed all potentially blocking operations.  Update the
+	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
-	memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
+	mutex_enter(&zilog->zl_lock);
+	zil_lwb_set_zio_dependency(zilog, lwb);
+	lwb->lwb_state = LWB_STATE_ISSUED;
+
+	if (nlwb) {
+		nlwb->lwb_blk = *bp;
+		nlwb->lwb_error = error;
+		nlwb->lwb_slog = slog;
+		nlwb->lwb_alloc_txg = txg;
+		if (nlwb->lwb_state != LWB_STATE_READY)
+			nlwb = NULL;
+	}
+	mutex_exit(&zilog->zl_lock);
 
 	if (lwb->lwb_slog) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
@@ -1938,11 +1928,19 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
-	ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER));
-	zil_lwb_add_block(lwb, &lwb->lwb_blk);
 	lwb->lwb_issued_timestamp = gethrtime();
 	zio_nowait(lwb->lwb_root_zio);
 	zio_nowait(lwb->lwb_write_zio);
+	if (lwb->lwb_child_zio)
+		zio_nowait(lwb->lwb_child_zio);
+
+	/*
+	 * If nlwb was ready when we gave it the block pointer,
+	 * it is on us to issue it and possibly following ones.
+	 */
+	lwb = nlwb;
+	if (lwb)
+		goto next_lwb;
 }
 
 /*
@@ -2014,10 +2012,7 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lr->lrc_txtype == TX_COMMIT) {
-		mutex_enter(&zilog->zl_lock);
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
-		itx->itx_private = NULL;
-		mutex_exit(&zilog->zl_lock);
 		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
@@ -2036,17 +2031,17 @@ cont:
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
-	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
-		lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
+		list_insert_tail(ilwbs, lwb);
+		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
-		zil_lwb_write_open(zilog, lwb);
-		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 
 		/*
 		 * There must be enough space in the new, empty log block to
@@ -2084,7 +2079,7 @@ cont:
 	clr->lrc_seq = ++zilog->zl_lr_seq;
 
 	lwb->lwb_nused += reclen + dnow;
-	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	zil_lwb_add_txg(lwb, lr->lrc_txg);
@@ -2096,22 +2091,9 @@ cont:
 		goto cont;
 	}
 
-	/*
-	 * We have to really issue all queued LWBs before we may have to
-	 * wait for a txg sync.  Otherwise we may end up in a dead lock.
-	 */
-	if (lr->lrc_txtype == TX_WRITE) {
-		boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa);
-		if (frozen || itx->itx_wr_state == WR_INDIRECT) {
-			lwb_t *tlwb;
-			while ((tlwb = list_remove_head(ilwbs)) != NULL)
-				zil_lwb_write_issue(zilog, tlwb);
-		}
-		if (itx->itx_wr_state == WR_INDIRECT)
-			lwb->lwb_indirect = B_TRUE;
-		if (frozen)
-			txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
-	}
+	if (lr->lrc_txtype == TX_WRITE &&
+	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
+		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
 
 	return (lwb);
 }
@@ -2174,26 +2156,24 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
 				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
+				if (lwb->lwb_child_zio == NULL) {
+					lwb->lwb_child_zio = zio_root(
+					    zilog->zl_spa, NULL, NULL,
+					    ZIO_FLAG_CANFAIL);
+				}
 			}
 
 			/*
-			 * We pass in the "lwb_write_zio" rather than
-			 * "lwb_root_zio" so that the "lwb_write_zio"
-			 * becomes the parent of any zio's created by
-			 * the "zl_get_data" callback. The vdevs are
-			 * flushed after the "lwb_write_zio" completes,
-			 * so we want to make sure that completion
-			 * callback waits for these additional zio's,
-			 * such that the vdevs used by those zio's will
-			 * be included in the lwb's vdev tree, and those
-			 * vdevs will be properly flushed. If we passed
-			 * in "lwb_root_zio" here, then these additional
-			 * vdevs may not be flushed; e.g. if these zio's
-			 * completed after "lwb_write_zio" completed.
+			 * The "lwb_child_zio" we pass in will become a child of
+			 * "lwb_write_zio", when one is created, so one will be
+			 * a parent of any zio's created by the "zl_get_data".
+			 * This way "lwb_write_zio" will first wait for children
+			 * block pointers before own writing, and then for their
+			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
-			    lwb->lwb_write_zio);
+			    lwb->lwb_child_zio);
 			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
 				memset((char *)dbuf + lrwb->lr_length, 0,
@@ -2226,12 +2206,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 				    error);
 				zfs_fallthrough;
 			case EIO:
-				if (lwb->lwb_indirect) {
-					txg_wait_synced(zilog->zl_dmu_pool,
-					    lr->lrc_txg);
-				} else {
-					lwb->lwb_write_zio->io_error = error;
-				}
+				txg_wait_synced(zilog->zl_dmu_pool,
+				    lr->lrc_txg);
 				zfs_fallthrough;
 			case ENOENT:
 				zfs_fallthrough;
@@ -2675,7 +2651,6 @@ zil_prune_commit_list(zilog_t *zilog)
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
-			itx->itx_private = NULL;
 		}
 
 		mutex_exit(&zilog->zl_lock);
@@ -2753,10 +2728,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * have already been created (zl_lwb_list not empty).
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
-		first = (lwb->lwb_state != LWB_STATE_OPENED) &&
+		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+		    lwb->lwb_state == LWB_STATE_OPENED);
+		first = (lwb->lwb_state == LWB_STATE_NEW) &&
 		    ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
 		    plwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	}
@@ -2880,37 +2854,32 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
-		 * variable is in one of the following states: "closed"
-		 * or "open".
+		 * variable is in "new" or "opened" state.
 		 *
-		 * If it's "closed", then no itxs have been committed to
-		 * it, so there's no point in issuing its zio (i.e. it's
-		 * "empty").
+		 * If it's "new", then no itxs have been committed to it, so
+		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
-		 * If it's "open", then it contains one or more itxs that
+		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
-		 * 1. Ideally, there will be more ZIL activity occurring
-		 * on the system, such that this function will be
-		 * immediately called again (not necessarily by the same
-		 * thread) and this lwb's zio will be issued via
-		 * zil_lwb_assign(). This way, the lwb is guaranteed to
-		 * be "full" when it is issued to disk, and we'll make
-		 * use of the lwb's size the best we can.
+		 * 1. Ideally, there will be more ZIL activity occurring on
+		 * the system, such that this function will be immediately
+		 * called again by different thread and this lwb will be
+		 * closed by zil_lwb_assign().  This way, the lwb will be
+		 * "full" when it is issued to disk, and we'll make use of
+		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
-		 * the system, such that this lwb's zio isn't issued via
-		 * zil_lwb_assign(), zil_commit_waiter() will issue the
-		 * lwb's zio. If this occurs, the lwb is not guaranteed
+		 * the system, zil_commit_waiter() will close it and issue
+		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
@@ -2940,8 +2909,11 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 			hrtime_t sleep = zilog->zl_last_lwb_latency *
 			    zfs_commit_timeout_pct / 100;
 			if (sleep < zil_min_commit_timeout ||
-			    lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
-				lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
+			    lwb->lwb_nmax - lwb->lwb_nused <
+			    lwb->lwb_nmax / 8) {
+				list_insert_tail(ilwbs, lwb);
+				lwb = zil_lwb_write_close(zilog, lwb,
+				    LWB_STATE_NEW);
 				zilog->zl_cur_used = 0;
 				if (lwb == NULL) {
 					while ((lwb = list_remove_head(ilwbs))
@@ -3024,7 +2996,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
@@ -3033,9 +3005,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
-	if (lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
@@ -3058,8 +3028,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done) {
-		lwb = NULL;
-		goto out;
+		mutex_exit(&zilog->zl_issuer_lock);
+		return;
 	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
@@ -3070,28 +3040,33 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
-	 * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
-	 * _can_ transition from ISSUED to DONE, but it's OK to race with
+	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
+	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
-	 * the ISSUED or DONE states.
+	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
-	 * if it's ISSUED or OPENED, and block any other threads that might
-	 * attempt to issue this lwb. For that reason we hold the
+	 * if it's OPENED or CLOSED, and block any other threads that might
+	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
-	 * zil_lwb_write_close() if the lwb had already been issued.
+	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
-	if (lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
-		lwb = NULL;
-		goto out;
+	if (lwb->lwb_state != LWB_STATE_OPENED) {
+		mutex_exit(&zilog->zl_issuer_lock);
+		return;
 	}
 
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+	/*
+	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
+	 * is still open.  But we have to drop it to avoid a deadlock in case
+	 * callback of zio issued by zil_lwb_write_issue() try to get it,
+	 * while zil_lwb_write_issue() is blocked on attempt to issue next
+	 * lwb it found in LWB_STATE_READY state.
+	 */
+	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
@@ -3099,9 +3074,9 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
-	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL);
+	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
-	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/*
 	 * Since the lwb's zio hadn't been issued by the time this thread
@@ -3124,34 +3099,15 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
-		 *
-		 * We must drop the commit waiter's lock prior to
-		 * calling zil_commit_writer_stall() or else we can wind
-		 * up with the following deadlock:
-		 *
-		 * - This thread is waiting for the txg to sync while
-		 *   holding the waiter's lock; txg_wait_synced() is
-		 *   used within txg_commit_writer_stall().
-		 *
-		 * - The txg can't sync because it is waiting for this
-		 *   lwb's zio callback to call dmu_tx_commit().
-		 *
-		 * - The lwb's zio callback can't call dmu_tx_commit()
-		 *   because it's blocked trying to acquire the waiter's
-		 *   lock, which occurs prior to calling dmu_tx_commit()
 		 */
-		mutex_exit(&zcw->zcw_lock);
 		zil_lwb_write_issue(zilog, lwb);
-		lwb = NULL;
 		zil_commit_writer_stall(zilog);
-		mutex_enter(&zcw->zcw_lock);
-	}
-
-out:
-	mutex_exit(&zilog->zl_issuer_lock);
-	if (lwb)
+		mutex_exit(&zilog->zl_issuer_lock);
+	} else {
+		mutex_exit(&zilog->zl_issuer_lock);
 		zil_lwb_write_issue(zilog, lwb);
-	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+	}
+	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
@@ -3216,7 +3172,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
-		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
@@ -3264,6 +3220,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 			 */
 
 			IMPLY(lwb != NULL,
+			    lwb->lwb_state == LWB_STATE_CLOSED ||
+			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
@@ -3618,10 +3576,11 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
-		    lwb->lwb_max_txg > txg)
+		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_free(spa, txg, &lwb->lwb_blk);
+		if (!BP_IS_HOLE(&lwb->lwb_blk))
+			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
@@ -3825,17 +3784,18 @@ zil_close(zilog_t *zilog)
 	}
 
 	mutex_enter(&zilog->zl_lock);
+	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
-	if (lwb == NULL)
-		txg = zilog->zl_dirty_max_txg;
-	else
-		txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
+	if (lwb != NULL) {
+		txg = MAX(txg, lwb->lwb_alloc_txg);
+		txg = MAX(txg, lwb->lwb_max_txg);
+	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
 	 * on the time when the dmu_tx transaction is assigned in
-	 * zil_lwb_write_close().
+	 * zil_lwb_write_issue().
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
@@ -3864,8 +3824,7 @@ zil_close(zilog_t *zilog)
 	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-
+		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
@@ -3986,7 +3945,7 @@ zil_suspend(const char *osname, void **cookiep)
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
-	 * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed
+	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index b5627109900c..7458b416c292 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4466,8 +4466,8 @@ zio_ready(zio_t *zio)
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
-	    ZIO_WAIT_READY)) {
+	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index bbef53e4e479..cc11fd80618b 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 	int error;
 
 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
@@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else { /* indirect write */
+		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's written out
 		 * and its checksum is being calculated that no one can change
-- 
cgit v1.2.3


From ed39d668ea63e0bbf4e3e4d761a84c7d42ef8f8b Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Fri, 25 Aug 2023 10:28:36 -0700
Subject: Update outdated assertion from zio_write_compress

As part of some internal gang block testing within Delphix
we hit the assertion removed by this patch. The assertion
was triggered by a ZIO that had two copies and was a gang
block making the following expression equal to 3:
```
MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa))
```
and failing when we expected the above to be equal to
`BP_GET_NDVAS(bp)`.

The assertion is no longer valid since the following commit:
```
commit 14872aaa4f909d72c6b5e4105dadcfa13c7d9d66
Author: Matthew Ahrens <matthew.ahrens@delphix.com>
Date:   Mon Feb 6 09:37:06 2023 -0800

  EIO caused by encryption + recursive gang
```

The above commit changed gang block headers so they can't
have more than 2 copies but the assertion in question from
this PR was never updated.

Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #15180
---
 module/zfs/zio.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 7458b416c292..3b3b40fa73d8 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1775,8 +1775,9 @@ zio_write_compress(zio_t *zio)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
-		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+		ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
+		    MIN(zp->zp_copies, spa_max_replication(spa))
+		    == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
-- 
cgit v1.2.3


From 804414aad224b432590afe3f9ec114ffb49e0f13 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Sat, 26 Aug 2023 03:31:29 +1000
Subject: tests/block_cloning: rename and document get_same_blocks helper

`get_same_blocks` is a helper to compare two files and return a list of
the blocks that are clones of each other. Its very necessary for block
cloning tests.

Previously it was incorrectly called `unique_blocks`, which is the
_inverse_ of what it does (an early version did list unique blocks; it
was changed but the name was not). So if nothing else, it should be
called `duplicate_blocks`.

But, keeping the details of a clone operation in your head is actually
quite difficult, without the additional overhead of wondering how the
tools work. So I've renamed it to better describe what it does, added a
usage note, and changed it to return block indexes from 0 instead of 1,
to match how L0 blocks are normally counted.

Reviewed-by: Umer Saleem <usaleem@ixsystems.com>
Reviewed-by:  Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15181
---
 .../tests/functional/block_cloning/block_cloning.kshlib    | 14 +++++++++++---
 .../block_cloning/block_cloning_copyfilerange.ksh          |  4 ++--
 .../block_cloning_copyfilerange_cross_dataset.ksh          |  4 ++--
 .../block_cloning/block_cloning_copyfilerange_fallback.ksh | 12 ++++++------
 .../block_cloning_copyfilerange_fallback_same_txg.ksh      |  2 +-
 .../block_cloning/block_cloning_copyfilerange_partial.ksh  |  6 +++---
 .../block_cloning/block_cloning_disabled_copyfilerange.ksh |  2 +-
 .../functional/block_cloning/block_cloning_ficlone.ksh     |  4 ++--
 .../block_cloning/block_cloning_ficlonerange.ksh           |  4 ++--
 .../block_cloning/block_cloning_ficlonerange_partial.ksh   |  6 +++---
 10 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
index 9998e5a87bfe..8e16366b4cd6 100644
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
@@ -34,13 +34,21 @@ function have_same_content
 	log_must [ "$hash1" = "$hash2" ]
 }
 
-function unique_blocks
+#
+# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2
+#
+# Returns a space-separated list of the indexes (starting at 0) of the L0
+# blocks that are shared between both files (by first DVA and checksum).
+# Assumes that the two files have the same content, use have_same_content to
+# confirm that.
+#
+function get_same_blocks
 {
 	typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$
 	zdb -vvvvv $1 -O $2 | \
-	    awk '/ L0 / { print ++l " " $3 " " $7 }' > $zdbout.a
+	    awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a
 	zdb -vvvvv $3 -O $4 | \
-	    awk '/ L0 / { print ++l " " $3 " " $7 }' > $zdbout.b
+	    awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b
 	echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ')
 }
 
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh
index 9adcbfcd88a1..43ea47b0ef19 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh
@@ -54,7 +54,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
-log_must [ "$blocks" = "1 2 3 4" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "0 1 2 3" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
index 07e089e89ceb..74e6b04903a3 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
@@ -58,8 +58,8 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2
 
-typeset blocks=$(unique_blocks \
+typeset blocks=$(get_same_blocks \
   $TESTPOOL/$TESTFS1 file1 $TESTPOOL/$TESTFS2 file2)
-log_must [ "$blocks" = "1 2 3 4" ]
+log_must [ "$blocks" = "0 1 2 3" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh
index 87f99eb5c0f0..9a96eacd60af 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh
@@ -58,8 +58,8 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
 
-typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
-log_must [ "$blocks" = "1 2 3 4" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "0 1 2 3" ]
 
 
 log_note "Copying within a block with copy_file_range"
@@ -69,8 +69,8 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
 
-typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
-log_must [ "$blocks" = "2 3 4" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "1 2 3" ]
 
 
 log_note "Copying across a block with copy_file_range"
@@ -80,7 +80,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
 
-typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
-log_must [ "$blocks" = "2" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "1" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
index 3451f887afb4..a10545bc0769 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
@@ -59,7 +59,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
 
-typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
 log_must [ "$blocks" = "" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh
index ecac62b20350..a5da0a0bd359 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh
@@ -54,7 +54,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
 log_must [ "$blocks" = "" ]
 
 log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144
@@ -62,7 +62,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
-log_must [ "$blocks" = "2 3" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "1 2" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh
index 30b155a140c4..d21b6251134e 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh
@@ -54,7 +54,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
 log_must [ "$blocks" = "" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh
index d13a39229870..3f227fb68ee3 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh
@@ -50,7 +50,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
-log_must [ "$blocks" = "1 2 3 4" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "0 1 2 3" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh
index 6556050c4352..cefc4336aefd 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh
@@ -50,7 +50,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
-log_must [ "$blocks" = "1 2 3 4" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "0 1 2 3" ]
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh
index 37a3511a26d5..067f55aaa65b 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh
@@ -50,7 +50,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
 log_must [ "$blocks" = "" ]
 
 log_must clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144
@@ -58,7 +58,7 @@ log_must sync_pool $TESTPOOL
 
 log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
 
-typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2)
-log_must [ "$blocks" = "2 3" ]
+typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
+log_must [ "$blocks" = "1 2" ]
 
 log_pass $claim
-- 
cgit v1.2.3