From 58211157bf866bbcdd8720e92c27297db3ba75d6 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 21 Mar 2024 10:46:15 +1100
Subject: Linux 6.8 compat: use splice_copy_file_range() for fallback

Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
simple wrapper around splice_copy_file_range(). Detect that function
directly and use it if generic_ is not available.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15930
Closes #15931
(cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0)
---
 config/kernel-vfs-file_range.m4      | 27 +++++++++++++++++++++++++++
 config/kernel.m4                     |  2 ++
 module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
index cc96404d8bbe..8a5cbe2eeeed 100644
--- a/config/kernel-vfs-file_range.m4
+++ b/config/kernel-vfs-file_range.m4
@@ -16,6 +16,9 @@ dnl #
 dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
 dnl #      generic_copy_file_range() added to support it
 dnl #
+dnl # 6.8: generic_copy_file_range() removed, replaced by
+dnl #      splice_copy_file_range()
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
 		#include <linux/fs.h>
@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
 	])
 ])
 
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
+		#include <linux/splice.h>
+	], [
+		struct file *src_file __attribute__ ((unused)) = NULL;
+		loff_t src_off __attribute__ ((unused)) = 0;
+		struct file *dst_file __attribute__ ((unused)) = NULL;
+		loff_t dst_off __attribute__ ((unused)) = 0;
+		size_t len __attribute__ ((unused)) = 0;
+		splice_copy_file_range(src_file, src_off, dst_file, dst_off,
+		    len);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether splice_copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
+		    [splice_copy_file_range() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
 		#include <linux/fs.h>
diff --git a/config/kernel.m4 b/config/kernel.m4
index e3f8645774c5..1d0c5a27fc7f 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 3065d54fa9da..64728fdb1187 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -26,6 +26,9 @@
 #include <linux/compat.h>
 #endif
 #include <linux/fs.h>
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
+#include <linux/splice.h>
+#endif
 #include <sys/file.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vnops.h>
@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	ret = zpl_clone_file_range_impl(src_file, src_off,
 	    dst_file, dst_off, len);
 
-#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
 	/*
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	    ret == -EAGAIN)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
+	/*
+	 * Since 6.8 the fallback function is called splice_copy_file_range
+	 * and has a slightly different signature.
+	 */
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+	    ret == -EAGAIN)
+		ret = splice_copy_file_range(src_file, src_off, dst_file,
+		    dst_off, len);
 #else
 	/*
 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 */
 	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
 		ret = -EOPNOTSUPP;
-#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
 
 	return (ret);
 }
-- 
cgit v1.2.3


From 220bb7341eb4b41f017cec8f492e0daf18660d4e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 13 Nov 2023 17:55:29 +1100
Subject: linux 5.4 compat: page_size()

Before 5.4 we have to do a little math.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21)
---
 config/kernel-mm-page-size.m4             | 17 +++++++++++++++
 config/kernel.m4                          |  2 ++
 include/os/linux/Makefile.am              |  1 +
 include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+)
 create mode 100644 config/kernel-mm-page-size.m4
 create mode 100644 include/os/linux/kernel/linux/mm_compat.h

diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
new file mode 100644
index 000000000000..d5ebd926986a
--- /dev/null
+++ b/config/kernel-mm-page-size.m4
@@ -0,0 +1,17 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 1d0c5a27fc7f..548905ccd04d 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
+	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
+	ZFS_AC_KERNEL_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 3830d198dfff..51c27132b4ef 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
 	%D%/kernel/linux/compiler_compat.h \
 	%D%/kernel/linux/dcache_compat.h \
 	%D%/kernel/linux/kmap_compat.h \
+	%D%/kernel/linux/mm_compat.h \
 	%D%/kernel/linux/mod_compat.h \
 	%D%/kernel/linux/page_compat.h \
 	%D%/kernel/linux/percpu_compat.h \
diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
new file mode 100644
index 000000000000..40056c68d6dd
--- /dev/null
+++ b/include/os/linux/kernel/linux/mm_compat.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ZFS_MM_COMPAT_H
+#define	_ZFS_MM_COMPAT_H
+
+#include <linux/mm.h>
+
+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
+#ifndef HAVE_MM_PAGE_SIZE
+#define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
+#endif
+
+#endif /* _ZFS_MM_COMPAT_H */
-- 
cgit v1.2.3


From 52a2af6fd16470dd4ba8f7a1def459b43081ae5b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 11 Dec 2023 16:05:54 +1100
Subject: abd: add page iterator

The regular ABD iterators yield data buffers, so they have to map and
unmap pages into kernel memory. If the caller only wants to count
chunks, or can use page pointers directly, then the map/unmap is just
unnecessary overhead.

This adds adb_iterate_page_func, which yields unmapped struct page
instead.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
---
 include/sys/abd.h              |   7 +++
 include/sys/abd_impl.h         |  26 +++++++++--
 module/os/freebsd/zfs/abd_os.c |   4 +-
 module/os/linux/zfs/abd_os.c   | 104 +++++++++++++++++++++++++++++++++++++----
 module/zfs/abd.c               |  42 +++++++++++++++++
 5 files changed, 169 insertions(+), 14 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 750f9986c1da..8a2df0bca946 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -79,6 +79,9 @@ typedef struct abd {
 
 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
+#if defined(__linux__) && defined(_KERNEL)
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+#endif
 
 extern int zfs_abd_scatter_enabled;
 
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
     abd_iter_func2_t *, void *);
+#if defined(__linux__) && defined(_KERNEL)
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
index 40546d4af137..f88ea25e245d 100644
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #ifndef _ABD_IMPL_H
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
 	ABDSTAT_DECR  /* Decrease abdstat values */
 } abd_stats_op_t;
 
-struct scatterlist; /* forward declaration */
+/* forward declarations */
+struct scatterlist;
+struct page;
 
 struct abd_iter {
 	/* public interface */
-	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
-	size_t		iter_mapsize;	/* length of data valid at mapaddr */
+	union {
+		/* for abd_iter_map()/abd_iter_unmap() */
+		struct {
+			/* addr corresponding to iter_pos */
+			void		*iter_mapaddr;
+			/* length of data valid at mapaddr */
+			size_t		iter_mapsize;
+		};
+		/* for abd_iter_page() */
+		struct {
+			/* current page */
+			struct page	*iter_page;
+			/* offset of data in page */
+			size_t		iter_page_doff;
+			/* size of data in page */
+			size_t		iter_page_dsize;
+		};
+	};
 
 	/* private */
 	abd_t		*iter_abd;	/* ABD being iterated through */
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
 void abd_iter_advance(struct abd_iter *, size_t);
 void abd_iter_map(struct abd_iter *);
 void abd_iter_unmap(struct abd_iter *);
+void abd_iter_page(struct abd_iter *);
 
 /*
  * Helper macros
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index 58a37df62b69..3b812271f98b 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_pos = 0;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
 }
 
 /*
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 24390fbbf125..dae1280121da 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 /*
@@ -59,6 +60,7 @@
 #include <sys/zfs_znode.h>
 #ifdef _KERNEL
 #include <linux/kmap_compat.h>
+#include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #endif
 
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
-	aiter->iter_pos = 0;
-	if (abd_is_linear(abd)) {
-		aiter->iter_offset = 0;
-		aiter->iter_sg = NULL;
-	} else {
+	if (!abd_is_linear(abd)) {
 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 	}
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
 
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
+	/*
+	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
+	 * this state (directly or abd_iter_unmap()) before advancing.
+	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
+	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0(aiter->iter_page_doff);
+	ASSERT0(aiter->iter_page_dsize);
 
 	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
 }
 
 #if defined(_KERNEL)
+/*
+ * Yield the next page struct and data offset and size within it, without
+ * mapping it into the address space.
+ */
+void
+abd_iter_page(struct abd_iter *aiter)
+{
+	if (abd_iter_at_end(aiter)) {
+		aiter->iter_page = NULL;
+		aiter->iter_page_doff = 0;
+		aiter->iter_page_dsize = 0;
+		return;
+	}
+
+	struct page *page;
+	size_t doff, dsize;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+
+		/* memory address at iter_pos */
+		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+
+		/* struct page for address */
+		page = is_vmalloc_addr(paddr) ?
+		    vmalloc_to_page(paddr) : virt_to_page(paddr);
+
+		/* offset of address within the page */
+		doff = offset_in_page(paddr);
+
+		/* total data remaining in abd from this position */
+		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
+	} else {
+		ASSERT(!abd_is_gang(aiter->iter_abd));
+
+		/* current scatter page */
+		page = sg_page(aiter->iter_sg);
+
+		/* position within page */
+		doff = aiter->iter_offset;
+
+		/* remaining data in scatterlist */
+		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+	}
+	ASSERT(page);
+
+	if (PageTail(page)) {
+		/*
+		 * This page is part of a "compound page", which is a group of
+		 * pages that can be referenced from a single struct page *.
+		 * Its organised as a "head" page, followed by a series of
+		 * "tail" pages.
+		 *
+		 * In OpenZFS, compound pages are allocated using the
+		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
+		 * vmalloc slabs (ie >16K allocations). So a great many of the
+		 * IO buffers we get are going to be of this type.
+		 *
+		 * The tail pages are just regular PAGE_SIZE pages, and can be
+		 * safely used as-is. However, the head page has length
+		 * covering itself and all the tail pages. If this ABD chunk
+		 * spans multiple pages, then we can use the head page and a
+		 * >PAGE_SIZE length, which is far more efficient.
+		 *
+		 * To do this, we need to adjust the offset to be counted from
+		 * the head page. struct page for compound pages are stored
+		 * contiguously, so we can just adjust by a simple offset.
+		 */
+		struct page *head = compound_head(page);
+		doff += ((page - head) * PAGESIZE);
+		page = head;
+	}
+
+	/* final page and position within it */
+	aiter->iter_page = page;
+	aiter->iter_page_doff = doff;
+
+	/* amount of data in the chunk, up to the end of the page */
+	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+}
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-#endif
+
+#endif /* _KERNEL */
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index d982f201c930..3388e2357305 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	return (ret);
 }
 
+#if defined(__linux__) && defined(_KERNEL)
+int
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_page_func_t *func, void *private)
+{
+	struct abd_iter aiter;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+	while (size > 0) {
+		IMPLY(abd_is_gang(abd), c_abd != NULL);
+
+		abd_iter_page(&aiter);
+
+		size_t len = MIN(aiter.iter_page_dsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_page, aiter.iter_page_doff,
+		    len, private);
+
+		aiter.iter_page = NULL;
+		aiter.iter_page_doff = 0;
+		aiter.iter_page_dsize = 0;
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+	}
+
+	return (ret);
+}
+#endif
+
 struct buf_arg {
 	void *arg_buf;
 };
-- 
cgit v1.2.3


From 4820185031d674cba29e95792f4b0f2c4fa576ff Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:12:56 +1100
Subject: vdev_disk: rename existing functions to vdev_classic_*

This is just renaming the existing functions we're about to replace and
grouping them together to make the next commits easier to follow.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4)
---
 include/sys/abd.h               |   2 +
 module/os/linux/zfs/abd_os.c    |   5 +
 module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++++++-------------------
 3 files changed, 120 insertions(+), 102 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 8a2df0bca946..bee38b831bc0 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -220,6 +220,8 @@ void abd_fini(void);
 
 /*
  * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
  */
 #if defined(__linux__) && defined(_KERNEL)
 unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index dae1280121da..3fe01c0b7d77 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
 	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
 }
 
+/*
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
+ * vdev_disk.c.
+ */
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index b0bda5fa2012..957619b87afd 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 
-/*
- * Virtual device vector for disks.
- */
-typedef struct dio_request {
-	zio_t			*dr_zio;	/* Parent ZIO */
-	atomic_t		dr_ref;		/* References */
-	int			dr_error;	/* Bio error */
-	int			dr_bio_count;	/* Count of bio's */
-	struct bio		*dr_bio[];	/* Attached bio's */
-} dio_request_t;
-
 /*
  * BIO request failfast mask.
  */
@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
 	v->vdev_tsd = NULL;
 }
 
-static dio_request_t *
-vdev_disk_dio_alloc(int bio_count)
-{
-	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
-	    sizeof (struct bio *) * bio_count, KM_SLEEP);
-	atomic_set(&dr->dr_ref, 0);
-	dr->dr_bio_count = bio_count;
-	dr->dr_error = 0;
-
-	for (int i = 0; i < dr->dr_bio_count; i++)
-		dr->dr_bio[i] = NULL;
-
-	return (dr);
-}
-
-static void
-vdev_disk_dio_free(dio_request_t *dr)
-{
-	int i;
-
-	for (i = 0; i < dr->dr_bio_count; i++)
-		if (dr->dr_bio[i])
-			bio_put(dr->dr_bio[i]);
-
-	kmem_free(dr, sizeof (dio_request_t) +
-	    sizeof (struct bio *) * dr->dr_bio_count);
-}
-
-static void
-vdev_disk_dio_get(dio_request_t *dr)
-{
-	atomic_inc(&dr->dr_ref);
-}
-
-static void
-vdev_disk_dio_put(dio_request_t *dr)
-{
-	int rc = atomic_dec_return(&dr->dr_ref);
-
-	/*
-	 * Free the dio_request when the last reference is dropped and
-	 * ensure zio_interpret is called only once with the correct zio
-	 */
-	if (rc == 0) {
-		zio_t *zio = dr->dr_zio;
-		int error = dr->dr_error;
-
-		vdev_disk_dio_free(dr);
-
-		if (zio) {
-			zio->io_error = error;
-			ASSERT3S(zio->io_error, >=, 0);
-			if (zio->io_error)
-				vdev_disk_error(zio);
-
-			zio_delay_interrupt(zio);
-		}
-	}
-}
-
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
-{
-	dio_request_t *dr = bio->bi_private;
-
-	if (dr->dr_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		dr->dr_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			dr->dr_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			dr->dr_error = EIO;
-#endif
-	}
-
-	/* Drop reference acquired by __vdev_disk_physio */
-	vdev_disk_dio_put(dr);
-}
-
 static inline void
 vdev_submit_bio_impl(struct bio *bio)
 {
@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+/* ========== */
+
+/*
+ * This is the classic, battle-tested BIO submission code.
+ *
+ * These functions have been renamed to vdev_classic_* to make it clear what
+ * they belong to, but their implementations are unchanged.
+ */
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[];	/* Attached bio's */
+} dio_request_t;
+
+static dio_request_t *
+vdev_classic_dio_alloc(int bio_count)
+{
+	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	atomic_set(&dr->dr_ref, 0);
+	dr->dr_bio_count = bio_count;
+	dr->dr_error = 0;
+
+	for (int i = 0; i < dr->dr_bio_count; i++)
+		dr->dr_bio[i] = NULL;
+
+	return (dr);
+}
+
+static void
+vdev_classic_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_classic_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static void
+vdev_classic_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_classic_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+}
+
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by vdev_classic_physio */
+	vdev_classic_dio_put(dr);
+}
+
 static inline unsigned int
-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 {
 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
 	    bio_size, abd_offset);
@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
     size_t io_size, uint64_t io_offset, int rw, int flags)
 {
 	dio_request_t *dr;
@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	}
 
 retry:
-	dr = vdev_disk_dio_alloc(bio_count);
+	dr = vdev_classic_dio_alloc(bio_count);
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    zio->io_vd->vdev_failfast == B_TRUE) {
@@ -771,23 +780,23 @@ retry:
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 
-		nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
+		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
 		if (unlikely(dr->dr_bio[i] == NULL)) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 
-		/* Matching put called by vdev_disk_physio_completion */
-		vdev_disk_dio_get(dr);
+		/* Matching put called by vdev_classic_physio_completion */
+		vdev_classic_dio_get(dr);
 
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
-		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
 
@@ -801,7 +810,7 @@ retry:
 	}
 
 	/* Extra reference to protect dio_request during vdev_submit_bio */
-	vdev_disk_dio_get(dr);
+	vdev_classic_dio_get(dr);
 
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
@@ -815,11 +824,13 @@ retry:
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 
-	vdev_disk_dio_put(dr);
+	vdev_classic_dio_put(dr);
 
 	return (error);
 }
 
+/* ========== */
+
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
 	}
 
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
+	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
 	    zio->io_size, zio->io_offset, rw, 0);
 	rw_exit(&vd->vd_lock);
 
-- 
cgit v1.2.3


From 13b5348848b17aedc3c47017394ce3d7148be4b7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:23:30 +1100
Subject: vdev_disk: reorganise vdev_disk_io_start

Light reshuffle to make it a bit more linear to read and get rid of a
bunch of args that aren't needed in all cases.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6)
---
 module/os/linux/zfs/vdev_disk.c | 51 +++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 957619b87afd..51e7cef2fc78 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-vdev_classic_physio(struct block_device *bdev, zio_t *zio,
-    size_t io_size, uint64_t io_offset, int rw, int flags)
+vdev_classic_physio(zio_t *zio)
 {
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	size_t io_size = zio->io_size;
+	uint64_t io_offset = zio->io_offset;
+	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
+	int flags = 0;
+
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
-	int rw, error;
+	int error;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
 		rw_exit(&vd->vd_lock);
 		zio_execute(zio);
 		return;
-	case ZIO_TYPE_WRITE:
-		rw = WRITE;
-		break;
-
-	case ZIO_TYPE_READ:
-		rw = READ;
-		break;
 
 	case ZIO_TYPE_TRIM:
 		zio->io_error = vdev_disk_io_trim(zio);
@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
 #endif
 		return;
 
-	default:
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		error = vdev_classic_physio(zio);
 		rw_exit(&vd->vd_lock);
-		zio->io_error = SET_ERROR(ENOTSUP);
-		zio_interrupt(zio);
+		if (error) {
+			zio->io_error = error;
+			zio_interrupt(zio);
+		}
 		return;
-	}
 
-	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
-	    zio->io_size, zio->io_offset, rw, 0);
-	rw_exit(&vd->vd_lock);
+	default:
+		/*
+		 * Getting here means our parent vdev has made a very strange
+		 * request of us, and shouldn't happen. Assert here to force a
+		 * crash in dev builds, but in production return the IO
+		 * unhandled. The pool will likely suspend anyway but that's
+		 * nicer than crashing the kernel.
+		 */
+		ASSERT3S(zio->io_type, ==, -1);
 
-	if (error) {
-		zio->io_error = error;
+		rw_exit(&vd->vd_lock);
+		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
+
+	__builtin_unreachable();
 }
 
 static void
-- 
cgit v1.2.3


From 03ff875e09c6578941607581e9f08658e191bbd9 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:29:19 +1100
Subject: vdev_disk: make read/write IO function configurable

This is just setting up for the next couple of commits, which will add a
new IO function and a parameter to select it.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0)
---
 module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 51e7cef2fc78..de4dba72fa3c 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
 #endif
 }
 
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
+
 static void
 vdev_disk_io_start(zio_t *zio)
 {
@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
-		error = vdev_classic_physio(zio);
+		error = vdev_disk_io_rw_fn(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * At first use vdev use, set the submission function from the default value if
+ * it hasn't been set already.
+ */
+static int
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	(void) spa;
+	(void) nv;
+	(void) tsd;
+
+	if (vdev_disk_io_rw_fn == NULL)
+		vdev_disk_io_rw_fn = vdev_classic_physio;
+
+	return (0);
+}
+
 vdev_ops_t vdev_disk_ops = {
-	.vdev_op_init = NULL,
+	.vdev_op_init = vdev_disk_init,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,
-- 
cgit v1.2.3


From 51c2bd0def6462b489f5d4e8ac1b9d6ab854f47b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 18 Jul 2023 11:11:29 +1000
Subject: vdev_disk: rewrite BIO filling machinery to avoid split pages

This commit tackles a number of issues in the way BIOs (`struct bio`)
are constructed for submission to the Linux block layer.

The kernel has a hard upper limit on the number of pages/segments that
can be added to a BIO, as well as a separate limit for each device
(related to its queue depth and other scheduling characteristics).

ZFS counts the number of memory pages in the request ABD
(`abd_nr_pages_off()`, and then uses that as the number of segments to
put into the BIO, up to the hard upper limit. If it requires more than
the limit, it will create multiple BIOs.

Leaving aside the fact that page count method is wrong (see below), not
limiting to the device segment max means that the device driver will
need to split the BIO in half. This is alone is not necessarily a
problem, but it interacts with another issue to cause a much larger
problem.

The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
`struct page` pointer, and offset+len within it. `struct page` can
represent a run of contiguous memory pages (known as a "compound page").
In can be of arbitrary length.

The ZFS functions that count ABD pages and load them into the BIO
(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
page` is for multiple pages. In this case, it will load the same `struct
page` into the BIO multiple times, with the offset adjusted each time.

With a sufficiently large ABD, this can easily lead to the BIO being
entirely filled much earlier than it could have been. This is also
further contributes to the problem caused by the incorrect segment limit
calculation, as its much easier to go past the device limit, and so
require a split.

Again, this is not a problem on its own.

The logic for "never submit more than `PAGE_SIZE`" is actually a little
more subtle. It will actually never submit a buffer that crosses a 4K
page boundary.

In practice, this is fine, as most ABDs are scattered, that is a list of
complete 4K pages, and so are loaded in as such.

Linear ABDs are typically allocated from slabs, and for small sizes they
are frequently not aligned to page boundaries. For example, a 12K
allocation can span four pages, eg:

     -- 4K -- -- 4K -- -- 4K -- -- 4K --
    |        |        |        |        |
          :## ######## ######## ######:    [1K, 4K, 4K, 3K]

Such an allocation would be loaded into a BIO as you see:

    [1K, 4K, 4K, 3K]

This tends not to be a problem in practice, because even if the BIO were
filled and needed to be split, each half would still have either a start
or end aligned to the logical block size of the device (assuming 4K at
least).

---

In ideal circumstances, these shortcomings don't cause any particular
problems. Its when they start to interact with other ZFS features that
things get interesting.

Aggregation will create a "gang" ABD, which is simply a list of other
ABDs. Iterating over a gang ABD is just iterating over each ABD within
it in turn.

Because the segments are simply loaded in order, we can end up with
uneven segments either side of the "gap" between the two ABDs. For
example, two 12K ABDs might be aggregated and then loaded as:

    [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]

Should a split occur, each individual BIO can end up either having an
start or end offset that is not aligned to the logical block size, which
some drivers (eg SCSI) will reject. However, this tends not to happen
because the default aggregation limit usually keeps the BIO small enough
to not require more than one split, and most pages are actually full 4K
pages, so hitting an uneven gap is very rare anyway.

If the pool is under particular memory pressure, then an IO can be
broken down into a "gang block", a 512-byte block composed of a header
and up to three block pointers. Each points to a fragment of the
original write, or in turn, another gang block, breaking the original
data up over and over until space can be found in the pool for each of
them.

Each gang header is a separate 512-byte memory allocation from a slab,
that needs to be written down to disk. When the gang header is added to
the BIO, its a single 512-byte segment.

Pulling all this together, consider a large aggregated write of gang
blocks. This results a BIO containing lots of 512-byte segments. Given
our tendency to overfill the BIO, a split is likely, and most possible
split points will yield a pair of BIOs that are misaligned. Drivers that
care, like the SCSI driver, will reject them.

---

This commit is a substantial refactor and rewrite of much of `vdev_disk`
to sort all this out.

`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
override this, to assist with testing.

We scan the ABD up front to count the number of pages within it, and to
confirm that if we submitted all those pages to one or more BIOs, it
could be split at any point with creating a misaligned BIO.  If the
pages in the BIO are not usable (as in any of the above situations), the
ABD is linearised, and then checked again. This is the same technique
used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
and allocator quirks.

`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
idea is simply that it can hold all the state needed to create, submit
and return multiple BIOs, including all the refcounts, the ABD copy if
it was needed, and so on. Apart from what I hope is a clearer interface,
the major difference is that because we know how many BIOs we'll need up
front, we don't need the old overflow logic that would grow the BIO
array, throw away all the old work and restart. We can get it right from
the start.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8)
---
 include/os/linux/kernel/linux/mod_compat.h |   1 +
 man/man4/zfs.4                             |  10 +-
 module/os/linux/zfs/vdev_disk.c            | 439 ++++++++++++++++++++++++++++-
 3 files changed, 447 insertions(+), 3 deletions(-)

diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
index 8e20a9613539..039865b703ef 100644
--- a/include/os/linux/kernel/linux/mod_compat.h
+++ b/include/os/linux/kernel/linux/mod_compat.h
@@ -68,6 +68,7 @@ enum scope_prefix_types {
 	zfs_trim,
 	zfs_txg,
 	zfs_vdev,
+	zfs_vdev_disk,
 	zfs_vdev_file,
 	zfs_vdev_mirror,
 	zfs_vnops,
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 352990e02daf..b5679f2f0714 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2,6 +2,7 @@
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
+.\" Copyright (c) 2023, 2024 Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@@ -15,7 +16,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd July 21, 2023
+.Dd January 9, 2024
 .Dt ZFS 4
 .Os
 .
@@ -1345,6 +1346,13 @@ _
 	4	Driver	No driver retries on driver errors.
 .TE
 .
+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
+Maximum number of segments to add to a BIO (min 4).
+If this is higher than the maximum allowed by the device queue or the kernel
+itself, it will be clamped.
+Setting it to zero will cause the kernel's ideal size to be used.
+This parameter only applies on Linux.
+.
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 .Pa .zfs/snapshot .
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index de4dba72fa3c..0ccb9ad96fa5 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -24,6 +24,7 @@
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -66,6 +67,13 @@ typedef struct vdev_disk {
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 
+/*
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
+ * the maximum allowed by the device queue or the kernel itself, it will be
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
+ */
+uint_t zfs_vdev_disk_max_segs = 0;
+
 /*
  * Unique identifier for the exclusive vdev holder.
  */
@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+static inline uint_t
+vdev_bio_max_segs(struct block_device *bdev)
+{
+	/*
+	 * Smallest of the device max segs and the tuneable max segs. Minimum
+	 * 4, so there's room to finish split pages if they come up.
+	 */
+	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
+	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
+	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
+	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
+
+#ifdef HAVE_BIO_MAX_SEGS
+	return (bio_max_segs(max_segs));
+#else
+	return (MIN(max_segs, BIO_MAX_PAGES));
+#endif
+}
+
+static inline uint_t
+vdev_bio_max_bytes(struct block_device *bdev)
+{
+	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
+}
+
+
+/*
+ * Virtual block IO object (VBIO)
+ *
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
+ * they can hold. Depending on how they're allocated and structured, a large
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
+ * all have to complete before we can return the completed ZIO back to ZFS.
+ *
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
+ * translate a ZIO down into the kernel block layer and back again.
+ *
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
+ * (flush/trim) don't need multiple BIOs and so can just make the call
+ * directly.
+ */
+typedef struct {
+	zio_t		*vbio_zio;	/* parent zio */
+
+	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
+
+	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
+
+	atomic_t	vbio_ref;	/* bio refcount */
+	int		vbio_error;	/* error from failed bio */
+
+	uint_t		vbio_max_segs;	/* max segs per bio */
+
+	uint_t		vbio_max_bytes;	/* max bytes per bio */
+	uint_t		vbio_lbs_mask;	/* logical block size mask */
+
+	uint64_t	vbio_offset;	/* start offset of next bio */
+
+	struct bio	*vbio_bio;	/* pointer to the current bio */
+	struct bio	*vbio_bios;	/* list of all bios */
+} vbio_t;
+
+static vbio_t *
+vbio_alloc(zio_t *zio, struct block_device *bdev)
+{
+	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
+
+	vbio->vbio_zio = zio;
+	vbio->vbio_bdev = bdev;
+	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
+	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
+	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
+	vbio->vbio_offset = zio->io_offset;
+
+	return (vbio);
+}
+
+static int
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+{
+	struct bio *bio;
+	uint_t ssize;
+
+	while (size > 0) {
+		bio = vbio->vbio_bio;
+		if (bio == NULL) {
+			/* New BIO, allocate and set up */
+			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
+			    vbio->vbio_max_segs);
+			if (unlikely(bio == NULL))
+				return (SET_ERROR(ENOMEM));
+			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+
+			bio->bi_next = vbio->vbio_bios;
+			vbio->vbio_bios = vbio->vbio_bio = bio;
+		}
+
+		/*
+		 * Only load as much of the current page data as will fit in
+		 * the space left in the BIO, respecting lbs alignment. Older
+		 * kernels will error if we try to overfill the BIO, while
+		 * newer ones will accept it and split the BIO. This ensures
+		 * everything works on older kernels, and avoids an additional
+		 * overhead on the new.
+		 */
+		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
+		    vbio->vbio_lbs_mask);
+		if (ssize > 0 &&
+		    bio_add_page(bio, page, ssize, offset) == ssize) {
+			/* Accepted, adjust and load any remaining. */
+			size -= ssize;
+			offset += ssize;
+			continue;
+		}
+
+		/* No room, set up for a new BIO and loop */
+		vbio->vbio_offset += BIO_BI_SIZE(bio);
+
+		/* Signal new BIO allocation wanted */
+		vbio->vbio_bio = NULL;
+	}
+
+	return (0);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
+static void vbio_put(vbio_t *vbio);
+
+static void
+vbio_submit(vbio_t *vbio, int flags)
+{
+	ASSERT(vbio->vbio_bios);
+	struct bio *bio = vbio->vbio_bios;
+	vbio->vbio_bio = vbio->vbio_bios = NULL;
+
+	/*
+	 * We take a reference for each BIO as we submit it, plus one to
+	 * protect us from BIOs completing before we're done submitting them
+	 * all, causing vbio_put() to free vbio out from under us and/or the
+	 * zio to be returned before all its IO has completed.
+	 */
+	atomic_set(&vbio->vbio_ref, 1);
+
+	/*
+	 * If we're submitting more than one BIO, inform the block layer so
+	 * it can batch them if it wants.
+	 */
+	struct blk_plug plug;
+	boolean_t do_plug = (bio->bi_next != NULL);
+	if (do_plug)
+		blk_start_plug(&plug);
+
+	/* Submit all the BIOs */
+	while (bio != NULL) {
+		atomic_inc(&vbio->vbio_ref);
+
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		bio->bi_end_io = vdev_disk_io_rw_completion;
+		bio->bi_private = vbio;
+		bio_set_op_attrs(bio,
+		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+		    WRITE : READ, flags);
+
+		vdev_submit_bio(bio);
+
+		bio = next;
+	}
+
+	/* Finish the batch */
+	if (do_plug)
+		blk_finish_plug(&plug);
+
+	/* Release the extra reference */
+	vbio_put(vbio);
+}
+
+static void
+vbio_return_abd(vbio_t *vbio)
+{
+	zio_t *zio = vbio->vbio_zio;
+	if (vbio->vbio_abd == NULL)
+		return;
+
+	/*
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
+	 */
+	void *buf = abd_to_buf(vbio->vbio_abd);
+	abd_free(vbio->vbio_abd);
+	vbio->vbio_abd = NULL;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, buf, zio->io_size);
+}
+
+static void
+vbio_free(vbio_t *vbio)
+{
+	VERIFY0(atomic_read(&vbio->vbio_ref));
+
+	vbio_return_abd(vbio);
+
+	kmem_free(vbio, sizeof (vbio_t));
+}
+
+static void
+vbio_put(vbio_t *vbio)
+{
+	if (atomic_dec_return(&vbio->vbio_ref) > 0)
+		return;
+
+	/*
+	 * This was the last reference, so the entire IO is completed. Clean
+	 * up and submit it for processing.
+	 */
+
+	/*
+	 * Get any data buf back to the original ABD, if necessary. We do this
+	 * now so we can get the ZIO into the pipeline as quickly as possible,
+	 * and then do the remaining cleanup after.
+	 */
+	vbio_return_abd(vbio);
+
+	zio_t *zio = vbio->vbio_zio;
+
+	/*
+	 * Set the overall error. If multiple BIOs returned an error, only the
+	 * first will be taken; the others are dropped (see
+	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
+	 * multiple IOs to the same device to fail with different errors, so
+	 * there's no real risk.
+	 */
+	zio->io_error = vbio->vbio_error;
+	if (zio->io_error)
+		vdev_disk_error(zio);
+
+	/* All done, submit for processing */
+	zio_delay_interrupt(zio);
+
+	/* Finish cleanup */
+	vbio_free(vbio);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
+{
+	vbio_t *vbio = bio->bi_private;
+
+	if (vbio->vbio_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		vbio->vbio_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			vbio->vbio_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			vbio->vbio_error = EIO;
+#endif
+	}
+
+	/*
+	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
+	 * kernel won't touch it again after the completion function runs.
+	 */
+	bio_put(bio);
+
+	/* Drop this BIOs reference acquired by vbio_submit() */
+	vbio_put(vbio);
+}
+
+/*
+ * Iterator callback to count ABD pages and check their size & alignment.
+ *
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
+ * the data within that page. A page can be arbitrarily large ("compound"
+ * pages) but we still have to ensure the data portion is correctly sized and
+ * aligned to the logical block size, to ensure that if the kernel wants to
+ * split the BIO, the two halves will still be properly aligned.
+ */
+typedef struct {
+	uint_t  bmask;
+	uint_t  npages;
+	uint_t  end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vdev_disk_check_pages_t *s = priv;
+
+	/*
+	 * If we didn't finish on a block size boundary last time, then there
+	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 */
+	if (s->end != 0)
+		return (1);
+
+	/*
+	 * Note if we're taking less than a full block, so we can check it
+	 * above on the next call.
+	 */
+	s->end = len & s->bmask;
+
+	/* All blocks after the first must start on a block size boundary. */
+	if (s->npages != 0 && (off & s->bmask) != 0)
+		return (1);
+
+	s->npages++;
+	return (0);
+}
+
+/*
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
+ * the number of pages, or 0 if it can't be submitted like this.
+ */
+static boolean_t
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+{
+	vdev_disk_check_pages_t s = {
+	    .bmask = bdev_logical_block_size(bdev)-1,
+	    .npages = 0,
+	    .end = 0,
+	};
+
+	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
+
+static int
+vdev_disk_io_rw(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	int flags = 0;
+
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    (u_longlong_t)zio->io_offset,
+		    (u_longlong_t)zio->io_size,
+		    (u_longlong_t)i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+	    v->vdev_failfast == B_TRUE) {
+		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
+		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
+	}
+
+	/*
+	 * Check alignment of the incoming ABD. If any part of it would require
+	 * submitting a page that is not aligned to the logical block size,
+	 * then we take a copy into a linear buffer and submit that instead.
+	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
+	 * usually requiring abnormally-small data blocks (eg gang blocks)
+	 * mixed into the same ABD as larger ones (eg aggregated).
+	 */
+	abd_t *abd = zio->io_abd;
+	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
+		void *buf;
+		if (zio->io_type == ZIO_TYPE_READ)
+			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		else
+			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+		/*
+		 * Wrap the copy in an abd_t, so we can use the same iterators
+		 * to count and fill the vbio later.
+		 */
+		abd = abd_get_from_buf(buf, zio->io_size);
+
+		/*
+		 * False here would mean the borrowed copy has an invalid
+		 * alignment too, which would mean we've somehow been passed a
+		 * linear ABD with an interior page that has a non-zero offset
+		 * or a size not a multiple of PAGE_SIZE. This is not possible.
+		 * It would mean either zio_buf_alloc() or its underlying
+		 * allocators have done something extremely strange, or our
+		 * math in vdev_disk_check_pages() is wrong. In either case,
+		 * something in seriously wrong and its not safe to continue.
+		 */
+		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
+	}
+
+	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
+	int error = 0;
+	vbio_t *vbio = vbio_alloc(zio, bdev);
+	if (abd != zio->io_abd)
+		vbio->vbio_abd = abd;
+
+	/* Fill it with pages */
+	error = abd_iterate_page_func(abd, 0, zio->io_size,
+	    vdev_disk_fill_vbio_cb, vbio);
+	if (error != 0) {
+		vbio_free(vbio);
+		return (error);
+	}
+
+	vbio_submit(vbio, flags);
+	return (0);
+}
+
 /* ========== */
 
 /*
- * This is the classic, battle-tested BIO submission code.
+ * This is the classic, battle-tested BIO submission code. Until we're totally
+ * sure that the new code is safe and correct in all cases, this will remain
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
+ * load time.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		vdev_disk_io_rw_fn = vdev_classic_physio;
+		/* XXX make configurable */
+		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
+
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
+	"Maximum number of data segments to add to an IO request (min 4)");
-- 
cgit v1.2.3


From af3a5bb40d89d6339ead886fbef96b24171962bd Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 13:28:57 +1100
Subject: vdev_disk: add module parameter to select BIO submission method

This makes the submission method selectable at module load time via the
`zfs_vdev_disk_classic` parameter, allowing this change to be backported
to 2.2 safely, and disabled in favour of the "classic" submission method
if new problems come up.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7)
---
 man/man4/zfs.4                  | 16 ++++++++++++++++
 module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index b5679f2f0714..6a628e7f3e52 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
 itself, it will be clamped.
 Setting it to zero will cause the kernel's ideal size to be used.
 This parameter only applies on Linux.
+This parameter is ignored if
+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
+.
+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
+and earlier.
+This "classic" method has known issues with highly fragmented IO requests and
+is slower on many workloads, but it has been in use for many years and is known
+to be very stable.
+If you set this parameter, please also open a bug report why you did so,
+including the workload involved and any error messages.
+.Pp
+This parameter and the classic submission method will be removed once we have
+total confidence in the new method.
+.Pp
+This parameter only applies on Linux, and can only be set at module load time.
 .
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 0ccb9ad96fa5..a9110623ace0 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * BIO submission method. See comment above about vdev_classic.
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
+ */
+static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+
+/* Set submission function from module parameter */
+static int
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
+{
+	int err = param_set_uint(buf, kp);
+	if (err < 0)
+		return (SET_ERROR(err));
+
+	vdev_disk_io_rw_fn =
+	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
+
+	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
+	    zfs_vdev_disk_classic ? "classic" : "new");
+
+	return (0);
+}
+
 /*
  * At first use vdev use, set the submission function from the default value if
  * it hasn't been set already.
@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		/* XXX make configurable */
-		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
+		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
+		    vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 	"Maximum number of data segments to add to an IO request (min 4)");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
+    vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
+	"Use classic BIO submission method");
-- 
cgit v1.2.3


From cb599d27edf8788c53507c0a6588438fa0f120b8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 21 Feb 2024 11:07:21 +1100
Subject: vdev_disk: use bio_chain() to submit multiple BIOs

Simplifies our code a lot, so we don't have to wait for each and
reassemble them.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3)
---
 module/os/linux/zfs/vdev_disk.c | 231 ++++++++++++++--------------------------
 1 file changed, 80 insertions(+), 151 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index a9110623ace0..36468fc21132 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
-	if (vd->vd_bdh != NULL) {
+	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
-	}
 
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
@@ -663,9 +662,6 @@ typedef struct {
 
 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 
-	atomic_t	vbio_ref;	/* bio refcount */
-	int		vbio_error;	/* error from failed bio */
-
 	uint_t		vbio_max_segs;	/* max segs per bio */
 
 	uint_t		vbio_max_bytes;	/* max bytes per bio */
@@ -674,43 +670,52 @@ typedef struct {
 	uint64_t	vbio_offset;	/* start offset of next bio */
 
 	struct bio	*vbio_bio;	/* pointer to the current bio */
-	struct bio	*vbio_bios;	/* list of all bios */
+	int		vbio_flags;	/* bio flags */
 } vbio_t;
 
 static vbio_t *
-vbio_alloc(zio_t *zio, struct block_device *bdev)
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 
 	vbio->vbio_zio = zio;
 	vbio->vbio_bdev = bdev;
-	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_abd = NULL;
 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 	vbio->vbio_offset = zio->io_offset;
+	vbio->vbio_bio = NULL;
+	vbio->vbio_flags = flags;
 
 	return (vbio);
 }
 
+BIO_END_IO_PROTO(vbio_completion, bio, error);
+
 static int
 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 {
-	struct bio *bio;
+	struct bio *bio = vbio->vbio_bio;
 	uint_t ssize;
 
 	while (size > 0) {
-		bio = vbio->vbio_bio;
 		if (bio == NULL) {
 			/* New BIO, allocate and set up */
 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 			    vbio->vbio_max_segs);
-			if (unlikely(bio == NULL))
-				return (SET_ERROR(ENOMEM));
+			VERIFY(bio);
+
 			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+			bio_set_op_attrs(bio,
+			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+			    WRITE : READ, vbio->vbio_flags);
 
-			bio->bi_next = vbio->vbio_bios;
-			vbio->vbio_bios = vbio->vbio_bio = bio;
+			if (vbio->vbio_bio) {
+				bio_chain(vbio->vbio_bio, bio);
+				vdev_submit_bio(vbio->vbio_bio);
+			}
+			vbio->vbio_bio = bio;
 		}
 
 		/*
@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 		vbio->vbio_offset += BIO_BI_SIZE(bio);
 
 		/* Signal new BIO allocation wanted */
-		vbio->vbio_bio = NULL;
+		bio = NULL;
 	}
 
 	return (0);
 }
 
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
-static void vbio_put(vbio_t *vbio);
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
 
+/* Create some BIOs, fill them with data and submit them */
 static void
-vbio_submit(vbio_t *vbio, int flags)
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bios);
-	struct bio *bio = vbio->vbio_bios;
-	vbio->vbio_bio = vbio->vbio_bios = NULL;
-
-	/*
-	 * We take a reference for each BIO as we submit it, plus one to
-	 * protect us from BIOs completing before we're done submitting them
-	 * all, causing vbio_put() to free vbio out from under us and/or the
-	 * zio to be returned before all its IO has completed.
-	 */
-	atomic_set(&vbio->vbio_ref, 1);
+	ASSERT(vbio->vbio_bdev);
 
 	/*
-	 * If we're submitting more than one BIO, inform the block layer so
-	 * it can batch them if it wants.
+	 * We plug so we can submit the BIOs as we go and only unplug them when
+	 * they are fully created and submitted. This is important; if we don't
+	 * plug, then the kernel may start executing earlier BIOs while we're
+	 * still creating and executing later ones, and if the device goes
+	 * away while that's happening, older kernels can get confused and
+	 * trample memory.
 	 */
 	struct blk_plug plug;
-	boolean_t do_plug = (bio->bi_next != NULL);
-	if (do_plug)
-		blk_start_plug(&plug);
+	blk_start_plug(&plug);
 
-	/* Submit all the BIOs */
-	while (bio != NULL) {
-		atomic_inc(&vbio->vbio_ref);
+	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
+	ASSERT(vbio->vbio_bio);
 
-		struct bio *next = bio->bi_next;
-		bio->bi_next = NULL;
+	vbio->vbio_bio->bi_end_io = vbio_completion;
+	vbio->vbio_bio->bi_private = vbio;
 
-		bio->bi_end_io = vdev_disk_io_rw_completion;
-		bio->bi_private = vbio;
-		bio_set_op_attrs(bio,
-		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
-		    WRITE : READ, flags);
+	vdev_submit_bio(vbio->vbio_bio);
 
-		vdev_submit_bio(bio);
-
-		bio = next;
-	}
-
-	/* Finish the batch */
-	if (do_plug)
-		blk_finish_plug(&plug);
+	blk_finish_plug(&plug);
 
-	/* Release the extra reference */
-	vbio_put(vbio);
+	vbio->vbio_bio = NULL;
+	vbio->vbio_bdev = NULL;
 }
 
-static void
-vbio_return_abd(vbio_t *vbio)
+/* IO completion callback */
+BIO_END_IO_PROTO(vbio_completion, bio, error)
 {
+	vbio_t *vbio = bio->bi_private;
 	zio_t *zio = vbio->vbio_zio;
-	if (vbio->vbio_abd == NULL)
-		return;
-
-	/*
-	 * If we copied the ABD before issuing it, clean up and return the copy
-	 * to the ADB, with changes if appropriate.
-	 */
-	void *buf = abd_to_buf(vbio->vbio_abd);
-	abd_free(vbio->vbio_abd);
-	vbio->vbio_abd = NULL;
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-	else
-		abd_return_buf(zio->io_abd, buf, zio->io_size);
-}
 
-static void
-vbio_free(vbio_t *vbio)
-{
-	VERIFY0(atomic_read(&vbio->vbio_ref));
-
-	vbio_return_abd(vbio);
+	ASSERT(zio);
 
-	kmem_free(vbio, sizeof (vbio_t));
-}
+	/* Capture and log any errors */
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = 0;
+	if (error)
+		zio->io_error = -(error);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		zio->io_error = EIO;
+#endif
+	ASSERT3U(zio->io_error, >=, 0);
 
-static void
-vbio_put(vbio_t *vbio)
-{
-	if (atomic_dec_return(&vbio->vbio_ref) > 0)
-		return;
+	if (zio->io_error)
+		vdev_disk_error(zio);
 
-	/*
-	 * This was the last reference, so the entire IO is completed. Clean
-	 * up and submit it for processing.
-	 */
+	/* Return the BIO to the kernel */
+	bio_put(bio);
 
 	/*
-	 * Get any data buf back to the original ABD, if necessary. We do this
-	 * now so we can get the ZIO into the pipeline as quickly as possible,
-	 * and then do the remaining cleanup after.
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
 	 */
-	vbio_return_abd(vbio);
+	if (vbio->vbio_abd != NULL) {
+		void *buf = abd_to_buf(vbio->vbio_abd);
+		abd_free(vbio->vbio_abd);
+		vbio->vbio_abd = NULL;
 
-	zio_t *zio = vbio->vbio_zio;
+		if (zio->io_type == ZIO_TYPE_READ)
+			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+		else
+			abd_return_buf(zio->io_abd, buf, zio->io_size);
+	}
 
-	/*
-	 * Set the overall error. If multiple BIOs returned an error, only the
-	 * first will be taken; the others are dropped (see
-	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
-	 * multiple IOs to the same device to fail with different errors, so
-	 * there's no real risk.
-	 */
-	zio->io_error = vbio->vbio_error;
-	if (zio->io_error)
-		vdev_disk_error(zio);
+	/* Final cleanup */
+	kmem_free(vbio, sizeof (vbio_t));
 
 	/* All done, submit for processing */
 	zio_delay_interrupt(zio);
-
-	/* Finish cleanup */
-	vbio_free(vbio);
-}
-
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
-{
-	vbio_t *vbio = bio->bi_private;
-
-	if (vbio->vbio_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		vbio->vbio_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			vbio->vbio_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			vbio->vbio_error = EIO;
-#endif
-	}
-
-	/*
-	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
-	 * kernel won't touch it again after the completion function runs.
-	 */
-	bio_put(bio);
-
-	/* Drop this BIOs reference acquired by vbio_submit() */
-	vbio_put(vbio);
 }
 
 /*
@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
 	return (B_TRUE);
 }
 
-/* Iterator callback to submit ABD pages to the vbio. */
-static int
-vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
-{
-	vbio_t *vbio = priv;
-	return (vbio_add_page(vbio, page, len, off));
-}
-
 static int
 vdev_disk_io_rw(zio_t *zio)
 {
@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
 	}
 
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
-	int error = 0;
-	vbio_t *vbio = vbio_alloc(zio, bdev);
+	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
 	if (abd != zio->io_abd)
 		vbio->vbio_abd = abd;
 
-	/* Fill it with pages */
-	error = abd_iterate_page_func(abd, 0, zio->io_size,
-	    vdev_disk_fill_vbio_cb, vbio);
-	if (error != 0) {
-		vbio_free(vbio);
-		return (error);
-	}
-
-	vbio_submit(vbio, flags);
+	/* Fill it with data pages and submit it to the kernel */
+	vbio_submit(vbio, abd, zio->io_size);
 	return (0);
 }
 
-- 
cgit v1.2.3


From d0b3be763f63bcc4ca05542eb9c2c635d93af03c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 14 Mar 2024 10:57:30 +1100
Subject: abd_iter_page: don't use compound heads on Linux <4.5

Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
in a compound page were refcounted separately. This means that using the
head page without taking a reference to it could see it cleaned up later
before we're finished with it. Specifically, bio_add_page() would take a
reference, and drop its reference after the bio completion callback
returns.

If the zio is executed immediately from the completion callback, this is
usually ok, as any data is referenced through the tail page referenced
by the ABD, and so becomes "live" that way. If there's a delay in zio
execution (high load, error injection), then the head page can be freed,
along with any dirty flags or other indicators that the underlying
memory is used. Later, when the zio completes and that memory is
accessed, its either unmapped and an unhandled fault takes down the
entire system, or it is mapped and we end up messing around in someone
else's memory. Both of these are very bad.

The solution on these older kernels is to take a reference to the head
page when we use it, and release it when we're done. There's not really
a sensible way under our current structure to do this; the "best" would
be to keep a list of head page references in the ABD, and release them
when the ABD is freed.

Since this additional overhead is totally unnecessary on 4.5+, where
head and tail pages share refcounts, I've opted to simply not use the
compound head in ABD page iteration there. This is theoretically less
efficient (though cleaning up head page references would add overhead),
but its safe, and we still get the other benefits of not mapping pages
before adding them to a bio and not mis-splitting pages.

There doesn't appear to be an obvious symbol name or config option we
can match on to discover this behaviour in configure (and the mm/page
APIs have changed a lot since then anyway), so I've gone with a simple
version check.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542)
---
 module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 3fe01c0b7d77..d3255dcbc0f7 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -62,6 +62,7 @@
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
+#include <linux/version.h>
 #endif
 
 #ifdef _KERNEL
@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
 	}
 	ASSERT(page);
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
 	if (PageTail(page)) {
 		/*
 		 * This page is part of a "compound page", which is a group of
@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
 		 * To do this, we need to adjust the offset to be counted from
 		 * the head page. struct page for compound pages are stored
 		 * contiguously, so we can just adjust by a simple offset.
+		 *
+		 * Before kernel 4.5, compound page heads were refcounted
+		 * separately, such that moving back to the head page would
+		 * require us to take a reference to it and releasing it once
+		 * we're completely finished with it. In practice, that means
+		 * when our caller is done with the ABD, which we have no
+		 * insight into from here. Rather than contort this API to
+		 * track head page references on such ancient kernels, we just
+		 * compile this block out and use the tail pages directly. This
+		 * is slightly less efficient, but makes everything far
+		 * simpler.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
 		page = head;
 	}
+#endif
 
 	/* final page and position within it */
 	aiter->iter_page = page;
-- 
cgit v1.2.3


From eebf00bee91de2c7b7d03a9c1e5e2f3e5fd66c9e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 27 Mar 2024 13:11:12 +1100
Subject: vdev_disk: default to classic submission for 2.2.x

We don't want to change to brand-new code in the middle of a stable
series, but we want it available to test for people running into page
splitting issues.

This commits make zfs_vdev_disk_classic=1 the default, and updates the
documentation to better explain what's going on.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
---
 man/man4/zfs.4                  | 31 ++++++++++++++++++++++---------
 module/os/linux/zfs/vdev_disk.c |  8 +++++---
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 6a628e7f3e52..a98ec519aaa5 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1355,17 +1355,30 @@ This parameter only applies on Linux.
 This parameter is ignored if
 .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
 .
-.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
-If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
-and earlier.
-This "classic" method has known issues with highly fragmented IO requests and
-is slower on many workloads, but it has been in use for many years and is known
-to be very stable.
-If you set this parameter, please also open a bug report why you did so,
+.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
+Controls the method used to submit IO to the Linux block layer
+(default
+.Sy 1 "classic" Ns
+)
+.Pp
+If set to 1, the "classic" method is used.
+This is the method that has been in use since the earliest versions of
+ZFS-on-Linux.
+It has known issues with highly fragmented IO requests and is less efficient on
+many workloads, but it well known and well understood.
+.Pp
+If set to 0, the "new" method is used.
+This method is available since 2.2.4 and should resolve all known issues and be
+far more efficient, but has not had as much testing.
+In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
+.Pp
+It is not recommended that you change it except on advice from the OpenZFS
+developers.
+If you do change it, please also open a bug report describing why you did so,
 including the workload involved and any error messages.
 .Pp
-This parameter and the classic submission method will be removed once we have
-total confidence in the new method.
+This parameter and the "classic" submission method will be removed in a future
+release of OpenZFS once we have total confidence in the new method.
 .Pp
 This parameter only applies on Linux, and can only be set at module load time.
 .
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 36468fc21132..e1c19a085b0e 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio)
 /*
  * This is the classic, battle-tested BIO submission code. Until we're totally
  * sure that the new code is safe and correct in all cases, this will remain
- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
- * load time.
+ * available.
+ *
+ * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
+ * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd)
  * BIO submission method. See comment above about vdev_classic.
  * Set zfs_vdev_disk_classic=0 for new, =1 for classic
  */
-static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+static uint_t zfs_vdev_disk_classic = 1;	/* default classic */
 
 /* Set submission function from module parameter */
 static int
-- 
cgit v1.2.3


From deb7a84231aff8d772bb4ce9fa486d1886f1a2b6 Mon Sep 17 00:00:00 2001
From: Robert Evans <rrevans@gmail.com>
Date: Mon, 25 Mar 2024 17:56:49 -0400
Subject: Fix corruption caused by mmap flushing problems

1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
   already in writeback unless data-integrity sync is requested.

2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
   skipped due to DMU pushing back on TX assign.

3) Add missing mmap flush when doing block cloning.

4) While here, pass errors from putpage to writepage/writepages.

This change fixes corruption edge cases, but unfortunately adds
synchronous ZIL flushes for dirty mmap pages to llseek and bclone
operations. It may be possible to avoid these sync writes later
but would need more tricky refactoring of the writeback code.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #15933
Closes #16019
---
 module/os/linux/zfs/zfs_vnops_os.c | 5 +----
 module/os/linux/zfs/zpl_file.c     | 8 ++++----
 module/zfs/zfs_vnops.c             | 6 +++++-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index c06a75662bf7..7c473bc7e965 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
-		if (err == ERESTART)
-			dmu_tx_wait(tx);
-
 		dmu_tx_abort(tx);
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 3caa0fc6c214..9dec52215c7c 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
 	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;
+	int ret;
 
 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));
 
 	cookie = spl_fstrans_mark();
-	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
+	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);
 
-	return (0);
+	return (ret);
 }
 
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
 {
-	(void) zpl_putpage(&pp->page, wbc, data);
-	return (0);
+	return (zpl_putpage(&pp->page, wbc, data));
 }
 #endif
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 2b37834d5c56..7020f88ecf93 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 
 	/* Flush any mmap()'d data to disk */
 	if (zn_has_cached_data(zp, 0, file_sz - 1))
-		zn_flush_cached_data(zp, B_FALSE);
+		zn_flush_cached_data(zp, B_TRUE);
 
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 	}
 
+	/* Flush any mmap()'d data to disk */
+	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
+		zn_flush_cached_data(inzp, B_TRUE);
+
 	/*
 	 * Maintain predictable lock order.
 	 */
-- 
cgit v1.2.3


From 28520cad2500b60ce8653e431990e33f77ff08f7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Apr 2024 09:17:07 +1100
Subject: vdev_disk: don't touch vbio after its handed off to the kernel

After IO is unplugged, it may complete immediately and vbio_completion
be called on interrupt context. That may interrupt or deschedule our
task. If its the last bio, the vbio will be freed. Then, we get
rescheduled, and try to write to freed memory through vbio->.

This patch just removes the the cleanup, and the corresponding assert.
These were leftovers from a previous iteration of vbio_submit() and were
always "belt and suspenders" ops anyway, never strictly required.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc
Reported-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
(cherry picked from commit 917ff75e9510d19968ef3cc5c80b1cd0ef48f84d)
---
 module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index e1c19a085b0e..62c7aa14fd1f 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bdev);
-
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
@@ -777,12 +775,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 	vbio->vbio_bio->bi_end_io = vbio_completion;
 	vbio->vbio_bio->bi_private = vbio;
 
+	/*
+	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
+	 * can't touch it again. The bio may complete and vbio_completion() be
+	 * called and free the vbio before this task is run again, so we must
+	 * consider it invalid from this point.
+	 */
 	vdev_submit_bio(vbio->vbio_bio);
 
 	blk_finish_plug(&plug);
-
-	vbio->vbio_bio = NULL;
-	vbio->vbio_bdev = NULL;
 }
 
 /* IO completion callback */
-- 
cgit v1.2.3


From d0d9dccc61d2ea9776497d2372a1d43cf8fe3d24 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 9 Apr 2024 04:50:24 +1000
Subject: vdev_disk: ensure trim errors are returned immediately

After 08fd5ccc3, the discard issuing code was organised such that if
requesting an async discard or secure erase failed before the IO was
issued (that is, calling __blkdev_issue_discard() returned an error),
the failed zio would never be executed, resulting in txg_sync hanging
forever waiting for IO to finish.

This commit fixes that by immediately executing a failed zio on error.
To handle the successful synchronous op case, we fake an async op by,
when not using an asynchronous submission method, queuing the successful
result zio as part of the discard handler.

Since it was hard to understand the differences between discard and
secure erase, and sync and async, across different kernel versions, I've
commented and reorganised the code a bit to try and make everything more
contained and linear.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
(cherry picked from commit ba9f587a77e6893390c752491dfacb6ee5d52023)
---
 config/kernel-blkdev.m4         | 118 ++++++++++++++++++++++++++----------
 module/os/linux/zfs/vdev_disk.c | 131 ++++++++++++++++++++++++++--------------
 2 files changed, 171 insertions(+), 78 deletions(-)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index c5a353ca9203..dae7bef9ce0d 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -523,12 +523,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [
 ])
 
 dnl #
-dnl # 5.19 API: blkdev_issue_secure_erase()
-dnl # 4.7  API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
-dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
-	ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [
+dnl # TRIM support: discard and secure erase. We make use of asynchronous
+dnl #               functions when available.
+dnl #
+dnl # 3.10:
+dnl #   sync discard:  blkdev_issue_discard(..., 0)
+dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #   async discard: [not available]
+dnl #   async erase:   [not available]
+dnl #
+dnl # 4.7:
+dnl #   sync discard:  blkdev_issue_discard(..., 0)
+dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #   async discard: __blkdev_issue_discard(..., 0)
+dnl #   async erase:   __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #
+dnl # 5.19:
+dnl #   sync discard:  blkdev_issue_discard(...)
+dnl #   sync erase:    blkdev_issue_secure_erase(...)
+dnl #   async discard: __blkdev_issue_discard(...)
+dnl #   async erase:   [not available]
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD], [
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_noflags], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
@@ -536,10 +553,33 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
 		sector_t nr_sects = 0;
 		int error __attribute__ ((unused));
 
-		error = blkdev_issue_secure_erase(bdev,
+		error = blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL);
 	])
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		sector_t sector = 0;
+		sector_t nr_sects = 0;
+		unsigned long flags = 0;
+		int error __attribute__ ((unused));
+
+		error = blkdev_issue_discard(bdev,
+		    sector, nr_sects, GFP_KERNEL, flags);
+	])
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_noflags], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		sector_t sector = 0;
+		sector_t nr_sects = 0;
+		struct bio *biop = NULL;
+		int error __attribute__ ((unused));
 
+		error = __blkdev_issue_discard(bdev,
+		    sector, nr_sects, GFP_KERNEL, &biop);
+	])
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [
 		#include <linux/blkdev.h>
 	],[
@@ -553,22 +593,52 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
 		error = __blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL, flags, &biop);
 	])
-
-	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
+	ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
-		unsigned long flags = 0;
 		int error __attribute__ ((unused));
 
-		error = blkdev_issue_discard(bdev,
-		    sector, nr_sects, GFP_KERNEL, flags);
+		error = blkdev_issue_secure_erase(bdev,
+		    sector, nr_sects, GFP_KERNEL);
 	])
 ])
 
-AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD], [
+	AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_noflags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS, 1,
+		    [blkdev_issue_discard() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether blkdev_issue_discard(flags) is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS, 1,
+		    [blkdev_issue_discard(flags) is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_noflags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS, 1,
+		    [__blkdev_issue_discard() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether __blkdev_issue_discard(flags) is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS, 1,
+		    [__blkdev_issue_discard(flags) is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
 	AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [
 		AC_MSG_RESULT(yes)
@@ -576,24 +646,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [
 		    [blkdev_issue_secure_erase() is available])
 	],[
 		AC_MSG_RESULT(no)
-
-		AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
-		ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
-			AC_MSG_RESULT(yes)
-			AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1,
-			    [__blkdev_issue_discard() is available])
-		],[
-			AC_MSG_RESULT(no)
-
-			AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
-			ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
-				AC_MSG_RESULT(yes)
-				AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1,
-					[blkdev_issue_discard() is available])
-			],[
-				ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()])
-			])
-		])
 	])
 ])
 
@@ -657,7 +709,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME
-	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
+	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
@@ -678,7 +730,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_BLKDEV_BDEVNAME
 	ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
-	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
+	ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 62c7aa14fd1f..12157d3b66ec 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1243,8 +1243,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 	return (0);
 }
 
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
-	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@@ -1259,54 +1257,99 @@ BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
 	zio_interrupt(zio);
 }
 
+/*
+ * Wrappers for the different secure erase and discard APIs. We use async
+ * when available; in this case, *biop is set to the last bio in the chain.
+ */
 static int
-vdev_issue_discard_trim(zio_t *zio, unsigned long flags)
+vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector,
+    sector_t nsect, struct bio **biop)
 {
-	int ret;
-	struct bio *bio = NULL;
+	*biop = NULL;
+	int error;
 
-#if defined(BLKDEV_DISCARD_SECURE)
-	ret = - __blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio);
+#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
+	error = blkdev_issue_secure_erase(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE);
 #else
-	(void) flags;
-	ret = - __blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio);
+#error "unsupported kernel"
 #endif
-	if (!ret && bio) {
-		bio->bi_private = zio;
-		bio->bi_end_io = vdev_disk_discard_end_io;
-		vdev_submit_bio(bio);
-	}
-	return (ret);
+
+	return (error);
 }
+
+static int
+vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector,
+    sector_t nsect, struct bio **biop)
+{
+	*biop = NULL;
+	int error;
+
+#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, 0, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, 0);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS);
+#else
+#error "unsupported kernel"
 #endif
 
+	return (error);
+}
+
+/*
+ * Entry point for TRIM ops. This calls the right wrapper for secure erase or
+ * discard, and then does the appropriate finishing work for error vs success
+ * and async vs sync.
+ */
 static int
 vdev_disk_io_trim(zio_t *zio)
 {
-	unsigned long trim_flags = 0;
-	if (zio->io_trim_flags & ZIO_TRIM_SECURE) {
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
-		return (-blkdev_issue_secure_erase(
-		    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
-#elif defined(BLKDEV_DISCARD_SECURE)
-		trim_flags |= BLKDEV_DISCARD_SECURE;
-#endif
+	int error;
+	struct bio *bio;
+
+	zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
+	sector_t sector = zio->io_offset >> 9;
+	sector_t nsects = zio->io_size >> 9;
+
+	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+		error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
+	else
+		error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio);
+
+	if (error != 0)
+		return (SET_ERROR(-error));
+
+	if (bio == NULL) {
+		/*
+		 * This was a synchronous op that completed successfully, so
+		 * return it to ZFS immediately.
+		 */
+		zio_interrupt(zio);
+	} else {
+		/*
+		 * This was an asynchronous op; set up completion callback and
+		 * issue it.
+		 */
+		bio->bi_private = zio;
+		bio->bi_end_io = vdev_disk_discard_end_io;
+		vdev_submit_bio(bio);
 	}
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
-	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
-	return (vdev_issue_discard_trim(zio, trim_flags));
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
-	return (-blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags));
-#else
-#error "Unsupported kernel"
-#endif
+
+	return (0);
 }
 
 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
@@ -1381,14 +1424,12 @@ vdev_disk_io_start(zio_t *zio)
 		return;
 
 	case ZIO_TYPE_TRIM:
-		zio->io_error = vdev_disk_io_trim(zio);
+		error = vdev_disk_io_trim(zio);
 		rw_exit(&vd->vd_lock);
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
-		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
-			zio_interrupt(zio);
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
-		zio_interrupt(zio);
-#endif
+		if (error) {
+			zio->io_error = error;
+			zio_execute(zio);
+		}
 		return;
 
 	case ZIO_TYPE_READ:
-- 
cgit v1.2.3


From 7ad2616d378f31fb9b15507b7038e445b80a4842 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 10 Apr 2024 13:14:13 +1000
Subject: vdev_disk: fix alignment check when buffer has non-zero starting
 offset

If a linear buffer spans multiple pages, and the first page has a
non-zero starting offset, the checker would not include the offset, and
so would think there was an alignment gap at the end of the first page,
rather than at the start.

That is, for a 16K buffer spread across five pages with an initial 512B
offset:

    [.XXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXX.]

It would be interpreted as:

    [XXXXXXX.][XXXXXXXX]...

And be rejected as misaligned.

Since it's already a linear ABD, the "linearising" copy would just reuse
the buffer as-is, and the second check would failing, tripping the
VERIFY in vdev_disk_io_rw().

This commit fixes all this by including the offset in the check for
end-of-page alignment.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
(cherry picked from commit 1bf649cb0a1cc6e48dce848611ba327eb283000e)
---
 module/os/linux/zfs/vdev_disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 12157d3b66ec..223b41068b83 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -865,7 +865,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 	 * Note if we're taking less than a full block, so we can check it
 	 * above on the next call.
 	 */
-	s->end = len & s->bmask;
+	s->end = (off+len) & s->bmask;
 
 	/* All blocks after the first must start on a block size boundary. */
 	if (s->npages != 0 && (off & s->bmask) != 0)
-- 
cgit v1.2.3


From 5fc134ff2ff55811a34c4653f4435468f58eeca5 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 4 Apr 2024 06:21:25 +0500
Subject: zvol: use multiple taskq

Currently, zvol uses a single taskq, resulting in throughput bottleneck
under heavy load due to lock contention on the single taskq. This patch
addresses the performance bottleneck under heavy load conditions by
utilizing multiple taskqs, thus mitigating lock contention. The number
of taskqs scale dynamically based on the available CPUs in the system,
as illustrated below:

                taskq   total
cpus    taskqs  threads threads
------- ------- ------- -------
1       1       32       32
2       1       32       32
4       1       32       32
8       2       16       32
16      3       11       33
32      5       7        35
64      8       8        64
128     11      12       132
256     16      16       256

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #15992
---
 man/man4/zfs.4                |   7 +++
 module/os/linux/zfs/zvol_os.c | 102 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index a98ec519aaa5..bab40dd91c72 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2370,6 +2370,13 @@ The number of requests which can be handled concurrently is controlled by
 is ignored when running on a kernel that supports block multiqueue
 .Pq Li blk-mq .
 .
+.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
+Number of zvol taskqs.
+If
+.Sy 0
+(the default) then scaling is done internally to prefer 6 threads per taskq.
+This only applies on Linux.
+.
 .It Sy zvol_threads Ns = Ns Sy 0 Pq uint
 The number of system wide threads to use for processing zvol block IOs.
 If
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 8562e989738d..f9c64a8a8ed0 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -37,6 +37,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
+#include <cityhash.h>
 
 #include <linux/blkdev_compat.h>
 #include <linux/task_io_accounting_ops.h>
@@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0;
 static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;
 
+/*
+ * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
+ * to utilize more threads for small files but may affect prefetch hits.
+ */
+#define	ZVOL_TASKQ_OFFSET_SHIFT 29
+
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 static unsigned int zvol_open_timeout_ms = 1000;
 #endif
@@ -74,6 +81,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
+static unsigned int zvol_num_taskqs = 0;
 #endif
 
 #ifndef	BLKDEV_DEFAULT_RQ
@@ -114,7 +122,11 @@ struct zvol_state_os {
 	boolean_t use_blk_mq;
 };
 
-static taskq_t *zvol_taskq;
+typedef struct zv_taskq {
+	uint_t tqs_cnt;
+	taskq_t **tqs_taskq;
+} zv_taskq_t;
+static zv_taskq_t zvol_taskqs;
 static struct ida zvol_ida;
 
 typedef struct zv_request_stack {
@@ -532,6 +544,17 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	}
 
 	zv_request_task_t *task;
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	uint_t blk_mq_hw_queue = 0;
+	uint_t tq_idx;
+	uint_t taskq_hash;
+#ifdef HAVE_BLK_MQ
+	if (rq)
+		blk_mq_hw_queue = rq->mq_hctx->queue_num;
+#endif
+	taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
+	    blk_mq_hw_queue, 0);
+	tq_idx = taskq_hash % ztqs->tqs_cnt;
 
 	if (rw == WRITE) {
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
@@ -601,7 +624,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 				zvol_discard(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
-				taskq_dispatch_ent(zvol_taskq,
+				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_discard_task, task, 0, &task->ent);
 			}
 		} else {
@@ -609,7 +632,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 				zvol_write(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
-				taskq_dispatch_ent(zvol_taskq,
+				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_write_task, task, 0, &task->ent);
 			}
 		}
@@ -631,7 +654,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 			zvol_read(&zvr);
 		} else {
 			task = zv_request_task_create(zvr);
-			taskq_dispatch_ent(zvol_taskq,
+			taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 			    zvol_read_task, task, 0, &task->ent);
 		}
 	}
@@ -1563,8 +1586,40 @@ zvol_init(void)
 		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
 	}
 
+	/*
+	 * Use atleast 32 zvol_threads but for many core system,
+	 * prefer 6 threads per taskq, but no more taskqs
+	 * than threads in them on large systems.
+	 *
+	 *                 taskq   total
+	 * cpus    taskqs  threads threads
+	 * ------- ------- ------- -------
+	 * 1       1       32       32
+	 * 2       1       32       32
+	 * 4       1       32       32
+	 * 8       2       16       32
+	 * 16      3       11       33
+	 * 32      5       7        35
+	 * 64      8       8        64
+	 * 128     11      12       132
+	 * 256     16      16       256
+	 */
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
+	if (num_tqs == 0) {
+		num_tqs = 1 + num_online_cpus() / 6;
+		while (num_tqs * num_tqs > zvol_actual_threads)
+			num_tqs--;
+	}
+	uint_t per_tq_thread = zvol_actual_threads / num_tqs;
+	if (per_tq_thread * num_tqs < zvol_actual_threads)
+		per_tq_thread++;
+	ztqs->tqs_cnt = num_tqs;
+	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
+		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
+		ztqs->tqs_taskq = NULL;
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
 		return (error);
 	}
@@ -1584,11 +1639,22 @@ zvol_init(void)
 		    1024);
 	}
 #endif
-	zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
-	    zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-	if (zvol_taskq == NULL) {
-		unregister_blkdev(zvol_major, ZVOL_DRIVER);
-		return (-ENOMEM);
+	for (uint_t i = 0; i < num_tqs; i++) {
+		char name[32];
+		(void) snprintf(name, sizeof (name), "%s_tq-%u",
+		    ZVOL_DRIVER, i);
+		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
+		    maxclsyspri, per_tq_thread, INT_MAX,
+		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+		if (ztqs->tqs_taskq[i] == NULL) {
+			for (int j = i - 1; j >= 0; j--)
+				taskq_destroy(ztqs->tqs_taskq[j]);
+			unregister_blkdev(zvol_major, ZVOL_DRIVER);
+			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+			    sizeof (taskq_t *));
+			ztqs->tqs_taskq = NULL;
+			return (-ENOMEM);
+		}
 	}
 
 	zvol_init_impl();
@@ -1599,9 +1665,22 @@ zvol_init(void)
 void
 zvol_fini(void)
 {
+	zv_taskq_t *ztqs = &zvol_taskqs;
 	zvol_fini_impl();
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
-	taskq_destroy(zvol_taskq);
+
+	if (ztqs->tqs_taskq == NULL) {
+		ASSERT3U(ztqs->tqs_cnt, ==, 0);
+	} else {
+		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
+			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
+			taskq_destroy(ztqs->tqs_taskq[i]);
+		}
+		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+		    sizeof (taskq_t *));
+		ztqs->tqs_taskq = NULL;
+	}
+
 	ida_destroy(&zvol_ida);
 }
 
@@ -1622,6 +1701,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
 
+module_param(zvol_num_taskqs, uint, 0444);
+MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
+
 module_param(zvol_prefetch_bytes, uint, 0644);
 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
 
-- 
cgit v1.2.3


From 5c0fe099ec743c979c08964a8ae68322a79ab9bb Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 9 Apr 2024 03:13:27 +1000
Subject: zvol_os: fix build on Linux <3.13

99741bde5 introduced zvol_num_taskqs, but put it behind the HAVE_BLK_MQ
define, preventing builds on versions of Linux that don't have it
(<3.13, incl EL7).

Nothing about it seems dependent on blk-mq, so this just moves it out
from behind that define and so fixes the build.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16062
---
 module/os/linux/zfs/zvol_os.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index f9c64a8a8ed0..0121f17bbf70 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -81,9 +81,10 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
-static unsigned int zvol_num_taskqs = 0;
 #endif
 
+static unsigned int zvol_num_taskqs = 0;
+
 #ifndef	BLKDEV_DEFAULT_RQ
 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
-- 
cgit v1.2.3


From 3c5f354a8c3af6b1f23dcb6b998d82ac64d0e1fe Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 9 Apr 2024 04:38:49 +1000
Subject: zvol_os: fix compile with blk-mq on Linux 4.x

99741bde5 accesses a cached blk-mq hardware context through the mq_hctx
field of struct request. However, this field did not exist until 5.0.
Before that, the private function blk_mq_map_queue() was used to dig it
out of broader queue context. This commit detects this situation, and
handles it with a poor-man's simulation of that function.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16069
---
 config/kernel-blk-queue.m4    | 15 +++++++++++++++
 module/os/linux/zfs/zvol_os.c |  5 +++++
 2 files changed, 20 insertions(+)

diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4
index bb5903b313eb..15dbe1c7dff0 100644
--- a/config/kernel-blk-queue.m4
+++ b/config/kernel-blk-queue.m4
@@ -377,6 +377,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [
 		(void) blk_mq_alloc_tag_set(&tag_set);
 		return BLK_STS_OK;
 	], [])
+	ZFS_LINUX_TEST_SRC([blk_mq_rq_hctx], [
+		#include <linux/blk-mq.h>
+		#include <linux/blkdev.h>
+	], [
+		struct request rq = {0};
+		struct blk_mq_hw_ctx *hctx = NULL;
+		rq.mq_hctx = hctx;
+	], [])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
@@ -384,6 +392,13 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
 	ZFS_LINUX_TEST_RESULT([blk_mq], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available])
+		AC_MSG_CHECKING([whether block multiqueue hardware context is cached in struct request])
+		ZFS_LINUX_TEST_RESULT([blk_mq_rq_hctx], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_BLK_MQ_RQ_HCTX, 1, [block multiqueue hardware context is cached in struct request])
+		], [
+			AC_MSG_RESULT(no)
+		])
 	], [
 		AC_MSG_RESULT(no)
 	])
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 0121f17bbf70..e1ede9851a4c 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -551,7 +551,12 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	uint_t taskq_hash;
 #ifdef HAVE_BLK_MQ
 	if (rq)
+#ifdef HAVE_BLK_MQ_RQ_HCTX
 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
+#else
+		blk_mq_hw_queue =
+		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+#endif
 #endif
 	taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 	    blk_mq_hw_queue, 0);
-- 
cgit v1.2.3


From 7ea83310096bc3dc0e73fc7d23d4ba981aa23e4b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 24 Oct 2023 17:35:25 -0400
Subject: ZIL: Detect single-threaded workloads

... by checking that previous block is fully written and flushed.
It allows to skip commit delays since we can give up on aggregation
in that case.  This removes zil_min_commit_timeout parameter, since
for single-threaded workloads it is not needed at all, while on very
fast devices even some multi-threaded workloads may get detected as
single-threaded and still bypass the wait.  To give multi-threaded
workloads more aggregation chances increase zfs_commit_timeout_pct
from 5 to 10%, as they should suffer less from additional latency.

Also single-threaded workloads detection allows in perspective better
prediction of the next block size.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15381
---
 include/sys/zil_impl.h |  4 ++-
 man/man4/zfs.4         |  9 +----
 module/zfs/zil.c       | 91 ++++++++++++++++++++++----------------------------
 3 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index f780ad3d61bc..c9db6d428ea2 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -181,6 +181,7 @@ typedef struct zil_vdev_node {
 	avl_node_t	zv_node;	/* AVL tree linkage */
 } zil_vdev_node_t;
 
+#define	ZIL_BURSTS 8
 #define	ZIL_PREV_BLKS 16
 
 /*
@@ -222,8 +223,9 @@ struct zilog {
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
 	zil_header_t	zl_old_header;	/* debugging aid */
-	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+	uint_t		zl_parallel;	/* workload is multi-threaded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
+	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
 	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
 	uint64_t	zl_dirty_max_txg; /* highest txg used to dirty zilog */
 
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index bab40dd91c72..5307f1f32e93 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -799,7 +799,7 @@ Note that this should not be set below the ZED thresholds
 (currently 10 checksums over 10 seconds)
 or else the daemon may not trigger any action.
 .
-.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
+.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
 This controls the amount of time that a ZIL block (lwb) will remain "open"
 when it isn't "full", and it has a thread waiting for it to be committed to
 stable storage.
@@ -2206,13 +2206,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
 It tunes a tradeoff between additional memory copy and possibly worse log
 space efficiency vs additional range lock/unlock.
 .
-.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
-This sets the minimum delay in nanoseconds ZIL care to delay block commit,
-waiting for more records.
-If ZIL writes are too fast, kernel may not be able sleep for so short interval,
-increasing log latency above allowed by
-.Sy zfs_commit_timeout_pct .
-.
 .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable the cache flush commands that are normally sent to disk by
 the ZIL after an LWB write has completed.
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 5642f082bdb8..8742fc6623a1 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -91,15 +91,7 @@
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
-static uint_t zfs_commit_timeout_pct = 5;
-
-/*
- * Minimal time we care to delay commit waiting for more ZIL records.
- * At least FreeBSD kernel can't sleep for less than 2us at its best.
- * So requests to sleep for less then 5us is a waste of CPU time with
- * a risk of significant log latency increase due to oversleep.
- */
-static uint64_t zil_min_commit_timeout = 5000;
+static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
@@ -2732,6 +2724,19 @@ zil_commit_writer_stall(zilog_t *zilog)
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
+static void
+zil_burst_done(zilog_t *zilog)
+{
+	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
+	    zilog->zl_cur_used == 0)
+		return;
+
+	if (zilog->zl_parallel)
+		zilog->zl_parallel--;
+
+	zilog->zl_cur_used = 0;
+}
+
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
@@ -2746,7 +2751,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
-	boolean_t first = B_TRUE;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
@@ -2772,9 +2776,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
-		first = (lwb->lwb_state == LWB_STATE_NEW) &&
-		    ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
-		    plwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+		/*
+		 * If the lwb is still opened, it means the workload is really
+		 * multi-threaded and we won the chance of write aggregation.
+		 * If it is not opened yet, but previous lwb is still not
+		 * flushed, it still means the workload is multi-threaded, but
+		 * there was too much time between the commits to aggregate, so
+		 * we try aggregation next times, but without too much hopes.
+		 */
+		if (lwb->lwb_state == LWB_STATE_OPENED) {
+			zilog->zl_parallel = ZIL_BURSTS;
+		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
+		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
+			zilog->zl_parallel = MAX(zilog->zl_parallel,
+			    ZIL_BURSTS / 2);
+		}
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@@ -2849,7 +2866,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
-					first = B_FALSE;
+					zilog->zl_parallel = ZIL_BURSTS;
 					break;
 				}
 			} else {
@@ -2941,28 +2958,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
-		 *
-		 * If we had no already running or open LWBs, it can be
-		 * the workload is single-threaded.  And if the ZIL write
-		 * latency is very small or if the LWB is almost full, it
-		 * may be cheaper to bypass the delay.
 		 */
-		if (lwb->lwb_state == LWB_STATE_OPENED && first) {
-			hrtime_t sleep = zilog->zl_last_lwb_latency *
-			    zfs_commit_timeout_pct / 100;
-			if (sleep < zil_min_commit_timeout ||
-			    lwb->lwb_nmax - lwb->lwb_nused <
-			    lwb->lwb_nmax / 8) {
-				list_insert_tail(ilwbs, lwb);
-				lwb = zil_lwb_write_close(zilog, lwb,
-				    LWB_STATE_NEW);
-				zilog->zl_cur_used = 0;
-				if (lwb == NULL) {
-					while ((lwb = list_remove_head(ilwbs))
-					    != NULL)
-						zil_lwb_write_issue(zilog, lwb);
-					zil_commit_writer_stall(zilog);
-				}
+		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+			list_insert_tail(ilwbs, lwb);
+			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+			zil_burst_done(zilog);
+			if (lwb == NULL) {
+				while ((lwb = list_remove_head(ilwbs)) != NULL)
+					zil_lwb_write_issue(zilog, lwb);
+				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
@@ -3120,19 +3124,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
-	/*
-	 * Since the lwb's zio hadn't been issued by the time this thread
-	 * reached its timeout, we reset the zilog's "zl_cur_used" field
-	 * to influence the zil block size selection algorithm.
-	 *
-	 * By having to issue the lwb's zio here, it means the size of the
-	 * lwb was too large, given the incoming throughput of itxs.  By
-	 * setting "zl_cur_used" to zero, we communicate this fact to the
-	 * block size selection algorithm, so it can take this information
-	 * into account, and potentially select a smaller size for the
-	 * next lwb block that is allocated.
-	 */
-	zilog->zl_cur_used = 0;
+	zil_burst_done(zilog);
 
 	if (nlwb == NULL) {
 		/*
@@ -4250,9 +4242,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
-ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
-	"Minimum delay we care for ZIL block commit");
-
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 
-- 
cgit v1.2.3


From 8b1a132de74978dda5035f070320feede5d38512 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 17 Nov 2023 17:00:59 -0500
Subject: ZIO: Optimize zio_flush()

- Generalize vdev_nowritecache handling by traversing through the
VDEV tree and skipping children ZIOs where not supported.
 - Remove intermediate zio_null() in case of several VDEV children.
 - Remove children handling from zio_ioctl().  There are no other
use cases for this code beside DKIOCFLUSHWRITECACHED, and would there
be, I doubt they would so straightforward apply to all VDEV children.

Comparing to removed previous optimization this should improve cases
of redundant ZILs/SLOGs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15515
---
 module/zfs/zil.c |  2 +-
 module/zfs/zio.c | 36 +++++++++++++++---------------------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 8742fc6623a1..7ad0fb344b7b 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1622,7 +1622,7 @@ zil_lwb_write_done(zio_t *zio)
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL && !vd->vdev_nowritecache) {
+		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d8eb075eef54..d0b4016237b9 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1435,23 +1435,10 @@ zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
-	zio_t *zio;
-	int c;
-
-	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
-		zio->io_cmd = cmd;
-	} else {
-		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    done, private, flags));
-	}
-
+	zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+	    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+	zio->io_cmd = cmd;
 	return (zio);
 }
 
@@ -1622,11 +1609,18 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
 }
 
 void
-zio_flush(zio_t *zio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd)
 {
-	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	if (vd->vdev_nowritecache)
+		return;
+	if (vd->vdev_children == 0) {
+		zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
+		    DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	} else {
+		for (uint64_t c = 0; c < vd->vdev_children; c++)
+			zio_flush(pio, vd->vdev_child[c]);
+	}
 }
 
 void
-- 
cgit v1.2.3


From 25ea8ce94bf23873f21ea84a7c460253e66366a3 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Dec 2023 13:54:44 -0500
Subject: ZIL: Improve next log block size prediction

Track history in context of bursts, not individual log blocks. It
allows to not blow away all the history by single large burst of
many block, and same time allows optimizations covering multiple
blocks in a burst and even predicted following burst.  For each
burst account its optimal block size and minimal first block size.
Use that statistics from the last 8 bursts to predict first block
size of the next burst.

Remove predefined set of block sizes. Allocate any size we see fit,
multiple of 4KB, as required by ZIL now.  With compression enabled
by default, ZFS already writes pretty random block sizes, so this
should not surprise space allocator any more.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15635
---
 include/sys/zil_impl.h |   8 +-
 module/zfs/zil.c       | 267 ++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 201 insertions(+), 74 deletions(-)

diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index c9db6d428ea2..9a34bafc1c77 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -182,7 +182,6 @@ typedef struct zil_vdev_node {
 } zil_vdev_node_t;
 
 #define	ZIL_BURSTS 8
-#define	ZIL_PREV_BLKS 16
 
 /*
  * Stable storage intent log management structure.  One per dataset.
@@ -217,7 +216,9 @@ struct zilog {
 	uint64_t	zl_parse_lr_count; /* number of log records parsed */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
-	uint64_t	zl_cur_used;	/* current commit log size used */
+	uint64_t	zl_cur_size;	/* current burst full size */
+	uint64_t	zl_cur_left;	/* current burst remaining size */
+	uint64_t	zl_cur_max;	/* biggest record in current burst */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
@@ -225,7 +226,8 @@ struct zilog {
 	zil_header_t	zl_old_header;	/* debugging aid */
 	uint_t		zl_parallel;	/* workload is multi-threaded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
-	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+	uint_t		zl_prev_opt[ZIL_BURSTS]; /* optimal block size */
+	uint_t		zl_prev_min[ZIL_BURSTS]; /* minimal first block size */
 	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
 	uint64_t	zl_dirty_max_txg; /* highest txg used to dirty zilog */
 
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 7ad0fb344b7b..9b5d866a8c22 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -144,6 +144,7 @@ static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
+static uint64_t zil_max_waste_space(zilog_t *zilog);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
@@ -1710,24 +1711,6 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 	mutex_exit(&zilog->zl_lock);
 }
 
-/*
- * Define a limited set of intent log block sizes.
- *
- * These must be a multiple of 4KB. Note only the amount used (again
- * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
- */
-static const struct {
-	uint64_t	limit;
-	uint64_t	blksz;
-} zil_block_buckets[] = {
-	{ 4096,		4096 },			/* non TX_WRITE */
-	{ 8192 + 4096,	8192 + 4096 },		/* database */
-	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
-	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
-	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
-};
-
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
@@ -1735,6 +1718,91 @@ static const struct {
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
+/*
+ * Plan splitting of the provided burst size between several blocks.
+ */
+static uint_t
+zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
+{
+	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
+
+	if (size <= md) {
+		/*
+		 * Small bursts are written as-is in one block.
+		 */
+		*minsize = size;
+		return (size);
+	} else if (size > 8 * md) {
+		/*
+		 * Big bursts use maximum blocks.  The first block size
+		 * is hard to predict, but it does not really matter.
+		 */
+		*minsize = 0;
+		return (md);
+	}
+
+	/*
+	 * Medium bursts try to divide evenly to better utilize several SLOG
+	 * VDEVs.  The first block size we predict assuming the worst case of
+	 * maxing out others.  Fall back to using maximum blocks if due to
+	 * large records or wasted space we can not predict anything better.
+	 */
+	uint_t s = size;
+	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
+	uint_t chunk = DIV_ROUND_UP(s, n);
+	uint_t waste = zil_max_waste_space(zilog);
+	waste = MAX(waste, zilog->zl_cur_max);
+	if (chunk <= md - waste) {
+		*minsize = MAX(s - (md - waste) * (n - 1), waste);
+		return (chunk);
+	} else {
+		*minsize = 0;
+		return (md);
+	}
+}
+
+/*
+ * Try to predict next block size based on previous history.  Make prediction
+ * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
+ * less then 50%, extra writes may cost more, but we don't want single spike
+ * to badly affect our predictions.
+ */
+static uint_t
+zil_lwb_predict(zilog_t *zilog)
+{
+	uint_t m, o;
+
+	/* If we are in the middle of a burst, take it into account also. */
+	if (zilog->zl_cur_size > 0) {
+		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
+	} else {
+		o = UINT_MAX;
+		m = 0;
+	}
+
+	/* Find minimum optimal size.  We don't need to go below that. */
+	for (int i = 0; i < ZIL_BURSTS; i++)
+		o = MIN(o, zilog->zl_prev_opt[i]);
+
+	/* Find two biggest minimal first block sizes above the optimal. */
+	uint_t m1 = MAX(m, o), m2 = o;
+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		m = zilog->zl_prev_min[i];
+		if (m >= m1) {
+			m2 = m1;
+			m1 = m;
+		} else if (m > m2) {
+			m2 = m;
+		}
+	}
+
+	/*
+	 * If second minimum size gives 50% saving -- use it.  It may cost us
+	 * one additional write later, but the space saving is just too big.
+	 */
+	return ((m1 < m2 * 2) ? m1 : m2);
+}
+
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
@@ -1742,7 +1810,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
-	int i;
+	uint64_t blksz, plan, plan2;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@@ -1757,34 +1825,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 		return (NULL);
 
 	/*
-	 * Log blocks are pre-allocated. Here we select the size of the next
-	 * block, based on size used in the last block.
-	 * - first find the smallest bucket that will fit the block from a
-	 *   limited set of block sizes. This is because it's faster to write
-	 *   blocks allocated from the same metaslab as they are adjacent or
-	 *   close.
-	 * - next find the maximum from the new suggested size and an array of
-	 *   previous sizes. This lessens a picket fence effect of wrongly
-	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
-	 *   requests.
-	 *
-	 * Note we only write what is used, but we can't just allocate
-	 * the maximum block size because we can exhaust the available
-	 * pool log space.
+	 * Log blocks are pre-allocated.  Here we select the size of the next
+	 * block, based on what's left of this burst and the previous history.
+	 * While we try to only write used part of the block, we can't just
+	 * always allocate the maximum block size because we can exhaust all
+	 * available pool log space, so we try to be reasonable.
 	 */
-	uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
-	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
-		continue;
-	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
-	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
-	for (i = 0; i < ZIL_PREV_BLKS; i++)
-		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
-	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
-	    uint64_t, zil_blksz,
-	    uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
-	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
-
-	return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
+	if (zilog->zl_cur_left > 0) {
+		/*
+		 * We are in the middle of a burst and know how much is left.
+		 * But if workload is multi-threaded there may be more soon.
+		 * Try to predict what can it be and plan for the worst case.
+		 */
+		uint_t m;
+		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		if (zilog->zl_parallel) {
+			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
+			    zil_lwb_predict(zilog), &m);
+			if (plan < plan2)
+				plan = plan2;
+		}
+	} else {
+		/*
+		 * The previous burst is done and we can only predict what
+		 * will come next.
+		 */
+		plan = zil_lwb_predict(zilog);
+	}
+	blksz = plan + sizeof (zil_chain_t);
+	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
+	blksz = MIN(blksz, zilog->zl_max_block_size);
+	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
+	    uint64_t, plan);
+
+	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
 }
 
 /*
@@ -1835,7 +1909,7 @@ next_lwb:
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
-		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
@@ -1996,6 +2070,42 @@ zil_max_copied_data(zilog_t *zilog)
 	return (MIN(max_data, zil_maxcopied));
 }
 
+static uint64_t
+zil_itx_record_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen);
+}
+
+static uint64_t
+zil_itx_data_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+	lr_write_t *lrw = (lr_write_t *)lr;
+
+	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+		ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
+		return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
+		    uint64_t));
+	}
+	return (0);
+}
+
+static uint64_t
+zil_itx_full_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen + zil_itx_data_size(itx));
+}
+
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
@@ -2038,16 +2148,9 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 	}
 
 	reclen = lr->lrc_reclen;
-	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		ASSERT3U(reclen, ==, sizeof (lr_write_t));
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-	} else {
-		ASSERT3U(reclen, >=, sizeof (lr_t));
-		dlen = 0;
-	}
+	ASSERT3U(reclen, >=, sizeof (lr_t));
 	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
-	zilog->zl_cur_used += (reclen + dlen);
+	dlen = zil_itx_data_size(itx);
 
 cont:
 	/*
@@ -2088,6 +2191,7 @@ cont:
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
+		zilog->zl_cur_left -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
@@ -2109,10 +2213,8 @@ cont:
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
-	if (dlen > 0) {
-		zilog->zl_cur_used += reclen;
+	if (dlen > 0)
 		goto cont;
-	}
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
@@ -2139,13 +2241,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
-	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-	} else {
-		dlen = 0;
-	}
 	reclen = lr->lrc_reclen;
+	dlen = zil_itx_data_size(itx);
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
@@ -2576,6 +2673,7 @@ zil_get_commit_list(zilog_t *zilog)
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
+		itx_t *itx = NULL;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
@@ -2585,10 +2683,20 @@ zil_get_commit_list(zilog_t *zilog)
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
+			itx = list_head(sync_list);
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
+
+		while (itx != NULL) {
+			uint64_t s = zil_itx_full_size(itx);
+			zilog->zl_cur_size += s;
+			zilog->zl_cur_left += s;
+			s = zil_itx_record_size(itx);
+			zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
+			itx = list_next(commit_list, itx);
+		}
 	}
 	return (wtxg);
 }
@@ -2728,13 +2836,20 @@ static void
 zil_burst_done(zilog_t *zilog)
 {
 	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
-	    zilog->zl_cur_used == 0)
+	    zilog->zl_cur_size == 0)
 		return;
 
 	if (zilog->zl_parallel)
 		zilog->zl_parallel--;
 
-	zilog->zl_cur_used = 0;
+	uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
+	zilog->zl_prev_rotor = r;
+	zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
+	    &zilog->zl_prev_min[r]);
+
+	zilog->zl_cur_size = 0;
+	zilog->zl_cur_max = 0;
+	zilog->zl_cur_left = 0;
 }
 
 /*
@@ -2867,6 +2982,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 					 * itx list to somebody else who care.
 					 */
 					zilog->zl_parallel = ZIL_BURSTS;
+					zilog->zl_cur_left -=
+					    zil_itx_full_size(itx);
 					break;
 				}
 			} else {
@@ -2876,8 +2993,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 			zil_itx_destroy(itx);
 		}
 	}
@@ -2960,9 +3079,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * of each individual itx.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
-			zil_burst_done(zilog);
 			if (lwb == NULL) {
 				while ((lwb = list_remove_head(ilwbs)) != NULL)
 					zil_lwb_write_issue(zilog, lwb);
@@ -3120,12 +3239,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
+	zil_burst_done(zilog);
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
-	zil_burst_done(zilog);
-
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
@@ -3720,7 +3838,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
-	zilog->zl_max_block_size = zil_maxblocksize;
+	zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
+	    ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
+	    spa_maxblocksize(dmu_objset_spa(os)));
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3740,6 +3860,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
+		    sizeof (zil_chain_t);
+	}
+
 	return (zilog);
 }
 
-- 
cgit v1.2.3


From 3b8817db9607a875428d1a3bceb015de9709ae65 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Jan 2024 19:49:39 -0500
Subject: ZIL: Update Linux tracing after #15635

While picking parts from #14909 I've missed Linux tracing specific
ones, that went unnoticed in default configurations, but breaks the
build in some.

Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15730
---
 include/os/linux/zfs/sys/trace_zil.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h
index afa1a274e43c..ae1caa3ac473 100644
--- a/include/os/linux/zfs/sys/trace_zil.h
+++ b/include/os/linux/zfs/sys/trace_zil.h
@@ -51,7 +51,9 @@
 		__field(uint64_t,	zl_parse_lr_seq)		    \
 		__field(uint64_t,	zl_parse_blk_count)		    \
 		__field(uint64_t,	zl_parse_lr_count)		    \
-		__field(uint64_t,	zl_cur_used)			    \
+		__field(uint64_t,	zl_cur_size)			    \
+		__field(uint64_t,	zl_cur_left)			    \
+		__field(uint64_t,	zl_cur_max)			    \
 		__field(clock_t,	zl_replay_time)			    \
 		__field(uint64_t,	zl_replay_blks)
 
@@ -72,7 +74,9 @@
 		__entry->zl_parse_lr_seq	= zilog->zl_parse_lr_seq;   \
 		__entry->zl_parse_blk_count	= zilog->zl_parse_blk_count;\
 		__entry->zl_parse_lr_count	= zilog->zl_parse_lr_count; \
-		__entry->zl_cur_used	= zilog->zl_cur_used;		    \
+		__entry->zl_cur_size	= zilog->zl_cur_size;		    \
+		__entry->zl_cur_left	= zilog->zl_cur_left;		    \
+		__entry->zl_cur_max	= zilog->zl_cur_max;		    \
 		__entry->zl_replay_time	= zilog->zl_replay_time;	    \
 		__entry->zl_replay_blks	= zilog->zl_replay_blks;
 
@@ -82,7 +86,8 @@
 	"replay %u stop_sync %u logbias %u sync %u "			    \
 	"parse_error %u parse_blk_seq %llu parse_lr_seq %llu "		    \
 	"parse_blk_count %llu parse_lr_count %llu "			    \
-	"cur_used %llu replay_time %lu replay_blks %llu }"
+	"cur_size %llu cur_left %llu cur_max %llu replay_time %lu "	    \
+	"replay_blks %llu }"
 
 #define	ZILOG_TP_PRINTK_ARGS						    \
 	    __entry->zl_lr_seq, __entry->zl_commit_lr_seq,		    \
@@ -92,7 +97,8 @@
 	    __entry->zl_stop_sync, __entry->zl_logbias, __entry->zl_sync,   \
 	    __entry->zl_parse_error, __entry->zl_parse_blk_seq,		    \
 	    __entry->zl_parse_lr_seq, __entry->zl_parse_blk_count,	    \
-	    __entry->zl_parse_lr_count, __entry->zl_cur_used,		    \
+	    __entry->zl_parse_lr_count, __entry->zl_cur_size,		    \
+	    __entry->zl_cur_left, __entry->zl_cur_max,			    \
 	    __entry->zl_replay_time, __entry->zl_replay_blks
 
 #define	ITX_TP_STRUCT_ENTRY						    \
-- 
cgit v1.2.3


From fdd97e00934b9a1af7f953333ddf4f7c196907f0 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 7 Aug 2023 16:54:41 -0400
Subject: Refactor dmu_prefetch().

- Split dmu_prefetch_dnode() from dmu_prefetch() into a separate
function.  It is quite inconvenient to read the code where len = 0
means dnode prefetch instead indirect/data prefetch.  One function
doing both has no benefits, since the code paths are independent.
 - Improve dmu_prefetch() handling of long block ranges.  Instead
of limiting L0 data length to prefetch for to dmu_prefetch_max,
make dmu_prefetch_max limit the actual amount of prefetch at the
specified level, and, if there is more, prefetch all the rest at
higher indirection level.  It should improve random access times
within the prefetched range of any length, reducing importance of
specific dmu_prefetch_max value.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15076
---
 include/sys/dmu.h                    |   1 +
 module/os/freebsd/zfs/zfs_vnops_os.c |   4 +-
 module/os/linux/zfs/zfs_vnops_os.c   |   7 +--
 module/zfs/dmu.c                     | 103 +++++++++++++++++++++--------------
 module/zfs/dsl_deadlist.c            |   8 +--
 module/zfs/spa_log_spacemap.c        |   4 +-
 module/zfs/zvol.c                    |   2 +-
 7 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 06b4dc27dfea..5bdb7c0293b8 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -889,6 +889,7 @@ extern uint_t zfs_max_recordsize;
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
+void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 05f28033be6a..1ba25bce6196 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
 
 		ASSERT3S(outcount, <=, bufsize);
 
-		/* Prefetch znode */
 		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0, 0,
-			    ZIO_PRIORITY_SYNC_READ);
+			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 7c473bc7e965..be528f6e8176 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 		if (done)
 			break;
 
-		/* Prefetch znode */
-		if (prefetch) {
-			dmu_prefetch(os, objnum, 0, 0, 0,
-			    ZIO_PRIORITY_SYNC_READ);
-		}
+		if (prefetch)
+			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3215ab1c2a14..d82211e6d4c7 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -695,74 +695,93 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 }
 
 /*
- * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
- * the data starting at offset, and continuing to offset + len.
+ * the data starting at offset, and continuing to offset + len.  If the range
+ * it too long, prefetch the first dmu_prefetch_max bytes as requested, while
+ * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
+ * should primarily help random reads, since for long sequential reads there is
+ * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
- * in cache, they will be asynchronously read in.
+ * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
+ * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	uint64_t blkid;
-	int nblks, err;
-
-	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = DMU_META_DNODE(os);
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
 
-		if (object == 0 || object >= DN_MAX_OBJECT)
-			return;
-
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, level,
-		    object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, level, blkid, pri, 0);
-		rw_exit(&dn->dn_struct_rwlock);
+	if (dmu_prefetch_max == 0 || len == 0) {
+		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
-	/*
-	 * See comment before the definition of dmu_prefetch_max.
-	 */
-	len = MIN(len, dmu_prefetch_max);
-
-	/*
-	 * XXX - Note, if the dnode for the requested object is not
-	 * already cached, we will do a *synchronous* read in the
-	 * dnode_hold() call.  The same is true for any indirects.
-	 */
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0)
+	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	/*
-	 * offset + len - 1 is the last byte we want to prefetch for, and offset
-	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
-	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
-	 * offset)  is the first.  Then the number we need to prefetch is the
-	 * last - first + 1.
+	 * Depending on len we may do two prefetches: blocks [start, end) at
+	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (level > 0 || dn->dn_datablkshift != 0) {
-		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
-		    dbuf_whichblock(dn, level, offset) + 1;
+	if (dn->dn_datablkshift != 0) {
+		/*
+		 * The object has multiple blocks.  Calculate the full range
+		 * of blocks [start, end2) and then split it into two parts,
+		 * so that the first [start, end) fits into dmu_prefetch_max.
+		 */
+		start = dbuf_whichblock(dn, level, offset);
+		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
+		uint8_t ibs = dn->dn_indblkshift;
+		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
+		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
+		start2 = end = MIN(end2, start + limit);
+
+		/*
+		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
+		 */
+		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
+		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+		do {
+			level2++;
+			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
+			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
+		} while (end2 - start2 > limit);
 	} else {
-		nblks = (offset < dn->dn_datablksz);
+		/* There is only one block.  Prefetch it or nothing. */
+		start = start2 = end2 = 0;
+		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
-	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, level, offset);
-		for (int i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, level, blkid + i, pri, 0);
-	}
+	for (uint64_t i = start; i < end; i++)
+		dbuf_prefetch(dn, level, i, pri, 0);
+	for (uint64_t i = start2; i < end2; i++)
+		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
+/*
+ * Issue prefetch I/Os for the given object's dnode.
+ */
+void
+dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
+{
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return;
+
+	dnode_t *dn = DMU_META_DNODE(os);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
+	dbuf_prefetch(dn, 0, blkid, pri, 0);
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index ac30a370813f..e6c8d4be13b4 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dle->dle_bpobj.bpo_object = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
+		    ZIO_PRIORITY_SYNC_READ);
 
 		avl_add(&dl->dl_tree, dle);
 	}
@@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dlce->dlce_bpobj = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
+		    ZIO_PRIORITY_SYNC_READ);
 		avl_add(&dl->dl_cache, dlce);
 	}
 	VERIFY3U(error, ==, ENOENT);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index 2878e68c6e4b..cf05158b63f8 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa)
 	/* Prefetch log spacemaps dnodes. */
 	for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
 	    sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
-		dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
+		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	uint_t pn = 0;
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 20ea71f23376..89b523ddd903 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -931,7 +931,7 @@ zvol_prefetch_minors_impl(void *arg)
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
-		dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }
-- 
cgit v1.2.3


From 793a2cff2a0cfa97c409ae22ae2dd7399f0f0bdb Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 13 Feb 2024 14:15:16 -0500
Subject: Linux: Cleanup taskq threads spawn/exit

This changes taskq_thread_should_stop() to limit maximum exit rate
for idle threads to one per 5 seconds.  I believe the previous one
was broken, not allowing any thread exits for tasks arriving more
than one at a time and so completing while others are running.

Also while there:
 - Remove taskq_thread_spawn() calls on task allocation errors.
 - Remove extra taskq_thread_should_stop() call.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15873
---
 include/os/linux/spl/sys/taskq.h |  2 +-
 man/man4/spl.4                   | 18 ++-------
 module/os/linux/spl/spl-taskq.c  | 85 ++++++++++++++--------------------------
 3 files changed, 34 insertions(+), 71 deletions(-)

diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index 6c1b4377a98a..6c0dbbefda7f 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -104,7 +104,7 @@ typedef struct taskq {
 	/* list node for the cpu hotplug callback */
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
-	unsigned long		lastshouldstop; /* when to purge dynamic */
+	unsigned long		lastspawnstop;	/* when to purge dynamic */
 } taskq_t;
 
 typedef struct taskq_ent {
diff --git a/man/man4/spl.4 b/man/man4/spl.4
index 414a92394858..5cc12764e18c 100644
--- a/man/man4/spl.4
+++ b/man/man4/spl.4
@@ -186,18 +186,8 @@ reading it could cause a lock-up if the list grow too large
 without limiting the output.
 "(truncated)" will be shown if the list is larger than the limit.
 .
-.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
-(Linux-only)
-How long a taskq has to have had no work before we tear it down.
-Previously, we would tear down a dynamic taskq worker as soon
-as we noticed it had no work, but it was observed that this led
-to a lot of churn in tearing down things we then immediately
-spawned anew.
-In practice, it seems any nonzero value will remove the vast
-majority of this churn, while the nontrivially larger value
-was chosen to help filter out the little remaining churn on
-a mostly idle system.
-Setting this value to
-.Sy 0
-will revert to the previous behavior.
+.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
+Minimum idle threads exit interval for dynamic taskqs.
+Smaller values allow idle threads exit more often and potentially be
+respawned again on demand, causing more churn.
 .El
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index d18f935b167c..0e44aa1fcd38 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -36,12 +36,12 @@ static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
 
-static uint_t spl_taskq_thread_timeout_ms = 10000;
+static uint_t spl_taskq_thread_timeout_ms = 5000;
 /* BEGIN CSTYLED */
 module_param(spl_taskq_thread_timeout_ms, uint, 0644);
 /* END CSTYLED */
 MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
-	"Time to require a dynamic thread be idle before it gets cleaned up");
+	"Minimum idle threads exit interval for dynamic taskqs");
 
 static int spl_taskq_thread_dynamic = 1;
 module_param(spl_taskq_thread_dynamic, int, 0444);
@@ -594,8 +594,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	ASSERT(tq->tq_nactive <= tq->tq_nthreads);
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
-		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
-		    taskq_thread_spawn(tq) == 0)
+		if (taskq_thread_spawn(tq) == 0)
 			goto out;
 	}
 
@@ -629,11 +628,11 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
-out:
+
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
-
+out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
@@ -676,10 +675,11 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 	spin_unlock(&t->tqent_lock);
-out:
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
+out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
@@ -704,9 +704,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
-		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
-		    taskq_thread_spawn(tq) == 0)
-			goto out2;
+		if (taskq_thread_spawn(tq) == 0)
+			goto out;
 		flags |= TQ_FRONT;
 	}
 
@@ -742,11 +741,11 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
-out:
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
-out2:
+out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 }
 EXPORT_SYMBOL(taskq_dispatch_ent);
@@ -825,6 +824,7 @@ taskq_thread_spawn(taskq_t *tq)
 	if (!(tq->tq_flags & TASKQ_DYNAMIC))
 		return (0);
 
+	tq->lastspawnstop = jiffies;
 	if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
 	    (tq->tq_flags & TASKQ_ACTIVE)) {
 		spawning = (++tq->tq_nspawn);
@@ -836,9 +836,9 @@ taskq_thread_spawn(taskq_t *tq)
 }
 
 /*
- * Threads in a dynamic taskq should only exit once it has been completely
- * drained and no other threads are actively servicing tasks.  This prevents
- * threads from being created and destroyed more than is required.
+ * Threads in a dynamic taskq may exit once there is no more work to do.
+ * To prevent threads from being created and destroyed too often limit
+ * the exit rate to one per spl_taskq_thread_timeout_ms.
  *
  * The first thread is the thread list is treated as the primary thread.
  * There is nothing special about the primary thread but in order to avoid
@@ -847,44 +847,22 @@ taskq_thread_spawn(taskq_t *tq)
 static int
 taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
 {
-	if (!(tq->tq_flags & TASKQ_DYNAMIC))
+	ASSERT(!taskq_next_ent(tq));
+	if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic)
 		return (0);
-
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		return (1);
 	if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
 	    tqt_thread_list) == tqt)
 		return (0);
-
-	int no_work =
-	    ((tq->tq_nspawn == 0) &&	/* No threads are being spawned */
-	    (tq->tq_nactive == 0) &&	/* No threads are handling tasks */
-	    (tq->tq_nthreads > 1) &&	/* More than 1 thread is running */
-	    (!taskq_next_ent(tq)) &&	/* There are no pending tasks */
-	    (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
-
-	/*
-	 * If we would have said stop before, let's instead wait a bit, maybe
-	 * we'll see more work come our way soon...
-	 */
-	if (no_work) {
-		/* if it's 0, we want the old behavior. */
-		/* if the taskq is being torn down, we also want to go away. */
-		if (spl_taskq_thread_timeout_ms == 0 ||
-		    !(tq->tq_flags & TASKQ_ACTIVE))
-			return (1);
-		unsigned long lasttime = tq->lastshouldstop;
-		if (lasttime > 0) {
-			if (time_after(jiffies, lasttime +
-			    msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
-				return (1);
-			else
-				return (0);
-		} else {
-			tq->lastshouldstop = jiffies;
-		}
-	} else {
-		tq->lastshouldstop = 0;
-	}
-	return (0);
+	ASSERT3U(tq->tq_nthreads, >, 1);
+	if (tq->tq_nspawn != 0)
+		return (0);
+	if (time_before(jiffies, tq->lastspawnstop +
+	    msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
+		return (0);
+	tq->lastspawnstop = jiffies;
+	return (1);
 }
 
 static int
@@ -935,10 +913,8 @@ taskq_thread(void *args)
 		if (list_empty(&tq->tq_pend_list) &&
 		    list_empty(&tq->tq_prio_list)) {
 
-			if (taskq_thread_should_stop(tq, tqt)) {
-				wake_up_all(&tq->tq_wait_waitq);
+			if (taskq_thread_should_stop(tq, tqt))
 				break;
-			}
 
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
@@ -1013,9 +989,6 @@ taskq_thread(void *args)
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
-		} else {
-			if (taskq_thread_should_stop(tq, tqt))
-				break;
 		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -1122,7 +1095,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 	tq->tq_flags = (flags | TASKQ_ACTIVE);
 	tq->tq_next_id = TASKQID_INITIAL;
 	tq->tq_lowest_id = TASKQID_INITIAL;
-	tq->lastshouldstop = 0;
+	tq->lastspawnstop = jiffies;
 	INIT_LIST_HEAD(&tq->tq_free_list);
 	INIT_LIST_HEAD(&tq->tq_pend_list);
 	INIT_LIST_HEAD(&tq->tq_prio_list);
-- 
cgit v1.2.3


From fa5de0c5cd4061b3b70d5e9eb2d67a4a7c594a63 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 20 Mar 2024 20:22:36 -0400
Subject: Update resume token at object receive.

Before this change resume token was updated only on data receive.
Usually it is enough to resume replication without much overlap.
But we've got a report of a curios case, where replication source
was traversed with recursive grep, which through enabled atime
modified every object without modifying any data.  It produced
several gigabytes of replication traffic without a single data
write and so without a single resume point.

While the resume token was not designed to resume from an object,
I've found that the send implementation always sends object before
any data. So by requesting resume from offset 0 we are effectively
resuming from the object, followed (or not) by the data at offset
0, just as we need it.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15927
---
 module/zfs/dmu_recv.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 54aa60259ea1..2cf10909738b 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2110,6 +2110,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
+
+	/*
+	 * If the receive fails, we want the resume stream to start with the
+	 * same record that we last successfully received. There is no way to
+	 * request resume from the object record, but we can benefit from the
+	 * fact that sender always sends object record before anything else,
+	 * after which it will "resend" data at offset 0 and resume normally.
+	 */
+	save_resume_state(rwa, drro->drr_object, 0, tx);
+
 	dmu_tx_commit(tx);
 
 	return (0);
-- 
cgit v1.2.3


From f7c1db6366947d879d0103a0970d22d9663ef20e Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 18:42:21 -0400
Subject: BRT: Change brt_pending_tree sorting order

It does not look important how exactly brt_pending_tree is sorted.
When cloning large file, it is quite likely that all of its blocks
have identical physical birth times, so comparing them first does
not provide useful entropy, while accesses additional cache line.
In most cases combination of vdev and offset provides unique result
and physical birth time comparison is not even needed.  Meanwhile,
when traversing the tree inside brt_pending_apply(), it can be
beneficial for dbuf cache and CPU cache hits to group processing
by vdev and so by the per-VDEV BRT ZAPs.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15954
---
 module/zfs/brt.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 225ddaca1e54..3d565cd1397c 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1420,13 +1420,14 @@ brt_pending_entry_compare(const void *x1, const void *x2)
 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
 	int cmp;
 
-	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
+	cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+	    DVA_GET_VDEV(&bp2->blk_dva[0]));
 	if (cmp == 0) {
-		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
-		    DVA_GET_VDEV(&bp2->blk_dva[0]));
-		if (cmp == 0) {
-			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
-			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		if (unlikely(cmp == 0)) {
+			cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1),
+			    BP_PHYSICAL_BIRTH(bp2));
 		}
 	}
 
-- 
cgit v1.2.3


From dced953b62f255a209c93b32797e8aa1260c4edc Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 19:43:53 -0400
Subject: ZAP: Some cleanups/micro-optimizations

- Remove custom zap_memset(), use regular memset().
- Use PANIC() instead of opaque cmn_err(CE_PANIC).
- Provide entry parameter to zap_leaf_rehash_entry().
- Reduce branching in zap_leaf_array_create() inner loop.
- Remove signedness where it should not be.

Should be no function changes.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15976
---
 include/sys/zap_leaf.h |  8 +++---
 module/zfs/zap_leaf.c  | 77 ++++++++++++++++++++++----------------------------
 2 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index ebc67c2bf465..d563edd7ba59 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -47,7 +47,7 @@ struct zap_stats;
  * entries - header space (2*chunksize)
  */
 #define	ZAP_LEAF_NUMCHUNKS_BS(bs) \
-	(((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
+	(((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
 	ZAP_LEAF_CHUNKSIZE - 2)
 
 #define	ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
@@ -80,7 +80,7 @@ struct zap_stats;
  * chunks per entry (3).
  */
 #define	ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
-#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
+#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs))
 #define	ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
 #define	ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
 
@@ -163,7 +163,7 @@ typedef struct zap_leaf {
 	dmu_buf_user_t l_dbu;
 	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
-	int l_bs;			/* block size shift */
+	uint_t l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
 } zap_leaf_t;
 
@@ -243,7 +243,7 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
  */
 
 extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t len);
 extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
 extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
     struct zap_stats *zs);
diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
index e6afb1c58c95..032aca92695e 100644
--- a/module/zfs/zap_leaf.c
+++ b/module/zfs/zap_leaf.c
@@ -41,7 +41,8 @@
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
-static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
+    uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
@@ -52,16 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
-static void
-zap_memset(void *a, int c, size_t n)
-{
-	char *cp = a;
-	char *cpend = cp + n;
-
-	while (cp < cpend)
-		*cp++ = c;
-}
-
 static void
 stv(int len, void *addr, uint64_t value)
 {
@@ -79,7 +70,7 @@ stv(int len, void *addr, uint64_t value)
 		*(uint64_t *)addr = value;
 		return;
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 }
 
@@ -96,13 +87,13 @@ ldv(int len, const void *addr)
 	case 8:
 		return (*(uint64_t *)addr);
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
 {
 	zap_leaf_t l;
 	dmu_buf_t l_dbuf;
@@ -119,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
@@ -160,11 +151,11 @@ void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	memset(&zap_leaf_phys(l)->l_hdr, 0,
 	    sizeof (struct zap_leaf_header));
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
@@ -185,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
@@ -223,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
-	int byten = 0;
+	int byten = integer_size;
 	uint64_t value = 0;
 	int shift = (integer_size - 1) * 8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
 
+	if (len > 0)
+		value = ldv(integer_size, buf);
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
-			if (byten == 0)
-				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
-			if (++byten == integer_size) {
-				byten = 0;
-				buf += integer_size;
+			if (--byten == 0) {
 				if (--len == 0)
 					break;
+				byten = integer_size;
+				buf += integer_size;
+				value = ldv(integer_size, buf);
 			}
 		}
 
@@ -264,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
-		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+		uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
@@ -333,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
-    int chunk, int array_numints)
+    uint_t chunk, int array_numints)
 {
 	int bseen = 0;
 
@@ -562,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	uint64_t valuelen = integer_size * num_integers;
 
-	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (SET_ERROR(E2BIG));
@@ -624,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
-	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+	uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
@@ -687,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
  */
 
 static uint16_t *
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
 {
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
@@ -722,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int nextchunk = la->la_next;
+		uint_t nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
@@ -739,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 }
 
 static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
@@ -748,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
-	(void) zap_leaf_rehash_entry(nl, chunk);
+	(void) zap_leaf_rehash_entry(nl, nle, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
@@ -766,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
-	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
@@ -777,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
@@ -792,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
@@ -800,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
-			(void) zap_leaf_rehash_entry(l, i);
+			(void) zap_leaf_rehash_entry(l, le, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
-	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
@@ -823,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
-		int nentries = 0;
-		int chunk = zap_leaf_phys(l)->l_hash[i];
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+		uint_t nentries = 0;
+		uint_t chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =
-- 
cgit v1.2.3


From fdd8c0aea1cf2b924b6c9c505437ffd30492d035 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:04 -0400
Subject: BRT: Skip duplicate BRT prefetches

If there is a pending entry for this block, then we've already
issued BRT prefetch for it within this TXG, so don't do it again.
BRT vdev lookup and following zap_prefetch_uint64() call can be
pretty expensive and should be avoided when not necessary.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15941
---
 module/zfs/brt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 3d565cd1397c..7ddec0b4b9bb 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1472,10 +1472,10 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 		kmem_cache_free(brt_pending_entry_cache, newbpe);
 	} else {
 		ASSERT(bpe == NULL);
-	}
 
-	/* Prefetch BRT entry, as we will need it in the syncing context. */
-	brt_prefetch(brt, bp);
+		/* Prefetch BRT entry for the syncing context. */
+		brt_prefetch(brt, bp);
+	}
 }
 
 void
-- 
cgit v1.2.3


From 19bf54b76414b70866df28c6a66e521c7fef349f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:50 -0400
Subject: ZAP: Massively switch to _by_dnode() interfaces

Before this change ZAP called dnode_hold() for almost every block
access, that was clearly visible in profiler under heavy load, such
as BRT.  This patch makes it always hold the dnode reference between
zap_lockdir() and zap_unlockdir().  It allows to avoid most of dnode
operations between those.  It also adds several new _by_dnode() APIs
to ZAP and uses them in BRT code.  Also adds dmu_prefetch_by_dnode()
variant and uses it in the ZAP code.

After this there remains only one call to dmu_buf_dnode_enter(),
which seems to be unneeded.  So remove the call and the functions.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15951
---
 include/sys/dmu.h      |   4 +-
 include/sys/zap.h      |   8 ++
 include/sys/zap_impl.h |   1 +
 module/zfs/brt.c       |  72 ++++-------------
 module/zfs/dbuf.c      |  15 ----
 module/zfs/dmu.c       |  18 ++++-
 module/zfs/dmu_recv.c  |   7 +-
 module/zfs/zap.c       |  43 ++++-------
 module/zfs/zap_micro.c | 206 +++++++++++++++++++++++++++++++++++--------------
 9 files changed, 202 insertions(+), 172 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 5bdb7c0293b8..26b329b53f05 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -739,8 +739,6 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
-dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
-void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
@@ -889,6 +887,8 @@ extern uint_t zfs_max_recordsize;
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
+void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
diff --git a/include/sys/zap.h b/include/sys/zap.h
index 308a7c7284d7..96ddcc324b65 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
 int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
+int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 
 /*
  * Set the attribute with the given name to the given value.  If an
@@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /*
  * Get the length (in integers) and the integer size of the specified
@@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
 int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx);
+int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 74853f5faceb..2959aa9b2ca4 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -145,6 +145,7 @@ typedef struct zap {
 	dmu_buf_user_t zap_dbu;
 	objset_t *zap_objset;
 	uint64_t zap_object;
+	dnode_t *zap_dnode;
 	struct dmu_buf *zap_dbuf;
 	krwlock_t zap_rwlock;
 	boolean_t zap_ismicro;
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 7ddec0b4b9bb..5e10df9dfe56 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -955,52 +955,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
 	if (mos_entries == 0)
 		return;
 
-	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
-	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
-	    (u_longlong_t)bre->bre_offset);
 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
 }
 
-static int
-brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT(bre->bre_refcount > 0);
-
-	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
-	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
-	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
-static int
-brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT0(bre->bre_refcount);
-
-	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
-	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
 /*
  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
  * positive, but gives us quick answer if we should look into BRT, which
@@ -1559,24 +1517,16 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 }
 
 static void
-brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 {
-
-	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-
 	if (bre->bre_refcount == 0) {
-		int error;
-
-		error = brt_entry_remove(brt, brtvd, bre, tx);
-		ASSERT(error == 0 || error == ENOENT);
-		/*
-		 * If error == ENOENT then zfs_clone_range() was done from a
-		 * removed (but opened) file (open(), unlink()).
-		 */
-		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
+		int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, tx);
+		VERIFY(error == 0 || error == ENOENT);
 	} else {
-		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
+		VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
+		    &bre->bre_refcount, tx));
 	}
 }
 
@@ -1585,6 +1535,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre;
+	dnode_t *dn;
 	uint64_t vdevid;
 	void *c;
 
@@ -1608,14 +1559,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 		if (brtvd->bv_mos_brtvdev == 0)
 			brt_vdev_create(brt, brtvd, tx);
 
+		VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
+		    FTAG, &dn));
+
 		c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
-			brt_sync_entry(brt, brtvd, bre, tx);
+			brt_sync_entry(dn, bre, tx);
 			brt_entry_free(bre);
 			ASSERT(brt->brt_nentries > 0);
 			brt->brt_nentries--;
 		}
 
+		dnode_rele(dn, FTAG);
+
 		brt_vdev_sync(brt, brtvd, tx);
 
 		if (brtvd->bv_totalcount == 0)
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 280001bc34b6..ae5657d762f5 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4127,21 +4127,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
 	return (dbi->db_objset);
 }
 
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_ENTER(dbi);
-	return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_EXIT(dbi);
-}
-
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index d82211e6d4c7..8986f55e792a 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	int64_t level2 = level;
-	uint64_t start, end, start2, end2;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
@@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
+	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
+
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
+{
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
+
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
@@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
-
-	dnode_rele(dn, FTAG);
 }
 
 /*
@@ -2563,6 +2571,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_prefetch_by_dnode);
+EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 2cf10909738b..9f1c25f866f7 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2353,7 +2353,6 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 	if (rwa->heal) {
 		blkptr_t *bp;
 		dmu_buf_t *dbp;
-		dnode_t *dn;
 		int flags = DB_RF_CANFAIL;
 
 		if (rwa->raw)
@@ -2385,19 +2384,15 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
-		dn = dmu_buf_dnode_enter(dbp);
 		/* Make sure the on-disk block and recv record sizes match */
-		if (drrw->drr_logical_size !=
-		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) {
+		if (drrw->drr_logical_size != dbp->db_size) {
 			err = ENOTSUP;
-			dmu_buf_dnode_exit(dbp);
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Get the block pointer for the corrupted block */
 		bp = dmu_buf_get_blkptr(dbp);
 		err = do_corrective_recv(rwa, drrw, rrd, bp);
-		dmu_buf_dnode_exit(dbp);
 		dmu_buf_rele(dbp, FTAG);
 		return (err);
 	}
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index dde05d7005c2..da86defb445c 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	 * set up block 1 - the first leaf
 	 */
 	dmu_buf_t *db;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
@@ -182,7 +182,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
@@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 
 	uint64_t b = tbl->zt_blks_copied;
 	dmu_buf_t *db_old;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	dmu_buf_t *db_new;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	dmu_buf_t *db;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
@@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 		dmu_buf_t *db2;
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0) {
@@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 	uint64_t blk = idx >> (bs-3);
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
-	/*
-	 * Note: this is equivalent to dmu_buf_hold(), but we use
-	 * _dnode_enter / _by_dnode because it's faster because we don't
-	 * have to hold the dnode.
-	 */
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-		err = dmu_buf_hold_by_dnode(dn,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
-		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 
 		uint64_t newblk = zap_allocate_blocks(zap, 1);
 		dmu_buf_t *db_new;
-		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0)
@@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
 
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 	    DMU_READ_NO_PREFETCH));
 	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
@@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 		return (SET_ERROR(ENOENT));
 
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 
@@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn)
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
 	    ZIO_PRIORITY_SYNC_READ);
 }
 
@@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 	 */
 	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
 	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
-		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
 		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
@@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
@@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 			dmu_buf_t *db;
 			int err;
 
-			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 085d9cd8b4b6..d806988af96d 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -415,7 +415,7 @@ mze_destroy(zap_t *zap)
 }
 
 static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+mzap_open(dmu_buf_t *db)
 {
 	zap_t *winner;
 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
@@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
-	zap->zap_objset = os;
-	zap->zap_object = obj;
+	zap->zap_objset = dmu_buf_get_objset(db);
+	zap->zap_object = db->db_object;
 	zap->zap_dbuf = db;
 
 	if (zap_block_type != ZBT_MICRO) {
@@ -518,7 +518,7 @@ handle_winner:
  * have the specified tag.
  */
 static int
-zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
@@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 
 	*zapp = NULL;
 
-	dmu_object_info_from_db(db, &doi);
+	dmu_object_info_from_dnode(dn, &doi);
 	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
 		return (SET_ERROR(EINVAL));
 
 	zap_t *zap = dmu_buf_get_user(db);
 	if (zap == NULL) {
-		zap = mzap_open(os, obj, db);
+		zap = mzap_open(db);
 		if (zap == NULL) {
 			/*
 			 * mzap_open() didn't like what it saw on-disk.
@@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 	}
 
 	zap->zap_objset = os;
+	zap->zap_dnode = dn;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
@@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     zap_t **zapp)
 {
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
 		return (err);
-	}
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
 		dmu_buf_rele(db, tag);
-	}
+	else
+		VERIFY(dnode_add_ref(dn, tag));
 	return (err);
 }
 
@@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
+	dnode_t *dn;
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	err = dnode_hold(os, obj, tag, &dn);
 	if (err != 0)
 		return (err);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, tag);
+		return (err);
 	}
-#endif
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0)
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
 		dmu_buf_rele(db, tag);
+		dnode_rele(dn, tag);
+	}
 	return (err);
 }
 
@@ -645,6 +641,7 @@ void
 zap_unlockdir(zap_t *zap, const void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
+	dnode_rele(zap->zap_dnode, tag);
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
@@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+		VERIFY(dnode_add_ref(dn, FTAG));
+		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlockdir(zap, FTAG);
@@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
 	return (err);
 }
 
+static int
+zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
@@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, FTAG);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1396,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+static int
+zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
+    const void *tag)
 {
-	zap_t *zap;
+	int err;
 
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers, const void *val,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 	return (err);
 }
 
+static int
+zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
@@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_remove(zn, tx);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64);
 EXPORT_SYMBOL(zap_add);
 EXPORT_SYMBOL(zap_add_by_dnode);
 EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_add_uint64_by_dnode);
 EXPORT_SYMBOL(zap_update);
 EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_update_uint64_by_dnode);
 EXPORT_SYMBOL(zap_length);
 EXPORT_SYMBOL(zap_length_uint64);
 EXPORT_SYMBOL(zap_remove);
 EXPORT_SYMBOL(zap_remove_by_dnode);
 EXPORT_SYMBOL(zap_remove_norm);
 EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
 EXPORT_SYMBOL(zap_count);
 EXPORT_SYMBOL(zap_value_search);
 EXPORT_SYMBOL(zap_join);
-- 
cgit v1.2.3


From 457e62d7ca0e809412ca7fba184fd9190530cf1d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:59:55 -0400
Subject: BRT: Relax brt_pending_apply() locking

Since brt_pending_apply() is running in syncing context, no other
brt_pending_tree accesses are possible for the TXG.  We don't need
to acquire brt_pending_lock here.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15955
---
 module/zfs/brt.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 5e10df9dfe56..416caeb11c7e 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1473,26 +1473,23 @@ brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 void
 brt_pending_apply(spa_t *spa, uint64_t txg)
 {
-	brt_t *brt;
+	brt_t *brt = spa->spa_brt;
 	brt_pending_entry_t *bpe;
 	avl_tree_t *pending_tree;
-	kmutex_t *pending_lock;
 	void *c;
 
 	ASSERT3U(txg, !=, 0);
 
-	brt = spa->spa_brt;
+	/*
+	 * We are in syncing context, so no other brt_pending_tree accesses
+	 * are possible for the TXG. Don't need to acquire brt_pending_lock.
+	 */
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
-	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
-
-	mutex_enter(pending_lock);
 
 	c = NULL;
 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
 		boolean_t added_to_ddt;
 
-		mutex_exit(pending_lock);
-
 		for (int i = 0; i < bpe->bpe_count; i++) {
 			/*
 			 * If the block has DEDUP bit set, it means that it
@@ -1510,10 +1507,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 		}
 
 		kmem_cache_free(brt_pending_entry_cache, bpe);
-		mutex_enter(pending_lock);
 	}
-
-	mutex_exit(pending_lock);
 }
 
 static void
-- 
cgit v1.2.3


From c94f73007855f8a6b36ad6f0c0c48b93a1577e1d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 18:02:38 -0400
Subject: BRT: Make BRT block sizes configurable

Similar to DDT make BRT data and indirect block sizes configurable
via module parameters.  I am not sure what would be the best yet,
but similar to DDT 4KB blocks kill all chances of compression on
vdev with ashift=12 or more, that on my tests reaches 3x.

While here, fix documentation for respective DDT parameters.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15967
---
 man/man4/zfs.4   | 17 +++++++++++++++--
 module/zfs/brt.c | 22 +++++++++++-----------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 5307f1f32e93..24ea390d6e9a 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -245,12 +245,25 @@ For blocks that could be forced to be a gang block (due to
 .Sy metaslab_force_ganging ) ,
 force this many of them to be gang blocks.
 .
-.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
+Controls prefetching BRT records for blocks which are going to be cloned.
+.
+.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP data block size as a power of 2. Note that changing this after
+creating a BRT on the pool will not affect existing BRTs, only newly created
+ones.
+.
+.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP indirect block size as a power of 2. Note that changing this
+after creating a BRT on the pool will not affect existing BRTs, only newly
+created ones.
+.
+.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
 ones.
 .
-.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP indirect block size as a power of 2. Note that changing this
 after creating a DDT on the pool will not affect existing DDTs, only newly
 created ones.
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 416caeb11c7e..014f26517ddd 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -248,7 +248,7 @@ static kmem_cache_t *brt_pending_entry_cache;
 /*
  * Enable/disable prefetching of BRT entries that we are going to modify.
  */
-int zfs_brt_prefetch = 1;
+static int brt_zap_prefetch = 1;
 
 #ifdef ZFS_DEBUG
 #define	BRT_DEBUG(...)	do {						\
@@ -260,8 +260,8 @@ int zfs_brt_prefetch = 1;
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
-int brt_zap_leaf_blockshift = 12;
-int brt_zap_indirect_blockshift = 12;
+static int brt_zap_default_bs = 12;
+static int brt_zap_default_ibs = 12;
 
 static kstat_t	*brt_ksp;
 
@@ -458,8 +458,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 
 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
-	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
-	    0, tx);
+	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
 	VERIFY(brtvd->bv_mos_entries != 0);
 	BRT_DEBUG("MOS entries created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
@@ -1363,7 +1362,7 @@ brt_prefetch(brt_t *brt, const blkptr_t *bp)
 
 	ASSERT(bp != NULL);
 
-	if (!zfs_brt_prefetch)
+	if (!brt_zap_prefetch)
 		return;
 
 	brt_entry_fill(bp, &bre, &vdevid);
@@ -1680,9 +1679,10 @@ brt_unload(spa_t *spa)
 }
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
-    "Enable prefetching of BRT entries");
-#ifdef ZFS_BRT_DEBUG
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
-#endif
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
+	"Enable prefetching of BRT ZAP entries");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
+	"BRT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
+	"BRT ZAP indirect blockshift");
 /* END CSTYLED */
-- 
cgit v1.2.3


From 3e91a9c525f57e695a756cae2e6e1a7caa607a64 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 20:13:45 -0400
Subject: BRT: Skip getting length in brt_entry_lookup()

Unlike DDT, where ZAP values may have different lengths due to
compression, all BRT entries are identical 8-byte counters.  It
does not make sense to first fetch the length only to assert it.
zap_lookup_uint64() is specifically designed to work with counters
of different size and should return error if something odd found.
Calling it straight allows to save some measurable CPU time.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15950
---
 module/zfs/brt.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 014f26517ddd..bf8fdf6ea4b4 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -900,7 +900,6 @@ static int
 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t mos_entries;
-	uint64_t one, physsize;
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
@@ -918,21 +917,8 @@ brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 
 	brt_unlock(brt);
 
-	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
-	    BRT_KEY_WORDS, &one, &physsize);
-	if (error == 0) {
-		ASSERT3U(one, ==, 1);
-		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
-
-		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
-		    &bre->bre_offset, BRT_KEY_WORDS, 1,
-		    sizeof (bre->bre_refcount), &bre->bre_refcount);
-		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
-		    "count=%llu error=%d", (u_longlong_t)mos_entries,
-		    (u_longlong_t)brtvd->bv_vdevid,
-		    (u_longlong_t)bre->bre_offset,
-		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
-	}
+	error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+	    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
 
 	brt_wlock(brt);
 
-- 
cgit v1.2.3


From 2ea370a4e3fe55ee9fa7d8cffb14ae73eac6c576 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 18 Mar 2024 14:19:53 -0400
Subject: BRT: Fix holes cloning.

 - When reading L0 block pointers handle buffers without ones and
without dirty records as a holes.  Those appear when dnode size
was increased, but the end was never written, so there are no new
indirection levels to store the pointers.  It makes no sense to
return EAGAIN here, since sync won't create new indirection levels
until there will be actual writes.
 - When cloning blocks set destination hole logical birth time
to the current TXG.  Otherwise if we are cloning over existing
data, newly created holes may not be properly replicated later.
Use BP_SET_BIRTH() when possible to not replicate its logic.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15994
Closes #16007
---
 module/zfs/dmu.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 8986f55e792a..7d07accc7c9e 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2265,11 +2265,13 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 
 		if (bp == NULL) {
 			/*
-			 * The block was created in this transaction group,
-			 * so it has no BP yet.
+			 * The file size was increased, but the block was never
+			 * written, otherwise we would either have the block
+			 * pointer or the dirty record and would not get here.
+			 * It is effectively a hole, so report it as such.
 			 */
-			error = SET_ERROR(EAGAIN);
-			goto out;
+			BP_ZERO(&bps[i]);
+			continue;
 		}
 		/*
 		 * Make sure we clone only data blocks.
@@ -2361,18 +2363,16 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		dl->dr_overridden_by = *bp;
-		dl->dr_brtwrite = B_TRUE;
-		dl->dr_override_state = DR_OVERRIDDEN;
-		if (BP_IS_HOLE(bp)) {
-			dl->dr_overridden_by.blk_birth = 0;
-			dl->dr_overridden_by.blk_phys_birth = 0;
-		} else {
-			dl->dr_overridden_by.blk_birth = dr->dr_txg;
+		if (!BP_IS_HOLE(bp) || bp->blk_birth != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
-				dl->dr_overridden_by.blk_phys_birth =
-				    BP_PHYSICAL_BIRTH(bp);
+				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
+				    BP_PHYSICAL_BIRTH(bp));
+			} else {
+				dl->dr_overridden_by.blk_birth = dr->dr_txg;
 			}
 		}
+		dl->dr_brtwrite = B_TRUE;
+		dl->dr_override_state = DR_OVERRIDDEN;
 
 		mutex_exit(&db->db_mtx);
 
-- 
cgit v1.2.3


From e3c1c9153f764db8fa7af778395be9bc403126ff Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 12:25:14 -0400
Subject: BRT: Fix tests to work on non-empty pools

It should not normally happen, but if it does, better to not fail
everything for no good reason, or it may be hard to debug.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../tests/functional/bclone/bclone_common.kshlib   | 47 ++++++++++++----------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
index 3b8eaea5bb54..84b92b4dcdc9 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
@@ -97,20 +97,19 @@ function verify_pool_prop_eq
 
 function verify_pool_props
 {
-    typeset -r dsize=$1
-    typeset -r ratio=$2
+    typeset -r oused=$1
+    typeset -r osaved=$2
+    typeset dsize=$3
+    typeset ratio=$4
 
     if [[ $dsize -eq 0 ]]; then
-        verify_pool_prop_eq bcloneused 0
-        verify_pool_prop_eq bclonesaved 0
-        verify_pool_prop_eq bcloneratio 1.00
-    else
-        if [[ $ratio -eq 1 ]]; then
-            verify_pool_prop_eq bcloneused 0
-        else
-            verify_pool_prop_eq bcloneused $dsize
-        fi
-        verify_pool_prop_eq bclonesaved $((dsize*(ratio-1)))
+        ratio=1
+    elif [[ $ratio -eq 1 ]]; then
+        dsize=0
+    fi
+    verify_pool_prop_eq bcloneused $(($oused+$dsize))
+    verify_pool_prop_eq bclonesaved $(($osaved+dsize*(ratio-1)))
+    if [[ $oused -eq 0 ]]; then
         verify_pool_prop_eq bcloneratio "${ratio}.00"
     fi
 }
@@ -124,16 +123,22 @@ function bclone_test
     typeset -r srcdir=$4
     typeset -r dstdir=$5
     typeset dsize
+    typeset oused
+    typeset osaved
 
     typeset -r original="${srcdir}/original"
     typeset -r clone="${dstdir}/clone"
 
     log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded"
 
+    # Save current block cloning stats for later use.
+    sync_pool $TESTPOOL
+    oused=$(get_pool_prop bcloneused $TESTPOOL)
+    osaved=$(get_pool_prop bclonesaved $TESTPOOL)
+
     # Create a test file with known content.
     case $datatype in
         random|text)
-            sync_pool $TESTPOOL
             if [[ $datatype = "random" ]]; then
                 dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null
             else
@@ -146,13 +151,13 @@ function bclone_test
             sync_pool $TESTPOOL
             # It is hard to predict block sizes that will be used,
             # so just do one clone and take it from bcloneused.
-            filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL)
+            dsize=$(get_pool_prop bcloneused $TESTPOOL)
+            dsize=$(($dsize-$oused))
             if [[ $embedded = "false" ]]; then
-                log_must test $filesize -gt 0
+                log_must test $dsize -gt 0
             fi
             rm -f "${clone}-tmp"
             sync_pool $TESTPOOL
-            dsize=$filesize
             ;;
         hole)
             log_must truncate_test -s $filesize -f $original
@@ -217,7 +222,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}4" $filesize
     test_file_integrity $original_checksum "${clone}5" $filesize
 
-    verify_pool_props $dsize 7
+    verify_pool_props $oused $osaved $dsize 7
 
     # Clear cache and test after fresh import.
     log_must zpool export $TESTPOOL
@@ -240,7 +245,7 @@ function bclone_test
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 11
+    verify_pool_props $oused $osaved $dsize 11
 
     log_must zpool export $TESTPOOL
     log_must zpool import $TESTPOOL
@@ -268,7 +273,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}8" $filesize
     test_file_integrity $original_checksum "${clone}9" $filesize
 
-    verify_pool_props $dsize 6
+    verify_pool_props $oused $osaved $dsize 6
 
     rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9"
 
@@ -276,11 +281,11 @@ function bclone_test
 
     test_file_integrity $original_checksum "${clone}6" $filesize
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 
     rm -f "${clone}6"
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 }
-- 
cgit v1.2.3


From 39993c3dfee3580d0ef40e0c523a52f76552d870 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 13:08:05 -0400
Subject: BRT: Check pool clone stats in more tests

This should allow to catch some leaks, if those happen.

While there fix some cosmetic issues.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../functional/bclone/bclone_corner_cases.kshlib     | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
index ddfbfc999c4e..aeb8efe91715 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
@@ -66,7 +66,7 @@ function bclone_corner_cases_init
     export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0)
     export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1)
     export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2)
-    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest)
+    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest)
     export FIRST_HALF_CHECKSUM=""
     export SECOND_HALF_CHECKSUM=""
 }
@@ -210,6 +210,8 @@ function bclone_corner_cases_test
     typeset -r dstdir=$2
     typeset limit=$3
     typeset -i count=0
+    typeset oused
+    typeset osaved
 
     if [[ $srcdir != "count" ]]; then
         if [[ -n "$limit" ]]; then
@@ -217,6 +219,11 @@ function bclone_corner_cases_test
             limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs)
         fi
         bclone_corner_cases_init $srcdir $dstdir
+
+        # Save current block cloning stats for later use.
+        sync_pool $TESTPOOL
+        oused=$(get_pool_prop bcloneused $TESTPOOL)
+        osaved=$(get_pool_prop bclonesaved $TESTPOOL)
     fi
 
     #
@@ -285,21 +292,24 @@ function bclone_corner_cases_test
                                     overwrite_clone "$second_overwrite"
 
                                     if checksum_compare $read_after; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     fi
 
                                     log_must zpool export $TESTPOOL
                                     log_must zpool import $TESTPOOL
 
                                     if checksum_compare "yes"; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     fi
 
                                     rm -f "$CLONE"
+                                    sync_pool $TESTPOOL
+                                    verify_pool_prop_eq bcloneused $oused
+                                    verify_pool_prop_eq bclonesaved $osaved
                                 done
                             done
                         done
-- 
cgit v1.2.3


From d5fb6abd36401388e73ba7513697cb152fb11369 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 3 Apr 2024 18:04:26 -0400
Subject: Improve dbuf_read() error reporting

Previous code reported non-ZIO errors only via return value, but
not via parent ZIO.  It could cause NULL-dereference panics due
to dmu_buf_hold_array_by_dnode() ignoring the return value,
relying solely on parent ZIO status.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reported by:	Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16042
---
 module/zfs/dbuf.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index ae5657d762f5..46c564c2572b 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1542,17 +1542,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
  * returning.
  */
 static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
     db_lock_type_t dblt, const void *tag)
 {
-	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
 	blkptr_t bp, *bpp;
 
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -1627,8 +1624,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	if (err != 0)
 		goto early_unlock;
 
-	DB_DNODE_EXIT(db);
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
@@ -1653,12 +1648,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	dmu_buf_unlock_parent(db, dblt, tag);
-	(void) arc_read(zio, db->db_objset->os_spa, bpp,
+	return (arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
-	    &aflags, &zb);
-	return (err);
+	    &aflags, &zb));
+
 early_unlock:
-	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
@@ -1743,7 +1737,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 }
 
 int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
@@ -1759,7 +1753,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	dn = DB_DNODE(db);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
+	    (flags & DB_RF_NOPREFETCH) == 0;
 
 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
@@ -1806,13 +1800,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
-		if (zio == NULL && (db->db_state == DB_NOFILL ||
+		if (pio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+			pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
-		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
 		/*
 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
 		 * for us
@@ -1833,9 +1827,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		 */
 		if (need_wait) {
 			if (err == 0)
-				err = zio_wait(zio);
+				err = zio_wait(pio);
 			else
-				VERIFY0(zio_wait(zio));
+				(void) zio_wait(pio);
+			pio = NULL;
 		}
 	} else {
 		/*
@@ -1862,7 +1857,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, zio);
+				    db, zio_t *, pio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
@@ -1871,6 +1866,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		}
 	}
 
+	if (pio && err != 0) {
+		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		zio->io_error = err;
+		zio_nowait(zio);
+	}
+
 	return (err);
 }
 
-- 
cgit v1.2.3


From 602b5dca7b0c9326768dfa38836b62497230be52 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 15:03:18 -0400
Subject: Fix read errors race after block cloning

Investigating read errors triggering panic fixed in #16042 I've
found that we have a race in a sync process between the moment
dirty record for cloned block is removed and the moment dbuf is
destroyed.  If dmu_buf_hold_array_by_dnode() take a hold on a
cloned dbuf before it is synced/destroyed, then dbuf_read_impl()
may see it still in DB_NOFILL state, but without the dirty record.
Such case is not an error, but equivalent to DB_UNCACHED, since
the dbuf block pointer is already updated by dbuf_write_ready().
Unfortunately it is impossible to safely change the dbuf state
to DB_UNCACHED there, since there may already be another cloning
in progress, that dropped dbuf lock before creating a new dirty
record, protected only by the range lock.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Robert Evans <evansr@google.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16052
---
 module/zfs/dbuf.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 46c564c2572b..42e5811c8597 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1548,7 +1548,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
-	blkptr_t bp, *bpp;
+	blkptr_t bp, *bpp = NULL;
 
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1562,29 +1562,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}
 
-	if (db->db_state == DB_UNCACHED) {
-		if (db->db_blkptr == NULL) {
-			bpp = NULL;
-		} else {
-			bp = *db->db_blkptr;
+	/*
+	 * If we have a pending block clone, we don't want to read the
+	 * underlying block, but the content of the block being cloned,
+	 * pointed by the dirty record, so we have the most recent data.
+	 * If there is no dirty record, then we hit a race in a sync
+	 * process when the dirty record is already removed, while the
+	 * dbuf is not yet destroyed. Such case is equivalent to uncached.
+	 */
+	if (db->db_state == DB_NOFILL) {
+		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+		if (dr != NULL) {
+			if (!dr->dt.dl.dr_brtwrite) {
+				err = EIO;
+				goto early_unlock;
+			}
+			bp = dr->dt.dl.dr_overridden_by;
 			bpp = &bp;
 		}
-	} else {
-		dbuf_dirty_record_t *dr;
-
-		ASSERT3S(db->db_state, ==, DB_NOFILL);
+	}
 
-		/*
-		 * Block cloning: If we have a pending block clone,
-		 * we don't want to read the underlying block, but the content
-		 * of the block being cloned, so we have the most recent data.
-		 */
-		dr = list_head(&db->db_dirty_records);
-		if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
-			err = EIO;
-			goto early_unlock;
-		}
-		bp = dr->dt.dl.dr_overridden_by;
+	if (bpp == NULL && db->db_blkptr != NULL) {
+		bp = *db->db_blkptr;
 		bpp = &bp;
 	}
 
-- 
cgit v1.2.3


From 026fe796465e3da7b27d06ef5338634ee6dd30d8 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 18:13:27 -0400
Subject: Speculative prefetch for reordered requests

Before this change speculative prefetcher was able to detect a stream
only if all of its accesses are perfectly sequential.  It was easy to
implement and is perfectly fine for single-threaded applications.
Unfortunately multi-threaded network servers, such as iSCSI, SMB or
NFS usually have plenty of threads and may often reorder requests,
preventing successful speculation and prefetch.

This change allows speculative prefetcher to detect streams even if
requests are reordered by introducing a list of 9 non-contiguous
ranges up to 16MB ahead of current stream position and filling the
gaps as more requests arrive.  It also allows stream to proceed
even with holes up to a certain configurable threshold (25%).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16022
---
 cmd/arc_summary          |  11 +-
 include/sys/dmu_zfetch.h |  16 ++-
 man/man4/zfs.4           |  11 ++
 module/zfs/dmu.c         |   8 +-
 module/zfs/dmu_zfetch.c  | 289 ++++++++++++++++++++++++++++++++++++++---------
 5 files changed, 272 insertions(+), 63 deletions(-)

diff --git a/cmd/arc_summary b/cmd/arc_summary
index 9c69ec4f8ccc..100fb1987a8b 100755
--- a/cmd/arc_summary
+++ b/cmd/arc_summary
@@ -793,18 +793,27 @@ def section_dmu(kstats_dict):
 
     zfetch_stats = isolate_section('zfetchstats', kstats_dict)
 
-    zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+    zfetch_access_total = int(zfetch_stats['hits']) +\
+        int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
+        int(zfetch_stats['past']) + int(zfetch_stats['misses'])
 
     prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
     prt_i2('Stream hits:',
            f_perc(zfetch_stats['hits'], zfetch_access_total),
            f_hits(zfetch_stats['hits']))
+    future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
+    prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
+           f_hits(future))
+    prt_i2('Hits behind stream:',
+           f_perc(zfetch_stats['past'], zfetch_access_total),
+           f_hits(zfetch_stats['past']))
     prt_i2('Stream misses:',
            f_perc(zfetch_stats['misses'], zfetch_access_total),
            f_hits(zfetch_stats['misses']))
     prt_i2('Streams limit reached:',
            f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
            f_hits(zfetch_stats['max_streams']))
+    prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
     prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
     print()
 
diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h
index f00e13cf03a6..322472fb1ae2 100644
--- a/include/sys/dmu_zfetch.h
+++ b/include/sys/dmu_zfetch.h
@@ -45,18 +45,24 @@ typedef struct zfetch {
 	int		zf_numstreams;	/* number of zstream_t's */
 } zfetch_t;
 
+typedef struct zsrange {
+	uint16_t	start;
+	uint16_t	end;
+} zsrange_t;
+
+#define	ZFETCH_RANGES	9		/* Fits zstream_t into 128 bytes */
+
 typedef struct zstream {
+	list_node_t	zs_node;	/* link for zf_stream */
 	uint64_t	zs_blkid;	/* expect next access at this blkid */
+	uint_t		zs_atime;	/* time last prefetch issued */
+	zsrange_t	zs_ranges[ZFETCH_RANGES]; /* ranges from future */
 	unsigned int	zs_pf_dist;	/* data prefetch distance in bytes */
 	unsigned int	zs_ipf_dist;	/* L1 prefetch distance in bytes */
 	uint64_t	zs_pf_start;	/* first data block to prefetch */
 	uint64_t	zs_pf_end;	/* data block to prefetch up to */
 	uint64_t	zs_ipf_start;	/* first data block to prefetch L1 */
 	uint64_t	zs_ipf_end;	/* data block to prefetch L1 up to */
-
-	list_node_t	zs_node;	/* link for zf_stream */
-	hrtime_t	zs_atime;	/* time last prefetch issued */
-	zfetch_t	*zs_fetch;	/* parent fetch */
 	boolean_t	zs_missed;	/* stream saw cache misses */
 	boolean_t	zs_more;	/* need more distant prefetch */
 	zfs_refcount_t	zs_callers;	/* number of pending callers */
@@ -74,7 +80,7 @@ void		dmu_zfetch_init(zfetch_t *, struct dnode *);
 void		dmu_zfetch_fini(zfetch_t *);
 zstream_t	*dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
     boolean_t);
-void		dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
+void		dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
 void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
     boolean_t);
 
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 24ea390d6e9a..1191cc962492 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -544,6 +544,10 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
+.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
+Log2 fraction of holes in speculative prefetch stream allowed for it to
+proceed.
+.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
@@ -558,6 +562,13 @@ Max bytes to prefetch per stream.
 .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch indirects for per stream.
 .
+.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
+Requests within this byte distance from the current prefetch stream position
+are considered parts of the stream, reordered due to parallel processing.
+Such requests do not advance the stream position immediately unless
+.Sy zfetch_hole_shift
+fill threshold is reached, but saved to fill holes in the stream later.
+.
 .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
 Max number of streams per zfetch (prefetch streams per file).
 .
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 7d07accc7c9e..d8d5cfdbd230 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
-			if (zs)
-				dmu_zfetch_run(zs, missed, B_TRUE);
+			if (zs) {
+				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
+				    B_TRUE);
+			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
@@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zfs_racct_write(length, nblks);
 
 	if (zs)
-		dmu_zfetch_run(zs, missed, B_TRUE);
+		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index d0acaf502066..e26195176036 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -65,9 +65,16 @@ unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max request reorder distance within a stream (default 16MB) */
+unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+/* Max log2 fraction of holes in a stream */
+unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_future;
+	kstat_named_t zfetchstat_stride;
+	kstat_named_t zfetchstat_past;
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
@@ -76,6 +83,9 @@ typedef struct zfetch_stats {
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "future",			KSTAT_DATA_UINT64 },
+	{ "stride",			KSTAT_DATA_UINT64 },
+	{ "past",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
 	{ "io_issued",			KSTAT_DATA_UINT64 },
@@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = {
 
 struct {
 	wmsum_t zfetchstat_hits;
+	wmsum_t zfetchstat_future;
+	wmsum_t zfetchstat_stride;
+	wmsum_t zfetchstat_past;
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
@@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
 		return (EACCES);
 	zs->zfetchstat_hits.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
+	zs->zfetchstat_future.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_future);
+	zs->zfetchstat_stride.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_stride);
+	zs->zfetchstat_past.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_past);
 	zs->zfetchstat_misses.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
 	zs->zfetchstat_max_streams.value.ui64 =
@@ -122,6 +141,9 @@ void
 zfetch_init(void)
 {
 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_future, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_past, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
@@ -147,6 +169,9 @@ zfetch_fini(void)
 	}
 
 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
+	wmsum_fini(&zfetch_sums.zfetchstat_future);
+	wmsum_fini(&zfetch_sums.zfetchstat_stride);
+	wmsum_fini(&zfetch_sums.zfetchstat_past);
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
@@ -222,22 +247,22 @@ static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
 	zstream_t *zs, *zs_next, *zs_old = NULL;
-	hrtime_t now = gethrtime(), t;
+	uint_t now = gethrestime_sec(), t;
 
 	ASSERT(MUTEX_HELD(&zf->zf_lock));
 
 	/*
 	 * Delete too old streams, reusing the first found one.
 	 */
-	t = now - SEC2NSEC(zfetch_max_sec_reap);
+	t = now - zfetch_max_sec_reap;
 	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
 		zs_next = list_next(&zf->zf_stream, zs);
 		/*
 		 * Skip if still active.  1 -- zf_stream reference.
 		 */
-		if (zfs_refcount_count(&zs->zs_refs) != 1)
+		if ((int)(zs->zs_atime - t) >= 0)
 			continue;
-		if (zs->zs_atime > t)
+		if (zfs_refcount_count(&zs->zs_refs) != 1)
 			continue;
 		if (zs_old)
 			dmu_zfetch_stream_remove(zf, zs);
@@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 	if (zs_old) {
 		zs = zs_old;
+		list_remove(&zf->zf_stream, zs);
 		goto reuse;
 	}
 
@@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	 * for all the streams to be non-overlapping.
 	 */
 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
-	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
 	    zfetch_max_distance));
 	if (zf->zf_numstreams >= max_streams) {
-		t = now - SEC2NSEC(zfetch_min_sec_reap);
+		t = now - zfetch_min_sec_reap;
 		for (zs = list_head(&zf->zf_stream); zs != NULL;
 		    zs = list_next(&zf->zf_stream, zs)) {
-			if (zfs_refcount_count(&zs->zs_refs) != 1)
+			if ((int)(zs->zs_atime - t) >= 0)
 				continue;
-			if (zs->zs_atime > t)
+			if (zfs_refcount_count(&zs->zs_refs) != 1)
 				continue;
-			if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
+			if (zs_old == NULL ||
+			    (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
 				zs_old = zs;
 		}
 		if (zs_old) {
 			zs = zs_old;
+			list_remove(&zf->zf_stream, zs);
 			goto reuse;
 		}
 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
@@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 
 	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
-	zs->zs_fetch = zf;
 	zfs_refcount_create(&zs->zs_callers);
 	zfs_refcount_create(&zs->zs_refs);
 	/* One reference for zf_stream. */
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	zf->zf_numstreams++;
-	list_insert_head(&zf->zf_stream, zs);
 
 reuse:
+	list_insert_head(&zf->zf_stream, zs);
 	zs->zs_blkid = blkid;
+	/* Allow immediate stream reuse until first hit. */
+	zs->zs_atime = now - zfetch_min_sec_reap;
+	memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
 	zs->zs_pf_dist = 0;
+	zs->zs_ipf_dist = 0;
 	zs->zs_pf_start = blkid;
 	zs->zs_pf_end = blkid;
-	zs->zs_ipf_dist = 0;
 	zs->zs_ipf_start = blkid;
 	zs->zs_ipf_end = blkid;
-	/* Allow immediate stream reuse until first hit. */
-	zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
 	zs->zs_missed = B_FALSE;
 	zs->zs_more = B_FALSE;
 }
@@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
 }
 
+/*
+ * Process stream hit access for nblks blocks starting at zs_blkid.  Return
+ * number of blocks to proceed for after aggregation with future ranges.
+ */
+static uint64_t
+dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
+{
+	uint_t i, j;
+
+	/* Optimize sequential accesses (no future ranges). */
+	if (zs->zs_ranges[0].start == 0)
+		goto done;
+
+	/* Look for intersections with further ranges. */
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > nblks)
+			break;
+		if (r->end >= nblks) {
+			nblks = r->end;
+			i++;
+			break;
+		}
+	}
+
+	/* Delete all found intersecting ranges, updates remaining. */
+	for (j = 0; i < ZFETCH_RANGES; i++, j++) {
+		if (zs->zs_ranges[i].start == 0)
+			break;
+		ASSERT3U(zs->zs_ranges[i].start, >, nblks);
+		ASSERT3U(zs->zs_ranges[i].end, >, nblks);
+		zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
+		zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
+	}
+	if (j < ZFETCH_RANGES) {
+		zs->zs_ranges[j].start = 0;
+		zs->zs_ranges[j].end = 0;
+	}
+
+done:
+	zs->zs_blkid += nblks;
+	return (nblks);
+}
+
+/*
+ * Process future stream access for nblks blocks starting at blkid.  Return
+ * number of blocks to proceed for if future ranges reach fill threshold.
+ */
+static uint64_t
+dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
+{
+	ASSERT3U(blkid, >, zs->zs_blkid);
+	blkid -= zs->zs_blkid;
+	ASSERT3U(blkid + nblks, <=, UINT16_MAX);
+
+	/* Search for first and last intersection or insert point. */
+	uint_t f = ZFETCH_RANGES, l = 0, i;
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > blkid + nblks)
+			break;
+		if (r->end < blkid)
+			continue;
+		if (f > i)
+			f = i;
+		if (l < i)
+			l = i;
+	}
+	if (f <= l) {
+		/* Got some intersecting range, expand it if needed. */
+		if (zs->zs_ranges[f].start > blkid)
+			zs->zs_ranges[f].start = blkid;
+		zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
+		if (f < l) {
+			/* Got more than one intersection, remove others. */
+			for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
+				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
+				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
+			}
+			zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
+			zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
+		}
+	} else if (i < ZFETCH_RANGES) {
+		/* Got no intersecting ranges, insert new one. */
+		for (l = ZFETCH_RANGES - 1; l > i; l--) {
+			zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
+			zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
+		}
+		zs->zs_ranges[i].start = blkid;
+		zs->zs_ranges[i].end = blkid + nblks;
+	} else {
+		/* No space left to insert.  Drop the range. */
+		return (0);
+	}
+
+	/* Check if with the new access addition we reached fill threshold. */
+	if (zfetch_hole_shift >= 16)
+		return (0);
+	uint_t hole = 0;
+	for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0)
+			break;
+		hole += r->start - f;
+		f = r->end;
+		if (hole <= r->end >> zfetch_hole_shift)
+			l = r->end;
+	}
+	if (l > 0)
+		return (dmu_zfetch_hit(zs, l));
+
+	return (0);
+}
+
 /*
  * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
  * associates dnode access specified with blkid and nblks arguments with
@@ -365,53 +507,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	mutex_enter(&zf->zf_lock);
 
 	/*
-	 * Find matching prefetch stream.  Depending on whether the accesses
+	 * Find perfect prefetch stream.  Depending on whether the accesses
 	 * are block-aligned, first block of the new access may either follow
 	 * the last block of the previous access, or be equal to it.
 	 */
+	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+	uint64_t end_blkid = blkid + nblks;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid == zs->zs_blkid) {
-			break;
+			goto hit;
 		} else if (blkid + 1 == zs->zs_blkid) {
 			blkid++;
 			nblks--;
-			break;
+			goto hit;
 		}
 	}
 
 	/*
-	 * If the file is ending, remove the matching stream if found.
-	 * If not found then it is too late to create a new one now.
+	 * Find close enough prefetch stream.  Access crossing stream position
+	 * is a hit in its new part.  Access ahead of stream position considered
+	 * a hit for metadata prefetch, since we do not care about fill percent,
+	 * or stored for future otherwise.  Access behind stream position is
+	 * silently ignored, since we already skipped it reaching fill percent.
 	 */
-	uint64_t end_of_access_blkid = blkid + nblks;
-	if (end_of_access_blkid >= maxblkid) {
-		if (zs != NULL)
-			dmu_zfetch_stream_remove(zf, zs);
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
+	uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
+	uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid > zs->zs_blkid) {
+			if (end_blkid <= zs->zs_blkid + max_reorder) {
+				if (!fetch_data) {
+					nblks = dmu_zfetch_hit(zs,
+					    end_blkid - zs->zs_blkid);
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+					goto future;
+				}
+				nblks = dmu_zfetch_future(zs, blkid, nblks);
+				if (nblks > 0)
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+				else
+					ZFETCHSTAT_BUMP(zfetchstat_future);
+				goto future;
+			}
+		} else if (end_blkid >= zs->zs_blkid) {
+			nblks -= zs->zs_blkid - blkid;
+			blkid += zs->zs_blkid - blkid;
+			goto hit;
+		} else if (end_blkid + max_reorder > zs->zs_blkid &&
+		    (int)(zs->zs_atime - t) >= 0) {
+			ZFETCHSTAT_BUMP(zfetchstat_past);
+			zs->zs_atime = gethrestime_sec();
+			goto out;
+		}
 	}
 
-	/* Exit if we already prefetched this block before. */
-	if (nblks == 0) {
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
-	}
+	/*
+	 * This access is not part of any existing stream.  Create a new
+	 * stream for it unless we are at the end of file.
+	 */
+	if (end_blkid < maxblkid)
+		dmu_zfetch_stream_create(zf, end_blkid);
+	mutex_exit(&zf->zf_lock);
+	if (!have_lock)
+		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+	ZFETCHSTAT_BUMP(zfetchstat_misses);
+	return (NULL);
 
-	if (zs == NULL) {
-		/*
-		 * This access is not part of any existing stream.  Create
-		 * a new stream for it.
-		 */
-		dmu_zfetch_stream_create(zf, end_of_access_blkid);
+hit:
+	nblks = dmu_zfetch_hit(zs, nblks);
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+future:
+	zs->zs_atime = gethrestime_sec();
+
+	/* Exit if we already prefetched for this position before. */
+	if (nblks == 0)
+		goto out;
+
+	/* If the file is ending, remove the stream. */
+	end_blkid = zs->zs_blkid;
+	if (end_blkid >= maxblkid) {
+		dmu_zfetch_stream_remove(zf, zs);
+out:
 		mutex_exit(&zf->zf_lock);
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
 		return (NULL);
 	}
 
@@ -427,7 +608,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	 * than ~6% of ARC held by active prefetches.  It should help with
 	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
-	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
 	unsigned int nbytes = nblks << dbs;
 	unsigned int pf_nblks;
 	if (fetch_data) {
@@ -447,10 +627,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	} else {
 		pf_nblks = 0;
 	}
-	if (zs->zs_pf_start < end_of_access_blkid)
-		zs->zs_pf_start = end_of_access_blkid;
-	if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
-		zs->zs_pf_end = end_of_access_blkid + pf_nblks;
+	if (zs->zs_pf_start < end_blkid)
+		zs->zs_pf_start = end_blkid;
+	if (zs->zs_pf_end < end_blkid + pf_nblks)
+		zs->zs_pf_end = end_blkid + pf_nblks;
 
 	/*
 	 * Do the same for indirects, starting where we will stop reading
@@ -468,9 +648,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
 		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
 
-	zs->zs_blkid = end_of_access_blkid;
-	/* Protect the stream from reclamation. */
-	zs->zs_atime = gethrtime();
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	/* Count concurrent callers. */
 	zfs_refcount_add(&zs->zs_callers, NULL);
@@ -478,15 +655,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 
 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-
-	ZFETCHSTAT_BUMP(zfetchstat_hits);
 	return (zs);
 }
 
 void
-dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
+    boolean_t have_lock)
 {
-	zfetch_t *zf = zs->zs_fetch;
 	int64_t pf_start, pf_end, ipf_start, ipf_end;
 	int epbs, issued;
 
@@ -562,7 +737,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
 
 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
 	if (zs)
-		dmu_zfetch_run(zs, missed, have_lock);
+		dmu_zfetch_run(zf, zs, missed, have_lock);
 }
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
@@ -585,3 +760,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
+	"Max request reorder distance within a stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
+	"Max log2 fraction of holes in a stream");
-- 
cgit v1.2.3


From 97d7228f427218cb701b4149c3f8f9d77e8c64e4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 18:23:43 -0400
Subject: Remove db_state DB_NOFILL checks from syncing context

Syncing context should not depend on current state of dbuf, which
could already change several times in later transaction groups,
but rely solely on dirty record for the transaction group being
synced. Some of the checks seem already impossible, while instead
of others I think we should better check for absence of data in
the specific dirty record rather than DB_NOFILL.

Reviewed-by: Robert Evans <evansr@google.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16057
---
 module/zfs/dbuf.c | 44 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 42e5811c8597..5d1887175a53 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4550,11 +4550,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);
 
-	if (db->db_state != DB_NOFILL &&
+	if (*datap != NULL && *datap == db->db_buf &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1 &&
-	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
-	    *datap == db->db_buf) {
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
@@ -4839,11 +4838,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-		if (db->db_state != DB_NOFILL) {
-			if (dr->dt.dl.dr_data != NULL &&
-			    dr->dt.dl.dr_data != db->db_buf) {
-				arc_buf_destroy(dr->dt.dl.dr_data, db);
-			}
+		if (dr->dt.dl.dr_data != NULL &&
+		    dr->dt.dl.dr_data != db->db_buf) {
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -5042,21 +5039,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	os = dn->dn_objset;
 
-	if (db->db_state != DB_NOFILL) {
-		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
-			/*
-			 * Private object buffers are released here rather
-			 * than in dbuf_dirty() since they are only modified
-			 * in the syncing context and we don't want the
-			 * overhead of making multiple copies of the data.
-			 */
-			if (BP_IS_HOLE(db->db_blkptr)) {
-				arc_buf_thaw(data);
-			} else {
-				dbuf_release_bp(db);
-			}
-			dbuf_remap(dn, db, tx);
-		}
+	if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+		/*
+		 * Private object buffers are released here rather than in
+		 * dbuf_dirty() since they are only modified in the syncing
+		 * context and we don't want the overhead of making multiple
+		 * copies of the data.
+		 */
+		if (BP_IS_HOLE(db->db_blkptr))
+			arc_buf_thaw(data);
+		else
+			dbuf_release_bp(db);
+		dbuf_remap(dn, db, tx);
 	}
 
 	if (parent != dn->dn_dbuf) {
@@ -5092,7 +5086,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
-	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+	wp_flag |= (data == NULL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
@@ -5124,7 +5118,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
 		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
-	} else if (db->db_state == DB_NOFILL) {
+	} else if (data == NULL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
-- 
cgit v1.2.3


From f4ce02ae42ab084306ebcfb6ec615acb9dc98db2 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 9 Apr 2024 19:14:04 -0400
Subject: Small fix to prefetch ranges aggregation

When after #16022 adding new range we aggregate more than two
existing ranges, that should be very rare, only if several streams
overlap, we may need to zero not the last range, but some earlier.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16072
---
 module/zfs/dmu_zfetch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index e26195176036..3439f9bddf4e 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -418,8 +418,8 @@ dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
 				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
 				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
 			}
-			zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
-			zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
+			zs->zs_ranges[f].start = 0;
+			zs->zs_ranges[f].end = 0;
 		}
 	} else if (i < ZFETCH_RANGES) {
 		/* Got no intersecting ranges, insert new one. */
-- 
cgit v1.2.3


From 575872cc37a744928223dae2c24964d06f670e2d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 9 Apr 2024 19:23:19 -0400
Subject: L2ARC: Relax locking during write

Previous code held ARC state sublist lock throughout all L2ARC
write process, which included number of allocations and even ZIO
issues.  Being blocked in any of those places the code could also
block ARC eviction, that could cause OOM activation or even dead-
lock if system is low on memory or one is too fragmented.

Fix it by dropping the lock as soon as we see a block eligible
for L2ARC writing and pick it up later using earlier inserted
marker.  While there, also reduce scope of hash lock, moving
ZIO allocation and other operations not requiring header access
out of it.  All operations requiring header access move under
hash lock, since L2_WRITING flag does not prevent header eviction
only transition to arc_l2c_only state with L1 header.

To be able to manipulate sublist lock and marker as needed add few
more multilist functions and modify one.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16040
---
 include/sys/multilist.h |   5 +-
 module/zfs/arc.c        | 179 +++++++++++++++++++++++++-----------------------
 module/zfs/dbuf.c       |   2 +-
 module/zfs/dmu_objset.c |  10 +--
 module/zfs/metaslab.c   |   8 +--
 module/zfs/multilist.c  |  26 ++++++-
 6 files changed, 131 insertions(+), 99 deletions(-)

diff --git a/include/sys/multilist.h b/include/sys/multilist.h
index 26f37c37ab38..e7de86f2379b 100644
--- a/include/sys/multilist.h
+++ b/include/sys/multilist.h
@@ -82,12 +82,15 @@ int  multilist_is_empty(multilist_t *);
 unsigned int multilist_get_num_sublists(multilist_t *);
 unsigned int multilist_get_random_index(multilist_t *);
 
-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_lock(multilist_sublist_t *);
+multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
 multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
 void multilist_sublist_unlock(multilist_sublist_t *);
 
 void multilist_sublist_insert_head(multilist_sublist_t *, void *);
 void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
+void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
 void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
 void multilist_sublist_remove(multilist_sublist_t *, void *);
 int  multilist_sublist_is_empty(multilist_sublist_t *);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 4db6c06148b1..1953640139b3 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 
 	ASSERT3P(marker, !=, NULL);
 
-	mls = multilist_sublist_lock(ml, idx);
+	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
@@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	return (bytes_evicted);
 }
 
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+	/*
+	 * A b_spa of 0 is used to indicate that this header is
+	 * a marker. This fact is used in arc_evict_state_impl().
+	 */
+	marker->b_spa = 0;
+
+	return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+	kmem_cache_free(hdr_full_cache, marker);
+}
+
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
@@ -3994,16 +4014,8 @@ arc_state_alloc_markers(int count)
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
-	for (int i = 0; i < count; i++) {
-		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-		/*
-		 * A b_spa of 0 is used to indicate that this header is
-		 * a marker. This fact is used in arc_evict_state_impl().
-		 */
-		markers[i]->b_spa = 0;
-
-	}
+	for (int i = 0; i < count; i++)
+		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }
 
@@ -4011,7 +4023,7 @@ static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
-		kmem_cache_free(hdr_full_cache, markers[i]);
+		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
@@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
-		mls = multilist_sublist_lock(ml, i);
+		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -8628,7 +8640,7 @@ l2arc_sublist_lock(int list_num)
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock(ml, idx));
+	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
@@ -9040,9 +9052,9 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
-	uint64_t 		write_asize, write_psize, write_lsize, headroom;
-	boolean_t		full;
+	arc_buf_hdr_t 		*hdr, *head, *marker;
+	uint64_t 		write_asize, write_psize, headroom;
+	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
@@ -9051,10 +9063,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
-	write_lsize = write_asize = write_psize = 0;
+	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
@@ -9069,40 +9082,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				continue;
 		}
 
-		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
-
-		VERIFY3P(mls, !=, NULL);
+		headroom = target_sz * l2arc_headroom;
+		if (zfs_compressed_arc_enabled)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		/*
-		 * L2ARC fast warmup.
-		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
-		if (arc_warm == B_FALSE)
+		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+		ASSERT3P(mls, !=, NULL);
+		if (from_head)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
-		headroom = target_sz * l2arc_headroom;
-		if (zfs_compressed_arc_enabled)
-			headroom = (headroom * l2arc_headroom_boost) / 100;
-
-		for (; hdr; hdr = hdr_prev) {
+		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
-			if (arc_warm == B_FALSE)
-				hdr_prev = multilist_sublist_next(mls, hdr);
-			else
-				hdr_prev = multilist_sublist_prev(mls, hdr);
-
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
-				/*
-				 * Skip this buffer rather than waiting.
-				 */
+skip:
+				/* Skip this buffer rather than waiting. */
+				if (from_head)
+					hdr = multilist_sublist_next(mls, hdr);
+				else
+					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
@@ -9117,11 +9124,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
-				continue;
+				goto skip;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
-
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9143,12 +9149,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			}
 
 			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 * We should not sleep with sublist lock held or it
+			 * may block ARC eviction.  Insert a marker to save
+			 * the position and drop the lock.
 			 */
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+			if (from_head) {
+				multilist_sublist_insert_after(mls, hdr,
+				    marker);
+			} else {
+				multilist_sublist_insert_before(mls, hdr,
+				    marker);
+			}
+			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
@@ -9179,32 +9191,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
-					    ARC_FLAG_L2_WRITING);
+					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
-					continue;
+					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			hdr->b_l2hdr.b_hits = 0;
+			hdr->b_l2hdr.b_arcs_state =
+			    hdr->b_l1hdr.b_state->arcs_state;
+			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
-				mutex_exit(&dev->l2ad_mtx);
+			}
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+			    ARC_FLAG_L2_WRITING);
+
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+			l2arc_hdr_arcstats_increment(hdr);
 
+			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+			mutex_exit(hash_lock);
+
+			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
-				/*
-				 * Create a list to save allocated abd buffers
-				 * for l2arc_log_blk_commit().
-				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
@@ -9212,54 +9237,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    ZIO_FLAG_CANFAIL);
 			}
 
-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_hits = 0;
-
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			hdr->b_l2hdr.b_arcs_state =
-			    hdr->b_l1hdr.b_state->arcs_state;
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-			mutex_enter(&dev->l2ad_mtx);
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
-			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
+			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
-			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
-			mutex_exit(hash_lock);
-
-			/*
-			 * Append buf info to current log and commit if full.
-			 * arcstat_l2_{size,asize} kstats are updated
-			 * internally.
-			 */
-			if (l2arc_log_blk_insert(dev, hdr)) {
-				/*
-				 * l2ad_hand will be adjusted in
-				 * l2arc_log_blk_commit().
-				 */
+			if (commit) {
+				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
-			zio_nowait(wzio);
+next:
+			multilist_sublist_lock(mls);
+			if (from_head)
+				hdr = multilist_sublist_next(mls, marker);
+			else
+				hdr = multilist_sublist_prev(mls, marker);
+			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
@@ -9268,9 +9273,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			break;
 	}
 
+	arc_state_free_marker(marker);
+
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
-		ASSERT0(write_lsize);
+		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
@@ -10598,7 +10605,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
-	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 5d1887175a53..72aaf7f19822 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -754,7 +754,7 @@ static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(
+	multilist_sublist_t *mls = multilist_sublist_lock_idx(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index d134d4958f7c..f8bd7422a5df 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1633,7 +1633,7 @@ sync_dnodes_task(void *arg)
 	sync_dnodes_arg_t *sda = arg;
 
 	multilist_sublist_t *ms =
-	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, sda->sda_tx);
 
@@ -1987,8 +1987,8 @@ userquota_updates_task(void *arg)
 	dnode_t *dn;
 	userquota_cache_t cache = { { 0 } };
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
@@ -2070,8 +2070,8 @@ dnode_rele_task(void *arg)
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5809a832bcb0..dbfc00362ff8 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -641,7 +641,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
@@ -658,7 +658,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 				i--;
 				break;
 			}
-			mls = multilist_sublist_lock(ml, i);
+			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
@@ -2190,12 +2190,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
-		    multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
-			VERIFY3P(mls, ==, multilist_sublist_lock(
+			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index b1cdf1c5c5f4..3d3ef86e6839 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
 	return (random_in_range(ml->ml_num_sublists));
 }
 
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+	mutex_enter(&mls->mls_lock);
+}
+
 /* Lock and return the sublist specified at the given index */
 multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
 {
 	multilist_sublist_t *mls;
 
@@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
 multilist_sublist_t *
 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
 {
-	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+	return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
 }
 
 void
@@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
 	list_insert_tail(&mls->mls_list, obj);
 }
 
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_before(&mls->mls_list, next, obj);
+}
+
 /*
  * Move the object one element forward in the list.
  *
-- 
cgit v1.2.3


From 72e4996a54fe54b93bf1e667a5e60099375fe08f Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 13 Apr 2024 02:00:20 +1000
Subject: bdev_discard_supported: understand discard_granularity=0

Kernel documentation for the discard_granularity property says:

    A discard_granularity of 0 means that the device does not support
    discard functionality.

Some older kernels had drivers (notably loop, but also some USB-SATA
adapters) that would set the QUEUE_FLAG_DISCARD capability flag, but
have discard_granularity=0. Since 5.10 (torvalds/linux@b35fd7422c2f) the
discard entry point blkdev_issue_discard() has had a check for this,
which would immediately reject the call with EOPNOTSUPP, and throw a
scary diagnostic message into the log. See #16068.

Since 6.8, the block layer sets a non-zero default for
discard_granularity (torvalds/linux@3c407dc723bb), and a future kernel
will remove the check entirely[1].

As such, there's no good reason for us to enable discard when
discard_granularity=0. The kernel will never let the request go in
anyway; better that we just disable it so we can report it properly to
the user.

1. https://patchwork.kernel.org/project/linux-block/patch/20240312144826.1045212-2-hch@lst.de/

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
(cherry picked from commit b181b2e604de3f36feab1092c702cdec5e78c693)
---
 include/os/linux/kernel/linux/blkdev_compat.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index f111e648ccf7..b0f398354e4f 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -563,9 +563,11 @@ static inline boolean_t
 bdev_discard_supported(struct block_device *bdev)
 {
 #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS)
-	return (!!bdev_max_discard_sectors(bdev));
+	return (bdev_max_discard_sectors(bdev) > 0 &&
+	    bdev_discard_granularity(bdev) > 0);
 #elif defined(HAVE_BLK_QUEUE_DISCARD)
-	return (!!blk_queue_discard(bdev_get_queue(bdev)));
+	return (blk_queue_discard(bdev_get_queue(bdev)) > 0 &&
+	    bdev_get_queue(bdev)->limits.discard_granularity > 0);
 #else
 #error "Unsupported kernel"
 #endif
-- 
cgit v1.2.3


From bb9542a2a027cfe58b3e8a402eed75da530c060f Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 17 Apr 2024 09:29:21 -0700
Subject: Linux 6.8 compat: META (#16099)

Update the META file to reflect compatibility with the 6.8 kernel.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index d64414e32225..a33c1be8787a 100644
--- a/META
+++ b/META
@@ -6,5 +6,5 @@ Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.7
+Linux-Maximum: 6.8
 Linux-Minimum: 3.10
-- 
cgit v1.2.3


From fa2cbd40078f36ac17af6c2eafd0aa15c3f9b37b Mon Sep 17 00:00:00 2001
From: Fabian-Gruenbichler <f.gruenbichler@proxmox.com>
Date: Fri, 29 Mar 2024 22:37:40 +0100
Subject: zvols: prevent overflow of minor device numbers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

currently, the linux kernel allows 2^20 minor devices per major device
number.  ZFS reserves blocks of 2^4 minors per zvol: 1 for the zvol
itself, the other 15 for the first partitions of that zvol. as a result,
only 2^16 such blocks are available for use.

there are no checks in place to avoid overflowing into the major device
number when more than 2^16 zvols are allocated (with volmode=dev or
default). instead of ignoring this limit, which comes with all sorts of
weird knock-on effects, detect this situation and simply fail allocating
the zvol block device early on.

without this safeguard, the kernel will reject the attempt to create an
already existing block device, but ZFS doesn't handle this error and
gets confused about which zvol occupies which minor slot, potentially
resulting in kernel NULL derefs and other issues later on.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Closes #16006
---
 module/os/linux/zfs/zvol_os.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index e1ede9851a4c..cfb69dc06904 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1342,6 +1342,13 @@ zvol_os_create_minor(const char *name)
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
+	if (MINOR(minor) != minor) {
+		/* too many partitions can cause an overflow */
+		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
+		    name, minor, MINOR(minor));
+		ida_simple_remove(&zvol_ida, idx);
+		return (SET_ERROR(EINVAL));
+	}
 
 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
 	if (zv) {
-- 
cgit v1.2.3


From 3fb0942cc5fb216ac733da94930e11a64e589d38 Mon Sep 17 00:00:00 2001
From: Fabian-Gruenbichler <f.gruenbichler@proxmox.com>
Date: Fri, 22 Mar 2024 00:38:24 +0100
Subject: udev: correctly handle partition #16 and later
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a zvol has more than 15 partitions, the minor device number exhausts
the slot count reserved for partitions next to the zvol itself. As a
result, the minor number cannot be used to determine the partition
number for the higher partition, and doing so results in wrong named
symlinks being generated by udev.

Since the partition number is encoded in the block device name anyway,
let's just extract it from there instead.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Closes #15904
Closes #15970
---
 udev/zvol_id.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/udev/zvol_id.c b/udev/zvol_id.c
index 5960b978787a..609349594767 100644
--- a/udev/zvol_id.c
+++ b/udev/zvol_id.c
@@ -51,7 +51,7 @@ const char *__asan_default_options(void) {
 int
 main(int argc, const char *const *argv)
 {
-	if (argc != 2) {
+	if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) {
 		fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]);
 		return (1);
 	}
@@ -72,9 +72,10 @@ main(int argc, const char *const *argv)
 		return (1);
 	}
 
-	unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS;
-	if (dev_part != 0)
-		sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part);
+	const char *dev_part = strrchr(dev_name, 'p');
+	if (dev_part != NULL) {
+		sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1);
+	}
 
 	for (size_t i = 0; i < strlen(zvol_name); ++i)
 		if (isblank(zvol_name[i]))
-- 
cgit v1.2.3


From 5dbed504295ef4c8dbde54ef712812cd485a81e6 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Fri, 29 Mar 2024 20:11:52 -0400
Subject: Linux 5.18+ compat: Detect filemap_range_has_page

In v5.18 `filemap_range_has_page` moved to `pagemap.h`

`pagemap.h` has been around since 3.10 so just include both

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16034
---
 config/kernel-filemap.m4 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/kernel-filemap.m4 b/config/kernel-filemap.m4
index 745928168f92..0b7da828d299 100644
--- a/config/kernel-filemap.m4
+++ b/config/kernel-filemap.m4
@@ -4,6 +4,7 @@ dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [
 	ZFS_LINUX_TEST_SRC([filemap_range_has_page], [
 		#include <linux/fs.h>
+		#include <linux/pagemap.h>
 	],[
 		struct address_space *mapping = NULL;
 		loff_t lstart = 0;
-- 
cgit v1.2.3


From b9c3040b10b60350ece42aed8c29972031fb14a0 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 30 Mar 2024 08:51:33 +1100
Subject: vdev_disk: clean up spa/bdev mode conversion

43e8f6e37 introduced a subtle API misuse, in that it passed the output
from vdev_bdev_mode() back into itself. Fortunately, the
SPA_MODE_(READ|WRITE) bit values exactly map to the FMODE_(READ|WRITE) &
BLK_OPEN_(READ|WRITE) bit values, so it didn't result in a bug, but it
was hard to read and understand, so I cleaned it up.

In doing so, I noticed that the only call to vdev_bdev_mode() without
the "exclusive" flag set was in that misuse, and actually, we never do a
non-exclusive blkdev_get_by_path(). So I've just made exclusive be
always-on.


Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15995
---
 module/os/linux/zfs/vdev_disk.c | 81 ++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 223b41068b83..35e2a573facd 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -97,38 +97,41 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
 
 static unsigned int zfs_vdev_failfast_mask = 1;
 
+/*
+ * Convert SPA mode flags into bdev open mode flags.
+ */
 #ifdef HAVE_BLK_MODE_T
-static blk_mode_t
+typedef blk_mode_t vdev_bdev_mode_t;
+#define	VDEV_BDEV_MODE_READ	BLK_OPEN_READ
+#define	VDEV_BDEV_MODE_WRITE	BLK_OPEN_WRITE
+#define	VDEV_BDEV_MODE_EXCL	BLK_OPEN_EXCL
+#define	VDEV_BDEV_MODE_MASK	(BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
 #else
-static fmode_t
+typedef fmode_t vdev_bdev_mode_t;
+#define	VDEV_BDEV_MODE_READ	FMODE_READ
+#define	VDEV_BDEV_MODE_WRITE	FMODE_WRITE
+#define	VDEV_BDEV_MODE_EXCL	FMODE_EXCL
+#define	VDEV_BDEV_MODE_MASK	(FMODE_READ|FMODE_WRITE|FMODE_EXCL)
 #endif
-vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive)
-{
-#ifdef HAVE_BLK_MODE_T
-	blk_mode_t mode = 0;
-
-	if (spa_mode & SPA_MODE_READ)
-		mode |= BLK_OPEN_READ;
 
-	if (spa_mode & SPA_MODE_WRITE)
-		mode |= BLK_OPEN_WRITE;
+static vdev_bdev_mode_t
+vdev_bdev_mode(spa_mode_t smode)
+{
+	ASSERT3U(smode, !=, SPA_MODE_UNINIT);
+	ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
 
-	if (exclusive)
-		mode |= BLK_OPEN_EXCL;
-#else
-	fmode_t mode = 0;
+	vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
 
-	if (spa_mode & SPA_MODE_READ)
-		mode |= FMODE_READ;
+	if (smode & SPA_MODE_READ)
+		bmode |= VDEV_BDEV_MODE_READ;
 
-	if (spa_mode & SPA_MODE_WRITE)
-		mode |= FMODE_WRITE;
+	if (smode & SPA_MODE_WRITE)
+		bmode |= VDEV_BDEV_MODE_WRITE;
 
-	if (exclusive)
-		mode |= FMODE_EXCL;
-#endif
+	ASSERT(bmode & VDEV_BDEV_MODE_MASK);
+	ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
 
-	return (mode);
+	return (bmode);
 }
 
 /*
@@ -235,30 +238,28 @@ vdev_disk_kobj_evt_post(vdev_t *v)
 }
 
 static zfs_bdev_handle_t *
-vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder)
+vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
+	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
+
 #if defined(HAVE_BDEV_OPEN_BY_PATH)
-	return (bdev_open_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder, NULL));
+	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
-	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder, NULL));
+	return (blkdev_get_by_path(path, bmode, holder, NULL));
 #else
-	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder));
+	return (blkdev_get_by_path(path, bmode, holder));
 #endif
 }
 
 static void
-vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder)
+vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 {
 #if defined(HAVE_BDEV_RELEASE)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
 #else
-	return (blkdev_put(BDH_BDEV(bdh),
-	    vdev_bdev_mode(mode, B_TRUE)));
+	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
 #endif
 }
 
@@ -267,11 +268,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	zfs_bdev_handle_t *bdh;
-#ifdef HAVE_BLK_MODE_T
-	blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
-#else
-	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
-#endif
+	spa_mode_t smode = spa_mode(v->vdev_spa);
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
 
@@ -322,16 +319,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 					reread_part = B_TRUE;
 			}
 
-			vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
+			vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 		}
 
 		if (reread_part) {
-			bdh = vdev_blkdev_get_by_path(disk_name, mode,
+			bdh = vdev_blkdev_get_by_path(disk_name, smode,
 			    zfs_vdev_holder);
 			if (!BDH_IS_ERR(bdh)) {
 				int error =
 				    vdev_bdev_reread_part(BDH_BDEV(bdh));
-				vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
+				vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 				if (error == 0) {
 					timeout = MSEC2NSEC(
 					    zfs_vdev_open_timeout_ms * 2);
@@ -376,7 +373,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	hrtime_t start = gethrtime();
 	bdh = BDH_ERR_PTR(-ENXIO);
 	while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
-		bdh = vdev_blkdev_get_by_path(v->vdev_path, mode,
+		bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
 		    zfs_vdev_holder);
 		if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
 			/*
-- 
cgit v1.2.3


From 3bd7cd06b711339d170b4caf7a72a46b166fbd8a Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 27 Mar 2024 10:07:50 +1100
Subject: Linux 6.9 compat: bdev handles are now struct file

bdev_open_by_path() is replaced by bdev_file_open_by_path(), which
returns a plain old struct file*. Release function is gone entirely; the
regular file release function fput() will take care of the bdev
specifics.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16027
Closes #16033
---
 config/kernel-blkdev.m4         | 43 +++++++++++++++++++++++++++++++++++++++--
 module/os/linux/zfs/vdev_disk.c | 24 ++++++++++++++++++-----
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index dae7bef9ce0d..b6ce1e1cf083 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -54,6 +54,26 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [
 	])
 ])
 
+dnl #
+dnl # 6.9.x API change
+dnl # bdev_file_open_by_path() replaced bdev_open_by_path(),
+dnl # and returns struct file*
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [
+	ZFS_LINUX_TEST_SRC([bdev_file_open_by_path], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct file *file __attribute__ ((unused)) = NULL;
+		const char *path = "path";
+		fmode_t mode = 0;
+		void *holder = NULL;
+		struct blk_holder_ops h;
+
+		file = bdev_file_open_by_path(path, mode, holder, &h);
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
 	AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args])
 	ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [
@@ -73,7 +93,16 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
 					[bdev_open_by_path() exists])
 				AC_MSG_RESULT(yes)
 			], [
-				ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+				AC_MSG_RESULT(no)
+				AC_MSG_CHECKING([whether bdev_file_open_by_path() exists])
+				ZFS_LINUX_TEST_RESULT([bdev_file_open_by_path], [
+					AC_DEFINE(HAVE_BDEV_FILE_OPEN_BY_PATH, 1,
+						[bdev_file_open_by_path() exists])
+					AC_MSG_RESULT(yes)
+				], [
+					AC_MSG_RESULT(no)
+					ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+				])
 			])
 		])
 	])
@@ -149,10 +178,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [
 	])
 ])
 
+dnl #
+dnl # 6.9.x API change
+dnl #
+dnl # bdev_release() now private, but because bdev_file_open_by_path() returns
+dnl # struct file*, we can just use fput(). So the blkdev_put test no longer
+dnl # fails if not found.
+dnl #
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 	AC_MSG_CHECKING([whether blkdev_put() exists])
 	ZFS_LINUX_TEST_RESULT([blkdev_put], [
 		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_PUT, 1, [blkdev_put() exists])
 	], [
 		AC_MSG_RESULT(no)
 		AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2])
@@ -168,7 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 				AC_DEFINE(HAVE_BDEV_RELEASE, 1,
 					[bdev_release() exists])
 			], [
-				ZFS_LINUX_TEST_ERROR([blkdev_put()])
+				AC_MSG_RESULT(no)
 			])
 		])
 	])
@@ -697,6 +735,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH
+	ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 35e2a573facd..943e534ef5b0 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -45,15 +45,25 @@
 /*
  * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
  * block_device. Since it carries the block_device inside, its convenient to
- * just use the handle as a proxy. For pre-6.8, we just emulate this with
- * a cast, since we don't need any of the other fields inside the handle.
+ * just use the handle as a proxy.
+ *
+ * Linux 6.9.x uses a file for the same purpose.
+ *
+ * For pre-6.8, we just emulate this with a cast, since we don't need any of
+ * the other fields inside the handle.
  */
-#ifdef HAVE_BDEV_OPEN_BY_PATH
+#if defined(HAVE_BDEV_OPEN_BY_PATH)
 typedef struct bdev_handle zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((bdh)->bdev)
 #define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
+#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+typedef struct file zfs_bdev_handle_t;
+#define	BDH_BDEV(bdh)		(file_bdev(bdh))
+#define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
+#define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
+#define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #else
 typedef void zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((struct block_device *)bdh)
@@ -242,7 +252,9 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
 	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
 
-#if defined(HAVE_BDEV_OPEN_BY_PATH)
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+	return (bdev_file_open_by_path(path, bmode, holder, NULL));
+#elif defined(HAVE_BDEV_OPEN_BY_PATH)
 	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
 	return (blkdev_get_by_path(path, bmode, holder, NULL));
@@ -258,8 +270,10 @@ vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
-#else
+#elif defined(HAVE_BLKDEV_PUT)
 	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
+#else
+	fput(bdh);
 #endif
 }
 
-- 
cgit v1.2.3


From 9a7ef02f4dcda27cf77283e4ef497603d20bf00d Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 27 Mar 2024 11:24:57 +1100
Subject: Linux 6.9 compat: blk_alloc_disk() now takes two args

There's an extra nullable arg for queue limits. Detect it, and set it to
NULL. Similar change for blk_mq_alloc_disk(), now three args, same
treatment.

Error return now has error encoded in the return, so detect with
IS_ERR() and explicitly NULL our own return.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16027
Closes #16033
---
 config/kernel-make-request-fn.m4 | 33 +++++++++++++++++++++++++++++++++
 module/os/linux/zfs/zvol_os.c    | 23 ++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4
index 4d20dd45c4a1..9813ad2fb3f3 100644
--- a/config/kernel-make-request-fn.m4
+++ b/config/kernel-make-request-fn.m4
@@ -50,6 +50,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 		disk = blk_alloc_disk(NUMA_NO_NODE);
 	])
 
+	ZFS_LINUX_TEST_SRC([blk_alloc_disk_2arg], [
+		#include <linux/blkdev.h>
+	],[
+		struct queue_limits *lim = NULL;
+		struct gendisk *disk  __attribute__ ((unused));
+		disk = blk_alloc_disk(lim, NUMA_NO_NODE);
+	])
+
 	ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
 		#include <linux/blkdev.h>
 	],[
@@ -96,6 +104,31 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
 		], [
 			AC_MSG_RESULT(no)
 		])
+
+		dnl #
+		dnl # Linux 6.9 API Change:
+		dnl # blk_alloc_queue() takes a nullable queue_limits arg.
+		dnl #
+		AC_MSG_CHECKING([whether blk_alloc_disk() exists and takes 2 args])
+		ZFS_LINUX_TEST_RESULT([blk_alloc_disk_2arg], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
+
+			dnl #
+			dnl # 5.20 API change,
+			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
+			dnl #
+			AC_MSG_CHECKING([whether blk_cleanup_disk() exists])
+			ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1,
+				    [blk_cleanup_disk() exists])
+			], [
+				AC_MSG_RESULT(no)
+			])
+		], [
+			AC_MSG_RESULT(no)
+		])
 	],[
 		AC_MSG_RESULT(no)
 
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index cfb69dc06904..c7360293f0e6 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1082,6 +1082,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	if (zso->zvo_disk == NULL)
 		return (1);
 
+	zso->zvo_disk->minors = ZVOL_MINORS;
+	zso->zvo_queue = zso->zvo_disk->queue;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+	struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+	if (IS_ERR(disk)) {
+		zso->zvo_disk = NULL;
+		return (1);
+	}
+
+	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 #else
@@ -1132,6 +1142,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zso->zvo_disk->minors = ZVOL_MINORS;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
+	if (IS_ERR(disk)) {
+		zso->zvo_disk = NULL;
+		blk_mq_free_tag_set(&zso->tag_set);
+		return (1);
+	}
+
+	zso->zvo_disk = disk;
+	zso->zvo_queue = zso->zvo_disk->queue;
+	zso->zvo_disk->minors = ZVOL_MINORS;
 #else
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
@@ -1285,7 +1306,7 @@ zvol_os_free(zvol_state_t *zv)
 
 	del_gendisk(zv->zv_zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
-	defined(HAVE_BLK_ALLOC_DISK)
+	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
 	blk_cleanup_disk(zv->zv_zso->zvo_disk);
 #else
-- 
cgit v1.2.3


From 8a56047135b12d406d81c30051a1a2bbd9d3a983 Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Fri, 12 Apr 2024 03:10:24 +0500
Subject: Add support for zfs mount -R <filesystem>

This commit adds support for mounting a dataset along with all of
it's children with '-R' flag for zfs mount. There can be scenarios
where we want to mount all datasets under one hierarchy instead of
mounting all datasets present on system with '-a' flag.

'-R' flag should work on all root and non-root datasets. Usage
information and man page has been updated for zfs mount. A test
for verifying the behavior for '-R' flag is also added.

Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #16015
---
 cmd/zfs/zfs_main.c                                 |  75 +++++++++--
 man/man8/zfs-mount.8                               |   6 +-
 tests/runfiles/common.run                          |   2 +-
 tests/runfiles/sanity.run                          |   3 +-
 tests/zfs-tests/tests/Makefile.am                  |   1 +
 .../functional/cli_root/zfs_mount/zfs_mount.cfg    |   1 +
 .../cli_root/zfs_mount/zfs_mount_recursive.ksh     | 146 +++++++++++++++++++++
 7 files changed, 216 insertions(+), 18 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 3017de9ee73b..9f81292f06e7 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -309,7 +309,8 @@ get_usage(zfs_help_t idx)
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
 		return (gettext("\tmount\n"
-		    "\tmount [-flvO] [-o opts] <-a | filesystem>\n"));
+		    "\tmount [-flvO] [-o opts] <-a|-R filesystem|"
+		    "filesystem>\n"));
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone-filesystem>\n"));
 	case HELP_RECEIVE:
@@ -6750,6 +6751,8 @@ zfs_do_holds(int argc, char **argv)
 #define	MOUNT_TIME 1		/* seconds */
 
 typedef struct get_all_state {
+	char		**ga_datasets;
+	int		ga_count;
 	boolean_t	ga_verbose;
 	get_all_cb_t	*ga_cbp;
 } get_all_state_t;
@@ -6796,19 +6799,35 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
 	return (0);
 }
 
-static void
-get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
+static int
+get_recursive_datasets(zfs_handle_t *zhp, void *data)
 {
-	get_all_state_t state = {
-	    .ga_verbose = verbose,
-	    .ga_cbp = cbp
-	};
+	get_all_state_t *state = data;
+	int len = strlen(zfs_get_name(zhp));
+	for (int i = 0; i < state->ga_count; ++i) {
+		if (strcmp(state->ga_datasets[i], zfs_get_name(zhp)) == 0)
+			return (get_one_dataset(zhp, data));
+		else if ((strncmp(state->ga_datasets[i], zfs_get_name(zhp),
+		    len) == 0) && state->ga_datasets[i][len] == '/') {
+			(void) zfs_iter_filesystems_v2(zhp, 0,
+			    get_recursive_datasets, data);
+		}
+	}
+	zfs_close(zhp);
+	return (0);
+}
 
-	if (verbose)
+static void
+get_all_datasets(get_all_state_t *state)
+{
+	if (state->ga_verbose)
 		set_progress_header(gettext("Reading ZFS config"));
-	(void) zfs_iter_root(g_zfs, get_one_dataset, &state);
+	if (state->ga_datasets == NULL)
+		(void) zfs_iter_root(g_zfs, get_one_dataset, state);
+	else
+		(void) zfs_iter_root(g_zfs, get_recursive_datasets, state);
 
-	if (verbose)
+	if (state->ga_verbose)
 		finish_progress(gettext("done."));
 }
 
@@ -7154,18 +7173,22 @@ static int
 share_mount(int op, int argc, char **argv)
 {
 	int do_all = 0;
+	int recursive = 0;
 	boolean_t verbose = B_FALSE;
 	int c, ret = 0;
 	char *options = NULL;
 	int flags = 0;
 
 	/* check options */
-	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al"))
+	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al"))
 	    != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
+		case 'R':
+			recursive = 1;
+			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
@@ -7207,7 +7230,7 @@ share_mount(int op, int argc, char **argv)
 	argv += optind;
 
 	/* check number of arguments */
-	if (do_all) {
+	if (do_all || recursive) {
 		enum sa_protocol protocol = SA_NO_PROTOCOL;
 
 		if (op == OP_SHARE && argc > 0) {
@@ -7216,14 +7239,38 @@ share_mount(int op, int argc, char **argv)
 			argv++;
 		}
 
-		if (argc != 0) {
+		if (argc != 0 && do_all) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
+		if (argc == 0 && recursive) {
+			(void) fprintf(stderr,
+			    gettext("no dataset provided\n"));
+			usage(B_FALSE);
+		}
+
 		start_progress_timer();
 		get_all_cb_t cb = { 0 };
-		get_all_datasets(&cb, verbose);
+		get_all_state_t state = { 0 };
+		if (argc == 0) {
+			state.ga_datasets = NULL;
+			state.ga_count = -1;
+		} else {
+			zfs_handle_t *zhp;
+			for (int i = 0; i < argc; i++) {
+				zhp = zfs_open(g_zfs, argv[i],
+				    ZFS_TYPE_FILESYSTEM);
+				if (zhp == NULL)
+					usage(B_FALSE);
+				zfs_close(zhp);
+			}
+			state.ga_datasets = argv;
+			state.ga_count = argc;
+		}
+		state.ga_verbose = verbose;
+		state.ga_cbp = &cb;
+		get_all_datasets(&state);
 
 		if (cb.cb_used == 0) {
 			free(options);
diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8
index 35aa187cf063..20dbe4d0e648 100644
--- a/man/man8/zfs-mount.8
+++ b/man/man8/zfs-mount.8
@@ -43,7 +43,7 @@
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
-.Fl a Ns | Ns Ar filesystem
+.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Nm zfs
 .Cm unmount
 .Op Fl fu
@@ -61,7 +61,7 @@ Displays all ZFS file systems currently mounted.
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
-.Fl a Ns | Ns Ar filesystem
+.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Xc
 Mount ZFS filesystem on a path described by its
 .Sy mountpoint
@@ -83,6 +83,8 @@ for more information.
 .It Fl a
 Mount all available ZFS file systems.
 Invoked automatically as part of the boot process if configured.
+.It Fl R
+Mount the specified filesystems along with all their children.
 .It Ar filesystem
 Mount the specified filesystem.
 .It Fl o Ar options
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 7331244515f6..c4afde554da5 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -246,7 +246,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
     'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
-    'zfs_mount_test_race']
+    'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index ab41c05b8473..695d3697a602 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -155,7 +155,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
-    'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race']
+    'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
+    'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index e2824ee065e8..f587e265f15e 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -769,6 +769,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
+	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
index 06d25faf0356..739baf16086a 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
@@ -31,6 +31,7 @@
 export mountcmd=mount
 export mountforce="$mountcmd -f"
 export mountall="$mountcmd -a"
+export mountrecursive="$mountcmd -R"
 
 export unmountcmd=unmount
 export unmountforce="$unmountcmd -f"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh
new file mode 100755
index 000000000000..0e5cc5d6955e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh
@@ -0,0 +1,146 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2024, iXsystems Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
+
+#
+# DESCRIPTION:
+# Verify zfs mount -R <filesystems/s> functionality.
+#
+# STRATEGY:
+# 1. Create nested datasets
+# 2. Unmount all datasets
+# 3. Recusrively mount root datasets, this should mount all datasets
+#    present in a pool
+# 4. Unmount all datasets
+# 5. Recusrsively mount child datasets with children. This should mount
+#    child datasets, but not the root dataset or parent datasets
+# 6. Unmount all datasets
+# 7. Mount root dataset recursively again and confirm all child
+#    datasets are mounted.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must datasetexists $TESTPOOL/$TESTFS1 && \
+		destroy_dataset $TESTPOOL/$TESTFS1 -R
+	log_must datasetexists $TESTPOOL/$TESTFS2 && \
+		destroy_dataset $TESTPOOL/$TESTFS2 -R
+	log_must datasetexists $TESTPOOL/$TESTFS3 && \
+		destroy_dataset $TESTPOOL/$TESTFS3 -R
+}
+
+function setup_all
+{
+	log_must datasetexists $TESTPOOL/$TESTFS || zfs create $TESTPOOL/$TESTFS
+	log_must zfs create $TESTPOOL/$TESTFS1
+	log_must zfs create $TESTPOOL/$TESTFS2
+	log_must zfs create $TESTPOOL/$TESTFS3
+	log_must zfs create $TESTPOOL/$TESTFS2/child1
+	log_must zfs create $TESTPOOL/$TESTFS2/child2
+	log_must zfs create $TESTPOOL/$TESTFS2/child3
+	log_must zfs create $TESTPOOL/$TESTFS2/child2/subchild
+	log_must zfs create $TESTPOOL/$TESTFS3/child
+}
+
+log_assert "Verify that 'zfs $mountrecursive' successfully, " \
+	"mounts the dataset along with all its children."
+
+log_onexit cleanup
+
+log_must setup_all
+
+log_must zfs $unmountall
+
+log_must zfs $mountrecursive $TESTPOOL
+
+log_must mounted $TESTPOOL
+log_must mounted $TESTPOOL/$TESTFS
+log_must mounted $TESTPOOL/$TESTFS1
+log_must mounted $TESTPOOL/$TESTFS2
+log_must mounted $TESTPOOL/$TESTFS3
+log_must mounted $TESTPOOL/$TESTFS2/child1
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child3
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_must mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $unmountall
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $mountrecursive $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_must mounted $TESTPOOL/$TESTFS2
+log_must mounted $TESTPOOL/$TESTFS3
+log_must mounted $TESTPOOL/$TESTFS2/child1
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child3
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_must mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $unmountall
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $mountrecursive $TESTPOOL/$TESTFS2/child2
+
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_pass "'zfs $mountrecursive' behaves as expected."
-- 
cgit v1.2.3


From b0b0d07b13aebc4c200d635aab53e021b7ac95b3 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Thu, 4 Apr 2024 09:04:15 +0800
Subject: man: move zfs_prepare_disk.8 to nodist_man_MANS

The commit b53077a added zfs_prepare_disk.8 to the wrong list
dist_man_MANS, in which @zfsexecdir@ will not be properly substituted.
This leads to wrong path in the manpage in generated release tarballs.

Reported-by: Benda Xu <orv@debian.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
Closes #15979
---
 man/Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/Makefile.am b/man/Makefile.am
index 45156571eec3..43bb014ddd32 100644
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -62,7 +62,6 @@ dist_man_MANS = \
 	%D%/man8/zfs-userspace.8 \
 	%D%/man8/zfs-wait.8 \
 	%D%/man8/zfs_ids_to_path.8 \
-	%D%/man8/zfs_prepare_disk.8 \
 	%D%/man8/zgenhostid.8 \
 	%D%/man8/zinject.8 \
 	%D%/man8/zpool.8 \
@@ -115,7 +114,8 @@ endif
 
 nodist_man_MANS = \
 	%D%/man8/zed.8 \
-	%D%/man8/zfs-mount-generator.8
+	%D%/man8/zfs-mount-generator.8 \
+	%D%/man8/zfs_prepare_disk.8
 
 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS)
 
-- 
cgit v1.2.3


From baaac316554273f84c8bec06bddb7c5a09967c09 Mon Sep 17 00:00:00 2001
From: Benda Xu <heroxbd@gmail.com>
Date: Wed, 10 Apr 2024 07:34:58 +0800
Subject: config/Substfiles.am: restrict to the dedicated list.

We recover the scope of $(SUBSTFILES) to explicitly control what files
are being generated from the corresponding .in.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Benda Xu <orv@debian.org>
Closes #15980
---
 config/Substfiles.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/Substfiles.am b/config/Substfiles.am
index 38e870b2f501..809eaef2330a 100644
--- a/config/Substfiles.am
+++ b/config/Substfiles.am
@@ -43,4 +43,4 @@ SUBSTFILES =
 CLEANFILES += $(SUBSTFILES)
 dist_noinst_DATA += $(SUBSTFILES:=.in)
 
-$(call SUBST,%,)
+$(SUBSTFILES): $(call SUBST,%,)
-- 
cgit v1.2.3


From 6732e223bf700bd8bb45d6af6221f5a9418b1493 Mon Sep 17 00:00:00 2001
From: Benda Xu <heroxbd@gmail.com>
Date: Tue, 9 Apr 2024 07:52:24 +0800
Subject: etc/init.d: decide which variant to use at build time.

Let Debian use the sysv-rc variant of the script, even when OpenRC is
installed. Unlike on Gentoo, OpenRC on Debian consumes both the
sysv-rc scripts and OpenRC ones. ZFS initscripts on Debian should be
the sysv-rc version to provide most compatibility and to integrate
with the rest of initscripts for dependency tracking.

Restrict the substitution in the Makefile to the dedicated list.

This construct is inspired by Mo Zhou's detection of the execution
shell and follows the strategy of Peter in 6ef28c526ba7.

As of 2024, the initscripts are mostly relevant on Debian, Gentoo and
their derivatives.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Benda Xu <orv@debian.org>
Issue #8063
Issue #8204
Issue #8359
Closes #15977
---
 config/Substfiles.am       | 1 +
 config/zfs-build.m4        | 8 +++++---
 etc/init.d/README.md       | 6 +-----
 etc/init.d/zfs-import.in   | 2 +-
 etc/init.d/zfs-load-key.in | 2 +-
 etc/init.d/zfs-mount.in    | 2 +-
 etc/init.d/zfs-share.in    | 3 ++-
 etc/init.d/zfs-zed.in      | 3 ++-
 8 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/config/Substfiles.am b/config/Substfiles.am
index 809eaef2330a..2459637abe6e 100644
--- a/config/Substfiles.am
+++ b/config/Substfiles.am
@@ -18,6 +18,7 @@ subst_sed_cmd = \
 	-e 's|@ASAN_ENABLED[@]|$(ASAN_ENABLED)|g' \
 	-e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \
 	-e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' \
+	-e 's|@IS_SYSV_RC[@]|$(IS_SYSV_RC)|g' \
 	-e 's|@LIBFETCH_DYNAMIC[@]|$(LIBFETCH_DYNAMIC)|g' \
 	-e 's|@LIBFETCH_SONAME[@]|$(LIBFETCH_SONAME)|g' \
 	-e 's|@PYTHON[@]|$(PYTHON)|g' \
diff --git a/config/zfs-build.m4 b/config/zfs-build.m4
index 5f36569fe25b..bb5a85d815d1 100644
--- a/config/zfs-build.m4
+++ b/config/zfs-build.m4
@@ -578,13 +578,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [
 
 	AC_MSG_CHECKING([default shell])
 	case "$VENDOR" in
-		gentoo)     DEFAULT_INIT_SHELL="/sbin/openrc-run";;
-		alpine)     DEFAULT_INIT_SHELL="/sbin/openrc-run";;
-		*)          DEFAULT_INIT_SHELL="/bin/sh"         ;;
+		gentoo|alpine)	DEFAULT_INIT_SHELL=/sbin/openrc-run
+				IS_SYSV_RC=false	;;
+		*)		DEFAULT_INIT_SHELL=/bin/sh
+				IS_SYSV_RC=true		;;
 	esac
 
 	AC_MSG_RESULT([$DEFAULT_INIT_SHELL])
 	AC_SUBST(DEFAULT_INIT_SHELL)
+	AC_SUBST(IS_SYSV_RC)
 
 	AC_MSG_CHECKING([default nfs server init script])
 	AS_IF([test "$VENDOR" = "debian"],
diff --git a/etc/init.d/README.md b/etc/init.d/README.md
index 2de05042ce63..da780fdc1222 100644
--- a/etc/init.d/README.md
+++ b/etc/init.d/README.md
@@ -7,11 +7,7 @@ DESCRIPTION
 
   They have been tested successfully on:
 
-    * Debian GNU/Linux Wheezy
-    * Debian GNU/Linux Jessie
-    * Ubuntu Trusty
-    * CentOS 6.0
-    * CentOS 6.6
+    * Debian GNU/Linux Bookworm
     * Gentoo
 
 SUPPORT
diff --git a/etc/init.d/zfs-import.in b/etc/init.d/zfs-import.in
index a9a0604f81ac..ff169eb96d86 100755
--- a/etc/init.d/zfs-import.in
+++ b/etc/init.d/zfs-import.in
@@ -307,7 +307,7 @@ do_start()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
diff --git a/etc/init.d/zfs-load-key.in b/etc/init.d/zfs-load-key.in
index 53c7766b793a..27dfeeb0bcc5 100755
--- a/etc/init.d/zfs-load-key.in
+++ b/etc/init.d/zfs-load-key.in
@@ -104,7 +104,7 @@ do_stop()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
diff --git a/etc/init.d/zfs-mount.in b/etc/init.d/zfs-mount.in
index a0825f19fcdd..6a3ca5f86908 100755
--- a/etc/init.d/zfs-mount.in
+++ b/etc/init.d/zfs-mount.in
@@ -114,7 +114,7 @@ do_stop()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
diff --git a/etc/init.d/zfs-share.in b/etc/init.d/zfs-share.in
index 88978071cbf6..06c59c620b75 100755
--- a/etc/init.d/zfs-share.in
+++ b/etc/init.d/zfs-share.in
@@ -57,7 +57,8 @@ do_stop()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]; then
+if @IS_SYSV_RC@
+then
 	case "$1" in
 		start)
 			do_start
diff --git a/etc/init.d/zfs-zed.in b/etc/init.d/zfs-zed.in
index e9cf8867403c..3d40600cea5d 100755
--- a/etc/init.d/zfs-zed.in
+++ b/etc/init.d/zfs-zed.in
@@ -93,7 +93,8 @@ do_reload()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]; then
+if @IS_SYSV_RC@
+then
 	case "$1" in
 		start)
 			do_start
-- 
cgit v1.2.3


From d6da6cbd74f3de98de0e734fa2e00094bc0fc487 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= <des@FreeBSD.org>
Date: Wed, 30 Aug 2023 17:13:06 +0200
Subject: Clean up existing VERIFY*() macros.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chiefly:

- Remove unnecessary parentheses around variable names.
- Remove spaces between the type and variable in casts.
- Make the panic message for VERIFY0() reflect how the macro is used.
- Use %p to format pointers, except in Linux kernel code.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Dag-Erling Smørgrav <des@FreeBSD.org>
Closes #15225
---
 include/os/freebsd/spl/sys/debug.h | 27 +++++++++++++--------------
 include/os/linux/spl/sys/debug.h   | 25 ++++++++++++-------------
 lib/libspl/include/assert.h        |  4 ++--
 3 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h
index 3e67cf0e9a7d..b29d0daecc4b 100644
--- a/include/os/freebsd/spl/sys/debug.h
+++ b/include/os/freebsd/spl/sys/debug.h
@@ -89,8 +89,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
 		    "failed (%d " #OP " %d)\n",				\
-		    (boolean_t)(_verify3_left),				\
-		    (boolean_t)(_verify3_right));			\
+		    (boolean_t)_verify3_left,				\
+		    (boolean_t)_verify3_right);				\
 	} while (0)
 
 #define	VERIFY3S(LEFT, OP, RIGHT)	do {				\
@@ -100,8 +100,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
 		    "failed (%lld " #OP " %lld)\n",			\
-		    (long long) (_verify3_left),			\
-		    (long long) (_verify3_right));			\
+		    (long long)_verify3_left,				\
+		    (long long)_verify3_right);				\
 	} while (0)
 
 #define	VERIFY3U(LEFT, OP, RIGHT)	do {				\
@@ -111,8 +111,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
 		    "failed (%llu " #OP " %llu)\n",			\
-		    (unsigned long long) (_verify3_left),		\
-		    (unsigned long long) (_verify3_right));		\
+		    (unsigned long long)_verify3_left,			\
+		    (unsigned long long)_verify3_right);		\
 	} while (0)
 
 #define	VERIFY3P(LEFT, OP, RIGHT)	do {				\
@@ -121,19 +121,18 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		if (unlikely(!(_verify3_left OP _verify3_right)))	\
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
-		    "failed (%px " #OP " %px)\n",			\
-		    (void *) (_verify3_left),				\
-		    (void *) (_verify3_right));				\
+		    "failed (%p " #OP " %p)\n",				\
+		    (void *)_verify3_left,				\
+		    (void *)_verify3_right);				\
 	} while (0)
 
 #define	VERIFY0(RIGHT)	do {						\
-		const int64_t _verify3_left = (int64_t)(0);		\
-		const int64_t _verify3_right = (int64_t)(RIGHT);	\
-		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		const int64_t _verify0_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(0 == _verify0_right)))			\
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
-		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "VERIFY0(" #RIGHT ") "				\
 		    "failed (0 == %lld)\n",				\
-		    (long long) (_verify3_right));			\
+		    (long long)_verify0_right);				\
 	} while (0)
 
 /*
diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h
index 007238574fe1..9bcc2e1d1923 100644
--- a/include/os/linux/spl/sys/debug.h
+++ b/include/os/linux/spl/sys/debug.h
@@ -93,8 +93,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
 		    "failed (%d " #OP " %d)\n",				\
-		    (boolean_t)(_verify3_left),				\
-		    (boolean_t)(_verify3_right));			\
+		    (boolean_t)_verify3_left,				\
+		    (boolean_t)_verify3_right);				\
 	} while (0)
 
 #define	VERIFY3S(LEFT, OP, RIGHT)	do {				\
@@ -104,8 +104,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
 		    "failed (%lld " #OP " %lld)\n",			\
-		    (long long)(_verify3_left),				\
-		    (long long)(_verify3_right));			\
+		    (long long)_verify3_left,				\
+		    (long long)_verify3_right);				\
 	} while (0)
 
 #define	VERIFY3U(LEFT, OP, RIGHT)	do {				\
@@ -115,8 +115,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
 		    "failed (%llu " #OP " %llu)\n",			\
-		    (unsigned long long)(_verify3_left),		\
-		    (unsigned long long)(_verify3_right));		\
+		    (unsigned long long)_verify3_left,			\
+		    (unsigned long long)_verify3_right);		\
 	} while (0)
 
 #define	VERIFY3P(LEFT, OP, RIGHT)	do {				\
@@ -126,18 +126,17 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
 		    "failed (%px " #OP " %px)\n",			\
-		    (void *) (_verify3_left),				\
-		    (void *) (_verify3_right));				\
+		    (void *)_verify3_left,				\
+		    (void *)_verify3_right);				\
 	} while (0)
 
 #define	VERIFY0(RIGHT)	do {						\
-		const int64_t _verify3_left = (int64_t)(0);		\
-		const int64_t _verify3_right = (int64_t)(RIGHT);	\
-		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		const int64_t _verify0_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(0 == _verify0_right)))			\
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
-		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "VERIFY0(" #RIGHT ") "				\
 		    "failed (0 == %lld)\n",				\
-		    (long long) (_verify3_right));			\
+		    (long long)_verify0_right);				\
 	} while (0)
 
 #define	VERIFY_IMPLY(A, B) \
diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h
index af4957dfbaa6..8704e30da5c9 100644
--- a/lib/libspl/include/assert.h
+++ b/lib/libspl/include/assert.h
@@ -110,8 +110,8 @@ do {									\
 	const uintptr_t __right = (uintptr_t)(RIGHT);			\
 	if (!(__left OP __right))					\
 		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
-		    "%s %s %s (0x%llx %s 0x%llx)", #LEFT, #OP, #RIGHT,	\
-		    (u_longlong_t)__left, #OP, (u_longlong_t)__right);	\
+		    "%s %s %s (%p %s %p)", #LEFT, #OP, #RIGHT,		\
+		    (void *)__left, #OP, (void *)__right);		\
 } while (0)
 
 #define	VERIFY0(LEFT)							\
-- 
cgit v1.2.3


From 5dda8c09102606449bffc42c3013a0500cf02578 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= <des@FreeBSD.org>
Date: Wed, 30 Aug 2023 17:13:09 +0200
Subject: Add VERIFY0P() and ASSERT0P() macros.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These macros are similar to VERIFY0() and ASSERT0() but are intended
for pointers, and therefore use uintptr_t instead of int64_t.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Dag-Erling Smørgrav <des@FreeBSD.org>
Closes #15225
---
 include/os/freebsd/spl/sys/debug.h | 13 +++++++++++++
 include/os/linux/spl/sys/debug.h   | 13 +++++++++++++
 lib/libspl/include/assert.h        | 11 +++++++++++
 3 files changed, 37 insertions(+)

diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h
index b29d0daecc4b..785fcf62dd16 100644
--- a/include/os/freebsd/spl/sys/debug.h
+++ b/include/os/freebsd/spl/sys/debug.h
@@ -39,12 +39,14 @@
  * ASSERT3U()	- Assert unsigned X OP Y is true, if not panic.
  * ASSERT3P()	- Assert pointer X OP Y is true, if not panic.
  * ASSERT0()	- Assert value is zero, if not panic.
+ * ASSERT0P()	- Assert pointer is null, if not panic.
  * VERIFY()	- Verify X is true, if not panic.
  * VERIFY3B()	- Verify boolean X OP Y is true, if not panic.
  * VERIFY3S()	- Verify signed X OP Y is true, if not panic.
  * VERIFY3U()	- Verify unsigned X OP Y is true, if not panic.
  * VERIFY3P()	- Verify pointer X OP Y is true, if not panic.
  * VERIFY0()	- Verify value is zero, if not panic.
+ * VERIFY0P()	- Verify pointer is null, if not panic.
  */
 
 #ifndef _SPL_DEBUG_H
@@ -135,6 +137,15 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    (long long)_verify0_right);				\
 	} while (0)
 
+#define	VERIFY0P(RIGHT)	do {						\
+		const uintptr_t _verify0_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(0 == _verify0_right)))			\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0P(" #RIGHT ") "				\
+		    "failed (NULL == %p)\n",				\
+		    (void *)_verify0_right);				\
+	} while (0)
+
 /*
  * Debugging disabled (--disable-debug)
  */
@@ -150,6 +161,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3P(x, y, z)						\
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
 #define	EQUIV(A, B)		\
@@ -165,6 +177,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3U	VERIFY3U
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
+#define	ASSERT0P	VERIFY0P
 #define	ASSERT		VERIFY
 #define	IMPLY(A, B) \
 	((void)(likely((!(A)) || (B)) ||				\
diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h
index 9bcc2e1d1923..288193ad21c5 100644
--- a/include/os/linux/spl/sys/debug.h
+++ b/include/os/linux/spl/sys/debug.h
@@ -34,12 +34,14 @@
  * ASSERT3U()	- Assert unsigned X OP Y is true, if not panic.
  * ASSERT3P()	- Assert pointer X OP Y is true, if not panic.
  * ASSERT0()	- Assert value is zero, if not panic.
+ * ASSERT0P()	- Assert pointer is null, if not panic.
  * VERIFY()	- Verify X is true, if not panic.
  * VERIFY3B()	- Verify boolean X OP Y is true, if not panic.
  * VERIFY3S()	- Verify signed X OP Y is true, if not panic.
  * VERIFY3U()	- Verify unsigned X OP Y is true, if not panic.
  * VERIFY3P()	- Verify pointer X OP Y is true, if not panic.
  * VERIFY0()	- Verify value is zero, if not panic.
+ * VERIFY0P()	- Verify pointer is null, if not panic.
  */
 
 #ifndef _SPL_DEBUG_H
@@ -139,6 +141,15 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    (long long)_verify0_right);				\
 	} while (0)
 
+#define	VERIFY0P(RIGHT)	do {						\
+		const uintptr_t _verify0_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(0 == _verify0_right)))			\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0P(" #RIGHT ") "				\
+		    "failed (NULL == %px)\n",				\
+		    (void *)_verify0_right);				\
+	} while (0)
+
 #define	VERIFY_IMPLY(A, B) \
 	((void)(likely((!(A)) || (B)) ||				\
 	    spl_assert("(" #A ") implies (" #B ")",			\
@@ -164,6 +175,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3P(x, y, z)						\
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
 #define	EQUIV(A, B)		\
@@ -179,6 +191,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3U	VERIFY3U
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
+#define	ASSERT0P	VERIFY0P
 #define	ASSERT		VERIFY
 #define	IMPLY		VERIFY_IMPLY
 #define	EQUIV		VERIFY_EQUIV
diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h
index 8704e30da5c9..57f5719c1ac1 100644
--- a/lib/libspl/include/assert.h
+++ b/lib/libspl/include/assert.h
@@ -123,6 +123,15 @@ do {									\
 		    (u_longlong_t)__left);				\
 } while (0)
 
+#define	VERIFY0P(LEFT)							\
+do {									\
+	const uintptr_t __left = (uintptr_t)(LEFT);			\
+	if (!(__left == 0))						\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s == 0 (%p == 0)", #LEFT,				\
+		    (void *)__left);					\
+} while (0)
+
 #ifdef assert
 #undef assert
 #endif
@@ -137,6 +146,7 @@ do {									\
 #define	ASSERT3P(x, y, z)						\
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
 #define	ASSERT(x)		((void) sizeof ((uintptr_t)(x)))
 #define	assert(x)		((void) sizeof ((uintptr_t)(x)))
 #define	IMPLY(A, B)							\
@@ -149,6 +159,7 @@ do {									\
 #define	ASSERT3U	VERIFY3U
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
+#define	ASSERT0P	VERIFY0P
 #define	ASSERT		VERIFY
 #define	assert		VERIFY
 #define	IMPLY(A, B) \
-- 
cgit v1.2.3


From 9f1d3db73035665b51270925d24974d9e34e3cb3 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 16 Feb 2024 09:07:32 -0800
Subject: Check for minimum partition size

On Linux block devices used for vdevs will by partitioned.  The block
device must be large enough for an 64M partition starting at offset
of 2048 sectors (part1), and a second 64M reserved partition at the
end of the device (part9).

This commit adds a capacity check when creating the GPT label to
immediately detect a device which is too small.  With the existing
code this would be caught slightly latter when attempting to use
the partition.  Catching it sooner let's us print a more useful error.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #15898
---
 lib/libzfs/os/linux/libzfs_pool_os.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c
index 401151b1afb5..86eef3255bc2 100644
--- a/lib/libzfs/os/linux/libzfs_pool_os.c
+++ b/lib/libzfs/os/linux/libzfs_pool_os.c
@@ -273,6 +273,16 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name)
 	vtoc->efi_parts[0].p_start = start_block;
 	vtoc->efi_parts[0].p_size = slice_size;
 
+	if (vtoc->efi_parts[0].p_size * vtoc->efi_lbasize < SPA_MINDEVSIZE) {
+		(void) close(fd);
+		efi_free(vtoc);
+
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+		    "label '%s': partition would be less than the minimum "
+		    "device size (64M)"), path);
+		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
+	}
+
 	/*
 	 * Why we use V_USR: V_BACKUP confuses users, and is considered
 	 * disposable by some EFI utilities (since EFI doesn't have a backup
-- 
cgit v1.2.3


From 2ff09e8fed114834185805998fce1a11db93ce4d Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 21 Mar 2024 04:08:50 +1100
Subject: freebsd: fix missing headers in distribution tarball

arc_os.h and freebsd_event.h aren't included in release tarballs, so the
build fails on FreeBSD. This fixes it.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15963
---
 include/os/freebsd/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index 9819e534b7f6..551f75f42a20 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -80,7 +80,9 @@ noinst_HEADERS = \
 	%D%/spl/sys/zmod.h \
 	%D%/spl/sys/zone.h \
 	\
+	%D%/zfs/sys/arc_os.h \
 	%D%/zfs/sys/freebsd_crypto.h \
+	%D%/zfs/sys/freebsd_event.h \
 	%D%/zfs/sys/vdev_os.h \
 	%D%/zfs/sys/zfs_bootenv_os.h \
 	%D%/zfs/sys/zfs_context_os.h \
-- 
cgit v1.2.3


From 67995229a865642f6669dea0d4bd41d34bed2046 Mon Sep 17 00:00:00 2001
From: Cameron Harr <harr1@llnl.gov>
Date: Thu, 21 Mar 2024 09:00:29 -0700
Subject: Fix option string, adding -e and fixing order

The recently added '-e' option (PR #15769) missed adding the
new option in the online `zpool status` help command. This
adds the options and reorders a couple of the other options
that were not listed alphabetically.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Cameron Harr <harr1@llnl.gov>
Closes #16008
---
 cmd/zpool/zpool_main.c  | 39 +++++++++++++++++++--------------------
 man/man8/zpool-status.8 | 18 +++++++++---------
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 69bf9649acf6..9b864219a4bf 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -413,7 +413,7 @@ get_usage(zpool_help_t idx)
 		    "[<device> ...]\n"));
 	case HELP_STATUS:
 		return (gettext("\tstatus [--power] [-c [script1,script2,...]] "
-		    "[-igLpPstvxD]  [-T d|u] [pool] ... \n"
+		    "[-DegiLpPstvx] [-T d|u] [pool] ...\n"
 		    "\t    [interval [count]]\n"));
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade\n"
@@ -9064,22 +9064,22 @@ status_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ...
+ * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ...
  *              [pool] [interval [count]]
  *
  *	-c CMD	For each vdev, run command CMD
+ *	-D	Display dedup status (undocumented)
  *	-e	Display only unhealthy vdevs
- *	-i	Display vdev initialization status.
  *	-g	Display guid for individual vdev name.
+ *	-i	Display vdev initialization status.
  *	-L	Follow links when resolving vdev path name.
  *	-p	Display values in parsable (exact) format.
  *	-P	Display full path for vdev name.
  *	-s	Display slow IOs column.
- *	-v	Display complete error logs
- *	-x	Display only pools with potential problems
- *	-D	Display dedup status (undocumented)
  *	-t	Display vdev TRIM status.
  *	-T	Display a timestamp in date(1) or Unix format
+ *	-v	Display complete error logs
+ *	-x	Display only pools with potential problems
  *	--power	Display vdev enclosure slot power status
  *
  * Describes the health status of all pools or some subset.
@@ -9100,7 +9100,7 @@ zpool_do_status(int argc, char **argv)
 	};
 
 	/* check options */
-	while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
+	while ((c = getopt_long(argc, argv, "c:DegiLpPstT:vx", long_options,
 	    NULL)) != -1) {
 		switch (c) {
 		case 'c':
@@ -9127,15 +9127,18 @@ zpool_do_status(int argc, char **argv)
 			}
 			cmd = optarg;
 			break;
+		case 'D':
+			cb.cb_dedup_stats = B_TRUE;
+			break;
 		case 'e':
 			cb.cb_print_unhealthy = B_TRUE;
 			break;
-		case 'i':
-			cb.cb_print_vdev_init = B_TRUE;
-			break;
 		case 'g':
 			cb.cb_name_flags |= VDEV_NAME_GUID;
 			break;
+		case 'i':
+			cb.cb_print_vdev_init = B_TRUE;
+			break;
 		case 'L':
 			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
@@ -9148,21 +9151,18 @@ zpool_do_status(int argc, char **argv)
 		case 's':
 			cb.cb_print_slow_ios = B_TRUE;
 			break;
-		case 'v':
-			cb.cb_verbose = B_TRUE;
-			break;
-		case 'x':
-			cb.cb_explain = B_TRUE;
-			break;
-		case 'D':
-			cb.cb_dedup_stats = B_TRUE;
-			break;
 		case 't':
 			cb.cb_print_vdev_trim = B_TRUE;
 			break;
 		case 'T':
 			get_timestamp_arg(*optarg);
 			break;
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
+		case 'x':
+			cb.cb_explain = B_TRUE;
+			break;
 		case POWER_OPT:
 			cb.cb_print_power = B_TRUE;
 			break;
@@ -9202,7 +9202,6 @@ zpool_do_status(int argc, char **argv)
 
 		if (cb.vcdl != NULL)
 			free_vdev_cmd_data_list(cb.vcdl);
-
 		if (argc == 0 && cb.cb_count == 0)
 			(void) fprintf(stderr, gettext("no pools available\n"));
 		else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8
index 24ad6e643cae..bbe7a45aa0c6 100644
--- a/man/man8/zpool-status.8
+++ b/man/man8/zpool-status.8
@@ -36,7 +36,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm status
-.Op Fl DeigLpPstvx
+.Op Fl DegiLpPstvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
 .Oo Ar pool Oc Ns …
@@ -69,14 +69,20 @@ See the
 option of
 .Nm zpool Cm iostat
 for complete details.
+.It Fl D
+Display a histogram of deduplication statistics, showing the allocated
+.Pq physically present on disk
+and referenced
+.Pq logically referenced in the pool
+block counts and sizes by reference count.
 .It Fl e
 Only show unhealthy vdevs (not-ONLINE or with errors).
-.It Fl i
-Display vdev initialization status.
 .It Fl g
 Display vdev GUIDs instead of the normal device names
 These GUIDs can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
+.It Fl i
+Display vdev initialization status.
 .It Fl L
 Display real paths for vdevs resolving all symbolic links.
 This can be used to look up the current block device name regardless of the
@@ -90,12 +96,6 @@ the path.
 This can be used in conjunction with the
 .Fl L
 flag.
-.It Fl D
-Display a histogram of deduplication statistics, showing the allocated
-.Pq physically present on disk
-and referenced
-.Pq logically referenced in the pool
-block counts and sizes by reference count.
 .It Fl s
 Display the number of leaf vdev slow I/O operations.
 This is the number of I/O operations that didn't complete in
-- 
cgit v1.2.3


From d088fb7d24083988f38e4224a6ef9cc71c12a6a3 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Wed, 27 Mar 2024 17:59:16 -0400
Subject: ZTS: fix flakiness in cp_files_002_pos

Fix RANDOM to not return zero.

Overwriting with `dd ... count=0` does not test anything.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16029
---
 tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
index 60817449ab03..4db968ffae05 100755
--- a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
@@ -76,7 +76,7 @@ log_onexit cleanup
 
 SRC_FILE=src.data
 DST_FILE=dst.data
-SRC_SIZE=$(($RANDOM % 2048))
+SRC_SIZE=$((1024 + $RANDOM % 1024))
 
 # A smaller recordsize is used merely to speed up the test.
 RECORDSIZE=4096
@@ -120,7 +120,7 @@ for mode in "never" "auto" "always"; do
 	# Overwrite a random range of an existing file and immediately copy it.
 	sync_pool $TESTPOOL
 	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
-            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+            seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
 	if [[ "$mode" == "always" ]]; then
 		log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
 		log_must ls -l $CP_TESTDIR
@@ -152,7 +152,7 @@ for mode in "never" "auto" "always"; do
 
 	# Overwrite a random range of an existing file and immediately copy it.
 	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
-            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+            seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
 	log_must cp --reflink=$mode $SRC_FILE $DST_FILE
 	verify_copy $SRC_FILE $DST_FILE
 	log_must rm -f $SRC_FILE $DST_FILE
-- 
cgit v1.2.3


From e0cfa1592da065fb0e0ba7579ea4d152ce099323 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Fri, 29 Mar 2024 17:59:23 -0400
Subject: Fix buffer underflow if sysfs file is empty

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Jason Lee <jasonlee@lanl.gov>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16028
Closes #16035
---
 cmd/zpool/os/linux/zpool_vdev_os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c
index 006a3a7d8e01..80627b58211c 100644
--- a/cmd/zpool/os/linux/zpool_vdev_os.c
+++ b/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -458,7 +458,7 @@ static char *zpool_sysfs_gets(char *path)
 	}
 
 	/* Remove trailing newline */
-	if (buf[count - 1] == '\n')
+	if (count > 0 && buf[count - 1] == '\n')
 		buf[count - 1] = 0;
 
 	close(fd);
-- 
cgit v1.2.3


From 5d859a2e22f8d2ff347803c4c8d025f7c6fd2ea9 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 4 Apr 2024 09:13:27 +1100
Subject: xdr: header cleanup

#16047 notes that include/os/freebsd/spl/rpc/xdr.h carried an
(apparently) incompatible license. While looking into it, it seems that
this file is actually unnecessary these days - FreeBSD's kernel XDR has
XDR_CONTROL, xdrmem_control and XDR_GET_BYTES_AVAIL, while userspace has
XDR_CONTROL and xdrmem_control, and our implementation of
XDR_GET_BYTES_AVAIL for libspl works nicely with it. So this removes
that file outright.

To keep the includes in nvpair.c tidy, I've made a few small adjustments
to the Linux headers. By definition, rpc/types.h provides bool_t and is
included before rpc/xdr.h, so I've created rpc/types.h for Linux. This
isn't necessary for userspace; both FreeBSD native and tirpc on Linux
already have these headers set up correctly.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16047
Closes #16051
---
 include/os/freebsd/Makefile.am   |  2 --
 include/os/freebsd/spl/rpc/xdr.h | 71 ----------------------------------------
 include/os/linux/Makefile.am     |  1 +
 include/os/linux/spl/rpc/types.h | 30 +++++++++++++++++
 include/os/linux/spl/rpc/xdr.h   |  2 --
 module/nvpair/nvpair.c           |  1 +
 module/os/linux/spl/spl-xdr.c    |  1 +
 7 files changed, 33 insertions(+), 75 deletions(-)
 delete mode 100644 include/os/freebsd/spl/rpc/xdr.h
 create mode 100644 include/os/linux/spl/rpc/types.h

diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index 551f75f42a20..d4103c2f062a 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -4,8 +4,6 @@ noinst_HEADERS = \
 	\
 	%D%/spl/acl/acl_common.h \
 	\
-	%D%/spl/rpc/xdr.h \
-	\
 	%D%/spl/sys/ia32/asm_linkage.h \
 	\
 	%D%/spl/sys/acl.h \
diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h
deleted file mode 100644
index c98466e9d16a..000000000000
--- a/include/os/freebsd/spl/rpc/xdr.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Sun RPC is a product of Sun Microsystems, Inc. and is provided for
- * unrestricted use provided that this legend is included on all tape
- * media and as a part of the software program in whole or part.  Users
- * may copy or modify Sun RPC without charge, but are not authorized
- * to license or distribute it to anyone else except as part of a product or
- * program developed by the user.
- *
- * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE
- * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE.
- *
- * Sun RPC is provided with no support and without any obligation on the
- * part of Sun Microsystems, Inc. to assist in its use, correction,
- * modification or enhancement.
- *
- * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE
- * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC
- * OR ANY PART THEREOF.
- *
- * In no event will Sun Microsystems, Inc. be liable for any lost revenue
- * or profits or other special, indirect and consequential damages, even if
- * Sun has been advised of the possibility of such damages.
- *
- * Sun Microsystems, Inc.
- * 2550 Garcia Avenue
- * Mountain View, California  94043
- */
-
-#ifndef	_OPENSOLARIS_RPC_XDR_H_
-#define	_OPENSOLARIS_RPC_XDR_H_
-
-#include <rpc/types.h>
-#include_next <rpc/xdr.h>
-
-#if !defined(_KERNEL) && !defined(_STANDALONE)
-
-#include <assert.h>
-
-/*
- * Taken from sys/xdr/xdr_mem.c.
- *
- * FreeBSD's userland XDR doesn't implement control method (only the kernel),
- * but OpenSolaris nvpair still depend on it, so we have to implement it here.
- */
-static __inline bool_t
-xdrmem_control(XDR *xdrs, int request, void *info)
-{
-	xdr_bytesrec *xptr;
-
-	switch (request) {
-	case XDR_GET_BYTES_AVAIL:
-		xptr = (xdr_bytesrec *)info;
-		xptr->xc_is_last_record = TRUE;
-		xptr->xc_num_avail = xdrs->x_handy;
-		return (TRUE);
-	default:
-		assert(!"unexpected request");
-	}
-	return (FALSE);
-}
-
-#undef XDR_CONTROL
-#define	XDR_CONTROL(xdrs, req, op)					\
-	(((xdrs)->x_ops->x_control == NULL) ?				\
-	    xdrmem_control((xdrs), (req), (op)) :			\
-	    (*(xdrs)->x_ops->x_control)(xdrs, req, op))
-
-#endif	/* !_KERNEL && !_STANDALONE */
-
-#endif	/* !_OPENSOLARIS_RPC_XDR_H_ */
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 51c27132b4ef..332569efe361 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -47,6 +47,7 @@ kernel_sys_HEADERS = \
 
 kernel_spl_rpcdir = $(kerneldir)/spl/rpc
 kernel_spl_rpc_HEADERS = \
+	%D%/spl/rpc/types.h \
 	%D%/spl/rpc/xdr.h
 
 kernel_spl_sysdir = $(kerneldir)/spl/sys
diff --git a/include/os/linux/spl/rpc/types.h b/include/os/linux/spl/rpc/types.h
new file mode 100644
index 000000000000..5bbb4f2dec46
--- /dev/null
+++ b/include/os/linux/spl/rpc/types.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2008 Sun Microsystems, Inc.
+ *  Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RPC_TYPES_H
+#define	_SPL_RPC_TYPES_H
+
+#include <sys/types.h>
+
+/* Just enough to support rpc/xdr.h */
+
+typedef int bool_t;
+
+#endif /* SPL_RPC_TYPES_H */
diff --git a/include/os/linux/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h
index b00f3542fcdf..5b621fa9c863 100644
--- a/include/os/linux/spl/rpc/xdr.h
+++ b/include/os/linux/spl/rpc/xdr.h
@@ -23,8 +23,6 @@
 
 #include <sys/types.h>
 
-typedef int bool_t;
-
 /*
  * XDR enums and types.
  */
diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c
index d9449e47e87a..887f7d32df4a 100644
--- a/module/nvpair/nvpair.c
+++ b/module/nvpair/nvpair.c
@@ -41,6 +41,7 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/string.h>
+#include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <sys/mod.h>
 
diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c
index 6b77524181db..e1773da5d173 100644
--- a/module/os/linux/spl/spl-xdr.c
+++ b/module/os/linux/spl/spl-xdr.c
@@ -25,6 +25,7 @@
 #include <sys/debug.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
+#include <rpc/types.h>
 #include <rpc/xdr.h>
 
 /*
-- 
cgit v1.2.3


From 889152ce4a8d82e5818384c3f95ef2446bea2621 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Wed, 3 Apr 2024 16:34:46 -0700
Subject: Give a better message from 'zpool get' with invalid pool name

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Don Brady <don.brady@klarasystems.com>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #15942
---
 cmd/zpool/zpool_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 9b864219a4bf..5fa06b4bf208 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -10637,11 +10637,10 @@ found:
 		}
 	} else {
 		/*
-		 * The first arg isn't a pool name,
+		 * The first arg isn't the name of a valid pool.
 		 */
-		fprintf(stderr, gettext("missing pool name.\n"));
-		fprintf(stderr, "\n");
-		usage(B_FALSE);
+		fprintf(stderr, gettext("Cannot get properties of %s: "
+		    "no such pool available.\n"), argv[0]);
 		return (1);
 	}
 
-- 
cgit v1.2.3


From da88fc4ac9dcc21429d48d3e95d64186bb37d7fb Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Thu, 4 Apr 2024 10:38:18 +1100
Subject: zap_leaf: make l_hash[] variable length to silence UBSAN

When UBSAN is active and OpenZFS is a debug build, the l_hash assert at
the bottom of zap_open_leaf() causes UBSAN to complain.

This follows the example in 786641dcf to shut it up.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15964
---
 include/sys/zap_leaf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index d563edd7ba59..e54456d3472b 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -132,7 +132,7 @@ typedef struct zap_leaf_phys {
 	 * with the ZAP_LEAF_CHUNK() macro.
 	 */
 
-	uint16_t l_hash[1];
+	uint16_t l_hash[];
 } zap_leaf_phys_t;
 
 typedef union zap_leaf_chunk {
-- 
cgit v1.2.3


From db65272aef3d380d2bd1c94907826f2b9ec9205e Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 29 Apr 2024 13:20:03 -0700
Subject: [2.2.4-only] Stub RAIDZ enums to prevent conflicts

Stub in the RAIDZ expansions enums for now so that the slow IO
commit merges cleanly.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 include/libzfs.h      | 1 +
 include/sys/fs/zfs.h  | 2 ++
 lib/libzfs/libzfs.abi | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/libzfs.h b/include/libzfs.h
index 770c5e1f201c..4f06b5d3c24c 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -157,6 +157,7 @@ typedef enum zfs_error {
 	EZFS_CKSUM,		/* insufficient replicas */
 	EZFS_RESUME_EXISTS,	/* Resume on existing dataset without force */
 	EZFS_SHAREFAILED,	/* filesystem share failed */
+	EZFS_RAIDZ_EXPAND_IN_PROGRESS,	/* a raidz is currently expanding */
 	EZFS_UNKNOWN
 } zfs_error_t;
 
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index bc940e8a7929..c21ab8f6fb3e 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -363,6 +363,7 @@ typedef enum {
 	VDEV_PROP_CHECKSUM_T,
 	VDEV_PROP_IO_N,
 	VDEV_PROP_IO_T,
+	VDEV_PROP_RAIDZ_EXPANDING,
 	VDEV_NUM_PROPS
 } vdev_prop_t;
 
@@ -1569,6 +1570,7 @@ typedef enum {
 	ZFS_ERR_NOT_USER_NAMESPACE,
 	ZFS_ERR_RESUME_EXISTS,
 	ZFS_ERR_CRYPTO_NOTSUP,
+	ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
 } zfs_errno_t;
 
 /*
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 9bb8f6a47de1..5cd43889bc0a 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -5671,7 +5671,8 @@
       <enumerator name='VDEV_PROP_CHECKSUM_T' value='43'/>
       <enumerator name='VDEV_PROP_IO_N' value='44'/>
       <enumerator name='VDEV_PROP_IO_T' value='45'/>
-      <enumerator name='VDEV_NUM_PROPS' value='46'/>
+      <enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/>
+      <enumerator name='VDEV_NUM_PROPS' value='47'/>
     </enum-decl>
     <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
     <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
-- 
cgit v1.2.3


From c1c26a77ff38770b80ed1c97aea867d3ad9bf6ee Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@delphix.com>
Date: Thu, 8 Feb 2024 10:19:52 -0700
Subject: Add slow disk diagnosis to ZED

Slow disk response times can be indicative of a failing drive. ZFS
currently tracks slow I/Os (slower than zio_slow_io_ms) and generates
events (ereport.fs.zfs.delay).  However, no action is taken by ZED,
like is done for checksum or I/O errors.  This change adds slow disk
diagnosis to ZED which is opt-in using new VDEV properties:
  VDEV_PROP_SLOW_IO_N
  VDEV_PROP_SLOW_IO_T

If multiple VDEVs in a pool are undergoing slow I/Os, then it skips
the zpool_vdev_degrade().

Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15469
---
 cmd/zed/agents/fmd_api.c                           |  57 +++---
 cmd/zed/agents/fmd_api.h                           |   3 +-
 cmd/zed/agents/fmd_serd.c                          |   3 +-
 cmd/zed/agents/fmd_serd.h                          |   2 +-
 cmd/zed/agents/zfs_diagnosis.c                     | 143 +++++++++++---
 cmd/zed/agents/zfs_retire.c                        |   3 +
 cmd/zinject/zinject.c                              |  16 ++
 cmd/zpool/zpool_main.c                             |   8 +-
 include/sys/fm/fs/zfs.h                            |   2 +
 include/sys/fs/zfs.h                               |   2 +
 include/sys/vdev_impl.h                            |   5 +-
 lib/libzfs/libzfs.abi                              |   4 +-
 lib/libzfs/libzfs_pool.c                           |   2 +
 lib/libzfs/libzfs_util.c                           |   4 +-
 man/man7/vdevprops.7                               |   4 +-
 man/man7/zpoolconcepts.7                           |   4 +-
 man/man8/zinject.8                                 |   1 +
 module/zcommon/zpool_prop.c                        |   6 +
 module/zfs/vdev.c                                  |  30 +++
 module/zfs/zfs_fm.c                                |  26 +++
 module/zfs/zio_inject.c                            |   4 +
 tests/runfiles/linux.run                           |   3 +-
 tests/zfs-tests/tests/Makefile.am                  |   2 +
 .../functional/cli_root/zpool_get/vdev_get.cfg     |   2 +
 .../zfs-tests/tests/functional/events/cleanup.ksh  |   4 +-
 .../tests/functional/events/zed_slow_io.ksh        | 205 +++++++++++++++++++++
 .../functional/events/zed_slow_io_many_vdevs.ksh   | 177 ++++++++++++++++++
 tests/zfs-tests/tests/functional/fault/cleanup.ksh |   1 +
 tests/zfs-tests/tests/functional/fault/setup.ksh   |   1 +
 29 files changed, 654 insertions(+), 70 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
 create mode 100755 tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh

diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c
index 4a6cfbf8c05c..fe43e2ab971e 100644
--- a/cmd/zed/agents/fmd_api.c
+++ b/cmd/zed/agents/fmd_api.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  *
  * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 /*
@@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
 	if (strcmp(name, "spare_on_remove") == 0)
 		return (1);
 
-	if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
-		return (10);	/* N = 10 events */
-
-	return (0);
-}
-
-int64_t
-fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
-{
-	(void) hdl;
-
-	/*
-	 * These can be looked up in mp->modinfo->fmdi_props
-	 * For now we just hard code for phase 2. In the
-	 * future, there can be a ZED based override.
-	 */
-	if (strcmp(name, "remove_timeout") == 0)
-		return (15ULL * 1000ULL * 1000ULL * 1000ULL);	/* 15 sec */
-
-	if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
-		return (1000ULL * 1000ULL * 1000ULL * 600ULL);	/* 10 min */
-
 	return (0);
 }
 
@@ -535,20 +514,31 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
 	return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
 }
 
-void
-fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+int
+fmd_serd_active(fmd_hdl_t *hdl, const char *name)
 {
 	fmd_module_t *mp = (fmd_module_t *)hdl;
 	fmd_serd_eng_t *sgp;
 
 	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
 		zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
-		return;
+		return (0);
 	}
+	return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
+}
 
-	fmd_serd_eng_reset(sgp);
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	fmd_serd_eng_t *sgp;
 
-	fmd_hdl_debug(hdl, "serd_reset %s", name);
+	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+		zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+	} else {
+		fmd_serd_eng_reset(sgp);
+		fmd_hdl_debug(hdl, "serd_reset %s", name);
+	}
 }
 
 int
@@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
 {
 	fmd_module_t *mp = (fmd_module_t *)hdl;
 	fmd_serd_eng_t *sgp;
-	int err;
 
 	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
 		zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
 		    name);
 		return (0);
 	}
-	err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+	return (fmd_serd_eng_record(sgp, ep->ev_hrt));
+}
+
+void
+fmd_serd_gc(fmd_hdl_t *hdl)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
 
-	return (err);
+	fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
 }
 
 /* FMD Timers */
@@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
 	const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
 	struct itimerspec its;
 
-	fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+	fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
 
 	/* disarm the timer */
 	memset(&its, 0, sizeof (struct itimerspec));
diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h
index b940d0d395ec..8471feecf33f 100644
--- a/cmd/zed/agents/fmd_api.h
+++ b/cmd/zed/agents/fmd_api.h
@@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
 extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
 
 extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
-extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
 
 #define	FMD_STAT_NOALLOC	0x0	/* fmd should use caller's memory */
 #define	FMD_STAT_ALLOC		0x1	/* fmd should allocate stats memory */
@@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
 extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
 extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
 extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern int fmd_serd_active(fmd_hdl_t *, const char *);
 extern void fmd_serd_reset(fmd_hdl_t *, const char *);
 extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
 extern int fmd_serd_fired(fmd_hdl_t *, const char *);
 extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+extern void fmd_serd_gc(fmd_hdl_t *);
 
 extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
 extern void fmd_timer_remove(fmd_hdl_t *, id_t);
diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c
index 0bb2c535f094..f942e62b3f48 100644
--- a/cmd/zed/agents/fmd_serd.c
+++ b/cmd/zed/agents/fmd_serd.c
@@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
 }
 
 void
-fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
 {
+	(void) arg;
 	fmd_serd_elem_t *sep, *nep;
 	hrtime_t hrt;
 
diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h
index 25b6888e61f2..80ff9a3b25b8 100644
--- a/cmd/zed/agents/fmd_serd.h
+++ b/cmd/zed/agents/fmd_serd.h
@@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
 extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
 
 extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
-extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
 
 #ifdef	__cplusplus
 }
diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
index f6ba334a3ba3..e0ad00800add 100644
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 #include <stddef.h>
@@ -47,11 +48,16 @@
 #define	DEFAULT_CHECKSUM_T	600	/* seconds */
 #define	DEFAULT_IO_N		10	/* events */
 #define	DEFAULT_IO_T		600	/* seconds */
+#define	DEFAULT_SLOW_IO_N	10	/* events */
+#define	DEFAULT_SLOW_IO_T	30	/* seconds */
+
+#define	CASE_GC_TIMEOUT_SECS	43200	/* 12 hours */
 
 /*
- * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This
- * #define reserves enough space for two 64-bit hex values plus the length of
- * the longest string.
+ * Our serd engines are named in the following format:
+ *     'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
+ * This #define reserves enough space for two 64-bit hex values plus the
+ * length of the longest string.
  */
 #define	MAX_SERDLEN	(16 * 2 + sizeof ("zfs___checksum"))
 
@@ -68,6 +74,7 @@ typedef struct zfs_case_data {
 	int		zc_pool_state;
 	char		zc_serd_checksum[MAX_SERDLEN];
 	char		zc_serd_io[MAX_SERDLEN];
+	char		zc_serd_slow_io[MAX_SERDLEN];
 	int		zc_has_remove_timer;
 } zfs_case_data_t;
 
@@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
 	{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
 };
 
-static hrtime_t zfs_remove_timeout;
+/* wait 15 seconds after a removal */
+static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
 
 uu_list_pool_t *zfs_case_pool;
 uu_list_t *zfs_cases;
@@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
 #define	ZFS_MAKE_EREPORT(type)	\
     FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
 
+static void zfs_purge_cases(fmd_hdl_t *hdl);
+
 /*
  * Write out the persistent representation of an active case.
  */
@@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
 	return (zcp);
 }
 
+/*
+ * count other unique slow-io cases in a pool
+ */
+static uint_t
+zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
+{
+	zfs_case_t *zcp;
+	uint_t cases = 0;
+	static hrtime_t next_check = 0;
+
+	/*
+	 * Note that plumbing in some external GC would require adding locking,
+	 * since most of this module code is not thread safe and assumes there
+	 * is only one thread running against the module. So we perform GC here
+	 * inline periodically so that future delay induced faults will be
+	 * possible once the issue causing multiple vdev delays is resolved.
+	 */
+	if (gethrestime_sec() > next_check) {
+		/* Periodically purge old SERD entries and stale cases */
+		fmd_serd_gc(hdl);
+		zfs_purge_cases(hdl);
+		next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
+	}
+
+	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+	    zcp = uu_list_next(zfs_cases, zcp)) {
+		if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
+		    zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
+		    zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+		    fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
+			cases++;
+		}
+	}
+	return (cases);
+}
+
 /*
  * Iterate over any active cases.  If any cases are associated with a pool or
  * vdev which is no longer present on the system, close the associated case.
@@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
 	    (long long unsigned int)vdev_guid, type);
 }
 
+static void
+zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+	fmd_hdl_debug(hdl, "retiring case");
+
+	fmd_case_close(hdl, zcp->zc_case);
+}
+
 /*
  * Solve a given ZFS case.  This first checks to make sure the diagnosis is
  * still valid, as well as cleaning up any pending timer associated with the
@@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 		if (strcmp(class,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
 		    strcmp(class,
-		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
-		    strcmp(class,
-		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
 			zfs_stats.resource_drops.fmds_value.ui64++;
 			return;
 		}
@@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 			if (zcp->zc_data.zc_serd_checksum[0] != '\0')
 				fmd_serd_reset(hdl,
 				    zcp->zc_data.zc_serd_checksum);
+			if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+				fmd_serd_reset(hdl,
+				    zcp->zc_data.zc_serd_slow_io);
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
 			uint64_t state = 0;
@@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 	if (fmd_case_solved(hdl, zcp->zc_case))
 		return;
 
-	fmd_hdl_debug(hdl, "error event '%s'", class);
+	if (vdev_guid)
+		fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
+		    vdev_guid);
+	else
+		fmd_hdl_debug(hdl, "error event '%s'", class);
 
 	/*
 	 * Determine if we should solve the case and generate a fault.  We solve
@@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 	    fmd_nvl_class_match(hdl, nvl,
 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
 	    fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
+	    fmd_nvl_class_match(hdl, nvl,
 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
 		const char *failmode = NULL;
 		boolean_t checkremove = B_FALSE;
@@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 			}
 			if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
 				checkremove = B_TRUE;
+		} else if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
+			uint64_t slow_io_n, slow_io_t;
+
+			/*
+			 * Create a slow io SERD engine when the VDEV has the
+			 * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
+			 */
+			if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
+			    nvlist_lookup_uint64(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+			    &slow_io_n) == 0 &&
+			    nvlist_lookup_uint64(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+			    &slow_io_t) == 0) {
+				zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
+				    pool_guid, vdev_guid, "slow_io");
+				fmd_serd_create(hdl,
+				    zcp->zc_data.zc_serd_slow_io,
+				    slow_io_n,
+				    SEC2NSEC(slow_io_t));
+				zfs_case_serialize(zcp);
+			}
+			/* Pass event to SERD engine and see if this triggers */
+			if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+			    fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
+			    ep)) {
+				/*
+				 * Ignore a slow io diagnosis when other
+				 * VDEVs in the pool show signs of being slow.
+				 */
+				if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
+					zfs_case_retire(hdl, zcp);
+					fmd_hdl_debug(hdl, "pool %llu has "
+					    "multiple slow io cases -- skip "
+					    "degrading vdev %llu",
+					    (u_longlong_t)
+					    zcp->zc_data.zc_pool_guid,
+					    (u_longlong_t)
+					    zcp->zc_data.zc_vdev_guid);
+				} else {
+					zfs_case_solve(hdl, zcp,
+					    "fault.fs.zfs.vdev.slow_io");
+				}
+			}
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
 			/*
@@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
 	if (zcp->zc_data.zc_serd_io[0] != '\0')
 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+	if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
 	if (zcp->zc_data.zc_has_remove_timer)
 		fmd_timer_remove(hdl, zcp->zc_remove_timer);
 
@@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
 	fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
 }
 
-/*
- * We use the fmd gc entry point to look for old cases that no longer apply.
- * This allows us to keep our set of case data small in a long running system.
- */
-static void
-zfs_fm_gc(fmd_hdl_t *hdl)
-{
-	zfs_purge_cases(hdl);
-}
-
 static const fmd_hdl_ops_t fmd_ops = {
 	zfs_fm_recv,	/* fmdo_recv */
 	zfs_fm_timeout,	/* fmdo_timeout */
 	zfs_fm_close,	/* fmdo_close */
 	NULL,		/* fmdo_stats */
-	zfs_fm_gc,	/* fmdo_gc */
+	NULL,	/* fmdo_gc */
 };
 
 static const fmd_prop_t fmd_props[] = {
-	{ "checksum_N", FMD_TYPE_UINT32, "10" },
-	{ "checksum_T", FMD_TYPE_TIME, "10min" },
-	{ "io_N", FMD_TYPE_UINT32, "10" },
-	{ "io_T", FMD_TYPE_TIME, "10min" },
-	{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
 	{ NULL, 0, NULL }
 };
 
@@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
 
 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
 	    sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
-
-	zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
 }
 
 void
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index a0e377a4a0c8..1ef5c631a438 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 		} else if (fmd_nvl_class_match(hdl, fault,
 		    "fault.fs.zfs.vdev.checksum")) {
 			degrade_device = B_TRUE;
+		} else if (fmd_nvl_class_match(hdl, fault,
+		    "fault.fs.zfs.vdev.slow_io")) {
+			degrade_device = B_TRUE;
 		} else if (fmd_nvl_class_match(hdl, fault,
 		    "fault.fs.zfs.device")) {
 			fault_device = B_FALSE;
diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index f1262ed772de..a11b6d0b7fac 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -1083,6 +1083,22 @@ main(int argc, char **argv)
 			libzfs_fini(g_zfs);
 			return (1);
 		}
+
+		if (record.zi_nlanes) {
+			switch (io_type) {
+			case ZIO_TYPE_READ:
+			case ZIO_TYPE_WRITE:
+			case ZIO_TYPES:
+				break;
+			default:
+				(void) fprintf(stderr, "I/O type for a delay "
+				    "must be 'read' or 'write'\n");
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+		}
+
 		if (!error)
 			error = ENXIO;
 
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 5fa06b4bf208..a7062615c390 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 			break;
 
 		case VDEV_AUX_ERR_EXCEEDED:
-			(void) printf(gettext("too many errors"));
+			if (vs->vs_read_errors + vs->vs_write_errors +
+			    vs->vs_checksum_errors == 0 && children == 0 &&
+			    vs->vs_slow_ios > 0) {
+				(void) printf(gettext("too many slow I/Os"));
+			} else {
+				(void) printf(gettext("too many errors"));
+			}
 			break;
 
 		case VDEV_AUX_IO_FAILURE:
diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index fb9e8649221e..c746600cd2d5 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -82,6 +82,8 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T	"vdev_cksum_t"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N	"vdev_io_n"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T	"vdev_io_t"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N	"vdev_slow_io_n"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T	"vdev_slow_io_t"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS	"vdev_delays"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type"
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index c21ab8f6fb3e..2683b774e8c0 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -364,6 +364,8 @@ typedef enum {
 	VDEV_PROP_IO_N,
 	VDEV_PROP_IO_T,
 	VDEV_PROP_RAIDZ_EXPANDING,
+	VDEV_PROP_SLOW_IO_N,
+	VDEV_PROP_SLOW_IO_T,
 	VDEV_NUM_PROPS
 } vdev_prop_t;
 
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 3f2312c23438..edc3c6598ebc 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -453,12 +454,14 @@ struct vdev {
 	zfs_ratelimit_t vdev_checksum_rl;
 
 	/*
-	 * Checksum and IO thresholds for tuning ZED
+	 * Vdev properties for tuning ZED
 	 */
 	uint64_t	vdev_checksum_n;
 	uint64_t	vdev_checksum_t;
 	uint64_t	vdev_io_n;
 	uint64_t	vdev_io_t;
+	uint64_t	vdev_slow_io_n;
+	uint64_t	vdev_slow_io_t;
 };
 
 #define	VDEV_PAD_SIZE		(8 << 10)
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 5cd43889bc0a..2b904aecae7a 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -5672,7 +5672,9 @@
       <enumerator name='VDEV_PROP_IO_N' value='44'/>
       <enumerator name='VDEV_PROP_IO_T' value='45'/>
       <enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/>
-      <enumerator name='VDEV_NUM_PROPS' value='47'/>
+      <enumerator name='VDEV_PROP_SLOW_IO_N' value='47'/>
+      <enumerator name='VDEV_PROP_SLOW_IO_T' value='48'/>
+      <enumerator name='VDEV_NUM_PROPS' value='49'/>
     </enum-decl>
     <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
     <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 2f9ccbc2ab57..7f01143d5096 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5224,6 +5224,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
 		case VDEV_PROP_CHECKSUM_T:
 		case VDEV_PROP_IO_N:
 		case VDEV_PROP_IO_T:
+		case VDEV_PROP_SLOW_IO_N:
+		case VDEV_PROP_SLOW_IO_T:
 			if (intval == UINT64_MAX) {
 				(void) strlcpy(buf, "-", len);
 			} else {
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index fdd1975fa677..463cec0855c1 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -1699,7 +1699,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
 		    (prop == VDEV_PROP_CHECKSUM_N ||
 		    prop == VDEV_PROP_CHECKSUM_T ||
 		    prop == VDEV_PROP_IO_N ||
-		    prop == VDEV_PROP_IO_T)) {
+		    prop == VDEV_PROP_IO_T ||
+		    prop == VDEV_PROP_SLOW_IO_N ||
+		    prop == VDEV_PROP_SLOW_IO_T)) {
 			*ivalp = UINT64_MAX;
 		}
 
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index 6eebfa0060de..3d3ebc072915 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -44,7 +44,7 @@ section, below.
 Every vdev has a set of properties that export statistics about the vdev
 as well as control various behaviors.
 Properties are not inherited from top-level vdevs, with the exception of
-checksum_n, checksum_t, io_n, and io_t.
+checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t.
 .Pp
 The values of numeric properties can be specified using human-readable suffixes
 .Po for example,
@@ -117,7 +117,7 @@ If this device is currently being removed from the pool
 .Pp
 The following native properties can be used to change the behavior of a vdev.
 .Bl -tag -width "allocating"
-.It Sy checksum_n , checksum_t , io_n , io_t
+.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t
 Tune the fault management daemon by specifying checksum/io thresholds of <N>
 errors in <T> seconds, respectively.
 These properties can be set on leaf and top-level vdevs.
diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7
index 98f3ee7cd660..18dfca6dc8ac 100644
--- a/man/man7/zpoolconcepts.7
+++ b/man/man7/zpoolconcepts.7
@@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning.
 The underlying conditions are as follows:
 .Bl -bullet -compact
 .It
-The number of checksum errors exceeds acceptable levels and the device is
-degraded as an indication that something may be wrong.
+The number of checksum errors or slow I/Os exceeds acceptable levels and the
+device is degraded as an indication that something may be wrong.
 ZFS continues to use the device as necessary.
 .It
 The number of I/O errors exceeds acceptable levels.
diff --git a/man/man8/zinject.8 b/man/man8/zinject.8
index 4f0bbae81212..b692f12130a8 100644
--- a/man/man8/zinject.8
+++ b/man/man8/zinject.8
@@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state.
 .Nm zinject
 .Fl d Ar vdev
 .Fl D Ar latency : Ns Ar lanes
+.Op Fl T Ar read|write
 .Ar pool
 .Xc
 Add an artificial delay to I/O requests on a particular
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index c4aca04a96bd..ff70c0e3c35b 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -431,6 +431,12 @@ vdev_prop_init(void)
 	zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX,
 	    PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE,
 	    sfeatures);
+	zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX,
+	    PROP_DEFAULT, ZFS_TYPE_VDEV, "<events>", "SLOW_IO_N", B_FALSE,
+	    sfeatures);
+	zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX,
+	    PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "SLOW_IO_T", B_FALSE,
+	    sfeatures);
 
 	/* default index (boolean) properties */
 	zprop_register_index(VDEV_PROP_REMOVING, "removing", 0,
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index e1ca1aecc900..6d8eb50a1a6e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -676,6 +676,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
 	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
 	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
+	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
 
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
@@ -3730,6 +3732,18 @@ vdev_load(vdev_t *vd)
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
+		    &vd->vdev_slow_io_n);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
+		    &vd->vdev_slow_io_t);
+		if (error && error != ENOENT)
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 	}
 
 	/*
@@ -5934,6 +5948,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			}
 			vd->vdev_io_t = intval;
 			break;
+		case VDEV_PROP_SLOW_IO_N:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_slow_io_n = intval;
+			break;
+		case VDEV_PROP_SLOW_IO_T:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			vd->vdev_slow_io_t = intval;
+			break;
 		default:
 			/* Most processing is done in vdev_props_set_sync */
 			break;
@@ -6269,6 +6297,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
 			case VDEV_PROP_IO_T:
+			case VDEV_PROP_SLOW_IO_N:
+			case VDEV_PROP_SLOW_IO_T:
 				err = vdev_prop_get_int(vd, prop, &intval);
 				if (err && err != ENOENT)
 					break;
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index c4eb74e873db..481af2ba826b 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
 		case VDEV_PROP_IO_T:
 			propval = vd->vdev_io_t;
 			break;
+		case VDEV_PROP_SLOW_IO_N:
+			propval = vd->vdev_slow_io_n;
+			break;
+		case VDEV_PROP_SLOW_IO_T:
+			propval = vd->vdev_slow_io_t;
+			break;
 		default:
 			propval = propdef;
 			break;
@@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 			    NULL);
 	}
 
+	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+		uint64_t slow_io_n, slow_io_t;
+
+		slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
+		if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+			    DATA_TYPE_UINT64,
+			    slow_io_n,
+			    NULL);
+
+		slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
+		if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+			    DATA_TYPE_UINT64,
+			    slow_io_t,
+			    NULL);
+	}
+
 	mutex_exit(&spa->spa_errlist_lock);
 
 	*ereport_out = ereport;
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 3598351c499d..609182f4a2cd 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio)
 		if (vd->vdev_guid != handler->zi_record.zi_guid)
 			continue;
 
+		if (handler->zi_record.zi_iotype != ZIO_TYPES &&
+		    handler->zi_record.zi_iotype != zio->io_type)
+				continue;
+
 		/*
 		 * Defensive; should never happen as the array allocation
 		 * occurs prior to inserting this handler on the list.
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 6a4cd3fe691c..a0b74ef4a8c6 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -104,7 +104,8 @@ tags = ['functional', 'devices']
 
 [tests/functional/events:Linux]
 tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
-    'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config']
+    'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
+    'zed_slow_io', 'zed_slow_io_many_vdevs']
 tags = ['functional', 'events']
 
 [tests/functional/fadvise:Linux]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index f587e265f15e..2fc36c4d7380 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/events/zed_fd_spill.ksh \
 	functional/events/zed_io_config.ksh \
 	functional/events/zed_rc_filter.ksh \
+	functional/events/zed_slow_io.ksh \
+	functional/events/zed_slow_io_many_vdevs.ksh \
 	functional/exec/cleanup.ksh \
 	functional/exec/exec_001_pos.ksh \
 	functional/exec/exec_002_neg.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
index 71a64d4fae7a..c3b9efd6464a 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
@@ -70,4 +70,6 @@ typeset -a properties=(
     checksum_t
     io_n
     io_t
+    slow_io_n
+    slow_io_t
 )
diff --git a/tests/zfs-tests/tests/functional/events/cleanup.ksh b/tests/zfs-tests/tests/functional/events/cleanup.ksh
index ef6e098cf42a..669b8ae99456 100755
--- a/tests/zfs-tests/tests/functional/events/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/events/cleanup.ksh
@@ -26,8 +26,10 @@
 
 . $STF_SUITE/include/libtest.shlib
 
+zed_stop
+
 zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
 
-zed_stop
+zed_events_drain
 
 default_cleanup
diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
new file mode 100755
index 000000000000..d9fabb2c3bc9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
@@ -0,0 +1,205 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+# DESCRIPTION:
+#	Verify that vdev properties, slow_io_n and slow_io_t, work with ZED.
+#
+# STRATEGY:
+#	1. Create a pool with single vdev
+#	2. Set slow_io_n/slow_io_t to non-default values
+#	3. Inject slow io errors
+#	4. Verify that ZED degrades vdev
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TESTDIR="$TEST_BASE_DIR/zed_slow_io"
+VDEV="$TEST_BASE_DIR/vdevfile.$$"
+TESTPOOL="slow_io_pool"
+FILEPATH="$TESTDIR/slow_io.testfile"
+
+OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
+OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
+
+verify_runnable "both"
+
+function do_setup
+{
+	log_must truncate -s 1G $VDEV
+	default_setup_noexit $VDEV
+	zed_events_drain
+	log_must zfs set compression=off $TESTPOOL
+	log_must zfs set primarycache=none $TESTPOOL
+	log_must zfs set prefetch=none $TESTPOOL
+	log_must zfs set recordsize=512 $TESTPOOL
+	for i in {1..10}; do
+		dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null
+	done
+	zpool sync
+}
+
+# intermediate cleanup
+function do_clean
+{
+	log_must zinject -c all
+	log_must zpool destroy $TESTPOOL
+	log_must rm -f $VDEV
+}
+
+# final cleanup
+function cleanup
+{
+	log_must zinject -c all
+
+	# if pool still exists then something failed so log additional info
+	if poolexists $TESTPOOL ; then
+		log_note "$(zpool status -s $TESTPOOL)"
+		echo "=================== zed log search ==================="
+		grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
+		destroy_pool $TESTPOOL
+	fi
+	log_must zed_stop
+
+	log_must rm -f $VDEV
+
+	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+}
+
+function start_slow_io
+{
+	zpool sync
+	log_must set_tunable64 ZIO_SLOW_IO_MS 10
+	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
+
+	log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL
+	zpool sync
+}
+
+function stop_slow_io
+{
+	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+
+	log_must zinject -c all
+}
+
+# Test default ZED settings:
+#    inject 10 events over 2.5 seconds, should not degrade.
+function default_degrade
+{
+	do_setup
+
+	start_slow_io
+	for i in {1..10}; do
+		dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+		sleep 0.25
+	done
+	stop_slow_io
+	log_note "$(zpool status -s $TESTPOOL)"
+
+	# give slow ZED a chance to process the delay events
+	sleep 18
+	log_note "$(zpool status -s $TESTPOOL)"
+
+	degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
+	log_note $degrades vdev degrades in ZED log
+	[ $degrades -eq "0" ] || \
+		log_fail "expecting no degrade events, found $degrades"
+
+	do_clean
+}
+
+# change slow_io_n, slow_io_t to 5 events in 60 seconds
+# fire more than 5 events, should degrade
+function slow_io_degrade
+{
+	do_setup
+
+	zpool set slow_io_n=5 $TESTPOOL $VDEV
+	zpool set slow_io_t=60 $TESTPOOL $VDEV
+
+	start_slow_io
+	for i in {1..16}; do
+		dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+		sleep 0.5
+	done
+	stop_slow_io
+	zpool sync
+
+	#
+	# wait up to 60 seconds for kernel to produce at least 5 delay events
+	#
+	typeset -i i=0
+	typeset -i events=0
+	while [[ $i -lt 60 ]]; do
+		events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
+		[[ $events -ge "5" ]] && break
+		i=$((i+1))
+		sleep 1
+	done
+	log_note "$events delay events found"
+
+	if [[ $events -ge "5" ]]; then
+		log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10
+	fi
+
+	do_clean
+}
+
+# change slow_io_n, slow_io_t to 10 events in 1 second
+# inject events spaced 0.5 seconds apart, should not degrade
+function slow_io_no_degrade
+{
+	do_setup
+
+	zpool set slow_io_n=10 $TESTPOOL $VDEV
+	zpool set slow_io_t=1 $TESTPOOL $VDEV
+
+	start_slow_io
+	for i in {1..16}; do
+		dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+		sleep 0.5
+	done
+	stop_slow_io
+	zpool sync
+
+	log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45
+
+	do_clean
+}
+
+log_assert "Test ZED slow io configurability"
+log_onexit cleanup
+
+log_must zed_events_drain
+log_must zed_start
+
+default_degrade
+slow_io_degrade
+slow_io_no_degrade
+
+log_pass "Test ZED slow io configurability"
diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
new file mode 100755
index 000000000000..3357ae2e3510
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
@@ -0,0 +1,177 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+# DESCRIPTION:
+#	Verify that delay events from multiple vdevs doesnt degrade
+#
+# STRATEGY:
+#	1. Create a pool with a 3 disk raidz vdev
+#	2. Inject slow io errors
+#	3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TESTDIR="$TEST_BASE_DIR/zed_slow_io"
+VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
+VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
+VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
+VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
+VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
+TESTPOOL="slow_io_pool"
+FILEPATH="$TESTDIR/slow_io.testfile"
+
+OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
+OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must zinject -c all
+
+	# if pool still exists then something failed so log additional info
+	if poolexists $TESTPOOL ; then
+		log_note "$(zpool status -s $TESTPOOL)"
+		echo "=================== zed log search ==================="
+		grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
+		destroy_pool $TESTPOOL
+	fi
+	log_must zed_stop
+
+	log_must rm -f $VDEVS
+	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+}
+
+function start_slow_io
+{
+	for vdev in $VDEVS
+	do
+		log_must zpool set slow_io_n=4 $TESTPOOL $vdev
+		log_must zpool set slow_io_t=60 $TESTPOOL $vdev
+	done
+	zpool sync
+
+	log_must set_tunable64 ZIO_SLOW_IO_MS 10
+	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
+
+	for vdev in $VDEVS
+	do
+		log_must zinject -d $vdev -D10:1 $TESTPOOL
+	done
+	zpool sync
+}
+
+function stop_slow_io
+{
+	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+	log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+
+	log_must zinject -c all
+}
+
+function multiple_slow_vdevs_test
+{
+	log_must truncate -s 1G $VDEVS
+	default_raidz_setup_noexit $VDEVS
+
+	log_must zpool events -c
+	log_must zfs set compression=off $TESTPOOL
+	log_must zfs set primarycache=none $TESTPOOL
+	log_must zfs set recordsize=4K $TESTPOOL
+
+	log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20
+	zpool sync
+
+	#
+	# Read the file with slow io injected on the disks
+	# This will cause multiple errors on each disk to trip ZED SERD
+	#
+	#   pool: slow_io_pool
+	#  state: ONLINE
+	# config:
+	#
+	#         NAME                           STATE  READ WRITE CKSUM  SLOW
+	#         slow_io_pool                   ONLINE    0     0     0     -
+	#           raidz1-0                     ONLINE    0     0     0     -
+	#             /var/tmp/vdevfile1.499278  ONLINE    0     0     0   113
+	#             /var/tmp/vdevfile2.499278  ONLINE    0     0     0   109
+	#             /var/tmp/vdevfile3.499278  ONLINE    0     0     0    96
+	#             /var/tmp/vdevfile4.499278  ONLINE    0     0     0   109
+	#
+	start_slow_io
+	dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null
+	stop_slow_io
+
+	# count events available for processing
+	typeset -i i=0
+	typeset -i events=0
+	while [[ $i -lt 60 ]]; do
+		events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
+		[[ $events -ge "50" ]] && break
+		i=$((i+1))
+		sleep 1
+	done
+	log_note "$events delay events found"
+	if [[ $events -lt "50" ]]; then
+		log_note "bailing: not enough events to complete the test"
+		destroy_pool $TESTPOOL
+		return
+	fi
+
+	#
+	# give slow ZED a chance to process the delay events
+	#
+	typeset -i i=0
+	typeset -i skips=0
+	while [[ $i -lt 75 ]]; do
+		skips=$(grep "retiring case" \
+			$ZEDLET_DIR/zed.log | wc -l)
+		[[ $skips -gt "0" ]] && break
+		i=$((i+1))
+		sleep 1
+	done
+
+	log_note $skips degrade skips in ZED log after $i seconds
+	[ $skips -gt "0" ] || log_fail "expecting to see skips"
+
+	degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
+	log_note $degrades vdev degrades in ZED log
+	[ $degrades -eq "0" ] || \
+		log_fail "expecting no degrade events, found $degrades"
+
+	destroy_pool $TESTPOOL
+}
+
+log_assert "Test ZED slow io across multiple vdevs"
+log_onexit cleanup
+
+log_must zed_events_drain
+log_must zed_start
+multiple_slow_vdevs_test
+
+log_pass "Test ZED slow io across multiple vdevs"
diff --git a/tests/zfs-tests/tests/functional/fault/cleanup.ksh b/tests/zfs-tests/tests/functional/fault/cleanup.ksh
index 654343c0cf00..2959236b59a3 100755
--- a/tests/zfs-tests/tests/functional/fault/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/fault/cleanup.ksh
@@ -32,5 +32,6 @@ cleanup_devices $DISKS
 
 zed_stop
 zed_cleanup resilver_finish-start-scrub.sh
+zed_events_drain
 
 log_pass
diff --git a/tests/zfs-tests/tests/functional/fault/setup.ksh b/tests/zfs-tests/tests/functional/fault/setup.ksh
index 62f1c8ab56cb..61b9206ec1a6 100755
--- a/tests/zfs-tests/tests/functional/fault/setup.ksh
+++ b/tests/zfs-tests/tests/functional/fault/setup.ksh
@@ -28,6 +28,7 @@
 
 verify_runnable "global"
 
+zed_events_drain
 zed_setup resilver_finish-start-scrub.sh
 zed_start
 
-- 
cgit v1.2.3


From 74101f7e2ade81d50d79ebec1b8b2a903eab432f Mon Sep 17 00:00:00 2001
From: Alek P <alek-p@users.noreply.github.com>
Date: Wed, 3 Apr 2024 20:56:34 -0400
Subject: vdev props comment and manpage should include zfsd and FreeBSD
 mentions

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Alek Pinchuk <apinchuk@axcient.com>
Closes #15968
---
 include/sys/vdev_impl.h | 2 +-
 man/man7/vdevprops.7    | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index edc3c6598ebc..02948894c365 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -454,7 +454,7 @@ struct vdev {
 	zfs_ratelimit_t vdev_checksum_rl;
 
 	/*
-	 * Vdev properties for tuning ZED
+	 * Vdev properties for tuning ZED or zfsd
 	 */
 	uint64_t	vdev_checksum_n;
 	uint64_t	vdev_checksum_t;
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index 3d3ebc072915..5ec37df179de 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -127,7 +127,13 @@ If the property is only set on the top-level vdev, this value will be used.
 The value of these properties do not persist across vdev replacement.
 For this reason, it is advisable to set the property on the top-level vdev -
 not on the leaf vdev itself.
-The default values are 10 errors in 600 seconds.
+The default values for
+.Sy OpenZFS on Linux
+are 10 errors in 600 seconds.
+For
+.Sy OpenZFS on FreeBSD
+defaults see
+.Xr zfsd 8 .
 .It Sy comment
 A text comment up to 8192 characters long
 .It Sy bootsize
-- 
cgit v1.2.3


From 531572b5906a0a4d042b7dcfd09b3ea947bd442e Mon Sep 17 00:00:00 2001
From: Pavel Snajdr <snajpa@snajpa.net>
Date: Thu, 4 Apr 2024 03:09:19 +0200
Subject: Fix panics when truncating/deleting files

There's an union in dbuf_dirty_record_t; dr_brtwrite could evaluate
to B_TRUE if the dirty record is of another type than dl. Adding
more explicit dr type check before trying to access dr_brtwrite.

Fixes two similar panics:

[ 1373.806119] VERIFY0(db->db_level) failed (0 == 1)
[ 1373.807232] PANIC at dbuf.c:2549:dbuf_undirty()
[ 1373.814979]  dump_stack_lvl+0x71/0x90
[ 1373.815799]  spl_panic+0xd3/0x100 [spl]
[ 1373.827709]  dbuf_undirty+0x62a/0x970 [zfs]
[ 1373.829204]  dmu_buf_will_dirty_impl+0x1e9/0x5b0 [zfs]
[ 1373.831010]  dnode_free_range+0x532/0x1220 [zfs]
[ 1373.833922]  dmu_free_long_range+0x4e0/0x930 [zfs]
[ 1373.835277]  zfs_trunc+0x75/0x1e0 [zfs]
[ 1373.837958]  zfs_freesp+0x9b/0x470 [zfs]
[ 1373.847236]  zfs_setattr+0x161a/0x3500 [zfs]
[ 1373.855267]  zpl_setattr+0x125/0x320 [zfs]
[ 1373.856725]  notify_change+0x1ee/0x4a0
[ 1373.859207]  do_truncate+0x7f/0xd0
[ 1373.859968]  do_sys_ftruncate+0x28e/0x2e0
[ 1373.860962]  do_syscall_64+0x38/0x90
[ 1373.861751]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8

[ 1822.381337] VERIFY0(db->db_level) failed (0 == 1)
[ 1822.382376] PANIC at dbuf.c:2549:dbuf_undirty()
[ 1822.389232]  dump_stack_lvl+0x71/0x90
[ 1822.389920]  spl_panic+0xd3/0x100 [spl]
[ 1822.399567]  dbuf_undirty+0x62a/0x970 [zfs]
[ 1822.400583]  dmu_buf_will_dirty_impl+0x1e9/0x5b0 [zfs]
[ 1822.401752]  dnode_free_range+0x532/0x1220 [zfs]
[ 1822.402841]  dmu_object_free+0x74/0x120 [zfs]
[ 1822.403869]  zfs_znode_delete+0x75/0x120 [zfs]
[ 1822.404906]  zfs_rmnode+0x3f6/0x7f0 [zfs]
[ 1822.405870]  zfs_inactive+0xa3/0x610 [zfs]
[ 1822.407803]  zpl_evict_inode+0x3e/0x90 [zfs]
[ 1822.408831]  evict+0xc1/0x1c0
[ 1822.409387]  do_unlinkat+0x147/0x300
[ 1822.410060]  __x64_sys_unlinkat+0x33/0x60
[ 1822.410802]  do_syscall_64+0x38/0x90
[ 1822.411458]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Pavel Snajdr <snajpa@snajpa.net>
Closes #15983
---
 module/zfs/dbuf.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 72aaf7f19822..a94ba59567ec 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2616,26 +2616,24 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	/*
-	 * Quick check for dirtiness.  For already dirty blocks, this
-	 * reduces runtime of this function by >90%, and overall performance
-	 * by 50% for some workloads (e.g. file deletion with indirect blocks
-	 * cached).
+	 * Quick check for dirtiness to improve performance for some workloads
+	 * (e.g. file deletion with indirect blocks cached).
 	 */
 	mutex_enter(&db->db_mtx);
-
 	if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
-		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		/*
-		 * It's possible that it is already dirty but not cached,
+		 * It's possible that the dbuf is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
+		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		if (dr != NULL) {
-			if (dr->dt.dl.dr_brtwrite) {
+			if (db->db_level == 0 &&
+			    dr->dt.dl.dr_brtwrite) {
 				/*
 				 * Block cloning: If we are dirtying a cloned
-				 * block, we cannot simply redirty it, because
-				 * this dr has no data associated with it.
+				 * level 0 block, we cannot simply redirty it,
+				 * because this dr has no associated data.
 				 * We will go through a full undirtying below,
 				 * before dirtying it again.
 				 */
-- 
cgit v1.2.3


From 86b39b41a0311d109f3558b65a203a7aeb568472 Mon Sep 17 00:00:00 2001
From: Maxim Filimonov <part1zano@users.noreply.github.com>
Date: Tue, 9 Apr 2024 02:37:41 +0400
Subject: Fix locale-specific time

In `zpool status -t`, scrub date/time is reported using the C locale,
while trim time is reported using the current one. This is inconsistent.
This patch fixes that.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Maxim Filimonov <che@bein.link>
Closes #15878
Closes #15879
---
 cmd/zpool/zpool_main.c   | 10 ++++------
 lib/libzfs/libzfs_pool.c |  6 ++++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index a7062615c390..3bb77e59fc3f 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2246,7 +2246,6 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
-			struct tm zaction_ts;
 
 			time_t t = vs->vs_initialize_action_time;
 			int initialize_pct = 100;
@@ -2256,8 +2255,8 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
 				    100 / (vs->vs_initialize_bytes_est + 1));
 			}
 
-			(void) localtime_r(&t, &zaction_ts);
-			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+			(void) ctime_r(&t, tbuf);
+			tbuf[24] = 0;
 
 			switch (vs->vs_initialize_state) {
 			case VDEV_INITIALIZE_SUSPENDED:
@@ -2297,7 +2296,6 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose)
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
-			struct tm zaction_ts;
 
 			time_t t = vs->vs_trim_action_time;
 			int trim_pct = 100;
@@ -2306,8 +2304,8 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose)
 				    100 / (vs->vs_trim_bytes_est + 1));
 			}
 
-			(void) localtime_r(&t, &zaction_ts);
-			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+			(void) ctime_r(&t, tbuf);
+			tbuf[24] = 0;
 
 			switch (vs->vs_trim_state) {
 			case VDEV_TRIM_SUSPENDED:
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 7f01143d5096..6d5ac9d59ed2 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -1899,7 +1899,8 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
 	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
 
 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "%c", &t) != 0) {
+	    ctime_r((time_t *)&rewindto, timestr) != NULL) {
+		timestr[24] = 0;
 		if (dryrun) {
 			(void) printf(dgettext(TEXT_DOMAIN,
 			    "Would be able to return %s "
@@ -1961,7 +1962,8 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
 	    "Recovery is possible, but will result in some data loss.\n"));
 
 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "%c", &t) != 0) {
+	    ctime_r((time_t *)&rewindto, timestr) != NULL) {
+		timestr[24] = 0;
 		(void) printf(dgettext(TEXT_DOMAIN,
 		    "\tReturning the pool to its state as of %s\n"
 		    "\tshould correct the problem.  "),
-- 
cgit v1.2.3


From 97889c037a8aae683f911c60ca81983105e32533 Mon Sep 17 00:00:00 2001
From: Jason Lee <calccrypto@gmail.com>
Date: Wed, 10 Apr 2024 16:01:39 -0600
Subject: return NULL at end of send_progress_thread

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jason Lee <jasonlee@lanl.gov>
Closes #16074
---
 lib/libzfs/libzfs_sendrecv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index e9bc78aa8d39..143aecb9459f 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -1053,6 +1053,7 @@ send_progress_thread(void *arg)
 		}
 	}
 	pthread_cleanup_pop(B_TRUE);
+	return (NULL);
 }
 
 static boolean_t
-- 
cgit v1.2.3


From 3f817debb431fbc87247cedf0c8d1f51ea552dba Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Fri, 12 Apr 2024 07:49:57 +1000
Subject: AUTHORS: refresh with recent new contributors

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16079
---
 .mailmap | 18 ++++++++++++++++++
 AUTHORS  | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/.mailmap b/.mailmap
index 46ef016b93f8..32bdb5209613 100644
--- a/.mailmap
+++ b/.mailmap
@@ -30,6 +30,7 @@ Andreas Dilger <adilger@dilger.ca>
 Andrew Walker <awalker@ixsystems.com>
 Benedikt Neuffer <github@itfriend.de>
 Chengfei Zhu <chengfeix.zhu@intel.com>
+ChenHao Lu <18302010006@fudan.edu.cn>
 Chris Lindee <chris.lindee+github@gmail.com>
 Colm Buckley <colm@tuatha.org>
 Crag Wang <crag0715@gmail.com>
@@ -43,6 +44,7 @@ Glenn Washburn <development@efficientek.com>
 Gordan Bobic <gordan.bobic@gmail.com>
 Gregory Bartholomew <gregory.lee.bartholomew@gmail.com>
 hedong zhang <h_d_zhang@163.com>
+Ilkka Sovanto <github@ilkka.kapsi.fi>
 InsanePrawn <Insane.Prawny@gmail.com>
 Jason Cohen <jwittlincohen@gmail.com>
 Jason Harmening <jason.harmening@gmail.com>
@@ -57,6 +59,7 @@ KernelOfTruth <kerneloftruth@gmail.com>
 Liu Hua <liu.hua130@zte.com.cn>
 Liu Qing <winglq@gmail.com>
 loli10K <ezomori.nozomu@gmail.com>
+Mart Frauenlob <allkind@fastest.cc>
 Matthias Blankertz <matthias@blankertz.org>
 Michael Gmelin <grembo@FreeBSD.org>
 Olivier Mazouffre <olivier.mazouffre@ims-bordeaux.fr>
@@ -73,6 +76,9 @@ WHR <msl0000023508@gmail.com>
 Yanping Gao <yanping.gao@xtaotech.com>
 Youzhong Yang <youzhong@gmail.com>
 
+# Signed-off-by: overriding Author:
+Yuxin Wang <yuxinwang9999@gmail.com> <Bi11gates9999@gmail.com>
+
 # Commits from strange places, long ago
 Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
 Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@fedora-17-amd64.(none)>
@@ -102,12 +108,15 @@ Brandon Thetford <brandon@dodecatec.com> <dodexahedron@users.noreply.github.com>
 buzzingwires <buzzingwires@outlook.com> <131118055+buzzingwires@users.noreply.github.com>
 Cedric Maunoury <cedric.maunoury@gmail.com> <38213715+cedricmaunoury@users.noreply.github.com>
 Charles Suh <charles.suh@gmail.com> <charlessuh@users.noreply.github.com>
+Chris Peredun <chris.peredun@ixsystems.com> <126915832+chrisperedun@users.noreply.github.com>
 Dacian Reece-Stremtan <dacianstremtan@gmail.com> <35844628+dacianstremtan@users.noreply.github.com>
 Damian Szuberski <szuberskidamian@gmail.com> <30863496+szubersk@users.noreply.github.com>
 Daniel Hiepler <d-git@coderdu.de> <32984777+heeplr@users.noreply.github.com>
 Daniel Kobras <d.kobras@science-computing.de> <sckobras@users.noreply.github.com>
 Daniel Reichelt <hacking@nachtgeist.net> <nachtgeist@users.noreply.github.com>
 David Quigley <david.quigley@intel.com> <dpquigl@users.noreply.github.com>
+Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com> <31087738+dennisfriedrichsen@users.noreply.github.com>
+Dex Wood <slash2314@gmail.com> <slash2314@users.noreply.github.com>
 DHE <git@dehacked.net> <DeHackEd@users.noreply.github.com>
 Dmitri John Ledkov <dimitri.ledkov@canonical.com> <19779+xnox@users.noreply.github.com>
 Dries Michiels <driesm.michiels@gmail.com> <32487486+driesmp@users.noreply.github.com>
@@ -128,6 +137,7 @@ Harry Mallon <hjmallon@gmail.com> <1816667+hjmallon@users.noreply.github.com>
 Hiếu Lê <leorize+oss@disroot.org> <alaviss@users.noreply.github.com>
 Jake Howard <git@theorangeone.net> <RealOrangeOne@users.noreply.github.com>
 James Cowgill <james.cowgill@mips.com> <jcowgill@users.noreply.github.com>
+Jaron Kent-Dobias <jaron@kent-dobias.com> <kentdobias@users.noreply.github.com>
 Jason King <jason.king@joyent.com> <jasonbking@users.noreply.github.com>
 Jeff Dike <jdike@akamai.com> <52420226+jdike@users.noreply.github.com>
 Jitendra Patidar <jitendra.patidar@nutanix.com> <53164267+jsai20@users.noreply.github.com>
@@ -137,7 +147,9 @@ John L. Hammond <john.hammond@intel.com> <35266395+jhammond-intel@users.noreply.
 John-Mark Gurney <jmg@funkthat.com> <jmgurney@users.noreply.github.com>
 John Ramsden <johnramsden@riseup.net> <johnramsden@users.noreply.github.com>
 Jonathon Fernyhough <jonathon@m2x.dev> <559369+jonathonf@users.noreply.github.com>
+Jose Luis Duran <jlduran@gmail.com> <jlduran@users.noreply.github.com>
 Justin Hibbits <chmeeedalf@gmail.com> <chmeeedalf@users.noreply.github.com>
+Kevin Greene <kevin.greene@delphix.com> <104801862+kxgreene@users.noreply.github.com>
 Kevin Jin <lostking2008@hotmail.com> <33590050+jxdking@users.noreply.github.com>
 Kevin P. Fleming <kevin@km6g.us> <kpfleming@users.noreply.github.com>
 Krzysztof Piecuch <piecuch@kpiecuch.pl> <3964215+pikrzysztof@users.noreply.github.com>
@@ -148,9 +160,11 @@ Lorenz Hüdepohl <dev@stellardeath.org> <lhuedepohl@users.noreply.github.com>
 Luís Henriques <henrix@camandro.org> <73643340+lumigch@users.noreply.github.com>
 Marcin Skarbek <git@skarbek.name> <mskarbek@users.noreply.github.com>
 Matt Fiddaman <github@m.fiddaman.uk> <81489167+matt-fidd@users.noreply.github.com>
+Maxim Filimonov <che@bein.link> <part1zano@users.noreply.github.com>
 Max Zettlmeißl <max@zettlmeissl.de> <6818198+maxz@users.noreply.github.com>
 Michael Niewöhner <foss@mniewoehner.de> <c0d3z3r0@users.noreply.github.com>
 Michael Zhivich <mzhivich@akamai.com> <33133421+mzhivich@users.noreply.github.com>
+MigeljanImeri <ImeriMigel@gmail.com> <78048439+MigeljanImeri@users.noreply.github.com>
 Mo Zhou <cdluminate@gmail.com> <5723047+cdluminate@users.noreply.github.com>
 Nick Mattis <nickm970@gmail.com> <nmattis@users.noreply.github.com>
 omni <omni+vagant@hack.org> <79493359+omnivagant@users.noreply.github.com>
@@ -164,6 +178,7 @@ Ping Huang <huangping@smartx.com> <101400146+hpingfs@users.noreply.github.com>
 Piotr P. Stefaniak <pstef@freebsd.org> <pstef@users.noreply.github.com>
 Richard Allen <belperite@gmail.com> <33836503+belperite@users.noreply.github.com>
 Rich Ercolani <rincebrain@gmail.com> <214141+rincebrain@users.noreply.github.com>
+Rick Macklem <rmacklem@uoguelph.ca> <64620010+rmacklem@users.noreply.github.com>
 Rob Wing <rob.wing@klarasystems.com> <98866084+rob-wing@users.noreply.github.com>
 Roman Strashkin <roman.strashkin@nexenta.com> <Ramzec@users.noreply.github.com>
 Ryan Hirasaki <ryanhirasaki@gmail.com> <4690732+RyanHir@users.noreply.github.com>
@@ -174,6 +189,8 @@ Scott Colby <scott@scolby.com> <scolby33@users.noreply.github.com>
 Sean Eric Fagan <kithrup@mac.com> <kithrup@users.noreply.github.com>
 Spencer Kinny <spencerkinny1995@gmail.com> <30333052+Spencer-Kinny@users.noreply.github.com>
 Srikanth N S <srikanth.nagasubbaraoseetharaman@hpe.com> <75025422+nssrikanth@users.noreply.github.com>
+Stefan Lendl <s.lendl@proxmox.com> <1321542+stfl@users.noreply.github.com>
+Thomas Bertschinger <bertschinger@lanl.gov> <101425190+bertschinger@users.noreply.github.com>
 Thomas Geppert <geppi@digitx.de> <geppi@users.noreply.github.com>
 Tim Crawford <tcrawford@datto.com> <crawfxrd@users.noreply.github.com>
 Tom Matthews <tom@axiom-partners.com> <tomtastic@users.noreply.github.com>
@@ -181,6 +198,7 @@ Tony Perkins <tperkins@datto.com> <62951051+tony-zfs@users.noreply.github.com>
 Torsten Wörtwein <twoertwein@gmail.com> <twoertwein@users.noreply.github.com>
 Tulsi Jain <tulsi.jain@delphix.com> <TulsiJain@users.noreply.github.com>
 Václav Skála <skala@vshosting.cz> <33496485+vaclavskala@users.noreply.github.com>
+Vaibhav Bhanawat <vaibhav.bhanawat@delphix.com> <88050553+vaibhav-delphix@users.noreply.github.com>
 Violet Purcell <vimproved@inventati.org> <66446404+vimproved@users.noreply.github.com>
 Vipin Kumar Verma <vipin.verma@hpe.com> <75025470+vermavipinkumar@users.noreply.github.com>
 Wolfgang Bumiller <w.bumiller@proxmox.com> <Blub@users.noreply.github.com>
diff --git a/AUTHORS b/AUTHORS
index be1efb87b34c..d7d55f42d2e7 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -88,9 +88,11 @@ CONTRIBUTORS:
     Bassu <bassu@phi9.com>
     Ben Allen <bsallen@alcf.anl.gov>
     Ben Cordero <bencord0@condi.me>
+    Benda Xu <orv@debian.org>
     Benedikt Neuffer <github@itfriend.de>
     Benjamin Albrecht <git@albrecht.io>
     Benjamin Gentil <benjgentil.pro@gmail.com>
+    Benjamin Sherman <benjamin@holyarmy.org>
     Ben McGough <bmcgough@fredhutch.org>
     Ben Rubson <ben.rubson@gmail.com>
     Ben Wolsieffer <benwolsieffer@gmail.com>
@@ -111,6 +113,7 @@ CONTRIBUTORS:
     bzzz77 <bzzz.tomas@gmail.com>
     cable2999 <cable2999@users.noreply.github.com>
     Caleb James DeLisle <calebdelisle@lavabit.com>
+    Cameron Harr <harr1@llnl.gov>
     Cao Xuewen <cao.xuewen@zte.com.cn>
     Carlo Landmeter <clandmeter@gmail.com>
     Carlos Alberto Lopez Perez <clopez@igalia.com>
@@ -120,12 +123,15 @@ CONTRIBUTORS:
     Chen Can <chen.can2@zte.com.cn>
     Chengfei Zhu <chengfeix.zhu@intel.com>
     Chen Haiquan <oc@yunify.com>
+    ChenHao Lu <18302010006@fudan.edu.cn>
     Chip Parker <aparker@enthought.com>
     Chris Burroughs <chris.burroughs@gmail.com>
+    Chris Davidson <christopher.davidson@gmail.com>
     Chris Dunlap <cdunlap@llnl.gov>
     Chris Dunlop <chris@onthe.net.au>
     Chris Lindee <chris.lindee+github@gmail.com>
     Chris McDonough <chrism@plope.com>
+    Chris Peredun <chris.peredun@ixsystems.com>
     Chris Siden <chris.siden@delphix.com>
     Chris Siebenmann <cks.github@cs.toronto.edu>
     Christer Ekholm <che@chrekh.se>
@@ -144,6 +150,7 @@ CONTRIBUTORS:
     Clint Armstrong <clint@clintarmstrong.net>
     Coleman Kane <ckane@colemankane.org>
     Colin Ian King <colin.king@canonical.com>
+    Colin Percival <cperciva@tarsnap.com>
     Colm Buckley <colm@tuatha.org>
     Crag Wang <crag0715@gmail.com>
     Craig Loomis <cloomis@astro.princeton.edu>
@@ -156,6 +163,7 @@ CONTRIBUTORS:
     Damiano Albani <damiano.albani@gmail.com>
     Damian Szuberski <szuberskidamian@gmail.com>
     Damian Wojsław <damian@wojslaw.pl>
+    Daniel Berlin <dberlin@dberlin.org>
     Daniel Hiepler <d-git@coderdu.de>
     Daniel Hoffman <dj.hoffman@delphix.com>
     Daniel Kobras <d.kobras@science-computing.de>
@@ -176,8 +184,10 @@ CONTRIBUTORS:
     David Quigley <david.quigley@intel.com>
     Debabrata Banerjee <dbanerje@akamai.com>
     D. Ebdrup <debdrup@freebsd.org>
+    Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com>
     Denys Rtveliashvili <denys@rtveliashvili.name>
     Derek Dai <daiderek@gmail.com>
+    Dex Wood <slash2314@gmail.com>
     DHE <git@dehacked.net>
     Didier Roche <didrocks@ubuntu.com>
     Dimitri John Ledkov <xnox@ubuntu.com>
@@ -235,9 +245,11 @@ CONTRIBUTORS:
     Gionatan Danti <g.danti@assyoma.it>
     Giuseppe Di Natale <guss80@gmail.com>
     Glenn Washburn <development@efficientek.com>
+    gofaster <felix.gofaster@gmail.com>
     Gordan Bobic <gordan@redsleeve.org>
     Gordon Bergling <gbergling@googlemail.com>
     Gordon Ross <gwr@nexenta.com>
+    Gordon Tetlow <gordon@freebsd.org>
     Graham Christensen <graham@grahamc.com>
     Graham Perrin <grahamperrin@gmail.com>
     Gregor Kopka <gregor@kopka.net>
@@ -265,6 +277,7 @@ CONTRIBUTORS:
     Igor Kozhukhov <ikozhukhov@gmail.com>
     Igor Lvovsky <ilvovsky@gmail.com>
     ilbsmart <wgqimut@gmail.com>
+    Ilkka Sovanto <github@ilkka.kapsi.fi>
     illiliti <illiliti@protonmail.com>
     ilovezfs <ilovezfs@icloud.com>
     InsanePrawn <Insane.Prawny@gmail.com>
@@ -280,9 +293,11 @@ CONTRIBUTORS:
     Jan Engelhardt <jengelh@inai.de>
     Jan Kryl <jan.kryl@nexenta.com>
     Jan Sanislo <oystr@cs.washington.edu>
+    Jaron Kent-Dobias <jaron@kent-dobias.com>
     Jason Cohen <jwittlincohen@gmail.com>
     Jason Harmening <jason.harmening@gmail.com>
     Jason King <jason.brian.king@gmail.com>
+    Jason Lee <jasonlee@lanl.gov>
     Jason Zaman <jasonzaman@gmail.com>
     Javen Wu <wu.javen@gmail.com>
     Jean-Baptiste Lallement <jean-baptiste@ubuntu.com>
@@ -313,6 +328,7 @@ CONTRIBUTORS:
     Jonathon Fernyhough <jonathon@m2x.dev>
     Jorgen Lundman <lundman@lundman.net>
     Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
+    Jose Luis Duran <jlduran@gmail.com>
     Josh Soref <jsoref@users.noreply.github.com>
     Joshua M. Clulow <josh@sysmgr.org>
     José Luis Salvador Rufo <salvador.joseluis@gmail.com>
@@ -336,8 +352,10 @@ CONTRIBUTORS:
     Kash Pande <kash@tripleback.net>
     Kay Pedersen <christianpe96@gmail.com>
     Keith M Wesolowski <wesolows@foobazco.org>
+    Kent Ross <k@mad.cash>
     KernelOfTruth <kerneloftruth@gmail.com>
     Kevin Bowling <kevin.bowling@kev009.com>
+    Kevin Greene <kevin.greene@delphix.com>
     Kevin Jin <lostking2008@hotmail.com>
     Kevin P. Fleming <kevin@km6g.us>
     Kevin Tanguy <kevin.tanguy@ovh.net>
@@ -389,6 +407,7 @@ CONTRIBUTORS:
     Mark Shellenbaum <Mark.Shellenbaum@Oracle.COM>
     marku89 <mar42@kola.li>
     Mark Wright <markwright@internode.on.net>
+    Mart Frauenlob <allkind@fastest.cc>
     Martin Matuska <mm@FreeBSD.org>
     Martin Rüegg <martin.rueegg@metaworx.ch>
     Massimo Maggi <me@massimo-maggi.eu>
@@ -405,6 +424,7 @@ CONTRIBUTORS:
     Matus Kral <matuskral@me.com>
     Mauricio Faria de Oliveira <mfo@canonical.com>
     Max Grossman <max.grossman@delphix.com>
+    Maxim Filimonov <che@bein.link>
     Maximilian Mehnert <maximilian.mehnert@gmx.de>
     Max Zettlmeißl <max@zettlmeissl.de>
     Md Islam <mdnahian@outlook.com>
@@ -417,6 +437,7 @@ CONTRIBUTORS:
     Michael Niewöhner <foss@mniewoehner.de>
     Michael Zhivich <mzhivich@akamai.com>
     Michal Vasilek <michal@vasilek.cz>
+    MigeljanImeri <ImeriMigel@gmail.com>
     Mike Gerdts <mike.gerdts@joyent.com>
     Mike Harsch <mike@harschsystems.com>
     Mike Leddy <mike.leddy@gmail.com>
@@ -448,6 +469,7 @@ CONTRIBUTORS:
     Olaf Faaland <faaland1@llnl.gov>
     Oleg Drokin <green@linuxhacker.ru>
     Oleg Stepura <oleg@stepura.com>
+    Olivier Certner <olce.freebsd@certner.fr>
     Olivier Mazouffre <olivier.mazouffre@ims-bordeaux.fr>
     omni <omni+vagant@hack.org>
     Orivej Desh <orivej@gmx.fr>
@@ -479,6 +501,7 @@ CONTRIBUTORS:
     Prasad Joshi <prasadjoshi124@gmail.com>
     privb0x23 <privb0x23@users.noreply.github.com>
     P.SCH <p88@yahoo.com>
+    Quartz <yyhran@163.com>
     Quentin Zdanis <zdanisq@gmail.com>
     Rafael Kitover <rkitover@gmail.com>
     RageLtMan <sempervictus@users.noreply.github.com>
@@ -491,11 +514,15 @@ CONTRIBUTORS:
     Riccardo Schirone <rschirone91@gmail.com>
     Richard Allen <belperite@gmail.com>
     Richard Elling <Richard.Elling@RichardElling.com>
+    Richard Kojedzinszky <richard@kojedz.in>
     Richard Laager <rlaager@wiktel.com>
     Richard Lowe <richlowe@richlowe.net>
     Richard Sharpe <rsharpe@samba.org>
     Richard Yao <ryao@gentoo.org>
     Rich Ercolani <rincebrain@gmail.com>
+    Rick Macklem <rmacklem@uoguelph.ca>
+    rilysh <nightquick@proton.me>
+    Robert Evans <evansr@google.com>
     Robert Novak <sailnfool@gmail.com>
     Roberto Ricci <ricci@disroot.org>
     Rob Norris <robn@despairlabs.com>
@@ -509,7 +536,9 @@ CONTRIBUTORS:
     Ryan Lahfa <masterancpp@gmail.com>
     Ryan Libby <rlibby@FreeBSD.org>
     Ryan Moeller <freqlabs@FreeBSD.org>
+    Sam Atkinson <samatk@amazon.com>
     Sam Hathaway <github.com@munkynet.org>
+    Sam James <sam@gentoo.org>
     Sam Lunt <samuel.j.lunt@gmail.com>
     Samuel VERSCHELDE <stormi-github@ylix.fr>
     Samuel Wycliffe <samuelwycliffe@gmail.com>
@@ -530,6 +559,8 @@ CONTRIBUTORS:
     Shaan Nobee <sniper111@gmail.com>
     Shampavman <sham.pavman@nexenta.com>
     Shaun Tancheff <shaun@aeonazure.com>
+    Shawn Bayern <sbayern@law.fsu.edu>
+    Shengqi Chen <harry-chen@outlook.com>
     Shen Yan <shenyanxxxy@qq.com>
     Simon Guest <simon.guest@tesujimath.org>
     Simon Klinkert <simon.klinkert@gmail.com>
@@ -537,6 +568,7 @@ CONTRIBUTORS:
     Spencer Kinny <spencerkinny1995@gmail.com>
     Srikanth N S <srikanth.nagasubbaraoseetharaman@hpe.com>
     Stanislav Seletskiy <s.seletskiy@gmail.com>
+    Stefan Lendl <s.lendl@proxmox.com>
     Steffen Müthing <steffen.muething@iwr.uni-heidelberg.de>
     Stephen Blinick <stephen.blinick@delphix.com>
     sterlingjensen <sterlingjensen@users.noreply.github.com>
@@ -557,6 +589,7 @@ CONTRIBUTORS:
     Teodor Spæren <teodor_spaeren@riseup.net>
     TerraTech <TerraTech@users.noreply.github.com>
     Thijs Cramer <thijs.cramer@gmail.com>
+    Thomas Bertschinger <bertschinger@lanl.gov>
     Thomas Geppert <geppi@digitx.de>
     Thomas Lamprecht <guggentom@hotmail.de>
     Till Maas <opensource@till.name>
@@ -586,6 +619,7 @@ CONTRIBUTORS:
     Turbo Fredriksson <turbo@bayour.com>
     Tyler J. Stachecki <stachecki.tyler@gmail.com>
     Umer Saleem <usaleem@ixsystems.com>
+    Vaibhav Bhanawat <vaibhav.bhanawat@delphix.com>
     Valmiky Arquissandas <kayvlim@gmail.com>
     Val Packett <val@packett.cool>
     Vince van Oosten <techhazard@codeforyouand.me>
@@ -614,6 +648,7 @@ CONTRIBUTORS:
     yuina822 <ayuichi@club.kyutech.ac.jp>
     YunQiang Su <syq@debian.org>
     Yuri Pankov <yuri.pankov@gmail.com>
+    Yuxin Wang <yuxinwang9999@gmail.com>
     Yuxuan Shui <yshuiv7@gmail.com>
     Zachary Bedell <zac@thebedells.org>
     Zach Dykstra <dykstra.zachary@gmail.com>
-- 
cgit v1.2.3


From 7aaf6ce9d8633a6748648bfbb0c39137500a5751 Mon Sep 17 00:00:00 2001
From: Andrew Turner <andrew.turner4@arm.com>
Date: Mon, 15 Apr 2024 21:53:39 +0100
Subject: Add the BTI elf note to the AArch64 SHA2 assembly

On ELF platforms there is a note to specify when an application or
library supports BTI. When linking one of these the linker needs
all input object files to have the note. If not it will not include
it in the output file.

Normally the compiler would generate it, but for assembly files we
need to do it our selves.

Add the note to the aarch64 sha256 and sha512 assembly files.

Tested by building with BTI enabled and using the -zbti-report=error
flag to lld that makes it an error if the note is missing.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Andrew Turner <andrew.turner4@arm.com>
Closes #16086
---
 module/icp/asm-aarch64/sha2/sha256-armv8.S | 10 ++++++++++
 module/icp/asm-aarch64/sha2/sha512-armv8.S | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S
index 7ae486e4e229..4dcdd3b65d0b 100644
--- a/module/icp/asm-aarch64/sha2/sha256-armv8.S
+++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S
@@ -21,6 +21,16 @@
 
 #if defined(__aarch64__)
 
+	.section	.note.gnu.property,"a",@note
+	.p2align	3
+	.word	4
+	.word	16
+	.word	5
+	.asciz	"GNU"
+	.word	3221225472
+	.word	4
+	.word	3
+	.word	0
 .text
 
 .align	6
diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S
index 9c61eeee4d7b..f6c8f7742912 100644
--- a/module/icp/asm-aarch64/sha2/sha512-armv8.S
+++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S
@@ -21,6 +21,16 @@
 
 #if defined(__aarch64__)
 
+	.section	.note.gnu.property,"a",@note
+	.p2align	3
+	.word	4
+	.word	16
+	.word	5
+	.asciz	"GNU"
+	.word	3221225472
+	.word	4
+	.word	3
+	.word	0
 .text
 
 .align	6
-- 
cgit v1.2.3


From 16c223eec9b4cee1c9c5996d9d08be47dbffb855 Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Mon, 15 Apr 2024 22:56:10 +0200
Subject: Do no use .cfi_negate_ra_state within the assembly on Arm64

Compiling openzfs on aarch64 with gcc-8 and gcc-9 is failing currently.
See issue #14965 for deeper context.

On platforms without pointer authentication, .cfi_negate_ra_state can be
defined to a no-op:
https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=gdb/aarch64-tdep.c#l1413

I have tested this on Arm64 FreeBSD 13.2 and AlmaLinux-8.

Reviewed-by: Andrew Turner <andrew.turner4@arm.com>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #14965
Closes #15784
---
 module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S  | 14 +++++++++++---
 module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 12 ++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
index dc2719d142db..e66bb4bc7f26 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -32,6 +32,14 @@
  */
 
 #if defined(__aarch64__)
+
+/* make gcc <= 9 happy */
+#if LD_VERSION >= 233010000
+#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
+#else
+#define CFI_NEGATE_RA_STATE
+#endif
+
 	.text
 	.section	.note.gnu.property,"a",@note
 	.p2align	3
@@ -51,7 +59,7 @@
 zfs_blake3_compress_in_place_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@@ -555,7 +563,7 @@ compress_pre:
 zfs_blake3_compress_xof_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@@ -608,7 +616,7 @@ zfs_blake3_compress_xof_sse2:
 zfs_blake3_hash_many_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	stp	d15, d14, [sp, #-160]!
 	stp	d13, d12, [sp, #16]
 	stp	d11, d10, [sp, #32]
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
index c4c2dfc5bcde..b9fb28dfcf03 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@@ -32,6 +32,14 @@
  */
 
 #if defined(__aarch64__)
+
+/* make gcc <= 9 happy */
+#if LD_VERSION >= 233010000
+#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
+#else
+#define CFI_NEGATE_RA_STATE
+#endif
+
 	.text
 	.section	.note.gnu.property,"a",@note
 	.p2align	3
@@ -51,7 +59,7 @@
 zfs_blake3_compress_in_place_sse41:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@@ -565,7 +573,7 @@ compress_pre:
 zfs_blake3_compress_xof_sse41:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
-- 
cgit v1.2.3


From 51d3c23150d3eaeb3ef66410b94edff13cffa0ed Mon Sep 17 00:00:00 2001
From: Seth Troisi <sethtroisi@google.com>
Date: Mon, 22 Apr 2024 10:45:39 -0700
Subject: Add newline to two zpool messages

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Seth Troisi <sethtroisi@google.com>
Closes #16113
---
 cmd/zpool/zpool_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 3bb77e59fc3f..1e577e712be5 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -3400,10 +3400,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 		ms_status = zpool_enable_datasets(zhp, mntopts, 0);
 		if (ms_status == EZFS_SHAREFAILED) {
 			(void) fprintf(stderr, gettext("Import was "
-			    "successful, but unable to share some datasets"));
+			    "successful, but unable to share some datasets\n"));
 		} else if (ms_status == EZFS_MOUNTFAILED) {
 			(void) fprintf(stderr, gettext("Import was "
-			    "successful, but unable to mount some datasets"));
+			    "successful, but unable to mount some datasets\n"));
 		}
 	}
 
-- 
cgit v1.2.3


From 6581b17842164dc882b00e5a6a0e468e9eadd8ed Mon Sep 17 00:00:00 2001
From: Seth Troisi <sethtroisi@google.com>
Date: Mon, 22 Apr 2024 10:47:44 -0700
Subject: ZTS: user_namespace_004.ksh avoid error in cleanup if unsupported

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Seth Troisi <sethtroisi@google.com>
Closes #16114
---
 .../zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh
index 37ef84b72377..e6ad25f23f93 100755
--- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh
+++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh
@@ -44,8 +44,6 @@ user_ns_cleanup() {
 	log_must zfs destroy -r "$TESTPOOL/userns"
 }
 
-log_onexit user_ns_cleanup
-
 log_assert "Check zfs zone command handling of non-namespace files"
 
 # Pass if user namespaces are not supported.
@@ -54,6 +52,8 @@ if [ "$?" -ne "0" ]; then
 	log_unsupported "Failed to create user namespace"
 fi
 
+log_onexit user_ns_cleanup
+
 # Create the baseline datasets.
 log_must zfs create -o zoned=on "$TESTPOOL/userns"
 
-- 
cgit v1.2.3


From 284489893bdd53004fdbc976106cac54a4069250 Mon Sep 17 00:00:00 2001
From: Todd <18294602+seidelma@users.noreply.github.com>
Date: Mon, 22 Apr 2024 17:55:41 -0700
Subject: zfs-kmod: fix empty rpm requires/conflicts

Fix an error in zfs-kmod.spec that causes kmod-zfs packages not to
include the correct RPM requires/conflicts relationships.  With this
change applied, RPM correctly no longer allows kmod-zfs & zfs-dkms
packages to be installed together.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Todd Seidelmann <18294602+seidelma@users.noreply.github.com>
Closes #16121
---
 rpm/redhat/zfs-kmod.spec.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in
index 9c836786baea..876c198c64de 100644
--- a/rpm/redhat/zfs-kmod.spec.in
+++ b/rpm/redhat/zfs-kmod.spec.in
@@ -17,7 +17,7 @@ BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
 # by generating a preamble text file which kmodtool can append to the spec file.
 %(/bin/echo -e "\
 Requires:       @PACKAGE@ = %{version}\n\
-Conflicts:      @PACKAGE@-dkms)
+Conflicts:      @PACKAGE@-dkms" > %{_sourcedir}/kmod-preamble)
 
 # LDFLAGS are not sanitized by arch/*/Makefile for these architectures.
 %ifarch ppc ppc64 ppc64le aarch64
-- 
cgit v1.2.3


From 71216b91d281e7e58f5e29ca4d4553945e080fe9 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 25 Apr 2024 13:40:09 -0700
Subject: Python 3.12 deprecated python3-distutils

As for python-3.12 the distutils package has been deprecated.
The latest ax_python_devel.m4 macro from the autoconf archive
has been updated accordingly so let's pull in the new version.

We can also drop the changes made to our customized version
to continue if the development version is not installed since
this functionality has been included upstream.

Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16126
Closes #16129
---
 config/always-pyzfs.m4    |   9 +-
 config/ax_python_devel.m4 | 341 +++++++++++++++++++++++++++++++---------------
 contrib/debian/control    |   2 +-
 3 files changed, 235 insertions(+), 117 deletions(-)

diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4
index 9b123b1b2db1..98c1cc230205 100644
--- a/config/always-pyzfs.m4
+++ b/config/always-pyzfs.m4
@@ -80,10 +80,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 			[AC_MSG_ERROR("Python $PYTHON_VERSION unknown")]
 		)
 
-		AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [
-			AS_IF([test "x$enable_pyzfs" = xyes], [
-				AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed")
-			], [test "x$enable_pyzfs" != xno], [
+		AS_IF([test "x$enable_pyzfs" = xyes], [
+			AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION])
+		], [
+			AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [true])
+			AS_IF([test "x$ax_python_devel_found" = xno], [
 				enable_pyzfs=no
 			])
 		])
diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4
index f6d4b01444d6..1f480db6d233 100644
--- a/config/ax_python_devel.m4
+++ b/config/ax_python_devel.m4
@@ -4,18 +4,13 @@
 #
 # SYNOPSIS
 #
-#   AX_PYTHON_DEVEL([version], [action-if-not-found])
+#   AX_PYTHON_DEVEL([version[,optional]])
 #
 # DESCRIPTION
 #
 #   Note: Defines as a precious variable "PYTHON_VERSION". Don't override it
 #   in your configure.ac.
 #
-#   Note: this is a slightly modified version of the original AX_PYTHON_DEVEL
-#   macro which accepts an additional [action-if-not-found] argument. This
-#   allow to detect if Python development is available without aborting the
-#   configure phase with an hard error in case it is not.
-#
 #   This macro checks for Python and tries to get the include path to
 #   'Python.h'. It provides the $(PYTHON_CPPFLAGS) and $(PYTHON_LIBS) output
 #   variables. It also exports $(PYTHON_EXTRA_LIBS) and
@@ -28,6 +23,11 @@
 #   version number. Don't use "PYTHON_VERSION" for this: that environment
 #   variable is declared as precious and thus reserved for the end-user.
 #
+#   By default this will fail if it does not detect a development version of
+#   python.  If you want it to continue, set optional to true, like
+#   AX_PYTHON_DEVEL([], [true]).  The ax_python_devel_found variable will be
+#   "no" if it fails.
+#
 #   This macro should work for all versions of Python >= 2.1.0. As an end
 #   user, you can disable the check for the python version by setting the
 #   PYTHON_NOVERSIONCHECK environment variable to something else than the
@@ -45,7 +45,6 @@
 #   Copyright (c) 2009 Matteo Settenvini <matteo@member.fsf.org>
 #   Copyright (c) 2009 Horst Knorr <hk_classes@knoda.org>
 #   Copyright (c) 2013 Daniel Mullner <muellner@math.stanford.edu>
-#   Copyright (c) 2018 loli10K <ezomori.nozomu@gmail.com>
 #
 #   This program is free software: you can redistribute it and/or modify it
 #   under the terms of the GNU General Public License as published by the
@@ -73,10 +72,18 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
-#serial 21
+#serial 36
 
 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL])
 AC_DEFUN([AX_PYTHON_DEVEL],[
+	# Get whether it's optional
+	if test -z "$2"; then
+	   ax_python_devel_optional=false
+	else
+	   ax_python_devel_optional=$2
+	fi
+	ax_python_devel_found=yes
+
 	#
 	# Allow the use of a (user set) custom python version
 	#
@@ -87,23 +94,26 @@ AC_DEFUN([AX_PYTHON_DEVEL],[
 
 	AC_PATH_PROG([PYTHON],[python[$PYTHON_VERSION]])
 	if test -z "$PYTHON"; then
-		m4_ifvaln([$2],[$2],[
-			AC_MSG_ERROR([Cannot find python$PYTHON_VERSION in your system path])
-			PYTHON_VERSION=""
-		])
+	   AC_MSG_WARN([Cannot find python$PYTHON_VERSION in your system path])
+	   if ! $ax_python_devel_optional; then
+	      AC_MSG_ERROR([Giving up, python development not available])
+	   fi
+	   ax_python_devel_found=no
+	   PYTHON_VERSION=""
 	fi
 
-	#
-	# Check for a version of Python >= 2.1.0
-	#
-	AC_MSG_CHECKING([for a version of Python >= '2.1.0'])
-	ac_supports_python_ver=`$PYTHON -c "import sys; \
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # Check for a version of Python >= 2.1.0
+	   #
+	   AC_MSG_CHECKING([for a version of Python >= '2.1.0'])
+	   ac_supports_python_ver=`$PYTHON -c "import sys; \
 		ver = sys.version.split ()[[0]]; \
 		print (ver >= '2.1.0')"`
-	if test "$ac_supports_python_ver" != "True"; then
+	   if test "$ac_supports_python_ver" != "True"; then
 		if test -z "$PYTHON_NOVERSIONCHECK"; then
 			AC_MSG_RESULT([no])
-			AC_MSG_FAILURE([
+			AC_MSG_WARN([
 This version of the AC@&t@_PYTHON_DEVEL macro
 doesn't work properly with versions of Python before
 2.1.0. You may need to re-run configure, setting the
@@ -112,20 +122,27 @@ PYTHON_EXTRA_LIBS and PYTHON_EXTRA_LDFLAGS by hand.
 Moreover, to disable this check, set PYTHON_NOVERSIONCHECK
 to something else than an empty string.
 ])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_FAILURE([Giving up])
+			fi
+			ax_python_devel_found=no
+			PYTHON_VERSION=""
 		else
 			AC_MSG_RESULT([skip at user request])
 		fi
-	else
+	   else
 		AC_MSG_RESULT([yes])
+	   fi
 	fi
 
-	#
-	# If the macro parameter ``version'' is set, honour it.
-	# A Python shim class, VPy, is used to implement correct version comparisons via
-	# string expressions, since e.g. a naive textual ">= 2.7.3" won't work for
-	# Python 2.7.10 (the ".1" being evaluated as less than ".3").
-	#
-	if test -n "$1"; then
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # If the macro parameter ``version'' is set, honour it.
+	   # A Python shim class, VPy, is used to implement correct version comparisons via
+	   # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for
+	   # Python 2.7.10 (the ".1" being evaluated as less than ".3").
+	   #
+	   if test -n "$1"; then
 		AC_MSG_CHECKING([for a version of Python $1])
                 cat << EOF > ax_python_devel_vpy.py
 class VPy:
@@ -133,7 +150,7 @@ class VPy:
         return tuple(map(int, s.strip().replace("rc", ".").split(".")))
     def __init__(self):
         import sys
-        self.vpy = tuple(sys.version_info)
+        self.vpy = tuple(sys.version_info)[[:3]]
     def __eq__(self, s):
         return self.vpy == self.vtup(s)
     def __ne__(self, s):
@@ -155,25 +172,69 @@ EOF
 			AC_MSG_RESULT([yes])
 		else
 			AC_MSG_RESULT([no])
-			AC_MSG_ERROR([this package requires Python $1.
+			AC_MSG_WARN([this package requires Python $1.
 If you have it installed, but it isn't the default Python
 interpreter in your system path, please pass the PYTHON_VERSION
 variable to configure. See ``configure --help'' for reference.
 ])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_ERROR([Giving up])
+			fi
+			ax_python_devel_found=no
 			PYTHON_VERSION=""
 		fi
+	   fi
 	fi
 
-	#
-	# Check for Python include path
-	#
-	#
-	AC_MSG_CHECKING([for Python include path])
-	if test -z "$PYTHON_CPPFLAGS"; then
-		python_path=`$PYTHON -c "import sysconfig; \
-			print (sysconfig.get_path('include'));"`
-		plat_python_path=`$PYTHON -c "import sysconfig; \
-			print (sysconfig.get_path('platinclude'));"`
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # Check if you have distutils, else fail
+	   #
+	   AC_MSG_CHECKING([for the sysconfig Python package])
+	   ac_sysconfig_result=`$PYTHON -c "import sysconfig" 2>&1`
+	   if test $? -eq 0; then
+		AC_MSG_RESULT([yes])
+		IMPORT_SYSCONFIG="import sysconfig"
+	   else
+		AC_MSG_RESULT([no])
+
+		AC_MSG_CHECKING([for the distutils Python package])
+		ac_sysconfig_result=`$PYTHON -c "from distutils import sysconfig" 2>&1`
+		if test $? -eq 0; then
+			AC_MSG_RESULT([yes])
+			IMPORT_SYSCONFIG="from distutils import sysconfig"
+		else
+			AC_MSG_WARN([cannot import Python module "distutils".
+Please check your Python installation. The error was:
+$ac_sysconfig_result])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_ERROR([Giving up])
+			fi
+			ax_python_devel_found=no
+			PYTHON_VERSION=""
+		fi
+	   fi
+	fi
+
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # Check for Python include path
+	   #
+	   AC_MSG_CHECKING([for Python include path])
+	   if test -z "$PYTHON_CPPFLAGS"; then
+		if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+			# sysconfig module has different functions
+			python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_path ('include'));"`
+			plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_path ('platinclude'));"`
+		else
+			# old distutils way
+			python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_inc ());"`
+			plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_inc (plat_specific=1));"`
+		fi
 		if test -n "${python_path}"; then
 			if test "${plat_python_path}" != "${python_path}"; then
 				python_path="-I$python_path -I$plat_python_path"
@@ -182,15 +243,15 @@ variable to configure. See ``configure --help'' for reference.
 			fi
 		fi
 		PYTHON_CPPFLAGS=$python_path
-	fi
-	AC_MSG_RESULT([$PYTHON_CPPFLAGS])
-	AC_SUBST([PYTHON_CPPFLAGS])
+	   fi
+	   AC_MSG_RESULT([$PYTHON_CPPFLAGS])
+	   AC_SUBST([PYTHON_CPPFLAGS])
 
-	#
-	# Check for Python library path
-	#
-	AC_MSG_CHECKING([for Python library path])
-	if test -z "$PYTHON_LIBS"; then
+	   #
+	   # Check for Python library path
+	   #
+	   AC_MSG_CHECKING([for Python library path])
+	   if test -z "$PYTHON_LIBS"; then
 		# (makes two attempts to ensure we've got a version number
 		# from the interpreter)
 		ac_python_version=`cat<<EOD | $PYTHON -
@@ -208,7 +269,7 @@ EOD`
 				ac_python_version=$PYTHON_VERSION
 			else
 				ac_python_version=`$PYTHON -c "import sys; \
-					print ('.'.join(sys.version.split('.')[[:2]]))"`
+					print ("%d.%d" % sys.version_info[[:2]])"`
 			fi
 		fi
 
@@ -220,7 +281,7 @@ EOD`
 		ac_python_libdir=`cat<<EOD | $PYTHON -
 
 # There should be only one
-import sysconfig
+$IMPORT_SYSCONFIG
 e = sysconfig.get_config_var('LIBDIR')
 if e is not None:
 	print (e)
@@ -229,7 +290,7 @@ EOD`
 		# Now, for the library:
 		ac_python_library=`cat<<EOD | $PYTHON -
 
-import sysconfig
+$IMPORT_SYSCONFIG
 c = sysconfig.get_config_vars()
 if 'LDVERSION' in c:
 	print ('python'+c[['LDVERSION']])
@@ -249,88 +310,140 @@ EOD`
 		else
 			# old way: use libpython from python_configdir
 			ac_python_libdir=`$PYTHON -c \
-			  "import sysconfig; \
+			  "from sysconfig import get_python_lib as f; \
 			  import os; \
-			  print (os.path.join(sysconfig.get_path('platstdlib'), 'config'));"`
+			  print (os.path.join(f(plat_specific=1, standard_lib=1), 'config'));"`
 			PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version"
 		fi
 
 		if test -z "PYTHON_LIBS"; then
-			m4_ifvaln([$2],[$2],[
-				AC_MSG_ERROR([
+			AC_MSG_WARN([
   Cannot determine location of your Python DSO. Please check it was installed with
   dynamic libraries enabled, or try setting PYTHON_LIBS by hand.
-				])
 			])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_ERROR([Giving up])
+			fi
+			ax_python_devel_found=no
+			PYTHON_VERSION=""
 		fi
+	   fi
 	fi
-	AC_MSG_RESULT([$PYTHON_LIBS])
-	AC_SUBST([PYTHON_LIBS])
 
-	#
-	# Check for site packages
-	#
-	AC_MSG_CHECKING([for Python site-packages path])
-	if test -z "$PYTHON_SITE_PKG"; then
-		PYTHON_SITE_PKG=`$PYTHON -c "import distutils.sysconfig; \
-			print (distutils.sysconfig.get_python_lib(0,0));" 2>/dev/null || \
-			$PYTHON -c "import sysconfig; \
-			print (sysconfig.get_path('purelib'));"`
-	fi
-	AC_MSG_RESULT([$PYTHON_SITE_PKG])
-	AC_SUBST([PYTHON_SITE_PKG])
+	if test $ax_python_devel_found = yes; then
+	   AC_MSG_RESULT([$PYTHON_LIBS])
+	   AC_SUBST([PYTHON_LIBS])
 
-	#
-	# libraries which must be linked in when embedding
-	#
-	AC_MSG_CHECKING(python extra libraries)
-	if test -z "$PYTHON_EXTRA_LIBS"; then
-	   PYTHON_EXTRA_LIBS=`$PYTHON -c "import sysconfig; \
+	   #
+	   # Check for site packages
+	   #
+	   AC_MSG_CHECKING([for Python site-packages path])
+	   if test -z "$PYTHON_SITE_PKG"; then
+		if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+			PYTHON_SITE_PKG=`$PYTHON -c "
+$IMPORT_SYSCONFIG;
+if hasattr(sysconfig, 'get_default_scheme'):
+    scheme = sysconfig.get_default_scheme()
+else:
+    scheme = sysconfig._get_default_scheme()
+if scheme == 'posix_local':
+    # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/
+    scheme = 'posix_prefix'
+prefix = '$prefix'
+if prefix == 'NONE':
+    prefix = '$ac_default_prefix'
+sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix})
+print(sitedir)"`
+		else
+			# distutils.sysconfig way
+			PYTHON_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_lib(0,0));"`
+		fi
+	   fi
+	   AC_MSG_RESULT([$PYTHON_SITE_PKG])
+	   AC_SUBST([PYTHON_SITE_PKG])
+
+	   #
+	   # Check for platform-specific site packages
+	   #
+	   AC_MSG_CHECKING([for Python platform specific site-packages path])
+	   if test -z "$PYTHON_PLATFORM_SITE_PKG"; then
+		if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+			PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "
+$IMPORT_SYSCONFIG;
+if hasattr(sysconfig, 'get_default_scheme'):
+    scheme = sysconfig.get_default_scheme()
+else:
+    scheme = sysconfig._get_default_scheme()
+if scheme == 'posix_local':
+    # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/
+    scheme = 'posix_prefix'
+prefix = '$prefix'
+if prefix == 'NONE':
+    prefix = '$ac_default_prefix'
+sitedir = sysconfig.get_path('platlib', scheme, vars={'platbase': prefix})
+print(sitedir)"`
+		else
+			# distutils.sysconfig way
+			PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_lib(1,0));"`
+		fi
+	   fi
+	   AC_MSG_RESULT([$PYTHON_PLATFORM_SITE_PKG])
+	   AC_SUBST([PYTHON_PLATFORM_SITE_PKG])
+
+	   #
+	   # libraries which must be linked in when embedding
+	   #
+	   AC_MSG_CHECKING(python extra libraries)
+	   if test -z "$PYTHON_EXTRA_LIBS"; then
+	      PYTHON_EXTRA_LIBS=`$PYTHON -c "$IMPORT_SYSCONFIG; \
                 conf = sysconfig.get_config_var; \
                 print (conf('LIBS') + ' ' + conf('SYSLIBS'))"`
-	fi
-	AC_MSG_RESULT([$PYTHON_EXTRA_LIBS])
-	AC_SUBST(PYTHON_EXTRA_LIBS)
+	   fi
+	   AC_MSG_RESULT([$PYTHON_EXTRA_LIBS])
+	   AC_SUBST(PYTHON_EXTRA_LIBS)
 
-	#
-	# linking flags needed when embedding
-	#
-	AC_MSG_CHECKING(python extra linking flags)
-	if test -z "$PYTHON_EXTRA_LDFLAGS"; then
-		PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "import sysconfig; \
+	   #
+	   # linking flags needed when embedding
+	   #
+	   AC_MSG_CHECKING(python extra linking flags)
+	   if test -z "$PYTHON_EXTRA_LDFLAGS"; then
+		PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "$IMPORT_SYSCONFIG; \
 			conf = sysconfig.get_config_var; \
 			print (conf('LINKFORSHARED'))"`
-	fi
-	AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS])
-	AC_SUBST(PYTHON_EXTRA_LDFLAGS)
+		# Hack for macos, it sticks this in here.
+		PYTHON_EXTRA_LDFLAGS=`echo $PYTHON_EXTRA_LDFLAGS | sed 's/CoreFoundation.*$/CoreFoundation/'`
+	   fi
+	   AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS])
+	   AC_SUBST(PYTHON_EXTRA_LDFLAGS)
 
-	#
-	# final check to see if everything compiles alright
-	#
-	AC_MSG_CHECKING([consistency of all components of python development environment])
-	# save current global flags
-	ac_save_LIBS="$LIBS"
-	ac_save_LDFLAGS="$LDFLAGS"
-	ac_save_CPPFLAGS="$CPPFLAGS"
-	LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS $PYTHON_EXTRA_LIBS"
-	LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS"
-	CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS"
-	AC_LANG_PUSH([C])
-	AC_LINK_IFELSE([
+	   #
+	   # final check to see if everything compiles alright
+	   #
+	   AC_MSG_CHECKING([consistency of all components of python development environment])
+	   # save current global flags
+	   ac_save_LIBS="$LIBS"
+	   ac_save_LDFLAGS="$LDFLAGS"
+	   ac_save_CPPFLAGS="$CPPFLAGS"
+	   LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS"
+	   LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS"
+	   CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS"
+	   AC_LANG_PUSH([C])
+	   AC_LINK_IFELSE([
 		AC_LANG_PROGRAM([[#include <Python.h>]],
 				[[Py_Initialize();]])
 		],[pythonexists=yes],[pythonexists=no])
-	AC_LANG_POP([C])
-	# turn back to default flags
-	CPPFLAGS="$ac_save_CPPFLAGS"
-	LIBS="$ac_save_LIBS"
-	LDFLAGS="$ac_save_LDFLAGS"
+	   AC_LANG_POP([C])
+	   # turn back to default flags
+	   CPPFLAGS="$ac_save_CPPFLAGS"
+	   LIBS="$ac_save_LIBS"
+	   LDFLAGS="$ac_save_LDFLAGS"
 
-	AC_MSG_RESULT([$pythonexists])
+	   AC_MSG_RESULT([$pythonexists])
 
-        if test ! "x$pythonexists" = "xyes"; then
-		m4_ifvaln([$2],[$2],[
-			AC_MSG_FAILURE([
+	   if test ! "x$pythonexists" = "xyes"; then
+	      AC_MSG_WARN([
   Could not link test program to Python. Maybe the main Python library has been
   installed in some non-standard library path. If so, pass it to configure,
   via the LIBS environment variable.
@@ -340,9 +453,13 @@ EOD`
    You probably have to install the development version of the Python package
    for your distribution.  The exact name of this package varies among them.
   ============================================================================
-	   ])
-			PYTHON_VERSION=""
-		])
+	      ])
+	      if ! $ax_python_devel_optional; then
+		 AC_MSG_ERROR([Giving up])
+	      fi
+	      ax_python_devel_found=no
+	      PYTHON_VERSION=""
+	   fi
 	fi
 
 	#
diff --git a/contrib/debian/control b/contrib/debian/control
index 98beb900d0fa..e56fbf0f1c93 100644
--- a/contrib/debian/control
+++ b/contrib/debian/control
@@ -189,7 +189,7 @@ Depends: dkms (>> 2.1.1.2-5),
          file,
          libc6-dev | libc-dev,
          lsb-release,
-         python3-distutils | libpython3-stdlib (<< 3.6.4),
+         python3 (>> 3.12) | python3-distutils | libpython3-stdlib (<< 3.6.4),
          ${misc:Depends},
          ${perl:Depends}
 Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Recommends}
-- 
cgit v1.2.3


From ef3fea63eb22ed07d941625d6893967d59fe179c Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 29 Apr 2024 11:31:50 -0700
Subject: GCC: Fixes for gcc 14 on Fedora 40

- Workaround dangling pointer in uu_list.c (#16124)
- Fix calloc() transposed arguments in zpool_vdev_os.c
- Make some temp variables unsigned to prevent triggering a
  '-Werror=alloc-size-larger-than' error.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16124
Closes #16125
---
 cmd/zpool/os/linux/zpool_vdev_os.c |  2 +-
 lib/libuutil/uu_list.c             | 14 ++++++++++----
 module/zfs/vdev_raidz.c            |  5 +++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c
index 80627b58211c..f194d28c55a9 100644
--- a/cmd/zpool/os/linux/zpool_vdev_os.c
+++ b/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -438,7 +438,7 @@ static char *zpool_sysfs_gets(char *path)
 		return (NULL);
 	}
 
-	buf = calloc(sizeof (*buf), statbuf.st_size + 1);
+	buf = calloc(statbuf.st_size + 1, sizeof (*buf));
 	if (buf == NULL) {
 		close(fd);
 		return (NULL);
diff --git a/lib/libuutil/uu_list.c b/lib/libuutil/uu_list.c
index 0ca6f05205e9..aa8b129cc22a 100644
--- a/lib/libuutil/uu_list.c
+++ b/lib/libuutil/uu_list.c
@@ -505,14 +505,20 @@ uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags)
 	}
 
 	if (lp->ul_debug || robust) {
-		uu_list_walk_t my_walk;
+		uu_list_walk_t *my_walk;
 		void *e;
 
-		list_walk_init(&my_walk, lp, flags);
+		my_walk = uu_zalloc(sizeof (*my_walk));
+		if (my_walk == NULL)
+			return (-1);
+
+		list_walk_init(my_walk, lp, flags);
 		while (status == UU_WALK_NEXT &&
-		    (e = uu_list_walk_next(&my_walk)) != NULL)
+		    (e = uu_list_walk_next(my_walk)) != NULL)
 			status = (*func)(e, private);
-		list_walk_fini(&my_walk);
+		list_walk_fini(my_walk);
+
+		uu_free(my_walk);
 	} else {
 		if (!reverse) {
 			for (np = lp->ul_null_node.uln_next;
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 3445fa9d35d5..34ef57a3548d 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1283,8 +1283,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
 static void
 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
-	int n, i, c, t, tt;
-	int nmissing_rows;
+	int i, c, t, tt;
+	unsigned int n;
+	unsigned int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
-- 
cgit v1.2.3


From 5972bb856c1e84be582c54dd0ff1559ee4dfa068 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= <des@FreeBSD.org>
Date: Wed, 30 Aug 2023 17:13:10 +0200
Subject: Use ASSERT0P() to check that a pointer is NULL.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Dag-Erling Smørgrav <des@FreeBSD.org>
Closes #15225
---
 module/zfs/dbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index a94ba59567ec..8bd9dd9a88c5 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2701,7 +2701,7 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 	VERIFY(!dbuf_undirty(db, tx));
-	ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL);
+	ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
-- 
cgit v1.2.3


From 4d17e200dde13ab747311bc824a53fc8071c77d7 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Fri, 19 Apr 2024 22:19:12 +0500
Subject: Add zfetch stats in arcstats

arc_summary also reports zfetch stats but it's inconvenient to monitor
contiguously incrementing numbers. Adding them in arcstats allows us to
observe streams more conveniently.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16094
---
 cmd/arcstat.in | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/cmd/arcstat.in b/cmd/arcstat.in
index 8df1c62f7e86..220f343b5b62 100755
--- a/cmd/arcstat.in
+++ b/cmd/arcstat.in
@@ -157,6 +157,16 @@ cols = {
     "free":       [5, 1024, "ARC free memory"],
     "avail":      [5, 1024, "ARC available memory"],
     "waste":      [5, 1024, "Wasted memory due to round up to pagesize"],
+    "ztotal":     [6, 1000, "zfetch total prefetcher calls per second"],
+    "zhits":      [5, 1000, "zfetch stream hits per second"],
+    "zahead":     [6, 1000, "zfetch hits ahead of streams per second"],
+    "zpast":      [5, 1000, "zfetch hits behind streams per second"],
+    "zmisses":    [7, 1000, "zfetch stream misses per second"],
+    "zmax":       [4, 1000, "zfetch limit reached per second"],
+    "zfuture":    [7, 1000, "zfetch stream future per second"],
+    "zstride":    [7, 1000, "zfetch stream strides per second"],
+    "zissued":    [7, 1000, "zfetch prefetches issued per second"],
+    "zactive":    [7, 1000, "zfetch prefetches active per second"],
 }
 
 v = {}
@@ -164,6 +174,8 @@ hdr = ["time", "read", "ddread", "ddh%", "dmread", "dmh%", "pread", "ph%",
        "size", "c", "avail"]
 xhdr = ["time", "mfu", "mru", "mfug", "mrug", "unc", "eskip", "mtxmis",
         "dread", "pread", "read"]
+zhdr = ["time", "ztotal", "zhits", "zahead", "zpast", "zmisses", "zmax",
+        "zfuture", "zstride", "zissued", "zactive"]
 sint = 1               # Default interval is 1 second
 count = 1              # Default count is 1
 hdr_intr = 20          # Print header every 20 lines of output
@@ -206,12 +218,17 @@ elif sys.platform.startswith('linux'):
     def kstat_update():
         global kstat
 
-        k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
+        k1 = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
 
-        if not k:
+        k2 = ["zfetch_" + line.strip() for line in
+             open('/proc/spl/kstat/zfs/zfetchstats')]
+
+        if k1 is None or k2 is None:
             sys.exit(1)
 
-        del k[0:2]
+        del k1[0:2]
+        del k2[0:2]
+        k = k1 + k2
         kstat = {}
 
         for s in k:
@@ -239,6 +256,7 @@ def usage():
     sys.stderr.write("\t -v : List all possible field headers and definitions"
                      "\n")
     sys.stderr.write("\t -x : Print extended stats\n")
+    sys.stderr.write("\t -z : Print zfetch stats\n")
     sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
     sys.stderr.write("\t -o : Redirect output to the specified file\n")
     sys.stderr.write("\t -s : Override default field separator with custom "
@@ -357,6 +375,7 @@ def init():
     global count
     global hdr
     global xhdr
+    global zhdr
     global opfile
     global sep
     global out
@@ -368,15 +387,17 @@ def init():
     xflag = False
     hflag = False
     vflag = False
+    zflag = False
     i = 1
 
     try:
         opts, args = getopt.getopt(
             sys.argv[1:],
-            "axo:hvs:f:p",
+            "axzo:hvs:f:p",
             [
                 "all",
                 "extended",
+                "zfetch",
                 "outfile",
                 "help",
                 "verbose",
@@ -410,13 +431,15 @@ def init():
             i += 1
         if opt in ('-p', '--parsable'):
             pretty_print = False
+        if opt in ('-z', '--zfetch'):
+            zflag = True
         i += 1
 
     argv = sys.argv[i:]
     sint = int(argv[0]) if argv else sint
     count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1)
 
-    if hflag or (xflag and desired_cols):
+    if hflag or (xflag and zflag) or ((zflag or xflag) and desired_cols):
         usage()
 
     if vflag:
@@ -425,6 +448,9 @@ def init():
     if xflag:
         hdr = xhdr
 
+    if zflag:
+        hdr = zhdr
+
     update_hdr_intr()
 
     # check if L2ARC exists
@@ -569,6 +595,17 @@ def calculate():
     v["el2mru"] = d["evict_l2_eligible_mru"] // sint
     v["el2inel"] = d["evict_l2_ineligible"] // sint
     v["mtxmis"] = d["mutex_miss"] // sint
+    v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] +
+                   d["zfetch_past"] + d["zfetch_misses"]) // sint
+    v["zhits"] = d["zfetch_hits"] // sint
+    v["zahead"] = (d["zfetch_future"] + d["zfetch_stride"]) // sint
+    v["zpast"] = d["zfetch_past"] // sint
+    v["zmisses"] = d["zfetch_misses"] // sint
+    v["zmax"] = d["zfetch_max_streams"] // sint
+    v["zfuture"] = d["zfetch_future"] // sint
+    v["zstride"] = d["zfetch_stride"] // sint
+    v["zissued"] = d["zfetch_io_issued"] // sint
+    v["zactive"] = d["zfetch_io_active"] // sint
 
     if l2exist:
         v["l2hits"] = d["l2_hits"] // sint
-- 
cgit v1.2.3


From b3b37b84e8336b47c4cbbeae2d4f21ba5c266144 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Tue, 30 Apr 2024 01:28:50 +0500
Subject: Fix arcstats for FreeBSD after zfetch support

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16141
---
 cmd/arcstat.in | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cmd/arcstat.in b/cmd/arcstat.in
index 220f343b5b62..c4f10a1d6d3b 100755
--- a/cmd/arcstat.in
+++ b/cmd/arcstat.in
@@ -200,6 +200,8 @@ if sys.platform.startswith('freebsd'):
 
         k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats')
              if ctl.type != sysctl.CTLTYPE_NODE]
+        k += [ctl for ctl in sysctl.filter('kstat.zfs.misc.zfetchstats')
+             if ctl.type != sysctl.CTLTYPE_NODE]
 
         if not k:
             sys.exit(1)
@@ -211,8 +213,12 @@ if sys.platform.startswith('freebsd'):
                 continue
 
             name, value = s.name, s.value
-            # Trims 'kstat.zfs.misc.arcstats' from the name
-            kstat[name[24:]] = int(value)
+
+            if "arcstats" in name:
+                # Trims 'kstat.zfs.misc.arcstats' from the name
+                kstat[name[24:]] = int(value)
+            else:
+                kstat["zfetch_" + name[27:]] = int(value)
 
 elif sys.platform.startswith('linux'):
     def kstat_update():
-- 
cgit v1.2.3


From 6f323353d280d5e1e318500d05522acd4ae6f894 Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Fri, 29 Mar 2024 15:15:56 -0400
Subject: Add ashift validation when adding devices to a pool

Currently, zpool add allows users to add top-level vdevs that have
different ashifts but doing so prevents users from being able to
perform a top-level vdev removal. Often times consumers may not realize
that they have mismatched ashifts until the top-level removal fails.

This feature adds ashift validation to the zpool add command and will
fail the operation if the sector size of the specified vdev does not
match the existing pool. This behavior can be disabled by using the -f
flag. In addition, new flags have been added to provide fine-grained
control to disable specific checks. These flags
are:

--allow-in-use
--allow-ashift-mismatch
--allow-replicaton-mismatch

The force flag will disable all of these checks.

Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Maybee <mmaybee@delphix.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #15509
---
 cmd/zpool/zpool_main.c                             | 76 +++++++++++++++++-----
 cmd/ztest.c                                        |  8 +--
 include/libzfs.h                                   |  5 +-
 include/sys/fs/zfs.h                               |  3 +-
 include/sys/spa.h                                  |  4 +-
 lib/libzfs/libzfs.abi                              | 76 ++++++++++++++++++----
 lib/libzfs/libzfs_pool.c                           |  5 +-
 lib/libzfs/libzfs_util.c                           |  8 ++-
 man/man8/zpool-add.8                               | 18 ++++-
 module/zfs/spa.c                                   | 14 +++-
 module/zfs/zfs_ioctl.c                             |  4 +-
 tests/runfiles/common.run                          |  3 +-
 tests/zfs-tests/tests/Makefile.am                  |  1 +
 .../functional/cli_root/zpool_add/add-o_ashift.ksh | 17 ++++-
 .../cli_root/zpool_add/add_prop_ashift.ksh         | 16 ++++-
 .../zpool_add/zpool_add--allow-ashift-mismatch.ksh |  0
 .../cli_root/zpool_add/zpool_add_002_pos.ksh       | 11 ++++
 .../cli_root/zpool_add/zpool_add_004_pos.ksh       |  2 +-
 .../cli_root/zpool_add/zpool_add_005_pos.ksh       |  2 +
 .../cli_root/zpool_add/zpool_add_009_neg.ksh       |  2 +
 .../cli_root/zpool_add/zpool_add_010_pos.ksh       |  2 +-
 21 files changed, 219 insertions(+), 58 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 1e577e712be5..20b1c85065b4 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
  * Copyright (c) 2012 by Cyril Plisko. All rights reserved.
  * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
@@ -131,6 +131,13 @@ static int zpool_do_help(int argc, char **argv);
 static zpool_compat_status_t zpool_do_load_compat(
     const char *, boolean_t *);
 
+enum zpool_options {
+	ZPOOL_OPTION_POWER = 1024,
+	ZPOOL_OPTION_ALLOW_INUSE,
+	ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH,
+	ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH
+};
+
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
@@ -347,7 +354,7 @@ get_usage(zpool_help_t idx)
 {
 	switch (idx) {
 	case HELP_ADD:
-		return (gettext("\tadd [-fgLnP] [-o property=value] "
+		return (gettext("\tadd [-afgLnP] [-o property=value] "
 		    "<pool> <vdev> ...\n"));
 	case HELP_ATTACH:
 		return (gettext("\tattach [-fsw] [-o property=value] "
@@ -1009,8 +1016,9 @@ add_prop_list_default(const char *propname, const char *propval,
 }
 
 /*
- * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
+ * zpool add [-afgLnP] [-o property=value] <pool> <vdev> ...
  *
+ *	-a	Disable the ashift validation checks
  *	-f	Force addition of devices, even if they appear in use
  *	-g	Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
@@ -1026,8 +1034,11 @@ add_prop_list_default(const char *propname, const char *propval,
 int
 zpool_do_add(int argc, char **argv)
 {
-	boolean_t force = B_FALSE;
+	boolean_t check_replication = B_TRUE;
+	boolean_t check_inuse = B_TRUE;
 	boolean_t dryrun = B_FALSE;
+	boolean_t check_ashift = B_TRUE;
+	boolean_t force = B_FALSE;
 	int name_flags = 0;
 	int c;
 	nvlist_t *nvroot;
@@ -1038,8 +1049,18 @@ zpool_do_add(int argc, char **argv)
 	nvlist_t *props = NULL;
 	char *propval;
 
+	struct option long_options[] = {
+		{"allow-in-use", no_argument, NULL, ZPOOL_OPTION_ALLOW_INUSE},
+		{"allow-replication-mismatch", no_argument, NULL,
+		    ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH},
+		{"allow-ashift-mismatch", no_argument, NULL,
+		    ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH},
+		{0, 0, 0, 0}
+	};
+
 	/* check options */
-	while ((c = getopt(argc, argv, "fgLno:P")) != -1) {
+	while ((c = getopt_long(argc, argv, "fgLno:P", long_options, NULL))
+	    != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
@@ -1069,6 +1090,15 @@ zpool_do_add(int argc, char **argv)
 		case 'P':
 			name_flags |= VDEV_NAME_PATH;
 			break;
+		case ZPOOL_OPTION_ALLOW_INUSE:
+			check_inuse = B_FALSE;
+			break;
+		case ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH:
+			check_replication = B_FALSE;
+			break;
+		case ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH:
+			check_ashift = B_FALSE;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -1089,6 +1119,19 @@ zpool_do_add(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
+	if (force) {
+		if (!check_inuse || !check_replication || !check_ashift) {
+			(void) fprintf(stderr, gettext("'-f' option is not "
+			    "allowed with '--allow-replication-mismatch', "
+			    "'--allow-ashift-mismatch', or "
+			    "'--allow-in-use'\n"));
+			usage(B_FALSE);
+		}
+		check_inuse = B_FALSE;
+		check_replication = B_FALSE;
+		check_ashift = B_FALSE;
+	}
+
 	poolname = argv[0];
 
 	argc--;
@@ -1119,8 +1162,8 @@ zpool_do_add(int argc, char **argv)
 	}
 
 	/* pass off to make_root_vdev for processing */
-	nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
-	    argc, argv);
+	nvroot = make_root_vdev(zhp, props, !check_inuse,
+	    check_replication, B_FALSE, dryrun, argc, argv);
 	if (nvroot == NULL) {
 		zpool_close(zhp);
 		return (1);
@@ -1224,7 +1267,7 @@ zpool_do_add(int argc, char **argv)
 
 		ret = 0;
 	} else {
-		ret = (zpool_add(zhp, nvroot) != 0);
+		ret = (zpool_add(zhp, nvroot, check_ashift) != 0);
 	}
 
 	nvlist_free(props);
@@ -7068,7 +7111,6 @@ zpool_do_split(int argc, char **argv)
 	return (ret);
 }
 
-#define	POWER_OPT 1024
 
 /*
  * zpool online [--power] <pool> <device> ...
@@ -7086,7 +7128,7 @@ zpool_do_online(int argc, char **argv)
 	int flags = 0;
 	boolean_t is_power_on = B_FALSE;
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7096,7 +7138,7 @@ zpool_do_online(int argc, char **argv)
 		case 'e':
 			flags |= ZFS_ONLINE_EXPAND;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_on = B_TRUE;
 			break;
 		case '?':
@@ -7209,7 +7251,7 @@ zpool_do_offline(int argc, char **argv)
 	boolean_t is_power_off = B_FALSE;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7222,7 +7264,7 @@ zpool_do_offline(int argc, char **argv)
 		case 't':
 			istmp = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_off = B_TRUE;
 			break;
 		case '?':
@@ -7322,7 +7364,7 @@ zpool_do_clear(int argc, char **argv)
 	char *pool, *device;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7339,7 +7381,7 @@ zpool_do_clear(int argc, char **argv)
 		case 'X':
 			xtreme_rewind = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_on = B_TRUE;
 			break;
 		case '?':
@@ -9099,7 +9141,7 @@ zpool_do_status(int argc, char **argv)
 	char *cmd = NULL;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -9167,7 +9209,7 @@ zpool_do_status(int argc, char **argv)
 		case 'x':
 			cb.cb_explain = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			cb.cb_print_power = B_TRUE;
 			break;
 		case '?':
diff --git a/cmd/ztest.c b/cmd/ztest.c
index 8cfbdfe1c2e2..34744d12b592 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -3270,7 +3270,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 		    "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors,
 		    1);
 
-		error = spa_vdev_add(spa, nvroot);
+		error = spa_vdev_add(spa, nvroot, B_FALSE);
 		fnvlist_free(nvroot);
 
 		switch (error) {
@@ -3332,7 +3332,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
 	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
 	    class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
 
-	error = spa_vdev_add(spa, nvroot);
+	error = spa_vdev_add(spa, nvroot, B_FALSE);
 	fnvlist_free(nvroot);
 
 	if (error == ENOSPC)
@@ -3439,7 +3439,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 		 */
 		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
 		    (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
-		error = spa_vdev_add(spa, nvroot);
+		error = spa_vdev_add(spa, nvroot, B_FALSE);
 
 		switch (error) {
 		case 0:
diff --git a/include/libzfs.h b/include/libzfs.h
index 4f06b5d3c24c..2823b8845827 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright Joyent, Inc.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2016, Intel Corporation.
@@ -158,6 +158,7 @@ typedef enum zfs_error {
 	EZFS_RESUME_EXISTS,	/* Resume on existing dataset without force */
 	EZFS_SHAREFAILED,	/* filesystem share failed */
 	EZFS_RAIDZ_EXPAND_IN_PROGRESS,	/* a raidz is currently expanding */
+	EZFS_ASHIFT_MISMATCH,   /* can't add vdevs with different ashifts */
 	EZFS_UNKNOWN
 } zfs_error_t;
 
@@ -261,7 +262,7 @@ _LIBZFS_H boolean_t zpool_skip_pool(const char *);
 _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
     nvlist_t *, nvlist_t *);
 _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *);
-_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *);
+_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift);
 
 typedef struct splitflags {
 	/* do not split, but return the config that would be split off */
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 2683b774e8c0..f84cb7aade7f 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -1573,6 +1573,7 @@ typedef enum {
 	ZFS_ERR_RESUME_EXISTS,
 	ZFS_ERR_CRYPTO_NOTSUP,
 	ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
+	ZFS_ERR_ASHIFT_MISMATCH,
 } zfs_errno_t;
 
 /*
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 87ddbd90e170..31eeefb7ff24 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -784,7 +784,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 #define	SPA_ASYNC_DETACH_SPARE			0x4000
 
 /* device manipulation */
-extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 2b904aecae7a..771c7407e9da 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -1112,14 +1112,11 @@
         <var-decl name='prev' type-id='b03eadb4' visibility='default'/>
       </data-member>
     </class-decl>
-    <class-decl name='list' size-in-bits='256' is-struct='yes' visibility='default' id='e824dae9'>
+    <class-decl name='list' size-in-bits='192' is-struct='yes' visibility='default' id='e824dae9'>
       <data-member access='public' layout-offset-in-bits='0'>
-        <var-decl name='list_size' type-id='b59d7dce' visibility='default'/>
-      </data-member>
-      <data-member access='public' layout-offset-in-bits='64'>
         <var-decl name='list_offset' type-id='b59d7dce' visibility='default'/>
       </data-member>
-      <data-member access='public' layout-offset-in-bits='128'>
+      <data-member access='public' layout-offset-in-bits='64'>
         <var-decl name='list_head' type-id='b0b5e45e' visibility='default'/>
       </data-member>
     </class-decl>
@@ -2878,6 +2875,9 @@
     </function-type>
   </abi-instr>
   <abi-instr address-size='64' path='lib/libzfs/libzfs_crypto.c' language='LANG_C99'>
+    <array-type-def dimensions='1' type-id='38b51b3c' size-in-bits='832' id='02b72c00'>
+      <subrange length='13' type-id='7359adad' id='487fded1'/>
+    </array-type-def>
     <array-type-def dimensions='1' type-id='fb7c6451' size-in-bits='256' id='64177143'>
       <subrange length='32' type-id='7359adad' id='ae5bde82'/>
     </array-type-def>
@@ -2890,6 +2890,10 @@
     <class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
     <class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
     <class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
+    <class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
+    <array-type-def dimensions='1' type-id='80f4b756' size-in-bits='832' id='39e6f84a'>
+      <subrange length='13' type-id='7359adad' id='487fded1'/>
+    </array-type-def>
     <array-type-def dimensions='1' type-id='95e97e5e' size-in-bits='896' id='47394ee0'>
       <subrange length='28' type-id='7359adad' id='3db583d7'/>
     </array-type-def>
@@ -3010,6 +3014,24 @@
     <typedef-decl name='__clock_t' type-id='bd54fe1a' id='4d66c6d7'/>
     <typedef-decl name='__ssize_t' type-id='bd54fe1a' id='41060289'/>
     <typedef-decl name='FILE' type-id='ec1ed955' id='aa12d1ba'/>
+    <class-decl name='__locale_struct' size-in-bits='1856' is-struct='yes' visibility='default' id='90cc1ce3'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='__locales' type-id='02b72c00' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='832'>
+        <var-decl name='__ctype_b' type-id='31347b7a' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='__ctype_tolower' type-id='6d60f45d' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='960'>
+        <var-decl name='__ctype_toupper' type-id='6d60f45d' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1024'>
+        <var-decl name='__names' type-id='39e6f84a' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <typedef-decl name='__locale_t' type-id='f01e1813' id='b7ac9b5f'/>
     <class-decl name='__sigset_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='b9c97942' visibility='default' id='2616147f'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='__val' type-id='d2baa450' visibility='default'/>
@@ -3025,6 +3047,7 @@
       </data-member>
     </union-decl>
     <typedef-decl name='__sigval_t' type-id='a094b870' id='eabacd01'/>
+    <typedef-decl name='locale_t' type-id='b7ac9b5f' id='973a4f8d'/>
     <class-decl name='siginfo_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='cb681f62' visibility='default' id='d8149419'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='si_signo' type-id='95e97e5e' visibility='default'/>
@@ -3260,9 +3283,13 @@
     <pointer-type-def type-id='bb4788fa' size-in-bits='64' id='cecf4ea7'/>
     <pointer-type-def type-id='010ae0b9' size-in-bits='64' id='e4c6fa61'/>
     <pointer-type-def type-id='79bd3751' size-in-bits='64' id='c65a1f29'/>
+    <pointer-type-def type-id='23de8b96' size-in-bits='64' id='38b51b3c'/>
+    <pointer-type-def type-id='90cc1ce3' size-in-bits='64' id='f01e1813'/>
     <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
     <qualified-type-def type-id='80f4b756' restrict='yes' id='9d26089a'/>
     <pointer-type-def type-id='80f4b756' size-in-bits='64' id='7d3cd834'/>
+    <qualified-type-def type-id='95e97e5e' const='yes' id='2448a865'/>
+    <pointer-type-def type-id='2448a865' size-in-bits='64' id='6d60f45d'/>
     <qualified-type-def type-id='aca3bac8' const='yes' id='2498fd78'/>
     <pointer-type-def type-id='2498fd78' size-in-bits='64' id='eed6c816'/>
     <qualified-type-def type-id='eed6c816' restrict='yes' id='a431a9da'/>
@@ -3295,6 +3322,7 @@
     <class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
     <class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
     <class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
+    <class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
     <function-decl name='zpool_get_prop_int' mangled-name='zpool_get_prop_int' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_get_prop_int'>
       <parameter type-id='4c81de99'/>
       <parameter type-id='5d0c23fb'/>
@@ -3399,6 +3427,10 @@
     <function-decl name='dlerror' visibility='default' binding='global' size-in-bits='64'>
       <return type-id='26a90f95'/>
     </function-decl>
+    <function-decl name='uselocale' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='973a4f8d'/>
+      <return type-id='973a4f8d'/>
+    </function-decl>
     <function-decl name='PKCS5_PBKDF2_HMAC_SHA1' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
       <parameter type-id='95e97e5e'/>
@@ -3482,8 +3514,9 @@
       <parameter type-id='80f4b756'/>
       <return type-id='26a90f95'/>
     </function-decl>
-    <function-decl name='strerror' visibility='default' binding='global' size-in-bits='64'>
+    <function-decl name='strerror_l' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='95e97e5e'/>
+      <parameter type-id='973a4f8d'/>
       <return type-id='26a90f95'/>
     </function-decl>
     <function-decl name='tcgetattr' visibility='default' binding='global' size-in-bits='64'>
@@ -3840,12 +3873,18 @@
     <qualified-type-def type-id='9c313c2d' const='yes' id='c3b7ba7d'/>
     <pointer-type-def type-id='c3b7ba7d' size-in-bits='64' id='713a56f5'/>
     <pointer-type-def type-id='01a1b934' size-in-bits='64' id='566b3f52'/>
+    <qualified-type-def type-id='566b3f52' restrict='yes' id='c878edd6'/>
+    <pointer-type-def type-id='566b3f52' size-in-bits='64' id='82d4e9e8'/>
+    <qualified-type-def type-id='82d4e9e8' restrict='yes' id='aa19c230'/>
     <pointer-type-def type-id='7e291ce6' size-in-bits='64' id='ca64ff60'/>
     <pointer-type-def type-id='9da381c4' size-in-bits='64' id='cb785ebf'/>
     <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
     <pointer-type-def type-id='8e0af06e' size-in-bits='64' id='053457bd'/>
     <pointer-type-def type-id='857bb57e' size-in-bits='64' id='75be733c'/>
     <pointer-type-def type-id='a63d15a3' size-in-bits='64' id='a195f4a3'/>
+    <qualified-type-def type-id='a195f4a3' restrict='yes' id='33518961'/>
+    <pointer-type-def type-id='a195f4a3' size-in-bits='64' id='e80ff3ab'/>
+    <qualified-type-def type-id='e80ff3ab' restrict='yes' id='8f2c7109'/>
     <pointer-type-def type-id='eae6431d' size-in-bits='64' id='0d41d328'/>
     <pointer-type-def type-id='7a6844eb' size-in-bits='64' id='18c91f9e'/>
     <pointer-type-def type-id='dddf6ca2' size-in-bits='64' id='d915a820'/>
@@ -4278,9 +4317,13 @@
       <parameter type-id='9d424d31'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='getgrnam' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <return type-id='566b3f52'/>
+    <function-decl name='getgrnam_r' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='c878edd6'/>
+      <parameter type-id='266fe297'/>
+      <parameter type-id='b59d7dce'/>
+      <parameter type-id='aa19c230'/>
+      <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='hasmntopt' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='48bea5ec'/>
@@ -4304,9 +4347,13 @@
       <parameter type-id='18c91f9e'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='getpwnam' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <return type-id='a195f4a3'/>
+    <function-decl name='getpwnam_r' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='33518961'/>
+      <parameter type-id='266fe297'/>
+      <parameter type-id='b59d7dce'/>
+      <parameter type-id='8f2c7109'/>
+      <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='strtol' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='9d26089a'/>
@@ -6245,6 +6292,7 @@
     <function-decl name='zpool_add' mangled-name='zpool_add' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_add'>
       <parameter type-id='4c81de99' name='zhp'/>
       <parameter type-id='5ce45b60' name='nvroot'/>
+      <parameter type-id='c19b74c3' name='ashift_check'/>
       <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='zpool_export' mangled-name='zpool_export' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_export'>
@@ -6696,7 +6744,7 @@
       <enumerator name='LZC_SEND_FLAG_RAW' value='8'/>
       <enumerator name='LZC_SEND_FLAG_SAVED' value='16'/>
     </enum-decl>
-    <class-decl name='ddt_key' size-in-bits='320' is-struct='yes' visibility='default' id='e0a4a1cb'>
+    <class-decl name='ddt_key_t' size-in-bits='320' is-struct='yes' naming-typedef-id='67f6d2cf' visibility='default' id='5fae1718'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='ddk_cksum' type-id='39730d0b' visibility='default'/>
       </data-member>
@@ -6704,7 +6752,7 @@
         <var-decl name='ddk_prop' type-id='9c313c2d' visibility='default'/>
       </data-member>
     </class-decl>
-    <typedef-decl name='ddt_key_t' type-id='e0a4a1cb' id='67f6d2cf'/>
+    <typedef-decl name='ddt_key_t' type-id='5fae1718' id='67f6d2cf'/>
     <enum-decl name='dmu_object_type' id='04b3b0b9'>
       <underlying-type type-id='9cac1fee'/>
       <enumerator name='DMU_OT_NONE' value='0'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 6d5ac9d59ed2..e95b361da866 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2018 Datto Inc.
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
@@ -1724,7 +1724,7 @@ zpool_discard_checkpoint(zpool_handle_t *zhp)
  * necessary verification to ensure that the vdev specification is well-formed.
  */
 int
-zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
+zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	zfs_cmd_t zc = {"\0"};
 	int ret;
@@ -1756,6 +1756,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
 
 	zcmd_write_conf_nvlist(hdl, &zc, nvroot);
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_flags = check_ashift;
 
 	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
 		switch (errno) {
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 463cec0855c1..60e9262f6b71 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2020 Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2020 The FreeBSD Foundation
@@ -317,6 +317,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
 	case EZFS_RESUME_EXISTS:
 		return (dgettext(TEXT_DOMAIN, "Resuming recv on existing "
 		    "dataset without force"));
+	case EZFS_ASHIFT_MISMATCH:
+		return (dgettext(TEXT_DOMAIN, "adding devices with "
+		    "different physical sector sizes is not allowed"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@@ -763,6 +766,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 	case ZFS_ERR_IOC_ARG_BADTYPE:
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
+	case ZFS_ERR_ASHIFT_MISMATCH:
+		zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
+		break;
 	default:
 		zfs_error_aux(hdl, "%s", strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
diff --git a/man/man8/zpool-add.8 b/man/man8/zpool-add.8
index 8ccdcccc7b06..60b35f1a511a 100644
--- a/man/man8/zpool-add.8
+++ b/man/man8/zpool-add.8
@@ -24,8 +24,9 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024 by Delphix. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd March 8, 2024
 .Dt ZPOOL-ADD 8
 .Os
 .
@@ -36,6 +37,7 @@
 .Nm zpool
 .Cm add
 .Op Fl fgLnP
+.Op Fl -allow-in-use -allow-replication-mismatch -allow-ashift-mismatch
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool vdev Ns …
 .
@@ -56,7 +58,8 @@ subcommand.
 .It Fl f
 Forces use of
 .Ar vdev Ns s ,
-even if they appear in use or specify a conflicting replication level.
+even if they appear in use, have conflicting ashift values, or specify
+a conflicting replication level.
 Not all devices can be overridden in this manner.
 .It Fl g
 Display
@@ -91,6 +94,17 @@ See the
 manual page for a list of valid properties that can be set.
 The only property supported at the moment is
 .Sy ashift .
+.It Fl -allow-ashift-mismatch
+Disable the ashift validation which allows mismatched ashift values in the
+pool.
+Adding top-level
+.Ar vdev Ns s
+with different sector sizes will prohibit future device removal operations, see
+.Xr zpool-remove 8 .
+.It Fl -allow-in-use
+Allow vdevs to be added even if they might be in use in another pool.
+.It Fl -allow-replication-mismatch
+Allow vdevs with conflicting replication levels to be added to the pool.
 .El
 .
 .Sh EXAMPLES
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index d7fe96cde6a4..0bd46f57475c 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -6978,7 +6978,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
  * Add a device to a storage pool.
  */
 int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	uint64_t txg, ndraid = 0;
 	int error;
@@ -7069,6 +7069,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		}
 	}
 
+	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			tvd = vd->vdev_child[c];
+			if (tvd->vdev_ashift != spa->spa_max_ashift) {
+				return (spa_vdev_exit(spa, vd, txg,
+				    ZFS_ERR_ASHIFT_MISMATCH));
+			}
+		}
+	}
+
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 2738385e260b..ac8329185cca 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -27,7 +27,7 @@
  * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -1887,7 +1887,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config);
 	if (error == 0) {
-		error = spa_vdev_add(spa, config);
+		error = spa_vdev_add(spa, config, zc->zc_flags);
 		nvlist_free(config);
 	}
 	spa_close(spa, FTAG);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index c4afde554da5..04a34a997570 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -372,7 +372,8 @@ tags = ['functional', 'cli_root', 'zpool']
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
-    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
+    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output',
+    'zpool_add--allow-ashift-mismatch']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 2fc36c4d7380..8befed077234 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -989,6 +989,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
+	functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
index 7ecaf849e44b..51871934dd22 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
@@ -22,7 +22,7 @@
 
 #
 # Copyright 2017, loli10K. All rights reserved.
-# Copyright (c) 2020 by Delphix. All rights reserved.
+# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -60,12 +60,23 @@ log_must mkfile $SIZE $disk2
 logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT)
 orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
 max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT)
+opt=""
 
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
+	#
+	# Need to add the --allow-ashift-mismatch option to disable the
+	# ashift mismatch checks in zpool add.
+	#
+	if [[ $ashift -eq $orig_ashift ]]; then
+		opt=""
+	else
+		opt="--allow-ashift-mismatch"
+	fi
+
 	log_must zpool create $TESTPOOL $disk1
-	log_must zpool add -o ashift=$ashift $TESTPOOL $disk2
+	log_must zpool add $opt -o ashift=$ashift $TESTPOOL $disk2
 	log_must verify_ashift $disk2 $ashift
 
 	# clean things for the next run
@@ -78,7 +89,7 @@ do
 	#
 	log_must zpool create $TESTPOOL $disk1
 	log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT $ashift
-	log_must zpool add $TESTPOOL $disk2
+	log_must zpool add $opt $TESTPOOL $disk2
 	exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift ))
 	log_must verify_ashift $disk2 $exp
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
index 228f62232aae..6a3283d0618f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
@@ -22,7 +22,7 @@
 
 #
 # Copyright 2017, loli10K. All rights reserved.
-# Copyright (c) 2020 by Delphix. All rights reserved.
+# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -68,8 +68,13 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
+	if [ $ashift -eq $orig_ashift ];then
+		opt=""
+	else
+		opt="--allow-ashift-mismatch"
+	fi
 	log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
-	log_must zpool add $TESTPOOL $disk2
+	log_must zpool add $opt $TESTPOOL $disk2
 	log_must verify_ashift $disk2 $ashift
 
 	# clean things for the next run
@@ -82,8 +87,13 @@ for ashift in ${ashifts[@]}
 do
 	for cmdval in ${ashifts[@]}
 	do
+		if [ $ashift -eq $cmdval ];then
+			opt=""
+		else
+			opt="--allow-ashift-mismatch"
+		fi
 		log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
-		log_must zpool add -o ashift=$cmdval $TESTPOOL $disk2
+		log_must zpool add $opt -o ashift=$cmdval $TESTPOOL $disk2
 		log_must verify_ashift $disk2 $cmdval
 
 		# clean things for the next run
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh
new file mode 100755
index 000000000000..e69de29bb2d1
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
index c5c06f76340b..afee34a33469 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
@@ -65,4 +65,15 @@ log_mustnot vdevs_in_pool $TESTPOOL $DISK2
 log_must zpool add -f $TESTPOOL $DISK2
 log_must vdevs_in_pool $TESTPOOL $DISK2
 
+log_must zpool destroy $TESTPOOL
+
+create_pool $TESTPOOL mirror $DISK0 $DISK1
+log_must poolexists $TESTPOOL
+
+log_mustnot zpool add $TESTPOOL $DISK2
+log_mustnot vdevs_in_pool $TESTPOOL $DISK2
+
+log_must zpool add --allow-replication-mismatch $TESTPOOL $DISK2
+log_must vdevs_in_pool $TESTPOOL $DISK2
+
 log_pass "'zpool add -f <pool> <vdev> ...' executes successfully."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
index 646edc1a4557..cecda56ab125 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
@@ -70,7 +70,7 @@ if is_freebsd; then
 	recursive=$(get_tunable VOL_RECURSIVE)
 	log_must set_tunable64 VOL_RECURSIVE 1
 fi
-log_must zpool add $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
+log_must zpool add --allow-ashift-mismatch $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
 
 log_must vdevs_in_pool "$TESTPOOL" "$ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL"
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
index 4990ef9d29b0..0e9d9f5f030f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
@@ -75,7 +75,9 @@ log_must poolexists $TESTPOOL1
 
 unset NOINUSE_CHECK
 log_mustnot zpool add -f $TESTPOOL $DISK1
+log_mustnot zpool add --allow-in-use $TESTPOOL $DISK1
 log_mustnot zpool add -f $TESTPOOL $mnttab_dev
+log_mustnot zpool add --allow-in-use $TESTPOOL $mnttab_dev
 if is_linux; then
        log_mustnot zpool add $TESTPOOL $vfstab_dev
 else
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
index d7f3a900e8fd..a13a27160e76 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
@@ -64,7 +64,9 @@ log_mustnot zpool add -f $TESTPOOL $DISK0
 for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache"
 do
 	log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1
+	log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1
 	log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1
+	log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1
 done
 
 log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
index b8b25db1b9f9..22860e9caf1d 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
@@ -138,7 +138,7 @@ function zpool_create_forced_add
 		while ((j < ${#add_args[@]})); do
 			log_must zpool create $TESTPOOL1 ${create_args[$i]}
 			log_mustnot zpool add $TESTPOOL1 ${add_args[$j]}
-			log_must zpool add -f $TESTPOOL1 ${add_args[$j]}
+			log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]}
 			log_must zpool destroy -f $TESTPOOL1
 
 			((j += 1))
-- 
cgit v1.2.3


From ea3f7c12a9c566c362d0039fb8ac5baa0baa8cbd Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@delphix.com>
Date: Tue, 5 Dec 2023 15:27:56 -0700
Subject: Extend import_progress kstat with a notes field

Detail the import progress of log spacemaps as they can take a very
long time.  Also grab the spa_note() messages to, as they provide
insight into what is happening

Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Closes #15539
---
 include/sys/spa.h                                  |   4 +
 module/zfs/spa.c                                   |  41 ++++++-
 module/zfs/spa_log_spacemap.c                      |  12 +-
 module/zfs/spa_misc.c                              |  74 +++++++++++-
 tests/runfiles/common.run                          |   3 +-
 .../cli_root/zpool_import/zpool_import_status.ksh  | 132 +++++++++++++++++++++
 6 files changed, 256 insertions(+), 10 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 31eeefb7ff24..3112ba09ceb4 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -966,6 +966,10 @@ extern int spa_import_progress_set_max_txg(uint64_t pool_guid,
     uint64_t max_txg);
 extern int spa_import_progress_set_state(uint64_t pool_guid,
     spa_load_state_t spa_load_state);
+extern void spa_import_progress_set_notes(spa_t *spa,
+    const char *fmt, ...) __printflike(2, 3);
+extern void spa_import_progress_set_notes_nolog(spa_t *spa,
+    const char *fmt, ...) __printflike(2, 3);
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag,
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 0bd46f57475c..fba7846955fd 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -3324,6 +3324,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 	spa->spa_load_state = state;
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
+	spa_import_progress_set_notes(spa, "spa_load()");
 
 	gethrestime(&spa->spa_loaded_ts);
 	error = spa_load_impl(spa, type, &ereport);
@@ -3552,7 +3553,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 	uint64_t mmp_config = ub->ub_mmp_config;
 	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
 	uint64_t import_delay;
-	hrtime_t import_expire;
+	hrtime_t import_expire, now;
 	nvlist_t *mmp_label = NULL;
 	vdev_t *rvd = spa->spa_root_vdev;
 	kcondvar_t cv;
@@ -3590,7 +3591,17 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 
 	import_expire = gethrtime() + import_delay;
 
-	while (gethrtime() < import_expire) {
+	spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
+	    "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+
+	int interations = 0;
+	while ((now = gethrtime()) < import_expire) {
+		if (interations++ % 30 == 0) {
+			spa_import_progress_set_notes(spa, "Checking MMP "
+			    "activity, %llu ms remaining",
+			    (u_longlong_t)NSEC2MSEC(import_expire - now));
+		}
+
 		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
 		    NSEC2SEC(import_expire - gethrtime()));
 
@@ -5204,6 +5215,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
 	 */
+	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
 	error = spa_ld_read_checkpoint_txg(spa);
 	if (error != 0)
 		return (error);
@@ -5216,6 +5228,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * initiated. Otherwise we could be reading from indirect vdevs before
 	 * we have loaded their mappings.
 	 */
+	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
 	error = spa_ld_open_indirect_vdev_metadata(spa);
 	if (error != 0)
 		return (error);
@@ -5224,6 +5237,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * Retrieve the full list of active features from the MOS and check if
 	 * they are all supported.
 	 */
+	spa_import_progress_set_notes(spa, "Checking feature flags");
 	error = spa_ld_check_features(spa, &missing_feat_write);
 	if (error != 0)
 		return (error);
@@ -5232,6 +5246,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * Load several special directories from the MOS needed by the dsl_pool
 	 * layer.
 	 */
+	spa_import_progress_set_notes(spa, "Loading special MOS directories");
 	error = spa_ld_load_special_directories(spa);
 	if (error != 0)
 		return (error);
@@ -5239,6 +5254,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	/*
 	 * Retrieve pool properties from the MOS.
 	 */
+	spa_import_progress_set_notes(spa, "Loading properties");
 	error = spa_ld_get_props(spa);
 	if (error != 0)
 		return (error);
@@ -5247,6 +5263,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * Retrieve the list of auxiliary devices - cache devices and spares -
 	 * and open them.
 	 */
+	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
 	error = spa_ld_open_aux_vdevs(spa, type);
 	if (error != 0)
 		return (error);
@@ -5255,14 +5272,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * Load the metadata for all vdevs. Also check if unopenable devices
 	 * should be autoreplaced.
 	 */
+	spa_import_progress_set_notes(spa, "Loading vdev metadata");
 	error = spa_ld_load_vdev_metadata(spa);
 	if (error != 0)
 		return (error);
 
+	spa_import_progress_set_notes(spa, "Loading dedup tables");
 	error = spa_ld_load_dedup_tables(spa);
 	if (error != 0)
 		return (error);
 
+	spa_import_progress_set_notes(spa, "Loading BRT");
 	error = spa_ld_load_brt(spa);
 	if (error != 0)
 		return (error);
@@ -5271,6 +5291,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * Verify the logs now to make sure we don't have any unexpected errors
 	 * when we claim log blocks later.
 	 */
+	spa_import_progress_set_notes(spa, "Verifying Log Devices");
 	error = spa_ld_verify_logs(spa, type, ereport);
 	if (error != 0)
 		return (error);
@@ -5292,6 +5313,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * state. When performing an extreme rewind, we verify the whole pool,
 	 * which can take a very long time.
 	 */
+	spa_import_progress_set_notes(spa, "Verifying pool data");
 	error = spa_ld_verify_pool_data(spa);
 	if (error != 0)
 		return (error);
@@ -5301,6 +5323,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * we write anything to the pool because we'd need to update the space
 	 * accounting using the deflated sizes.
 	 */
+	spa_import_progress_set_notes(spa, "Calculating deflated space");
 	spa_update_dspace(spa);
 
 	/*
@@ -5308,6 +5331,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	 * pool. If we are importing the pool in read-write mode, a few
 	 * additional steps must be performed to finish the import.
 	 */
+	spa_import_progress_set_notes(spa, "Starting import");
 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		uint64_t config_cache_txg = spa->spa_config_txg;
@@ -5324,6 +5348,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
 		}
 
+		spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
 		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
@@ -5343,6 +5368,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 		 * will have been set for us by ZIL traversal operations
 		 * performed above.
 		 */
+		spa_import_progress_set_notes(spa, "Syncing ZIL claims");
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
@@ -5350,6 +5376,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 		 * next sync, we would update the config stored in vdev labels
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
+		spa_import_progress_set_notes(spa, "Updating configs");
 		spa_ld_check_for_config_update(spa, config_cache_txg,
 		    update_config_cache);
 
@@ -5358,6 +5385,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 		 * Then check all DTLs to see if anything needs resilvering.
 		 * The resilver will be deferred if a rebuild was started.
 		 */
+		spa_import_progress_set_notes(spa, "Starting resilvers");
 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
 			vdev_rebuild_restart(spa);
 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
@@ -5371,6 +5399,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 		 */
 		spa_history_log_version(spa, "open", NULL);
 
+		spa_import_progress_set_notes(spa,
+		    "Restarting device removals");
 		spa_restart_removal(spa);
 		spa_spawn_aux_threads(spa);
 
@@ -5383,19 +5413,26 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 		 * auxiliary threads above (from which the livelist
 		 * deletion zthr is part of).
 		 */
+		spa_import_progress_set_notes(spa,
+		    "Cleaning up inconsistent objsets");
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
+		spa_import_progress_set_notes(spa,
+		    "Cleaning up temporary userrefs");
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_import_progress_set_notes(spa, "Restarting initialize");
 		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_import_progress_set_notes(spa, "Restarting TRIM");
 		vdev_trim_restart(spa->spa_root_vdev);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		spa_import_progress_set_notes(spa, "Finished importing");
 	}
 
 	spa_import_progress_remove(spa_guid(spa));
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index cf05158b63f8..873089a53e34 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1153,6 +1153,7 @@ spa_ld_log_sm_data(spa_t *spa)
 
 	uint_t pn = 0;
 	uint64_t ps = 0;
+	uint64_t nsm = 0;
 	psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
 	while (sls != NULL) {
 		/* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
@@ -1185,6 +1186,10 @@ spa_ld_log_sm_data(spa_t *spa)
 		summary_add_data(spa, sls->sls_txg,
 		    sls->sls_mscount, 0, sls->sls_nblocks);
 
+		spa_import_progress_set_notes_nolog(spa,
+		    "Read %llu of %lu log space maps", (u_longlong_t)nsm,
+		    avl_numnodes(&spa->spa_sm_logs_by_txg));
+
 		struct spa_ld_log_sm_arg vla = {
 			.slls_spa = spa,
 			.slls_txg = sls->sls_txg
@@ -1200,6 +1205,7 @@ spa_ld_log_sm_data(spa_t *spa)
 
 		pn--;
 		ps -= space_map_length(sls->sls_sm);
+		nsm++;
 		space_map_close(sls->sls_sm);
 		sls->sls_sm = NULL;
 		sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
@@ -1210,11 +1216,11 @@ spa_ld_log_sm_data(spa_t *spa)
 
 	hrtime_t read_logs_endtime = gethrtime();
 	spa_load_note(spa,
-	    "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
-	    "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
+	    "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
+	    "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
 	    (u_longlong_t)spa_log_sm_nblocks(spa),
 	    (u_longlong_t)zfs_log_sm_blksz,
-	    (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
+	    (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
 
 out:
 	if (error != 0) {
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 24f038ad7f4b..649fe2f634b5 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -417,6 +417,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
 
 	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
+
+	spa_import_progress_set_notes_nolog(spa, "%s", buf);
 }
 
 /*
@@ -2172,6 +2174,7 @@ typedef struct spa_import_progress {
 	uint64_t		pool_guid;	/* unique id for updates */
 	char			*pool_name;
 	spa_load_state_t	spa_load_state;
+	char			*spa_load_notes;
 	uint64_t		mmp_sec_remaining;	/* MMP activity check */
 	uint64_t		spa_load_max_txg;	/* rewind txg */
 	procfs_list_node_t	smh_node;
@@ -2182,9 +2185,9 @@ spa_history_list_t *spa_import_progress_list = NULL;
 static int
 spa_import_progress_show_header(struct seq_file *f)
 {
-	seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
+	seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid",
 	    "load_state", "multihost_secs", "max_txg",
-	    "pool_name");
+	    "pool_name", "notes");
 	return (0);
 }
 
@@ -2193,11 +2196,12 @@ spa_import_progress_show(struct seq_file *f, void *data)
 {
 	spa_import_progress_t *sip = (spa_import_progress_t *)data;
 
-	seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
+	seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n",
 	    (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
 	    (u_longlong_t)sip->mmp_sec_remaining,
 	    (u_longlong_t)sip->spa_load_max_txg,
-	    (sip->pool_name ? sip->pool_name : "-"));
+	    (sip->pool_name ? sip->pool_name : "-"),
+	    (sip->spa_load_notes ? sip->spa_load_notes : "-"));
 
 	return (0);
 }
@@ -2211,6 +2215,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
 		sip = list_remove_head(&shl->procfs_list.pl_list);
 		if (sip->pool_name)
 			spa_strfree(sip->pool_name);
+		if (sip->spa_load_notes)
+			kmem_strfree(sip->spa_load_notes);
 		kmem_free(sip, sizeof (spa_import_progress_t));
 		shl->size--;
 	}
@@ -2266,6 +2272,10 @@ spa_import_progress_set_state(uint64_t pool_guid,
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->spa_load_state = load_state;
+			if (sip->spa_load_notes != NULL) {
+				kmem_strfree(sip->spa_load_notes);
+				sip->spa_load_notes = NULL;
+			}
 			error = 0;
 			break;
 		}
@@ -2275,6 +2285,59 @@ spa_import_progress_set_state(uint64_t pool_guid,
 	return (error);
 }
 
+static void
+spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg,
+    const char *fmt, va_list adx)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	spa_import_progress_t *sip;
+	uint64_t pool_guid = spa_guid(spa);
+
+	if (shl->size == 0)
+		return;
+
+	char *notes = kmem_vasprintf(fmt, adx);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+		if (sip->pool_guid == pool_guid) {
+			if (sip->spa_load_notes != NULL) {
+				kmem_strfree(sip->spa_load_notes);
+				sip->spa_load_notes = NULL;
+			}
+			sip->spa_load_notes = notes;
+			if (log_dbgmsg)
+				zfs_dbgmsg("'%s' %s", sip->pool_name, notes);
+			notes = NULL;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+	if (notes != NULL)
+		kmem_strfree(notes);
+}
+
+void
+spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx);
+	va_end(adx);
+}
+
+void
+spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx);
+	va_end(adx);
+}
+
 int
 spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
 {
@@ -2343,6 +2406,7 @@ spa_import_progress_add(spa_t *spa)
 		poolname = spa_name(spa);
 	sip->pool_name = spa_strdup(poolname);
 	sip->spa_load_state = spa_load_state(spa);
+	sip->spa_load_notes = NULL;
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	procfs_list_add(&shl->procfs_list, sip);
@@ -2362,6 +2426,8 @@ spa_import_progress_remove(uint64_t pool_guid)
 		if (sip->pool_guid == pool_guid) {
 			if (sip->pool_name)
 				spa_strfree(sip->pool_name);
+			if (sip->spa_load_notes)
+				spa_strfree(sip->spa_load_notes);
 			list_remove(&shl->procfs_list.pl_list, sip);
 			shl->size--;
 			kmem_free(sip, sizeof (spa_import_progress_t));
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 04a34a997570..3e1a3aeb6cbe 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -459,7 +459,8 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
     'import_devices_missing', 'import_log_missing',
     'import_paths_changed',
     'import_rewind_config_changed',
-    'import_rewind_device_replaced']
+    'import_rewind_device_replaced',
+    'zpool_import_status']
 tags = ['functional', 'cli_root', 'zpool_import']
 timeout = 1200
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh
new file mode 100755
index 000000000000..c96961bf6419
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh
@@ -0,0 +1,132 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2023 Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
+
+#
+# DESCRIPTION:
+# 	During a pool import, the 'import_progress' kstat contains details
+# 	on the import progress.
+#
+# STRATEGY:
+#	1. Create test pool with several devices
+#	2. Generate some ZIL records and spacemap logs
+#	3. Export the pool
+#	4. Import the pool in the background and monitor the kstat content
+#	5. Check the zfs debug messages for import progress
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0
+	log_must set_tunable64 METASLAB_DEBUG_LOAD 0
+
+	destroy_pool $TESTPOOL1
+}
+
+log_assert "During a pool import, the 'import_progress' kstat contains " \
+	"notes on the progress"
+
+log_onexit cleanup
+
+log_must zpool create $TESTPOOL1 $VDEV0 $VDEV1 $VDEV2
+typeset guid=$(zpool get -H -o value guid $TESTPOOL1)
+
+log_must zfs create -o recordsize=8k $TESTPOOL1/fs
+#
+# This dd command works around an issue where ZIL records aren't created
+# after freezing the pool unless a ZIL header already exists. Create a file
+# synchronously to force ZFS to write one out.
+#
+log_must dd if=/dev/zero of=/$TESTPOOL1/fs/sync conv=fsync bs=1 count=1
+
+#
+# Overwrite some blocks to populate spacemap logs
+#
+log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200
+sync_all_pools
+log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200
+sync_all_pools
+
+#
+# Freeze the pool to retain intent log records
+#
+log_must zpool freeze $TESTPOOL1
+
+# fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data]
+log_must fill_fs /$TESTPOOL1/fs 1 2000 100 1024 R
+
+log_must zpool list -v $TESTPOOL1
+
+#
+# Unmount filesystem and export the pool
+#
+# At this stage the zfs intent log contains
+# a set of records to replay.
+#
+log_must zfs unmount /$TESTPOOL1/fs
+
+log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1
+log_must zpool export $TESTPOOL1
+
+log_must set_tunable64 METASLAB_DEBUG_LOAD 1
+log_note "Starting zpool import in background at" $(date +'%H:%M:%S')
+zpool import -d $DEVICE_DIR -f $guid &
+pid=$!
+
+#
+# capture progress until import is finished
+#
+log_note waiting for pid $pid to exit
+kstat import_progress
+while [[ -d /proc/"$pid" ]]; do
+	line=$(kstat import_progress | grep -v pool_guid)
+	if [[ -n $line ]]; then
+		echo $line
+	fi
+	if [[ -f /$TESTPOOL1/fs/00 ]]; then
+		break;
+	fi
+	sleep 0.0001
+done
+log_note "zpool import completed at" $(date +'%H:%M:%S')
+
+entries=$(kstat dbgmsg | grep "spa_import_progress_set_notes_impl(): 'testpool1'" | wc -l)
+log_note "found $entries progress notes in dbgmsg"
+log_must test $entries -gt 20
+
+log_must zpool status $TESTPOOL1
+
+log_pass "During a pool import, the 'import_progress' kstat contains " \
+	"notes on the progress"
-- 
cgit v1.2.3


From 706307445e66b63f6f4fd828dc331bb0a010c59f Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@delphix.com>
Date: Mon, 29 Apr 2024 15:35:53 -0600
Subject: vdev probe to slow disk can stall mmp write checker

Simplify vdev probes in the zio_vdev_io_done context to
avoid holding the spa config lock for a long duration.

Also allow zpool clear if no evidence of another host
is using the pool.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15839
---
 cmd/zpool/zpool_main.c                             |   2 +-
 include/sys/spa.h                                  |   4 +-
 include/sys/uberblock_impl.h                       |  16 ++--
 include/sys/vdev_impl.h                            |   2 +-
 man/man8/zpool-clear.8                             |   7 +-
 module/zfs/mmp.c                                   |   5 +-
 module/zfs/spa.c                                   | 102 +++++++++++++++++----
 module/zfs/txg.c                                   |   9 ++
 module/zfs/vdev.c                                  |  22 +++--
 module/zfs/vdev_label.c                            |   4 +-
 module/zfs/zfs_ioctl.c                             |   9 +-
 module/zfs/zio.c                                   |   6 +-
 module/zfs/zio_inject.c                            |   6 +-
 tests/runfiles/linux.run                           |   2 +-
 tests/zfs-tests/tests/Makefile.am                  |   1 +
 .../tests/functional/mmp/mmp_write_slow_disk.ksh   |  97 ++++++++++++++++++++
 16 files changed, 242 insertions(+), 52 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 20b1c85065b4..ed0b8d7a12d7 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -8910,7 +8910,7 @@ status_callback(zpool_handle_t *zhp, void *data)
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
 		    " are connected, then reboot your system and\n\timport the "
-		    "pool.\n"));
+		    "pool or run 'zpool clear' to resume the pool.\n"));
 		break;
 
 	case ZPOOL_STATUS_IO_FAILURE_WAIT:
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 3112ba09ceb4..6611141b9569 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -769,7 +769,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 #define	SPA_ASYNC_CONFIG_UPDATE			0x01
 #define	SPA_ASYNC_REMOVE			0x02
-#define	SPA_ASYNC_PROBE				0x04
+#define	SPA_ASYNC_FAULT_VDEV			0x04
 #define	SPA_ASYNC_RESILVER_DONE			0x08
 #define	SPA_ASYNC_RESILVER			0x10
 #define	SPA_ASYNC_AUTOEXPAND			0x20
@@ -1113,6 +1113,8 @@ extern uint32_t spa_get_hostid(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 extern boolean_t spa_livelist_delete_check(spa_t *spa);
 
+extern boolean_t spa_mmp_remote_host_activity(spa_t *spa);
+
 extern spa_mode_t spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 
diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h
index 03bcfa8f4dd1..13fce9c29e2d 100644
--- a/include/sys/uberblock_impl.h
+++ b/include/sys/uberblock_impl.h
@@ -50,20 +50,20 @@ extern "C" {
 #define	MMP_SEQ_VALID_BIT	0x02
 #define	MMP_FAIL_INT_VALID_BIT	0x04
 
-#define	MMP_VALID(ubp)		(ubp->ub_magic == UBERBLOCK_MAGIC && \
-				    ubp->ub_mmp_magic == MMP_MAGIC)
-#define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define	MMP_VALID(ubp)		((ubp)->ub_magic == UBERBLOCK_MAGIC && \
+				    (ubp)->ub_mmp_magic == MMP_MAGIC)
+#define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
 				    MMP_INTERVAL_VALID_BIT))
-#define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
 				    MMP_SEQ_VALID_BIT))
-#define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
 				    MMP_FAIL_INT_VALID_BIT))
 
-#define	MMP_INTERVAL(ubp)	((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+#define	MMP_INTERVAL(ubp)	(((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
 				    >> 8)
-#define	MMP_SEQ(ubp)		((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+#define	MMP_SEQ(ubp)		(((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
 				    >> 32)
-#define	MMP_FAIL_INT(ubp)	((ubp->ub_mmp_config & 0xFFFF000000000000) \
+#define	MMP_FAIL_INT(ubp)	(((ubp)->ub_mmp_config & 0xFFFF000000000000) \
 				    >> 48)
 
 #define	MMP_INTERVAL_SET(write) \
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 02948894c365..8c6ab316fa18 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -274,7 +274,7 @@ struct vdev {
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
-	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
+	boolean_t	vdev_fault_wanted; /* async faulted wanted?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8
index c61ecae483ac..3e448be87fc2 100644
--- a/man/man8/zpool-clear.8
+++ b/man/man8/zpool-clear.8
@@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the
 devices can be accessed.
 Pools with
 .Sy multihost
-enabled which have been suspended cannot be resumed.
-While the pool was suspended, it may have been imported on
-another host, and resuming I/O could result in pool damage.
+enabled which have been suspended cannot be resumed when there is evidence
+that the pool was imported by another host.
+The same checks performed during an import will be applied before the clear
+proceeds.
 .Bl -tag -width Ds
 .It Fl -power
 Power on the devices's slot in the storage enclosure and wait for the device
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 25eea0752941..8144d8965085 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -663,12 +663,13 @@ mmp_thread(void *arg)
 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
 			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
 			    "mmp_last_write %llu mmp_interval %llu "
-			    "mmp_fail_intervals %llu mmp_fail_ns %llu",
+			    "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)mmp->mmp_last_write,
 			    (u_longlong_t)mmp_interval,
 			    (u_longlong_t)mmp_fail_intervals,
-			    (u_longlong_t)mmp_fail_ns);
+			    (u_longlong_t)mmp_fail_ns,
+			    (u_longlong_t)spa->spa_uberblock.ub_txg);
 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 			    "succeeded in over %llu ms; suspending pool. "
 			    "Hrtime %llu",
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index fba7846955fd..251dd8a4d1c7 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -3542,11 +3542,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 }
 
 /*
- * Perform the import activity check.  If the user canceled the import or
- * we detected activity then fail.
+ * Remote host activity check.
+ *
+ * error results:
+ *          0 - no activity detected
+ *  EREMOTEIO - remote activity detected
+ *      EINTR - user canceled the operation
  */
 static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
+    boolean_t importing)
 {
 	uint64_t txg = ub->ub_txg;
 	uint64_t timestamp = ub->ub_timestamp;
@@ -3591,19 +3596,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 
 	import_expire = gethrtime() + import_delay;
 
-	spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
-	    "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+	if (importing) {
+		spa_import_progress_set_notes(spa, "Checking MMP activity, "
+		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+	}
 
-	int interations = 0;
+	int iterations = 0;
 	while ((now = gethrtime()) < import_expire) {
-		if (interations++ % 30 == 0) {
+		if (importing && iterations++ % 30 == 0) {
 			spa_import_progress_set_notes(spa, "Checking MMP "
 			    "activity, %llu ms remaining",
 			    (u_longlong_t)NSEC2MSEC(import_expire - now));
 		}
 
-		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
-		    NSEC2SEC(import_expire - gethrtime()));
+		if (importing) {
+			(void) spa_import_progress_set_mmp_check(spa_guid(spa),
+			    NSEC2SEC(import_expire - gethrtime()));
+		}
 
 		vdev_uberblock_load(rvd, ub, &mmp_label);
 
@@ -3685,6 +3694,61 @@ out:
 	return (error);
 }
 
+/*
+ * Called from zfs_ioc_clear for a pool that was suspended
+ * after failing mmp write checks.
+ */
+boolean_t
+spa_mmp_remote_host_activity(spa_t *spa)
+{
+	ASSERT(spa_multihost(spa) && spa_suspended(spa));
+
+	nvlist_t *best_label;
+	uberblock_t best_ub;
+
+	/*
+	 * Locate the best uberblock on disk
+	 */
+	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
+	if (best_label) {
+		/*
+		 * confirm that the best hostid matches our hostid
+		 */
+		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
+		    spa_get_hostid(spa) !=
+		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
+			nvlist_free(best_label);
+			return (B_TRUE);
+		}
+		nvlist_free(best_label);
+	} else {
+		return (B_TRUE);
+	}
+
+	if (!MMP_VALID(&best_ub) ||
+	    !MMP_FAIL_INT_VALID(&best_ub) ||
+	    MMP_FAIL_INT(&best_ub) == 0) {
+		return (B_TRUE);
+	}
+
+	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
+	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
+		zfs_dbgmsg("txg mismatch detected during pool clear "
+		    "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
+		    (u_longlong_t)spa->spa_uberblock.ub_txg,
+		    (u_longlong_t)best_ub.ub_txg,
+		    (u_longlong_t)spa->spa_uberblock.ub_timestamp,
+		    (u_longlong_t)best_ub.ub_timestamp);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Perform an activity check looking for any remote writer
+	 */
+	return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
+	    B_FALSE) != 0);
+}
+
 static int
 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 {
@@ -4005,7 +4069,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 		}
 
-		int error = spa_activity_check(spa, ub, spa->spa_config);
+		int error =
+		    spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
 		if (error) {
 			nvlist_free(label);
 			return (error);
@@ -8589,15 +8654,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 }
 
 static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
 {
-	if (vd->vdev_probe_wanted) {
-		vd->vdev_probe_wanted = B_FALSE;
-		vdev_reopen(vd);	/* vdev_open() does the actual probe */
+	if (vd->vdev_fault_wanted) {
+		vd->vdev_fault_wanted = B_FALSE;
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
-		spa_async_probe(spa, vd->vdev_child[c]);
+		spa_async_fault_vdev(spa, vd->vdev_child[c]);
 }
 
 static void
@@ -8685,11 +8751,11 @@ spa_async_thread(void *arg)
 	}
 
 	/*
-	 * See if any devices need to be probed.
+	 * See if any devices need to be marked faulted.
 	 */
-	if (tasks & SPA_ASYNC_PROBE) {
+	if (tasks & SPA_ASYNC_FAULT_VDEV) {
 		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_probe(spa, spa->spa_root_vdev);
+		spa_async_fault_vdev(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index a67c043446f5..5ce6be69be14 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -550,6 +550,15 @@ txg_sync_thread(void *arg)
 			timer = (delta > timeout ? 0 : timeout - delta);
 		}
 
+		/*
+		 * When we're suspended, nothing should be changing and for
+		 * MMP we don't want to bump anything that would make it
+		 * harder to detect if another host is changing it when
+		 * resuming after a MMP suspend.
+		 */
+		if (spa_suspended(spa))
+			continue;
+
 		/*
 		 * Wait until the quiesce thread hands off a txg to us,
 		 * prompting it to do so if necessary.
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 6d8eb50a1a6e..981da4e986c4 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1661,6 +1661,7 @@ vdev_metaslab_fini(vdev_t *vd)
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
+	boolean_t	vps_zio_done_probe;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
@@ -1704,6 +1705,17 @@ vdev_probe_done(zio_t *zio)
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
+
+			/*
+			 * If this probe was initiated from zio pipeline, then
+			 * change the state in a spa_async_request. Probes that
+			 * were initiated from a vdev_open can change the state
+			 * as part of the open call.
+			 */
+			if (vps->vps_zio_done_probe) {
+				vd->vdev_fault_wanted = B_TRUE;
+				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
+			}
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
@@ -1754,6 +1766,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
+		vps->vps_zio_done_probe = (zio != NULL);
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
@@ -1780,15 +1793,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
-
-		/*
-		 * We can't change the vdev state in this context, so we
-		 * kick off an async task to do it on our behalf.
-		 */
-		if (zio != NULL) {
-			vd->vdev_probe_wanted = B_TRUE;
-			spa_async_request(spa, SPA_ASYNC_PROBE);
-		}
 	}
 
 	if (zio != NULL)
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 737d8b33e188..5c0e750c4614 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1982,6 +1982,7 @@ retry:
 	/*
 	 * If this isn't a resync due to I/O errors,
 	 * and nothing changed in this transaction group,
+	 * and multihost protection isn't enabled,
 	 * and the vdev configuration hasn't changed,
 	 * then there's nothing to do.
 	 */
@@ -1989,7 +1990,8 @@ retry:
 		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
 		    txg, spa->spa_mmp.mmp_delay);
 
-		if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+		if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
+		    !spa_multihost(spa))
 			return (0);
 	}
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index ac8329185cca..b07837113293 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -5800,10 +5800,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 
 	/*
 	 * If multihost is enabled, resuming I/O is unsafe as another
-	 * host may have imported the pool.
+	 * host may have imported the pool. Check for remote activity.
 	 */
-	if (spa_multihost(spa) && spa_suspended(spa))
-		return (SET_ERROR(EINVAL));
+	if (spa_multihost(spa) && spa_suspended(spa) &&
+	    spa_mmp_remote_host_activity(spa)) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EREMOTEIO));
+	}
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d0b4016237b9..046e6d64c1a9 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2503,8 +2503,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
-	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
-	    "failure and has been suspended.\n", spa_name(spa));
+	if (reason != ZIO_SUSPEND_MMP) {
+		cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
+		    "I/O failure and has been suspended.\n", spa_name(spa));
+	}
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 609182f4a2cd..66ad72fb88e9 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -605,9 +605,11 @@ zio_handle_io_delay(zio_t *zio)
 		if (vd->vdev_guid != handler->zi_record.zi_guid)
 			continue;
 
+		/* also match on I/O type (e.g., -T read) */
 		if (handler->zi_record.zi_iotype != ZIO_TYPES &&
-		    handler->zi_record.zi_iotype != zio->io_type)
-				continue;
+		    handler->zi_record.zi_iotype != zio->io_type) {
+			continue;
+		}
 
 		/*
 		 * Defensive; should never happen as the array allocation
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index a0b74ef4a8c6..92ce09ec6fcb 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -146,7 +146,7 @@ tags = ['functional', 'mmap']
 tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
     'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
     'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
-    'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
+    'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
 tags = ['functional', 'mmp']
 
 [tests/functional/mount:Linux]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 8befed077234..cc66d762f3c2 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1588,6 +1588,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/mmp/mmp_on_zdb.ksh \
 	functional/mmp/mmp_reset_interval.ksh \
 	functional/mmp/mmp_write_distribution.ksh \
+	functional/mmp/mmp_write_slow_disk.ksh \
 	functional/mmp/mmp_write_uberblocks.ksh \
 	functional/mmp/multihost_history.ksh \
 	functional/mmp/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
new file mode 100755
index 000000000000..8b118684aa7f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
@@ -0,0 +1,97 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc
+#
+
+# DESCRIPTION:
+#	Verify that long VDEV probes do not cause MMP checks to suspend pool
+#	Note: without PR-15839 fix, this test will suspend the pool.
+#
+#	A device that is returning unexpected errors will trigger a vdev_probe.
+#	When the device additionally has slow response times, the probe can hold
+#	the spa config lock as a writer for a long period of time such that the
+#	mmp uberblock updates stall when trying to acquire the spa config lock.
+#
+# STRATEGY:
+#	1. Create a pool with multiple leaf vdevs
+#	2. Enable multihost and multihost_history
+#	3. Delay for MMP writes to occur
+#	4. Verify that a long VDEV probe didn't cause MMP check to suspend pool
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/mmp/mmp.cfg
+. $STF_SUITE/tests/functional/mmp/mmp.kshlib
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must zinject -c all
+
+	if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then
+		log_must zpool clear $MMP_POOL
+		zpool get state $MMP_POOL $MMP_DIR/file.3
+		zpool events | grep ".fs.zfs." | grep -v "history_event"
+	fi
+
+	poolexists $MMP_POOL && destroy_pool $MMP_POOL
+	log_must rm -r $MMP_DIR
+	log_must mmp_clear_hostid
+}
+
+log_assert "A long VDEV probe doesn't cause a MMP check suspend"
+log_onexit cleanup
+
+MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost
+
+# Create a multiple drive pool
+log_must zpool events -c
+log_must mkdir -p $MMP_DIR
+log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5}
+log_must zpool create -f $MMP_POOL \
+	mirror $MMP_DIR/file.{0,1,2} \
+	mirror $MMP_DIR/file.{3,4,5}
+
+# Enable MMP
+log_must mmp_set_hostid $HOSTID1
+log_must zpool set multihost=on $MMP_POOL
+clear_mmp_history
+
+# Inject vdev write error along with a delay
+log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL
+log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL
+log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL
+
+log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5
+sleep 10
+sync_pool $MMP_POOL
+
+# Confirm mmp writes to the non-slow disks have taken place
+for x in {0,1,2,4}; do
+	write_count=$(grep -c file.${x} $MMP_HISTORY_URL)
+	[[ $write_count -gt 0 ]] || log_fail "expecting mmp writes"
+done
+
+# Expect that the pool was not suspended
+log_must check_state $MMP_POOL "" "ONLINE"
+health=$(zpool list -H -o health $MMP_POOL)
+log_note "$MMP_POOL health is $health"
+[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected"
+
+log_pass "A long VDEV probe doesn't cause a MMP check suspend"
-- 
cgit v1.2.3


From 61f3638a3463b58168b9c2b0d1f8605fde1349d9 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 24 Oct 2023 11:00:07 -0700
Subject: Add prefetch property

ZFS prefetch is currently governed by the zfs_prefetch_disable
tunable. However, this is a module-wide settings - if a specific
dataset benefits from prefetch, while others have issue with it,
an optimal solution does not exists.

This commit introduce the "prefetch" tri-state property, which enable
granular control (at dataset/volume level) for prefetching.

This patch does not remove the zfs_prefetch_disable, which remains
a system-wide switch for enable/disable prefetch. However, to avoid
duplication, it would be preferable to deprecate and then remove
the module tunable.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Co-authored-by: Gionatan Danti <g.danti@assyoma.it>
Closes #15237
Closes #15436
---
 include/sys/dmu_objset.h  |  1 +
 include/sys/fs/zfs.h      |  7 +++++++
 lib/libzfs/libzfs.abi     |  3 ++-
 man/man7/zfsprops.7       | 17 +++++++++++++++++
 module/zcommon/zfs_prop.c | 11 +++++++++++
 module/zfs/dmu_objset.c   | 19 +++++++++++++++++++
 module/zfs/dmu_zfetch.c   |  7 ++++++-
 7 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index 9f6e0fdd601b..a9123e862af7 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -132,6 +132,7 @@ struct objset {
 	zfs_logbias_op_t os_logbias;
 	zfs_cache_type_t os_primary_cache;
 	zfs_cache_type_t os_secondary_cache;
+	zfs_prefetch_type_t os_prefetch;
 	zfs_sync_type_t os_sync;
 	zfs_redundant_metadata_type_t os_redundant_metadata;
 	uint64_t os_recordsize;
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index f84cb7aade7f..4329e4e86f2d 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -191,6 +191,7 @@ typedef enum {
 	ZFS_PROP_REDACTED,
 	ZFS_PROP_REDACT_SNAPS,
 	ZFS_PROP_SNAPSHOTS_CHANGED,
+	ZFS_PROP_PREFETCH,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
@@ -546,6 +547,12 @@ typedef enum zfs_key_location {
 	ZFS_KEYLOCATION_LOCATIONS
 } zfs_keylocation_t;
 
+typedef enum {
+	ZFS_PREFETCH_NONE = 0,
+	ZFS_PREFETCH_METADATA = 1,
+	ZFS_PREFETCH_ALL = 2
+} zfs_prefetch_type_t;
+
 #define	DEFAULT_PBKDF2_ITERATIONS 350000
 #define	MIN_PBKDF2_ITERATIONS 100000
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 771c7407e9da..8bedfe72294c 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -1867,7 +1867,8 @@
       <enumerator name='ZFS_PROP_REDACTED' value='93'/>
       <enumerator name='ZFS_PROP_REDACT_SNAPS' value='94'/>
       <enumerator name='ZFS_PROP_SNAPSHOTS_CHANGED' value='95'/>
-      <enumerator name='ZFS_NUM_PROPS' value='96'/>
+      <enumerator name='ZFS_PROP_PREFETCH' value='96'/>
+      <enumerator name='ZFS_NUM_PROPS' value='97'/>
     </enum-decl>
     <typedef-decl name='zfs_prop_t' type-id='4b000d60' id='58603c44'/>
     <enum-decl name='zprop_source_t' naming-typedef-id='a2256d42' id='5903f80e'>
diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7
index e3674b1f8a8d..59f6404379af 100644
--- a/man/man7/zfsprops.7
+++ b/man/man7/zfsprops.7
@@ -1613,6 +1613,23 @@ If this property is set to
 then only metadata is cached.
 The default value is
 .Sy all .
+.It Sy prefetch Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
+Controls what speculative prefetch does.
+If this property is set to
+.Sy all ,
+then both user data and metadata are prefetched.
+If this property is set to
+.Sy none ,
+then neither user data nor metadata are prefetched.
+If this property is set to
+.Sy metadata ,
+then only metadata are prefetched.
+The default value is
+.Sy all .
+.Pp
+Please note that the module parameter zfs_disable_prefetch=1 can
+be used to totally disable speculative prefetch, bypassing anything
+this property does.
 .It Sy setuid Ns = Ns Sy on Ns | Ns Sy off
 Controls whether the setuid bit is respected for the file system.
 The default value is
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c
index 3db6fd13f4ae..29764674a31b 100644
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -345,6 +345,13 @@ zfs_prop_init(void)
 		{ NULL }
 	};
 
+	static const zprop_index_t prefetch_table[] = {
+		{ "none",	ZFS_PREFETCH_NONE },
+		{ "metadata",	ZFS_PREFETCH_METADATA },
+		{ "all",	ZFS_PREFETCH_ALL },
+		{ NULL }
+	};
+
 	static const zprop_index_t sync_table[] = {
 		{ "standard",	ZFS_SYNC_STANDARD },
 		{ "always",	ZFS_SYNC_ALWAYS },
@@ -453,6 +460,10 @@ zfs_prop_init(void)
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "SECONDARYCACHE", cache_table, sfeatures);
+	zprop_register_index(ZFS_PROP_PREFETCH, "prefetch",
+	    ZFS_PREFETCH_ALL, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+	    "none | metadata | all", "PREFETCH", prefetch_table, sfeatures);
 	zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "latency | throughput", "LOGBIAS", logbias_table, sfeatures);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index f8bd7422a5df..ef966d6703c9 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -263,6 +263,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
 	os->os_secondary_cache = newval;
 }
 
+static void
+prefetch_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE ||
+	    newval == ZFS_PREFETCH_METADATA);
+	os->os_prefetch = newval;
+}
+
 static void
 sync_changed_cb(void *arg, uint64_t newval)
 {
@@ -562,6 +575,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
 		}
+		if (err == 0) {
+			err = dsl_prop_register(ds,
+			    zfs_prop_to_name(ZFS_PROP_PREFETCH),
+			    prefetch_changed_cb, os);
+		}
 		if (!ds->ds_is_snapshot) {
 			if (err == 0) {
 				err = dsl_prop_register(ds,
@@ -635,6 +653,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 		os->os_dnodesize = DNODE_MIN_SIZE;
+		os->os_prefetch = ZFS_PREFETCH_ALL;
 	}
 
 	if (ds == NULL || !ds->ds_is_snapshot)
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index 3439f9bddf4e..ed50f1889b59 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -471,9 +471,14 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 {
 	zstream_t *zs;
 	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
+	zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch;
 
-	if (zfs_prefetch_disable)
+	if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE)
 		return (NULL);
+
+	if (os_prefetch == ZFS_PREFETCH_METADATA)
+		fetch_data = B_FALSE;
+
 	/*
 	 * If we haven't yet loaded the indirect vdevs' mappings, we
 	 * can only read from blocks that we carefully ensure are on
-- 
cgit v1.2.3


From 3d4d61988ad20e88d704647f3c86f42176b460e0 Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@FreeBSD.org>
Date: Thu, 25 Apr 2024 16:24:52 -0500
Subject: Fix updating the zvol_htable when renaming a zvol

When renaming a zvol, insert it into zvol_htable using the new name, not
the old name.  Otherwise some operations won't work.  For example,
"zfs set volsize" while the zvol is open.

Sponsored by:	Axcient
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alek Pinchuk <apinchuk@axcient.com>
Signed-off-by:	Alan Somers <asomers@FreeBSD.org>
Closes #16127
Closes #16128
---
 module/os/freebsd/zfs/zvol_os.c | 2 +-
 module/os/linux/zfs/zvol_os.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index b6edac434dea..7c418f26fc14 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -1273,7 +1273,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	/* Move to a new hashtable entry.  */
-	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	zv->zv_hash = zvol_name_hash(newname);
 	hlist_del(&zv->zv_hlink);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index c7360293f0e6..61c8db5d8178 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1564,7 +1564,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 
 	/* move to new hashtable entry  */
-	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	zv->zv_hash = zvol_name_hash(newname);
 	hlist_del(&zv->zv_hlink);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 
-- 
cgit v1.2.3


From 2566592045780e7be7afc899c2496b1ae3af4f4d Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 17 Apr 2024 13:31:25 -0700
Subject: Tag zfs-2.2.4

META file and changelog updated.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index a33c1be8787a..383fa37fd42a 100644
--- a/META
+++ b/META
@@ -1,7 +1,7 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.3
+Version:       2.2.4
 Release:       1
 Release-Tags:  relext
 License:       CDDL
-- 
cgit v1.2.3