aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob N <rob.norris@klarasystems.com>2024-04-19 23:41:31 +0000
committerGitHub <noreply@github.com>2024-04-19 23:41:31 +0000
commitf4f156157de3f61e55db0429b10c63d02226e115 (patch)
tree7953263d50dd6c349b95d335cf37410f6fd27aa2
parent9f83eec03904b18e052fbe2c66542bd47254cf57 (diff)
downloadsrc-f4f156157de3f61e55db0429b10c63d02226e115.tar.gz
src-f4f156157de3f61e55db0429b10c63d02226e115.zip
abd_iter_page: rework to handle multipage scatterlists
Previously, abd_iter_page() would assume that every scatterlist would contain a single page (compound or no), because that's all we ever create in abd_alloc_chunks(). However, scatterlists can contain multiple pages of arbitrary provenance, and if we get one of those, we'd get all the math wrong. This reworks things to handle multiple pages in a scatterlist, by properly finding the right page within it for the given offset, and understanding better where the end of the page is and not crossing it. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reported-by: Brian Atkinson <batkinson@lanl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #16108
-rw-r--r--module/os/linux/zfs/abd_os.c120
1 files changed, 74 insertions, 46 deletions
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index d3255dcbc0f7..cee7410c8833 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -1015,10 +1015,50 @@ abd_cache_reap_now(void)
}
#if defined(_KERNEL)
+
/*
- * Yield the next page struct and data offset and size within it, without
+ * This is abd_iter_page(), the function underneath abd_iterate_page_func().
+ * It yields the next page struct and data offset and size within it, without
* mapping it into the address space.
*/
+
+/*
+ * "Compound pages" are a group of pages that can be referenced from a single
+ * struct page *. Its organised as a "head" page, followed by a series of
+ * "tail" pages.
+ *
+ * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
+ * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
+ * great many of the IO buffers we get are going to be of this type.
+ *
+ * The tail pages are just regular PAGESIZE pages, and can be safely used
+ * as-is. However, the head page has length covering itself and all the tail
+ * pages. If the ABD chunk spans multiple pages, then we can use the head page
+ * and a >PAGESIZE length, which is far more efficient.
+ *
+ * Before kernel 4.5 however, compound page heads were refcounted separately
+ * from tail pages, such that moving back to the head page would require us to
+ * take a reference to it and releasing it once we're completely finished with
+ * it. In practice, that means when our caller is done with the ABD, which we
+ * have no insight into from here. Rather than contort this API to track head
+ * page references on such ancient kernels, we disable this special compound
+ * page handling on 4.5, instead just using treating each page within it as a
+ * regular PAGESIZE page (which it is). This is slightly less efficient, but
+ * makes everything far simpler.
+ *
+ * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the
+ * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to
+ * understand compound pages, or not, as required.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#define ABD_ITER_COMPOUND_PAGES 1
+#define ABD_ITER_PAGE_SIZE(page) \
+ (PageCompound(page) ? page_size(page) : PAGESIZE)
+#else
+#undef ABD_ITER_COMPOUND_PAGES
+#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE)
+#endif
+
void
abd_iter_page(struct abd_iter *aiter)
{
@@ -1032,6 +1072,12 @@ abd_iter_page(struct abd_iter *aiter)
struct page *page;
size_t doff, dsize;
+ /*
+ * Find the page, and the start of the data within it. This is computed
+ * differently for linear and scatter ABDs; linear is referenced by
+ * virtual memory location, while scatter is referenced by page
+ * pointer.
+ */
if (abd_is_linear(aiter->iter_abd)) {
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
@@ -1044,57 +1090,24 @@ abd_iter_page(struct abd_iter *aiter)
/* offset of address within the page */
doff = offset_in_page(paddr);
-
- /* total data remaining in abd from this position */
- dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
} else {
ASSERT(!abd_is_gang(aiter->iter_abd));
/* current scatter page */
- page = sg_page(aiter->iter_sg);
+ page = nth_page(sg_page(aiter->iter_sg),
+ aiter->iter_offset >> PAGE_SHIFT);
/* position within page */
- doff = aiter->iter_offset;
-
- /* remaining data in scatterlist */
- dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
- aiter->iter_abd->abd_size - aiter->iter_pos);
+ doff = aiter->iter_offset & (PAGESIZE - 1);
}
- ASSERT(page);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#ifdef ABD_ITER_COMPOUND_PAGES
if (PageTail(page)) {
/*
- * This page is part of a "compound page", which is a group of
- * pages that can be referenced from a single struct page *.
- * Its organised as a "head" page, followed by a series of
- * "tail" pages.
- *
- * In OpenZFS, compound pages are allocated using the
- * __GFP_COMP flag, which we get from scatter ABDs and SPL
- * vmalloc slabs (ie >16K allocations). So a great many of the
- * IO buffers we get are going to be of this type.
- *
- * The tail pages are just regular PAGE_SIZE pages, and can be
- * safely used as-is. However, the head page has length
- * covering itself and all the tail pages. If this ABD chunk
- * spans multiple pages, then we can use the head page and a
- * >PAGE_SIZE length, which is far more efficient.
- *
- * To do this, we need to adjust the offset to be counted from
- * the head page. struct page for compound pages are stored
- * contiguously, so we can just adjust by a simple offset.
- *
- * Before kernel 4.5, compound page heads were refcounted
- * separately, such that moving back to the head page would
- * require us to take a reference to it and releasing it once
- * we're completely finished with it. In practice, that means
- * when our caller is done with the ABD, which we have no
- * insight into from here. Rather than contort this API to
- * track head page references on such ancient kernels, we just
- * compile this block out and use the tail pages directly. This
- * is slightly less efficient, but makes everything far
- * simpler.
+ * If this is a compound tail page, move back to the head, and
+ * adjust the offset to match. This may let us yield a much
+ * larger amount of data from a single logical page, and so
+ * leave our caller with fewer pages to process.
*/
struct page *head = compound_head(page);
doff += ((page - head) * PAGESIZE);
@@ -1102,12 +1115,27 @@ abd_iter_page(struct abd_iter *aiter)
}
#endif
- /* final page and position within it */
+ ASSERT(page);
+
+ /*
+ * Compute the maximum amount of data we can take from this page. This
+ * is the smaller of:
+ * - the remaining space in the page
+ * - the remaining space in this scatterlist entry (which may not cover
+ * the entire page)
+ * - the remaining space in the abd (which may not cover the entire
+ * scatterlist entry)
+ */
+ dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
+ if (!abd_is_linear(aiter->iter_abd))
+ dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
+ ASSERT3U(dsize, >, 0);
+
+ /* final iterator outputs */
aiter->iter_page = page;
aiter->iter_page_doff = doff;
-
- /* amount of data in the chunk, up to the end of the page */
- aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+ aiter->iter_page_dsize = dsize;
}
/*