aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c')
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c673
1 files changed, 673 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
new file mode 100644
index 000000000000..d282f6d95ddf
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+/*
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+#ifdef _KERNEL
+
+#include <sys/errno.h>
+#include <sys/vmem.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/uio_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/string.h>
+#include <sys/zfs_refcount.h>
+#include <sys/zfs_debug.h>
+#include <linux/kmap_compat.h>
+#include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+
+/*
+ * Move "n" bytes at byte address "p"; "rw" indicates the direction
+ * of the move, and the I/O parameters are provided in "uio", which is
+ * update to reflect the data which was moved. Returns 0 on success or
+ * a non-zero errno on failure.
+ */
+static int
+zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ const struct iovec *iov = uio->uio_iov;
+ size_t skip = uio->uio_skip;
+ ulong_t cnt;
+
+ ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
+ while (n && uio->uio_resid) {
+ cnt = MIN(iov->iov_len - skip, n);
+ if (rw == UIO_READ)
+ memcpy(iov->iov_base + skip, p, cnt);
+ else
+ memcpy(p, iov->iov_base + skip, cnt);
+ skip += cnt;
+ if (skip == iov->iov_len) {
+ skip = 0;
+ uio->uio_iov = (++iov);
+ uio->uio_iovcnt--;
+ }
+ uio->uio_skip = skip;
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+ p = (caddr_t)p + cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+static int
+zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ const struct bio_vec *bv = uio->uio_bvec;
+ size_t skip = uio->uio_skip;
+ ulong_t cnt;
+
+ while (n && uio->uio_resid) {
+ void *paddr;
+ cnt = MIN(bv->bv_len - skip, n);
+
+ paddr = zfs_kmap_local(bv->bv_page);
+ if (rw == UIO_READ) {
+ /* Copy from buffer 'p' to the bvec data */
+ memcpy(paddr + bv->bv_offset + skip, p, cnt);
+ } else {
+ /* Copy from bvec data to buffer 'p' */
+ memcpy(p, paddr + bv->bv_offset + skip, cnt);
+ }
+ zfs_kunmap_local(paddr);
+
+ skip += cnt;
+ if (skip == bv->bv_len) {
+ skip = 0;
+ uio->uio_bvec = (++bv);
+ uio->uio_iovcnt--;
+ }
+ uio->uio_skip = skip;
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+ p = (caddr_t)p + cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+static void
+zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
+ struct bio_vec *bv)
+{
+ void *paddr;
+
+ paddr = zfs_kmap_local(bv->bv_page);
+ if (rw == UIO_READ) {
+ /* Copy from buffer 'p' to the bvec data */
+ memcpy(paddr + bv->bv_offset + skip, p, cnt);
+ } else {
+ /* Copy from bvec data to buffer 'p' */
+ memcpy(p, paddr + bv->bv_offset + skip, cnt);
+ }
+ zfs_kunmap_local(paddr);
+}
+
+/*
+ * Copy 'n' bytes of data between the buffer p[] and the data represented
+ * by the request in the uio.
+ */
+static int
+zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ struct request *rq = uio->rq;
+ struct bio_vec bv;
+ struct req_iterator iter;
+ size_t this_seg_start; /* logical offset */
+ size_t this_seg_end; /* logical offset */
+ size_t skip_in_seg;
+ size_t copy_from_seg;
+ size_t orig_loffset;
+ int copied = 0;
+
+ /*
+ * Get the original logical offset of this entire request (because
+ * uio->uio_loffset will be modified over time).
+ */
+ orig_loffset = io_offset(NULL, rq);
+ this_seg_start = orig_loffset;
+
+ rq_for_each_segment(bv, rq, iter) {
+ /*
+ * Lookup what the logical offset of the last byte of this
+ * segment is.
+ */
+ this_seg_end = this_seg_start + bv.bv_len - 1;
+
+ /*
+ * We only need to operate on segments that have data we're
+ * copying.
+ */
+ if (uio->uio_loffset >= this_seg_start &&
+ uio->uio_loffset <= this_seg_end) {
+ /*
+ * Some, or all, of the data in this segment needs to be
+ * copied.
+ */
+
+ /*
+ * We may be not be copying from the first byte in the
+ * segment. Figure out how many bytes to skip copying
+ * from the beginning of this segment.
+ */
+ skip_in_seg = uio->uio_loffset - this_seg_start;
+
+ /*
+ * Calculate the total number of bytes from this
+ * segment that we will be copying.
+ */
+ copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
+
+ /* Copy the bytes */
+ zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
+ p = ((char *)p) + copy_from_seg;
+
+ n -= copy_from_seg;
+ uio->uio_resid -= copy_from_seg;
+ uio->uio_loffset += copy_from_seg;
+ copied = 1; /* We copied some data */
+ }
+
+ this_seg_start = this_seg_end + 1;
+ }
+
+ if (!copied) {
+ /* Didn't copy anything */
+ uio->uio_resid = 0;
+ }
+ return (0);
+}
+
+static int
+zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ if (uio->rq != NULL)
+ return (zfs_uiomove_bvec_rq(p, n, rw, uio));
+ return (zfs_uiomove_bvec_impl(p, n, rw, uio));
+}
+
+static int
+zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
+ boolean_t revert)
+{
+ size_t cnt = MIN(n, uio->uio_resid);
+
+ if (rw == UIO_READ)
+ cnt = copy_to_iter(p, cnt, uio->uio_iter);
+ else
+ cnt = copy_from_iter(p, cnt, uio->uio_iter);
+
+ /*
+ * When operating on a full pipe no bytes are processed.
+ * In which case return EFAULT which is converted to EAGAIN
+ * by the kernel's generic_file_splice_read() function.
+ */
+ if (cnt == 0)
+ return (EFAULT);
+
+ /*
+ * Revert advancing the uio_iter. This is set by zfs_uiocopy()
+ * to avoid consuming the uio and its iov_iter structure.
+ */
+ if (revert)
+ iov_iter_revert(uio->uio_iter, cnt);
+
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+
+ return (0);
+}
+
+int
+zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ if (uio->uio_segflg == UIO_BVEC)
+ return (zfs_uiomove_bvec(p, n, rw, uio));
+ else if (uio->uio_segflg == UIO_ITER)
+ return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
+ else
+ return (zfs_uiomove_iov(p, n, rw, uio));
+}
+EXPORT_SYMBOL(zfs_uiomove);
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified. Any
+ * error will terminate the process as this is only a best attempt to get
+ * the pages resident.
+ */
+int
+zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
+{
+ if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
+ (uio->uio_extflg & UIO_DIRECT)) {
+ /*
+ * There's never a need to fault in kernel pages or Direct I/O
+ * write pages. Direct I/O write pages have been pinned in so
+ * there is never a time for these pages a fault will occur.
+ */
+ return (0);
+ } else {
+ ASSERT3S(uio->uio_segflg, ==, UIO_ITER);
+ /*
+ * At least a Linux 4.18 kernel, iov_iter_fault_in_readable()
+ * can be relied on to fault in user pages when referenced.
+ */
+ if (iov_iter_fault_in_readable(uio->uio_iter, n))
+ return (EFAULT);
+ }
+
+ return (0);
+}
+EXPORT_SYMBOL(zfs_uio_prefaultpages);
+
+/*
+ * The same as zfs_uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
+{
+ zfs_uio_t uio_copy;
+ int ret;
+
+ memcpy(&uio_copy, uio, sizeof (zfs_uio_t));
+
+ if (uio->uio_segflg == UIO_BVEC)
+ ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
+ else if (uio->uio_segflg == UIO_ITER)
+ ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
+ else
+ ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
+
+ *cbytes = uio->uio_resid - uio_copy.uio_resid;
+
+ return (ret);
+}
+EXPORT_SYMBOL(zfs_uiocopy);
+
+/*
+ * Drop the next n chars out of *uio.
+ */
+void
+zfs_uioskip(zfs_uio_t *uio, size_t n)
+{
+ if (n > uio->uio_resid)
+ return;
+ /*
+ * When using a uio with a struct request, we simply
+ * use uio_loffset as a pointer to the next logical byte to
+ * copy in the request. We don't have to do any fancy
+ * accounting with uio_bvec/uio_iovcnt since we don't use
+ * them.
+ */
+ if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_bvec->bv_len) {
+ uio->uio_skip -= uio->uio_bvec->bv_len;
+ uio->uio_bvec++;
+ uio->uio_iovcnt--;
+ }
+ } else if (uio->uio_segflg == UIO_ITER) {
+ iov_iter_advance(uio->uio_iter, n);
+ } else {
+ ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_iov->iov_len) {
+ uio->uio_skip -= uio->uio_iov->iov_len;
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ }
+ }
+
+ uio->uio_loffset += n;
+ uio->uio_resid -= n;
+}
+EXPORT_SYMBOL(zfs_uioskip);
+
+/*
+ * Check if the uio is page-aligned in memory.
+ */
+boolean_t
+zfs_uio_page_aligned(zfs_uio_t *uio)
+{
+ boolean_t aligned = B_TRUE;
+
+ if (uio->uio_segflg == UIO_SYSSPACE) {
+ const struct iovec *iov = uio->uio_iov;
+ size_t skip = uio->uio_skip;
+
+ for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
+ uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
+ size_t size = iov->iov_len - skip;
+ if ((addr & (PAGE_SIZE - 1)) ||
+ (size & (PAGE_SIZE - 1))) {
+ aligned = B_FALSE;
+ break;
+ }
+ skip = 0;
+ }
+ } else if (uio->uio_segflg == UIO_ITER) {
+ unsigned long alignment =
+ iov_iter_alignment(uio->uio_iter);
+ aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
+ } else {
+ /* Currently not supported */
+ aligned = B_FALSE;
+ }
+
+ return (aligned);
+}
+
+#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
+#define ZFS_MARKEED_PAGE 0x0
+#define IS_ZFS_MARKED_PAGE(_p) 0
+#define zfs_mark_page(_p)
+#define zfs_unmark_page(_p)
+#define IS_ZERO_PAGE(_p) 0
+
+#else
+/*
+ * Mark pages to know if they were allocated to replace ZERO_PAGE() for
+ * Direct I/O writes.
+ */
+#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */
+#define IS_ZFS_MARKED_PAGE(_p) \
+ (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
+#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
+
+static inline void
+zfs_mark_page(struct page *page)
+{
+ ASSERT3P(page, !=, NULL);
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, ZFS_MARKED_PAGE);
+}
+
+static inline void
+zfs_unmark_page(struct page *page)
+{
+ ASSERT3P(page, !=, NULL);
+ set_page_private(page, 0UL);
+ ClearPagePrivate(page);
+ put_page(page);
+}
+#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
+
+static void
+zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
+{
+ ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+ for (long i = 0; i < uio->uio_dio.npages; i++) {
+ struct page *p = uio->uio_dio.pages[i];
+ lock_page(p);
+
+ if (IS_ZERO_PAGE(p)) {
+ /*
+ * If the user page points the kernels ZERO_PAGE() a
+ * new zero filled page will just be allocated so the
+ * contents of the page can not be changed by the user
+ * while a Direct I/O write is taking place.
+ */
+ gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |
+ __GFP_ZERO | GFP_KERNEL;
+
+ ASSERT0(IS_ZFS_MARKED_PAGE(p));
+ unlock_page(p);
+ put_page(p);
+
+ uio->uio_dio.pages[i] =
+ __page_cache_alloc(gfp_zero_page);
+ zfs_mark_page(uio->uio_dio.pages[i]);
+ } else {
+ unlock_page(p);
+ }
+ }
+}
+
+void
+zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+
+ ASSERT(uio->uio_extflg & UIO_DIRECT);
+ ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+ if (uio->uio_dio.pinned) {
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+ unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
+#endif
+ } else {
+ for (long i = 0; i < uio->uio_dio.npages; i++) {
+ struct page *p = uio->uio_dio.pages[i];
+
+ if (IS_ZFS_MARKED_PAGE(p)) {
+ zfs_unmark_page(p);
+ __free_page(p);
+ continue;
+ }
+
+ put_page(p);
+ }
+ }
+
+ vmem_free(uio->uio_dio.pages,
+ uio->uio_dio.npages * sizeof (struct page *));
+}
+
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+static int
+zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+ long res;
+ size_t skip = uio->uio_iter->iov_offset;
+ size_t len = uio->uio_resid - skip;
+ unsigned int gup_flags = 0;
+ unsigned long addr;
+ unsigned long nr_pages;
+
+ ASSERT3U(uio->uio_segflg, ==, UIO_ITER);
+
+ /*
+ * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
+ * possibly be used here in the future to allow for P2P operations with
+ * user pages.
+ */
+ if (rw == UIO_READ)
+ gup_flags = FOLL_WRITE;
+
+ if (len == 0)
+ return (0);
+
+ uio->uio_dio.pinned = B_TRUE;
+#if defined(HAVE_ITER_IS_UBUF)
+ if (iter_is_ubuf(uio->uio_iter)) {
+ nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
+ addr = (unsigned long)uio->uio_iter->ubuf + skip;
+ res = pin_user_pages_unlocked(addr, nr_pages,
+ &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
+ if (res < 0) {
+ return (SET_ERROR(-res));
+ } else if (len != (res * PAGE_SIZE)) {
+ uio->uio_dio.npages += res;
+ return (SET_ERROR(EFAULT));
+ }
+ uio->uio_dio.npages += res;
+ return (0);
+ }
+#endif
+ const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);
+ for (int i = 0; i < uio->uio_iovcnt; i++) {
+ size_t amt = iovp->iov_len - skip;
+ if (amt == 0) {
+ iovp++;
+ skip = 0;
+ continue;
+ }
+
+ addr = (unsigned long)iovp->iov_base + skip;
+ nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);
+ res = pin_user_pages_unlocked(addr, nr_pages,
+ &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
+ if (res < 0) {
+ return (SET_ERROR(-res));
+ } else if (amt != (res * PAGE_SIZE)) {
+ uio->uio_dio.npages += res;
+ return (SET_ERROR(EFAULT));
+ }
+
+ len -= amt;
+ uio->uio_dio.npages += res;
+ skip = 0;
+ iovp++;
+ };
+
+ ASSERT0(len);
+
+ return (0);
+}
+#endif
+
+static int
+zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+ size_t start;
+ size_t wanted = uio->uio_resid;
+ ssize_t rollback = 0;
+ ssize_t cnt;
+ unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
+
+ while (wanted) {
+#if defined(HAVE_IOV_ITER_GET_PAGES2)
+ cnt = iov_iter_get_pages2(uio->uio_iter,
+ &uio->uio_dio.pages[uio->uio_dio.npages],
+ wanted, maxpages, &start);
+#else
+ cnt = iov_iter_get_pages(uio->uio_iter,
+ &uio->uio_dio.pages[uio->uio_dio.npages],
+ wanted, maxpages, &start);
+#endif
+ if (cnt < 0) {
+ iov_iter_revert(uio->uio_iter, rollback);
+ return (SET_ERROR(-cnt));
+ }
+ /*
+ * All Direct I/O operations must be page aligned.
+ */
+ ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));
+ uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
+ rollback += cnt;
+ wanted -= cnt;
+#if !defined(HAVE_IOV_ITER_GET_PAGES2)
+ /*
+ * iov_iter_get_pages2() advances the iov_iter on success.
+ */
+ iov_iter_advance(uio->uio_iter, cnt);
+#endif
+
+ }
+ ASSERT3U(rollback, ==, uio->uio_resid);
+ iov_iter_revert(uio->uio_iter, rollback);
+
+ return (0);
+}
+
+/*
+ * This function pins user pages. In the event that the user pages were not
+ * successfully pinned an error value is returned.
+ *
+ * On success, 0 is returned.
+ */
+int
+zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+ int error = 0;
+ long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
+ size_t size = npages * sizeof (struct page *);
+
+ if (uio->uio_segflg == UIO_ITER) {
+ uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+ if (zfs_user_backed_iov_iter(uio->uio_iter))
+ error = zfs_uio_pin_user_pages(uio, rw);
+ else
+ error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
+#else
+ error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
+#endif
+ } else {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ ASSERT3S(uio->uio_dio.npages, >=, 0);
+
+ if (error) {
+ if (uio->uio_dio.pinned) {
+#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
+ unpin_user_pages(uio->uio_dio.pages,
+ uio->uio_dio.npages);
+#endif
+ } else {
+ for (long i = 0; i < uio->uio_dio.npages; i++)
+ put_page(uio->uio_dio.pages[i]);
+ }
+
+ vmem_free(uio->uio_dio.pages, size);
+ return (error);
+ } else {
+ ASSERT3S(uio->uio_dio.npages, ==, npages);
+ }
+
+ if (rw == UIO_WRITE && !uio->uio_dio.pinned)
+ zfs_uio_dio_check_for_zero_page(uio);
+
+ uio->uio_extflg |= UIO_DIRECT;
+
+ return (0);
+}
+
+#endif /* _KERNEL */