diff options
Diffstat (limited to 'sys/contrib/openzfs/module')
| -rw-r--r-- | sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c | 41 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c | 3 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c | 31 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c | 2 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/linux/spl/spl-zone.c | 19 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/linux/zfs/abd_os.c | 9 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c | 2 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c | 5 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c | 74 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c | 4 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c | 109 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/zfs/arc.c | 16 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_removal.c | 80 | ||||
| -rw-r--r-- | sys/contrib/openzfs/module/zfs/zvol.c | 2 | 
14 files changed, 300 insertions, 97 deletions
| diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c index d0fcca798fa9..ad707341eec7 100644 --- a/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c +++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha2_generic.c @@ -77,7 +77,8 @@ static const uint32_t SHA256_K[64] = {  	h = g, g = f, f = e, e = d + T1; \  	d = c, c = b, b = a, a = T1 + T2; -static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks) +static void +icp_sha256_generic(uint32_t state[8], const void *data, size_t num_blks)  {  	uint64_t blk; @@ -173,7 +174,8 @@ static const uint64_t SHA512_K[80] = {  	0x5fcb6fab3ad6faec, 0x6c44198c4a475817  }; -static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks) +static void +icp_sha512_generic(uint64_t state[8], const void *data, size_t num_blks)  {  	uint64_t blk; @@ -226,7 +228,8 @@ static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)  	}  } -static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) +static void +icp_sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)  {  	uint64_t pos = ctx->count[0];  	uint64_t total = ctx->count[1]; @@ -258,7 +261,8 @@ static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)  	ctx->count[1] = total;  } -static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) +static void +icp_sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)  {  	uint64_t pos = ctx->count[0];  	uint64_t total = ctx->count[1]; @@ -290,7 +294,8 @@ static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)  	ctx->count[1] = total;  } -static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) +static void +icp_sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)  {  	uint64_t mlen, pos = ctx->count[0];  	uint8_t *m = ctx->wbuf; @@ -334,7 +339,8 @@ static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)  	memset(ctx, 0, sizeof (*ctx));  } -static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits) +static void +icp_sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)  {  	uint64_t mlen, pos = ctx->count[0];  	uint8_t *m = ctx->wbuf, *r; @@ -461,14 +467,14 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len)  	switch (ctx->algotype) {  		case SHA256: -			sha256_update(&ctx->sha256, data, len); +			icp_sha256_update(&ctx->sha256, data, len);  			break;  		case SHA512:  		case SHA512_HMAC_MECH_INFO_TYPE: -			sha512_update(&ctx->sha512, data, len); +			icp_sha512_update(&ctx->sha512, data, len);  			break;  		case SHA512_256: -			sha512_update(&ctx->sha512, data, len); +			icp_sha512_update(&ctx->sha512, data, len);  			break;  	}  } @@ -479,32 +485,33 @@ SHA2Final(void *digest, SHA2_CTX *ctx)  {  	switch (ctx->algotype) {  		case SHA256: -			sha256_final(&ctx->sha256, digest, 256); +			icp_sha256_final(&ctx->sha256, digest, 256);  			break;  		case SHA512:  		case SHA512_HMAC_MECH_INFO_TYPE: -			sha512_final(&ctx->sha512, digest, 512); +			icp_sha512_final(&ctx->sha512, digest, 512);  			break;  		case SHA512_256: -			sha512_final(&ctx->sha512, digest, 256); +			icp_sha512_final(&ctx->sha512, digest, 256);  			break;  	}  }  /* the generic implementation is always okay */ -static boolean_t sha2_is_supported(void) +static boolean_t +icp_sha2_is_supported(void)  {  	return (B_TRUE);  }  const sha256_ops_t sha256_generic_impl = {  	.name = "generic", -	.transform = sha256_generic, -	.is_supported = sha2_is_supported +	.transform = icp_sha256_generic, +	.is_supported = icp_sha2_is_supported  };  const sha512_ops_t sha512_generic_impl = {  	.name = "generic", -	.transform = sha512_generic, -	.is_supported = sha2_is_supported +	.transform = icp_sha512_generic, +	.is_supported = icp_sha2_is_supported  }; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 4de48e013ec4..d0a9c662e6f0 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -762,8 +762,7 @@ zfsctl_common_pathconf(struct vop_pathconf_args *ap)  		return (0);  	case _PC_MIN_HOLE_SIZE: -		*ap->a_retval = (int)SPA_MINBLOCKSIZE; -		return (0); +		return (EINVAL);  	case _PC_ACL_EXTENDED:  		*ap->a_retval = 0; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 411225786089..f34a2fd37a77 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4116,6 +4116,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,  {  	znode_t *zp;  	zfsvfs_t *zfsvfs; +	uint_t blksize, iosize;  	int error;  	switch (cmd) { @@ -4127,8 +4128,20 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,  		*valp = 64;  		return (0);  	case _PC_MIN_HOLE_SIZE: -		*valp = (int)SPA_MINBLOCKSIZE; -		return (0); +		iosize = vp->v_mount->mnt_stat.f_iosize; +		if (vp->v_type == VREG) { +			zp = VTOZ(vp); +			blksize = zp->z_blksz; +			if (zp->z_size <= blksize) +				blksize = MAX(blksize, iosize); +			*valp = (int)blksize; +			return (0); +		} +		if (vp->v_type == VDIR) { +			*valp = (int)iosize; +			return (0); +		} +		return (EINVAL);  	case _PC_ACL_EXTENDED:  #if 0		/* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */  		zp = VTOZ(vp); @@ -4210,8 +4223,20 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,  			zfs_vmobject_wlock(object);  			(void) vm_page_grab_pages(object, OFF_TO_IDX(start), -			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO, +			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK,  			    ma, count); +			if (!vm_page_all_valid(ma[count - 1])) { +				/* +				 * Later in this function, we copy DMU data to +				 * invalid pages only. The last page may not be +				 * entirely filled though, if the file does not +				 * end on a page boundary. Therefore, we zero +				 * that last page here to make sure it does not +				 * contain garbage after the end of file. +				 */ +				ASSERT(vm_page_none_valid(ma[count - 1])); +				vm_page_zero_invalid(ma[count - 1], FALSE); +			}  			zfs_vmobject_wunlock(object);  		}  		if (blksz == zp->z_blksz) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index 91cf38016e00..8562c42b3220 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -437,6 +437,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,  	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); +	memset(&cuio_s, 0, sizeof (cuio_s));  	zfs_uio_init(&cuio, &cuio_s);  	keydata_len = zio_crypt_table[crypt].ci_keylen; @@ -519,6 +520,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,  	keydata_len = zio_crypt_table[crypt].ci_keylen;  	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); +	memset(&cuio_s, 0, sizeof (cuio_s));  	zfs_uio_init(&cuio, &cuio_s);  	/* diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c index 45c2999a4bb1..b2eae5d00b10 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c @@ -25,6 +25,10 @@   * SUCH DAMAGE.   */ +/* + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ +  #include <sys/types.h>  #include <sys/sysmacros.h>  #include <sys/kmem.h> @@ -56,6 +60,19 @@ typedef struct zone_dataset {  } zone_dataset_t;  #ifdef CONFIG_USER_NS + +/* + * Linux 6.18 moved the generic namespace type away from ns->ops->type onto + * ns_common itself. + */ +#ifdef HAVE_NS_COMMON_TYPE +#define	ns_is_newuser(ns)	\ +	((ns)->ns_type == CLONE_NEWUSER) +#else +#define	ns_is_newuser(ns)	\ +	((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER) +#endif +  /*   * Returns:   * - 0 on success @@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns)  		goto done;  	}  	ns = get_proc_ns(file_inode(nsfile)); -	if (ns->ops->type != CLONE_NEWUSER) { +	if (!ns_is_newuser(ns)) {  		error = ENOTTY;  		goto done;  	} diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 8a8316f63c48..18f2426fbbfc 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -23,6 +23,7 @@   * Copyright (c) 2014 by Chunwei Chen. All rights reserved.   * Copyright (c) 2019 by Delphix. All rights reserved.   * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>   */  /* @@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)  #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)  #endif +#ifndef nth_page +/* + * Since 6.18 nth_page() no longer exists, and is no longer required to iterate + * within a single SG entry, so we replace it with a simple addition. + */ +#define	nth_page(p, n)	((p)+(n)) +#endif +  void  abd_iter_page(struct abd_iter *aiter)  { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index daa4b5776837..934d74a112fd 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -2524,7 +2524,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,  	 * Also note: DOS R/O is ignored for directories.  	 */  	if ((v4_mode & WRITE_MASK_DATA) && -	    S_ISDIR(ZTOI(zp)->i_mode) && +	    !S_ISDIR(ZTOI(zp)->i_mode) &&  	    (zp->z_pflags & ZFS_READONLY)) {  		return (SET_ERROR(EPERM));  	} diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 6106726651a3..e845ad69ad78 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)  		goto out3;  	} -	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { -		err = SET_ERROR(EPERM); -		goto out3; -	} +	/* ZFS_READONLY will be handled in zfs_zaccess() */  	/*  	 * Verify timestamps doesn't overflow 32 bits. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index d07317b0d910..02965ac8cbee 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -23,6 +23,7 @@   * Copyright (c) 2011, Lawrence Livermore National Security, LLC.   * Copyright (c) 2015 by Chunwei Chen. All rights reserved.   * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>   */ @@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)  	return (ret);  } +#ifdef HAVE_WRITE_CACHE_PAGES  #ifdef HAVE_WRITEPAGE_T_FOLIO  static int  zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) @@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping,  #endif  	return (result);  } +#else +static inline int +zpl_write_cache_pages(struct address_space *mapping, +    struct writeback_control *wbc, void *data) +{ +	pgoff_t start = wbc->range_start >> PAGE_SHIFT; +	pgoff_t end = wbc->range_end >> PAGE_SHIFT; + +	struct folio_batch fbatch; +	folio_batch_init(&fbatch); + +	/* +	 * This atomically (-ish) tags all DIRTY pages in the range with +	 * TOWRITE, allowing users to continue dirtying or undirtying pages +	 * while we get on with writeback, without us treading on each other. +	 */ +	tag_pages_for_writeback(mapping, start, end); + +	int err = 0; +	unsigned int npages; + +	/* +	 * Grab references to the TOWRITE pages just flagged. This may not get +	 * all of them, so we do it in a loop until there are none left. +	 */ +	while ((npages = filemap_get_folios_tag(mapping, &start, end, +	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) { + +		/* Loop over each page and write it out. */ +		struct folio *folio; +		while ((folio = folio_batch_next(&fbatch)) != NULL) { +			folio_lock(folio); + +			/* +			 * If the folio has been remapped, or is no longer +			 * dirty, then there's nothing to do. +			 */ +			if (folio->mapping != mapping || +			    !folio_test_dirty(folio)) { +				folio_unlock(folio); +				continue; +			} + +			/* +			 * If writeback is already in progress, wait for it to +			 * finish. We continue after this even if the page +			 * ends up clean; zfs_putpage() will skip it if no +			 * further work is required. +			 */ +			while (folio_test_writeback(folio)) +				folio_wait_bit(folio, PG_writeback); + +			/* +			 * Write it out and collect any error. zfs_putpage() +			 * will clear the TOWRITE and DIRTY flags, and return +			 * with the page unlocked. +			 */ +			int ferr = zpl_putpage(&folio->page, wbc, data); +			if (err == 0 && ferr != 0) +				err = ferr; + +			/* Housekeeping for the caller. */ +			wbc->nr_to_write -= folio_nr_pages(folio); +		} + +		/* Release any remaining references on the batch. */ +		folio_batch_release(&fbatch); +	} + +	return (err); +} +#endif  static int  zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 444948d03cb3..347b352506e5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -23,6 +23,7 @@   * Copyright (c) 2011, Lawrence Livermore National Security, LLC.   * Copyright (c) 2023, Datto Inc. All rights reserved.   * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>   */ @@ -33,6 +34,7 @@  #include <sys/zpl.h>  #include <linux/iversion.h>  #include <linux/version.h> +#include <linux/vfs_compat.h>  /*   * What to do when the last reference to an inode is released. If 0, the kernel @@ -104,7 +106,7 @@ zpl_dirty_inode(struct inode *ip, int flags)   * reporting memory pressure and requests OpenZFS release some memory (see   * zfs_prune()).   * - * When set to 1, we call generic_delete_node(), which always returns "destroy + * When set to 1, we call generic_delete_inode(), which always returns "destroy   * immediately", resulting in inodes being destroyed immediately, releasing   * their associated dnodes and dbufs to the dbuf cached and the ARC to be   * evicted as normal. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 967a018640e1..fe939150b641 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -21,7 +21,7 @@   */  /*   * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> + * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>   * Copyright (c) 2024, 2025, Klara, Inc.   */ @@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr)  	}  	/* -	 * Align the request to volume block boundaries when a secure erase is -	 * not required.  This will prevent dnode_free_range() from zeroing out -	 * the unaligned parts which is slow (read-modify-write) and useless -	 * since we are not freeing any space by doing so. +	 * Align the request to volume block boundaries. This will prevent +	 * dnode_free_range() from zeroing out the unaligned parts which is +	 * slow (read-modify-write) and useless since we are not freeing any +	 * space by doing so.  	 */ -	if (!io_is_secure_erase(bio, rq)) { -		start = P2ROUNDUP(start, zv->zv_volblocksize); -		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); -		size = end - start; -	} +	start = P2ROUNDUP(start, zv->zv_volblocksize); +	end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); +	size = end - start;  	if (start >= end)  		goto unlock; @@ -467,6 +465,24 @@ zvol_read_task(void *arg)  	zv_request_task_free(task);  } +/* + * Note: + * + * The kernel uses different enum names for the IO opcode, depending on the + * kernel version ('req_opf', 'req_op').  To sidestep this, use macros rather + * than inline functions for these checks. + */ +/* Should this IO go down the zvol write path? */ +#define	ZVOL_OP_IS_WRITE(op) \ +	(op == REQ_OP_WRITE || \ +	op == REQ_OP_FLUSH || \ +	op == REQ_OP_DISCARD) + +/* Is this IO type supported by zvols? */ +#define	ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) + +/* Get the IO opcode */ +#define	ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))  /*   * Process a BIO or request @@ -486,27 +502,32 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,  	uint64_t size = io_size(bio, rq);  	int rw; -	if (rq != NULL) { -		/* -		 * Flush & trim requests go down the zvol_write codepath.  Or -		 * more specifically: -		 * -		 * If request is a write, or if it's op_is_sync() and not a -		 * read, or if it's a flush, or if it's a discard, then send the -		 * request down the write path. -		 */ -		if (op_is_write(rq->cmd_flags) || -		    (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || -		    req_op(rq) == REQ_OP_FLUSH || -		    op_is_discard(rq->cmd_flags)) { -			rw = WRITE; -		} else { -			rw = READ; -		} +	if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { +		zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", +		    rq != NULL ? "request" : "BIO", +		    ZVOL_OP(bio, rq), +		    rq != NULL ? rq->cmd_flags : bio->bi_opf); +		ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); +		zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); +		goto out; +	} + +	if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { +		rw = WRITE;  	} else { -		rw = bio_data_dir(bio); +		rw = READ;  	} +	/* +	 * Sanity check +	 * +	 * If we're a BIO, check our rw matches the kernel's +	 * bio_data_dir(bio) rw.  We need to check because we support fewer +	 * IO operations, and want to verify that what we think are reads and +	 * writes from those operations match what the kernel thinks. +	 */ +	ASSERT(rq != NULL || rw == bio_data_dir(bio)); +  	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {  		zvol_end_io(bio, rq, SET_ERROR(ENXIO));  		goto out; @@ -610,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,  		 * interfaces lack this functionality (they block waiting for  		 * the i/o to complete).  		 */ -		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { +		if (io_is_discard(bio, rq)) {  			if (force_sync) {  				zvol_discard(&zvr);  			} else { @@ -1011,12 +1032,12 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)   * tiny devices.  For devices over 1 Mib a standard head and sector count   * is used to keep the cylinders count reasonable.   */ -static int -zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static inline int +zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)  { +	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);  	sector_t sectors; -	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);  	ASSERT3P(zv, !=, NULL);  	ASSERT3U(zv->zv_open_count, >, 0); @@ -1036,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)  	return (0);  } +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK +static int +zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo) +{ +	return (zvol_getgeo_impl(disk, geo)); +} +#else +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ +	return (zvol_getgeo_impl(bdev->bd_disk, geo)); +} +#endif +  /*   * Why have two separate block_device_operations structs?   * @@ -1479,7 +1514,7 @@ zvol_os_remove_minor(zvol_state_t *zv)  	if (zso->use_blk_mq)  		blk_mq_free_tag_set(&zso->tag_set); -	ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); +	ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);  	kmem_free(zso, sizeof (struct zvol_state_os)); @@ -1634,7 +1669,7 @@ zvol_os_create_minor(const char *name)  	if (zvol_inhibit_dev)  		return (0); -	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); +	idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));  	if (idx < 0)  		return (SET_ERROR(-idx));  	minor = idx << ZVOL_MINOR_BITS; @@ -1642,7 +1677,7 @@ zvol_os_create_minor(const char *name)  		/* too many partitions can cause an overflow */  		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",  		    name, minor, MINOR(minor)); -		ida_simple_remove(&zvol_ida, idx); +		ida_free(&zvol_ida, idx);  		return (SET_ERROR(EINVAL));  	} @@ -1650,7 +1685,7 @@ zvol_os_create_minor(const char *name)  	if (zv) {  		ASSERT(MUTEX_HELD(&zv->zv_state_lock));  		mutex_exit(&zv->zv_state_lock); -		ida_simple_remove(&zvol_ida, idx); +		ida_free(&zvol_ida, idx);  		return (SET_ERROR(EEXIST));  	} @@ -1750,7 +1785,7 @@ out_doi:  		rw_exit(&zvol_state_lock);  		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);  	} else { -		ida_simple_remove(&zvol_ida, idx); +		ida_free(&zvol_ida, idx);  	}  	return (error); diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index b677f90280d7..dbb5e942e2e6 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1157,7 +1157,7 @@ buf_fini(void)  #if defined(_KERNEL)  	/*  	 * Large allocations which do not require contiguous pages -	 * should be using vmem_free() in the linux kernel\ +	 * should be using vmem_free() in the linux kernel.  	 */  	vmem_free(buf_hash_table.ht_table,  	    (buf_hash_table.ht_mask + 1) * sizeof (void *)); @@ -4651,10 +4651,10 @@ arc_flush_task(void *arg)  	arc_flush_impl(spa_guid, B_FALSE);  	arc_async_flush_remove(spa_guid, af->af_cache_level); -	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); -	if (elaspsed > 0) { +	uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); +	if (elapsed > 0) {  		zfs_dbgmsg("spa %llu arc flushed in %llu ms", -		    (u_longlong_t)spa_guid, (u_longlong_t)elaspsed); +		    (u_longlong_t)spa_guid, (u_longlong_t)elapsed);  	}  } @@ -9152,7 +9152,7 @@ top:  		if (dev->l2ad_first) {  			/*  			 * This is the first sweep through the device. There is -			 * nothing to evict. We have already trimmmed the +			 * nothing to evict. We have already trimmed the  			 * whole device.  			 */  			goto out; @@ -10086,12 +10086,12 @@ l2arc_device_teardown(void *arg)  	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);  	vmem_free(remdev, sizeof (l2arc_dev_t)); -	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); -	if (elaspsed > 0) { +	uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time); +	if (elapsed > 0) {  		zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",  		    (u_longlong_t)rva->rva_spa_gid,  		    (u_longlong_t)rva->rva_vdev_gid, -		    (u_longlong_t)elaspsed); +		    (u_longlong_t)elapsed);  	}  	if (rva->rva_async) diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 2f7a739da241..abb71543e3ab 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -51,34 +51,70 @@  #include <sys/trace_zfs.h>  /* - * This file contains the necessary logic to remove vdevs from a - * storage pool.  Currently, the only devices that can be removed - * are log, cache, and spare devices; and top level vdevs from a pool - * w/o raidz or mirrors.  (Note that members of a mirror can be removed - * by the detach operation.) + * This file contains the necessary logic to remove vdevs from a storage + * pool. Note that members of a mirror can be removed by the detach + * operation. Currently, the only devices that can be removed are:   * - * Log vdevs are removed by evacuating them and then turning the vdev - * into a hole vdev while holding spa config locks. + * 1) Traditional hot spare and cache vdevs. Note that draid distributed + *    spares are fixed at creation time and cannot be removed.   * - * Top level vdevs are removed and converted into an indirect vdev via - * a multi-step process: + * 2) Log vdevs are removed by evacuating them and then turning the vdev + *    into a hole vdev while holding spa config locks.   * - *  - Disable allocations from this device (spa_vdev_remove_top). + * 3) Top-level singleton and mirror vdevs, including dedup and special + *    vdevs, are removed and converted into an indirect vdev via a + *    multi-step process:   * - *  - From a new thread (spa_vdev_remove_thread), copy data from - *    the removing vdev to a different vdev.  The copy happens in open - *    context (spa_vdev_copy_impl) and issues a sync task - *    (vdev_mapping_sync) so the sync thread can update the partial - *    indirect mappings in core and on disk. + *    - Disable allocations from this device (spa_vdev_remove_top).   * - *  - If a free happens during a removal, it is freed from the - *    removing vdev, and if it has already been copied, from the new - *    location as well (free_from_removing_vdev). + *    - From a new thread (spa_vdev_remove_thread), copy data from the + *      removing vdev to a different vdev. The copy happens in open context + *      (spa_vdev_copy_impl) and issues a sync task (vdev_mapping_sync) so + *      the sync thread can update the partial indirect mappings in core + *      and on disk.   * - *  - After the removal is completed, the copy thread converts the vdev - *    into an indirect vdev (vdev_remove_complete) before instructing - *    the sync thread to destroy the space maps and finish the removal - *    (spa_finish_removal). + *    - If a free happens during a removal, it is freed from the removing + *      vdev, and if it has already been copied, from the new location as + *      well (free_from_removing_vdev). + * + *    - After the removal is completed, the copy thread converts the vdev + *      into an indirect vdev (vdev_remove_complete) before instructing + *      the sync thread to destroy the space maps and finish the removal + *      (spa_finish_removal). + * + *   The following constraints currently apply primary device removal: + * + *     - All vdevs must be online, healthy, and not be missing any data + *       according to the DTLs. + * + *     - When removing a singleton or mirror vdev, regardless of it's a + *       special, dedup, or primary device, it must have the same ashift + *       as the devices in the normal allocation class. Furthermore, all + *       vdevs in the normal allocation class must have the same ashift to + *       ensure the new allocations never includes additional padding. + * + *     - The normal allocation class cannot contain any raidz or draid + *       top-level vdevs since segments are copied without regard for block + *       boundaries. This makes it impossible to calculate the required + *       parity columns when using these vdev types as the destination. + * + *     - The encryption keys must be loaded so the ZIL logs can be reset + *       in order to prevent writing to the device being removed. + * + * N.B. ashift and raidz/draid constraints for primary top-level device + * removal could be slightly relaxed if it were possible to request that + * DVAs from a mirror or singleton in the specified allocation class be + * used (metaslab_alloc_dva). + * + * This flexibility would be particularly useful for raidz/draid pools which + * often include a mirrored special device. If a mistakenly added top-level + * singleton were added it could then still be removed at the cost of some + * special device capacity. This may be a worthwhile tradeoff depending on + * the pool capacity and expense (cost, complexity, time) of creating a new + * pool and copying all of the data to correct the configuration. + * + * Furthermore, while not currently supported it should be possible to allow + * vdevs of any type to be removed as long as they've never been written to.   */  typedef struct vdev_copy_arg { diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index faced0db7e9e..00f98168d3d8 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -410,7 +410,7 @@ zvol_set_volthreading(const char *name, boolean_t value)  {  	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);  	if (zv == NULL) -		return (SET_ERROR(ENOENT)); +		return (-1);  	zv->zv_threading = value;  	mutex_exit(&zv->zv_state_lock);  	return (0); | 
