diff options
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux/zfs')
6 files changed, 160 insertions, 43 deletions
| diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 8a8316f63c48..18f2426fbbfc 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -23,6 +23,7 @@   * Copyright (c) 2014 by Chunwei Chen. All rights reserved.   * Copyright (c) 2019 by Delphix. All rights reserved.   * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>   */  /* @@ -1109,6 +1110,14 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)  #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)  #endif +#ifndef nth_page +/* + * Since 6.18 nth_page() no longer exists, and is no longer required to iterate + * within a single SG entry, so we replace it with a simple addition. + */ +#define	nth_page(p, n)	((p)+(n)) +#endif +  void  abd_iter_page(struct abd_iter *aiter)  { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index daa4b5776837..934d74a112fd 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -2524,7 +2524,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,  	 * Also note: DOS R/O is ignored for directories.  	 */  	if ((v4_mode & WRITE_MASK_DATA) && -	    S_ISDIR(ZTOI(zp)->i_mode) && +	    !S_ISDIR(ZTOI(zp)->i_mode) &&  	    (zp->z_pflags & ZFS_READONLY)) {  		return (SET_ERROR(EPERM));  	} diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 6106726651a3..e845ad69ad78 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -2033,10 +2033,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)  		goto out3;  	} -	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { -		err = SET_ERROR(EPERM); -		goto out3; -	} +	/* ZFS_READONLY will be handled in zfs_zaccess() */  	/*  	 * Verify timestamps doesn't overflow 32 bits. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index d07317b0d910..02965ac8cbee 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -23,6 +23,7 @@   * Copyright (c) 2011, Lawrence Livermore National Security, LLC.   * Copyright (c) 2015 by Chunwei Chen. All rights reserved.   * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>   */ @@ -478,6 +479,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)  	return (ret);  } +#ifdef HAVE_WRITE_CACHE_PAGES  #ifdef HAVE_WRITEPAGE_T_FOLIO  static int  zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) @@ -499,6 +501,78 @@ zpl_write_cache_pages(struct address_space *mapping,  #endif  	return (result);  } +#else +static inline int +zpl_write_cache_pages(struct address_space *mapping, +    struct writeback_control *wbc, void *data) +{ +	pgoff_t start = wbc->range_start >> PAGE_SHIFT; +	pgoff_t end = wbc->range_end >> PAGE_SHIFT; + +	struct folio_batch fbatch; +	folio_batch_init(&fbatch); + +	/* +	 * This atomically (-ish) tags all DIRTY pages in the range with +	 * TOWRITE, allowing users to continue dirtying or undirtying pages +	 * while we get on with writeback, without us treading on each other. +	 */ +	tag_pages_for_writeback(mapping, start, end); + +	int err = 0; +	unsigned int npages; + +	/* +	 * Grab references to the TOWRITE pages just flagged. This may not get +	 * all of them, so we do it in a loop until there are none left. +	 */ +	while ((npages = filemap_get_folios_tag(mapping, &start, end, +	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) { + +		/* Loop over each page and write it out. */ +		struct folio *folio; +		while ((folio = folio_batch_next(&fbatch)) != NULL) { +			folio_lock(folio); + +			/* +			 * If the folio has been remapped, or is no longer +			 * dirty, then there's nothing to do. +			 */ +			if (folio->mapping != mapping || +			    !folio_test_dirty(folio)) { +				folio_unlock(folio); +				continue; +			} + +			/* +			 * If writeback is already in progress, wait for it to +			 * finish. We continue after this even if the page +			 * ends up clean; zfs_putpage() will skip it if no +			 * further work is required. +			 */ +			while (folio_test_writeback(folio)) +				folio_wait_bit(folio, PG_writeback); + +			/* +			 * Write it out and collect any error. zfs_putpage() +			 * will clear the TOWRITE and DIRTY flags, and return +			 * with the page unlocked. +			 */ +			int ferr = zpl_putpage(&folio->page, wbc, data); +			if (err == 0 && ferr != 0) +				err = ferr; + +			/* Housekeeping for the caller. */ +			wbc->nr_to_write -= folio_nr_pages(folio); +		} + +		/* Release any remaining references on the batch. */ +		folio_batch_release(&fbatch); +	} + +	return (err); +} +#endif  static int  zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 444948d03cb3..347b352506e5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -23,6 +23,7 @@   * Copyright (c) 2011, Lawrence Livermore National Security, LLC.   * Copyright (c) 2023, Datto Inc. All rights reserved.   * Copyright (c) 2025, Klara, Inc. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>   */ @@ -33,6 +34,7 @@  #include <sys/zpl.h>  #include <linux/iversion.h>  #include <linux/version.h> +#include <linux/vfs_compat.h>  /*   * What to do when the last reference to an inode is released. If 0, the kernel @@ -104,7 +106,7 @@ zpl_dirty_inode(struct inode *ip, int flags)   * reporting memory pressure and requests OpenZFS release some memory (see   * zfs_prune()).   * - * When set to 1, we call generic_delete_node(), which always returns "destroy + * When set to 1, we call generic_delete_inode(), which always returns "destroy   * immediately", resulting in inodes being destroyed immediately, releasing   * their associated dnodes and dbufs to the dbuf cached and the ARC to be   * evicted as normal. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 967a018640e1..fe939150b641 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -21,7 +21,7 @@   */  /*   * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> + * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>   * Copyright (c) 2024, 2025, Klara, Inc.   */ @@ -337,16 +337,14 @@ zvol_discard(zv_request_t *zvr)  	}  	/* -	 * Align the request to volume block boundaries when a secure erase is -	 * not required.  This will prevent dnode_free_range() from zeroing out -	 * the unaligned parts which is slow (read-modify-write) and useless -	 * since we are not freeing any space by doing so. +	 * Align the request to volume block boundaries. This will prevent +	 * dnode_free_range() from zeroing out the unaligned parts which is +	 * slow (read-modify-write) and useless since we are not freeing any +	 * space by doing so.  	 */ -	if (!io_is_secure_erase(bio, rq)) { -		start = P2ROUNDUP(start, zv->zv_volblocksize); -		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); -		size = end - start; -	} +	start = P2ROUNDUP(start, zv->zv_volblocksize); +	end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); +	size = end - start;  	if (start >= end)  		goto unlock; @@ -467,6 +465,24 @@ zvol_read_task(void *arg)  	zv_request_task_free(task);  } +/* + * Note: + * + * The kernel uses different enum names for the IO opcode, depending on the + * kernel version ('req_opf', 'req_op').  To sidestep this, use macros rather + * than inline functions for these checks. + */ +/* Should this IO go down the zvol write path? */ +#define	ZVOL_OP_IS_WRITE(op) \ +	(op == REQ_OP_WRITE || \ +	op == REQ_OP_FLUSH || \ +	op == REQ_OP_DISCARD) + +/* Is this IO type supported by zvols? */ +#define	ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) + +/* Get the IO opcode */ +#define	ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))  /*   * Process a BIO or request @@ -486,27 +502,32 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,  	uint64_t size = io_size(bio, rq);  	int rw; -	if (rq != NULL) { -		/* -		 * Flush & trim requests go down the zvol_write codepath.  Or -		 * more specifically: -		 * -		 * If request is a write, or if it's op_is_sync() and not a -		 * read, or if it's a flush, or if it's a discard, then send the -		 * request down the write path. -		 */ -		if (op_is_write(rq->cmd_flags) || -		    (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || -		    req_op(rq) == REQ_OP_FLUSH || -		    op_is_discard(rq->cmd_flags)) { -			rw = WRITE; -		} else { -			rw = READ; -		} +	if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { +		zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", +		    rq != NULL ? "request" : "BIO", +		    ZVOL_OP(bio, rq), +		    rq != NULL ? rq->cmd_flags : bio->bi_opf); +		ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); +		zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); +		goto out; +	} + +	if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { +		rw = WRITE;  	} else { -		rw = bio_data_dir(bio); +		rw = READ;  	} +	/* +	 * Sanity check +	 * +	 * If we're a BIO, check our rw matches the kernel's +	 * bio_data_dir(bio) rw.  We need to check because we support fewer +	 * IO operations, and want to verify that what we think are reads and +	 * writes from those operations match what the kernel thinks. +	 */ +	ASSERT(rq != NULL || rw == bio_data_dir(bio)); +  	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {  		zvol_end_io(bio, rq, SET_ERROR(ENXIO));  		goto out; @@ -610,7 +631,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,  		 * interfaces lack this functionality (they block waiting for  		 * the i/o to complete).  		 */ -		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { +		if (io_is_discard(bio, rq)) {  			if (force_sync) {  				zvol_discard(&zvr);  			} else { @@ -1011,12 +1032,12 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)   * tiny devices.  For devices over 1 Mib a standard head and sector count   * is used to keep the cylinders count reasonable.   */ -static int -zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static inline int +zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)  { +	zvol_state_t *zv = atomic_load_ptr(&disk->private_data);  	sector_t sectors; -	zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);  	ASSERT3P(zv, !=, NULL);  	ASSERT3U(zv->zv_open_count, >, 0); @@ -1036,6 +1057,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)  	return (0);  } +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK +static int +zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo) +{ +	return (zvol_getgeo_impl(disk, geo)); +} +#else +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ +	return (zvol_getgeo_impl(bdev->bd_disk, geo)); +} +#endif +  /*   * Why have two separate block_device_operations structs?   * @@ -1479,7 +1514,7 @@ zvol_os_remove_minor(zvol_state_t *zv)  	if (zso->use_blk_mq)  		blk_mq_free_tag_set(&zso->tag_set); -	ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); +	ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);  	kmem_free(zso, sizeof (struct zvol_state_os)); @@ -1634,7 +1669,7 @@ zvol_os_create_minor(const char *name)  	if (zvol_inhibit_dev)  		return (0); -	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); +	idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));  	if (idx < 0)  		return (SET_ERROR(-idx));  	minor = idx << ZVOL_MINOR_BITS; @@ -1642,7 +1677,7 @@ zvol_os_create_minor(const char *name)  		/* too many partitions can cause an overflow */  		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",  		    name, minor, MINOR(minor)); -		ida_simple_remove(&zvol_ida, idx); +		ida_free(&zvol_ida, idx);  		return (SET_ERROR(EINVAL));  	} @@ -1650,7 +1685,7 @@ zvol_os_create_minor(const char *name)  	if (zv) {  		ASSERT(MUTEX_HELD(&zv->zv_state_lock));  		mutex_exit(&zv->zv_state_lock); -		ida_simple_remove(&zvol_ida, idx); +		ida_free(&zvol_ida, idx);  		return (SET_ERROR(EEXIST));  	} @@ -1750,7 +1785,7 @@ out_doi:  		rw_exit(&zvol_state_lock);  		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);  	} else { -		ida_simple_remove(&zvol_ida, idx); +		ida_free(&zvol_ida, idx);  	}  	return (error); | 
