diff options
Diffstat (limited to 'uts/common/fs/zfs/vdev_disk.c')
| -rw-r--r-- | uts/common/fs/zfs/vdev_disk.c | 652 | 
1 files changed, 652 insertions, 0 deletions
| diff --git a/uts/common/fs/zfs/vdev_disk.c b/uts/common/fs/zfs/vdev_disk.c new file mode 100644 index 000000000000..759f0f84f1c8 --- /dev/null +++ b/uts/common/fs/zfs/vdev_disk.c @@ -0,0 +1,652 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/refcount.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/sunldi.h> +#include <sys/efi_partition.h> +#include <sys/fm/fs/zfs.h> + +/* + * Virtual device vector for disks. + */ + +extern ldi_ident_t zfs_li; + +typedef struct vdev_disk_buf { +	buf_t	vdb_buf; +	zio_t	*vdb_io; +} vdev_disk_buf_t; + +static void +vdev_disk_hold(vdev_t *vd) +{ +	ddi_devid_t devid; +	char *minor; + +	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + +	/* +	 * We must have a pathname, and it must be absolute. +	 */ +	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') +		return; + +	/* +	 * Only prefetch path and devid info if the device has +	 * never been opened. +	 */ +	if (vd->vdev_tsd != NULL) +		return; + +	if (vd->vdev_wholedisk == -1ULL) { +		size_t len = strlen(vd->vdev_path) + 3; +		char *buf = kmem_alloc(len, KM_SLEEP); + +		(void) snprintf(buf, len, "%ss0", vd->vdev_path); + +		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp); +		kmem_free(buf, len); +	} + +	if (vd->vdev_name_vp == NULL) +		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); + +	if (vd->vdev_devid != NULL && +	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { +		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); +		ddi_devid_str_free(minor); +		ddi_devid_free(devid); +	} +} + +static void +vdev_disk_rele(vdev_t *vd) +{ +	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + +	if (vd->vdev_name_vp) { +		VN_RELE_ASYNC(vd->vdev_name_vp, +		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); +		vd->vdev_name_vp = NULL; +	} +	if (vd->vdev_devid_vp) { +		VN_RELE_ASYNC(vd->vdev_devid_vp, +		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); +		vd->vdev_devid_vp = NULL; +	} +} + +static uint64_t +vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz) +{ +	ASSERT(vd->vdev_wholedisk); + +	vdev_disk_t *dvd = vd->vdev_tsd; +	dk_efi_t dk_ioc; +	efi_gpt_t *efi; +	uint64_t avail_space = 0; +	int efisize = EFI_LABEL_SIZE * 2; + +	dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP); +	dk_ioc.dki_lba = 1; +	dk_ioc.dki_length = efisize; +	dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data; +	efi = dk_ioc.dki_data; + +	if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc, +	    FKIOCTL, kcred, NULL) == 0) { +		uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); + +		zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu", +		    vd->vdev_path, capacity, efi_altern_lba); +		if (capacity > efi_altern_lba) +			avail_space = (capacity - efi_altern_lba) * blksz; +	} +	kmem_free(dk_ioc.dki_data, efisize); +	return (avail_space); +} + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, +    uint64_t *ashift) +{ +	spa_t *spa = vd->vdev_spa; +	vdev_disk_t *dvd; +	struct dk_minfo_ext dkmext; +	int error; +	dev_t dev; +	int otyp; + +	/* +	 * We must have a pathname, and it must be absolute. +	 */ +	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { +		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; +		return (EINVAL); +	} + +	/* +	 * Reopen the device if it's not currently open. Otherwise, +	 * just update the physical size of the device. +	 */ +	if (vd->vdev_tsd != NULL) { +		ASSERT(vd->vdev_reopening); +		dvd = vd->vdev_tsd; +		goto skip_open; +	} + +	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + +	/* +	 * When opening a disk device, we want to preserve the user's original +	 * intent.  We always want to open the device by the path the user gave +	 * us, even if it is one of multiple paths to the save device.  But we +	 * also want to be able to survive disks being removed/recabled. +	 * Therefore the sequence of opening devices is: +	 * +	 * 1. Try opening the device by path.  For legacy pools without the +	 *    'whole_disk' property, attempt to fix the path by appending 's0'. +	 * +	 * 2. If the devid of the device matches the stored value, return +	 *    success. +	 * +	 * 3. Otherwise, the device may have moved.  Try opening the device +	 *    by the devid instead. +	 */ +	if (vd->vdev_devid != NULL) { +		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, +		    &dvd->vd_minor) != 0) { +			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; +			return (EINVAL); +		} +	} + +	error = EINVAL;		/* presume failure */ + +	if (vd->vdev_path != NULL) { +		ddi_devid_t devid; + +		if (vd->vdev_wholedisk == -1ULL) { +			size_t len = strlen(vd->vdev_path) + 3; +			char *buf = kmem_alloc(len, KM_SLEEP); +			ldi_handle_t lh; + +			(void) snprintf(buf, len, "%ss0", vd->vdev_path); + +			if (ldi_open_by_name(buf, spa_mode(spa), kcred, +			    &lh, zfs_li) == 0) { +				spa_strfree(vd->vdev_path); +				vd->vdev_path = buf; +				vd->vdev_wholedisk = 1ULL; +				(void) ldi_close(lh, spa_mode(spa), kcred); +			} else { +				kmem_free(buf, len); +			} +		} + +		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, +		    &dvd->vd_lh, zfs_li); + +		/* +		 * Compare the devid to the stored value. +		 */ +		if (error == 0 && vd->vdev_devid != NULL && +		    ldi_get_devid(dvd->vd_lh, &devid) == 0) { +			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { +				error = EINVAL; +				(void) ldi_close(dvd->vd_lh, spa_mode(spa), +				    kcred); +				dvd->vd_lh = NULL; +			} +			ddi_devid_free(devid); +		} + +		/* +		 * If we succeeded in opening the device, but 'vdev_wholedisk' +		 * is not yet set, then this must be a slice. +		 */ +		if (error == 0 && vd->vdev_wholedisk == -1ULL) +			vd->vdev_wholedisk = 0; +	} + +	/* +	 * If we were unable to open by path, or the devid check fails, open by +	 * devid instead. +	 */ +	if (error != 0 && vd->vdev_devid != NULL) +		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, +		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); + +	/* +	 * If all else fails, then try opening by physical path (if available) +	 * or the logical path (if we failed due to the devid check).  While not +	 * as reliable as the devid, this will give us something, and the higher +	 * level vdev validation will prevent us from opening the wrong device. +	 */ +	if (error) { +		if (vd->vdev_physpath != NULL && +		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) +			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), +			    kcred, &dvd->vd_lh, zfs_li); + +		/* +		 * Note that we don't support the legacy auto-wholedisk support +		 * as above.  This hasn't been used in a very long time and we +		 * don't need to propagate its oddities to this edge condition. +		 */ +		if (error && vd->vdev_path != NULL) +			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), +			    kcred, &dvd->vd_lh, zfs_li); +	} + +	if (error) { +		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; +		return (error); +	} + +	/* +	 * Once a device is opened, verify that the physical device path (if +	 * available) is up to date. +	 */ +	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && +	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { +		char *physpath, *minorname; + +		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); +		minorname = NULL; +		if (ddi_dev_pathname(dev, otyp, physpath) == 0 && +		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && +		    (vd->vdev_physpath == NULL || +		    strcmp(vd->vdev_physpath, physpath) != 0)) { +			if (vd->vdev_physpath) +				spa_strfree(vd->vdev_physpath); +			(void) strlcat(physpath, ":", MAXPATHLEN); +			(void) strlcat(physpath, minorname, MAXPATHLEN); +			vd->vdev_physpath = spa_strdup(physpath); +		} +		if (minorname) +			kmem_free(minorname, strlen(minorname) + 1); +		kmem_free(physpath, MAXPATHLEN); +	} + +skip_open: +	/* +	 * Determine the actual size of the device. +	 */ +	if (ldi_get_size(dvd->vd_lh, psize) != 0) { +		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; +		return (EINVAL); +	} + +	/* +	 * Determine the device's minimum transfer size. +	 * If the ioctl isn't supported, assume DEV_BSIZE. +	 */ +	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext, +	    FKIOCTL, kcred, NULL) != 0) +		dkmext.dki_pbsize = DEV_BSIZE; + +	*ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1; + +	if (vd->vdev_wholedisk == 1) { +		uint64_t capacity = dkmext.dki_capacity - 1; +		uint64_t blksz = dkmext.dki_lbsize; +		int wce = 1; + +		/* +		 * If we own the whole disk, try to enable disk write caching. +		 * We ignore errors because it's OK if we can't do it. +		 */ +		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, +		    FKIOCTL, kcred, NULL); + +		*max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz); +		zfs_dbgmsg("capacity change: vdev %s, psize %llu, " +		    "max_psize %llu", vd->vdev_path, *psize, *max_psize); +	} else { +		*max_psize = *psize; +	} + +	/* +	 * Clear the nowritecache bit, so that on a vdev_reopen() we will +	 * try again. +	 */ +	vd->vdev_nowritecache = B_FALSE; + +	return (0); +} + +static void +vdev_disk_close(vdev_t *vd) +{ +	vdev_disk_t *dvd = vd->vdev_tsd; + +	if (vd->vdev_reopening || dvd == NULL) +		return; + +	if (dvd->vd_minor != NULL) +		ddi_devid_str_free(dvd->vd_minor); + +	if (dvd->vd_devid != NULL) +		ddi_devid_free(dvd->vd_devid); + +	if (dvd->vd_lh != NULL) +		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); + +	vd->vdev_delayed_close = B_FALSE; +	kmem_free(dvd, sizeof (vdev_disk_t)); +	vd->vdev_tsd = NULL; +} + +int +vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, +    uint64_t offset, int flags) +{ +	buf_t *bp; +	int error = 0; + +	if (vd_lh == NULL) +		return (EINVAL); + +	ASSERT(flags & B_READ || flags & B_WRITE); + +	bp = getrbuf(KM_SLEEP); +	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; +	bp->b_bcount = size; +	bp->b_un.b_addr = (void *)data; +	bp->b_lblkno = lbtodb(offset); +	bp->b_bufsize = size; + +	error = ldi_strategy(vd_lh, bp); +	ASSERT(error == 0); +	if ((error = biowait(bp)) == 0 && bp->b_resid != 0) +		error = EIO; +	freerbuf(bp); + +	return (error); +} + +static void +vdev_disk_io_intr(buf_t *bp) +{ +	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; +	zio_t *zio = vdb->vdb_io; + +	/* +	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. +	 * Rather than teach the rest of the stack about other error +	 * possibilities (EFAULT, etc), we normalize the error value here. +	 */ +	zio->io_error = (geterror(bp) != 0 ? EIO : 0); + +	if (zio->io_error == 0 && bp->b_resid != 0) +		zio->io_error = EIO; + +	kmem_free(vdb, sizeof (vdev_disk_buf_t)); + +	zio_interrupt(zio); +} + +static void +vdev_disk_ioctl_free(zio_t *zio) +{ +	kmem_free(zio->io_vsd, sizeof (struct dk_callback)); +} + +static const zio_vsd_ops_t vdev_disk_vsd_ops = { +	vdev_disk_ioctl_free, +	zio_vsd_default_cksum_report +}; + +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ +	zio_t *zio = zio_arg; + +	zio->io_error = error; + +	zio_interrupt(zio); +} + +static int +vdev_disk_io_start(zio_t *zio) +{ +	vdev_t *vd = zio->io_vd; +	vdev_disk_t *dvd = vd->vdev_tsd; +	vdev_disk_buf_t *vdb; +	struct dk_callback *dkc; +	buf_t *bp; +	int error; + +	if (zio->io_type == ZIO_TYPE_IOCTL) { +		/* XXPOLICY */ +		if (!vdev_readable(vd)) { +			zio->io_error = ENXIO; +			return (ZIO_PIPELINE_CONTINUE); +		} + +		switch (zio->io_cmd) { + +		case DKIOCFLUSHWRITECACHE: + +			if (zfs_nocacheflush) +				break; + +			if (vd->vdev_nowritecache) { +				zio->io_error = ENOTSUP; +				break; +			} + +			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); +			zio->io_vsd_ops = &vdev_disk_vsd_ops; + +			dkc->dkc_callback = vdev_disk_ioctl_done; +			dkc->dkc_flag = FLUSH_VOLATILE; +			dkc->dkc_cookie = zio; + +			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, +			    (uintptr_t)dkc, FKIOCTL, kcred, NULL); + +			if (error == 0) { +				/* +				 * The ioctl will be done asychronously, +				 * and will call vdev_disk_ioctl_done() +				 * upon completion. +				 */ +				return (ZIO_PIPELINE_STOP); +			} + +			if (error == ENOTSUP || error == ENOTTY) { +				/* +				 * If we get ENOTSUP or ENOTTY, we know that +				 * no future attempts will ever succeed. +				 * In this case we set a persistent bit so +				 * that we don't bother with the ioctl in the +				 * future. +				 */ +				vd->vdev_nowritecache = B_TRUE; +			} +			zio->io_error = error; + +			break; + +		default: +			zio->io_error = ENOTSUP; +		} + +		return (ZIO_PIPELINE_CONTINUE); +	} + +	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + +	vdb->vdb_io = zio; +	bp = &vdb->vdb_buf; + +	bioinit(bp); +	bp->b_flags = B_BUSY | B_NOCACHE | +	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); +	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) +		bp->b_flags |= B_FAILFAST; +	bp->b_bcount = zio->io_size; +	bp->b_un.b_addr = zio->io_data; +	bp->b_lblkno = lbtodb(zio->io_offset); +	bp->b_bufsize = zio->io_size; +	bp->b_iodone = (int (*)())vdev_disk_io_intr; + +	/* ldi_strategy() will return non-zero only on programming errors */ +	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); + +	return (ZIO_PIPELINE_STOP); +} + +static void +vdev_disk_io_done(zio_t *zio) +{ +	vdev_t *vd = zio->io_vd; + +	/* +	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if +	 * the device has been removed.  If this is the case, then we trigger an +	 * asynchronous removal of the device. Otherwise, probe the device and +	 * make sure it's still accessible. +	 */ +	if (zio->io_error == EIO && !vd->vdev_remove_wanted) { +		vdev_disk_t *dvd = vd->vdev_tsd; +		int state = DKIO_NONE; + +		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, +		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { +			/* +			 * We post the resource as soon as possible, instead of +			 * when the async removal actually happens, because the +			 * DE is using this information to discard previous I/O +			 * errors. +			 */ +			zfs_post_remove(zio->io_spa, vd); +			vd->vdev_remove_wanted = B_TRUE; +			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); +		} else if (!vd->vdev_delayed_close) { +			vd->vdev_delayed_close = B_TRUE; +		} +	} +} + +vdev_ops_t vdev_disk_ops = { +	vdev_disk_open, +	vdev_disk_close, +	vdev_default_asize, +	vdev_disk_io_start, +	vdev_disk_io_done, +	NULL, +	vdev_disk_hold, +	vdev_disk_rele, +	VDEV_TYPE_DISK,		/* name of this vdev type */ +	B_TRUE			/* leaf vdev */ +}; + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ +	ldi_handle_t vd_lh; +	vdev_label_t *label; +	uint64_t s, size; +	int l; +	ddi_devid_t tmpdevid; +	int error = -1; +	char *minor_name; + +	/* +	 * Read the device label and build the nvlist. +	 */ +	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, +	    &minor_name) == 0) { +		error = ldi_open_by_devid(tmpdevid, minor_name, +		    FREAD, kcred, &vd_lh, zfs_li); +		ddi_devid_free(tmpdevid); +		ddi_devid_str_free(minor_name); +	} + +	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, +	    zfs_li))) +		return (error); + +	if (ldi_get_size(vd_lh, &s)) { +		(void) ldi_close(vd_lh, FREAD, kcred); +		return (EIO); +	} + +	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); +	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + +	*config = NULL; +	for (l = 0; l < VDEV_LABELS; l++) { +		uint64_t offset, state, txg = 0; + +		/* read vdev label */ +		offset = vdev_label_offset(size, l, 0); +		if (vdev_disk_physio(vd_lh, (caddr_t)label, +		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) +			continue; + +		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, +		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { +			*config = NULL; +			continue; +		} + +		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, +		    &state) != 0 || state >= POOL_STATE_DESTROYED) { +			nvlist_free(*config); +			*config = NULL; +			continue; +		} + +		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, +		    &txg) != 0 || txg == 0) { +			nvlist_free(*config); +			*config = NULL; +			continue; +		} + +		break; +	} + +	kmem_free(label, sizeof (vdev_label_t)); +	(void) ldi_close(vd_lh, FREAD, kcred); +	if (*config == NULL) +		error = EIDRM; + +	return (error); +} | 
