aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Behlendorf <behlendorf1@llnl.gov>2016-04-19 18:19:12 +0000
committerBrian Behlendorf <behlendorf1@llnl.gov>2016-04-25 18:13:20 +0000
commit2d82ea8b111103b28b8c9ad0f69dd88736248804 (patch)
treec132339412ce0ed9b8f04c6ccb6471317755f59c
parent5b4136bd499a892f65c86af8fd39fa21e05c9148 (diff)
downloadsrc-2d82ea8b111103b28b8c9ad0f69dd88736248804.tar.gz
src-2d82ea8b111103b28b8c9ad0f69dd88736248804.zip
-rw-r--r--cmd/zpool/zpool_vdev.c38
-rw-r--r--lib/libzfs/libzfs_import.c115
-rw-r--r--lib/libzfs/libzfs_pool.c30
-rw-r--r--module/zfs/vdev_disk.c29
4 files changed, 166 insertions, 46 deletions
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index 8bbbf6615695..cf87554d5d8b 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -1198,12 +1198,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
/*
* Remove any previously existing symlink from a udev path to
- * the device before labeling the disk. This makes
- * zpool_label_disk_wait() truly wait for the new link to show
- * up instead of returning if it finds an old link still in
- * place. Otherwise there is a window between when udev
- * deletes and recreates the link during which access attempts
- * will fail with ENOENT.
+ * the device before labeling the disk. This ensures that
+ * only newly created links are used. Otherwise there is a
+ * window between when udev deletes and recreates the link
+ * during which access attempts will fail with ENOENT.
*/
strncpy(udevpath, path, MAXPATHLEN);
(void) zfs_append_partition(udevpath, MAXPATHLEN);
@@ -1227,6 +1225,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
* and then block until udev creates the new link.
*/
if (!is_exclusive || !is_spare(NULL, udevpath)) {
+ char *devnode = strrchr(devpath, '/') + 1;
+
ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
if (ret == 0) {
ret = lstat64(udevpath, &statbuf);
@@ -1234,18 +1234,29 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
(void) unlink(udevpath);
}
- if (zpool_label_disk(g_zfs, zhp,
- strrchr(devpath, '/') + 1) == -1)
+ /*
+ * When labeling a pool the raw device node name
+ * is provided as it appears under /dev/.
+ */
+ if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
return (-1);
+ /*
+ * Wait for udev to signal the device is available
+ * by the provided path.
+ */
ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
if (ret) {
- (void) fprintf(stderr, gettext("cannot "
- "resolve path '%s': %d\n"), udevpath, ret);
- return (-1);
+ (void) fprintf(stderr,
+ gettext("missing link: %s was "
+ "partitioned but %s is missing\n"),
+ devnode, udevpath);
+ return (ret);
}
- (void) zero_label(udevpath);
+ ret = zero_label(udevpath);
+ if (ret)
+ return (ret);
}
/*
@@ -1259,8 +1270,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
/*
* Update device id strings for whole disks (Linux only)
*/
- if (wholedisk)
- update_vdev_config_dev_strs(nv);
+ update_vdev_config_dev_strs(nv);
return (0);
}
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 8f27ed58c8df..2776ed29cd00 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -260,6 +260,86 @@ udev_device_is_ready(struct udev_device *dev)
}
/*
+ * Wait up to timeout_ms for udev to set up the device node. The device is
+ * considered ready when libudev determines it has been initialized, all of
+ * the device links have been verified to exist, and it has been allowed to
+ * settle. At this point the device the device can be accessed reliably.
+ * Depending on the complexity of the udev rules this process could take
+ * several seconds.
+ */
+int
+zpool_label_disk_wait(char *path, int timeout_ms)
+{
+ struct udev *udev;
+ struct udev_device *dev = NULL;
+ char nodepath[MAXPATHLEN];
+ char *sysname = NULL;
+ int ret = ENODEV;
+ int settle_ms = 50;
+ long sleep_ms = 10;
+ hrtime_t start, settle;
+
+ if ((udev = udev_new()) == NULL)
+ return (ENXIO);
+
+ start = gethrtime();
+ settle = 0;
+
+ do {
+ if (sysname == NULL) {
+ if (realpath(path, nodepath) != NULL) {
+ sysname = strrchr(nodepath, '/') + 1;
+ } else {
+ (void) usleep(sleep_ms * MILLISEC);
+ continue;
+ }
+ }
+
+ dev = udev_device_new_from_subsystem_sysname(udev,
+ "block", sysname);
+ if ((dev != NULL) && udev_device_is_ready(dev)) {
+ struct udev_list_entry *links, *link;
+
+ ret = 0;
+ links = udev_device_get_devlinks_list_entry(dev);
+
+ udev_list_entry_foreach(link, links) {
+ struct stat64 statbuf;
+ const char *name;
+
+ name = udev_list_entry_get_name(link);
+ errno = 0;
+ if (stat64(name, &statbuf) == 0 && errno == 0)
+ continue;
+
+ settle = 0;
+ ret = ENODEV;
+ break;
+ }
+
+ if (ret == 0) {
+ if (settle == 0) {
+ settle = gethrtime();
+ } else if (NSEC2MSEC(gethrtime() - settle) >=
+ settle_ms) {
+ udev_device_unref(dev);
+ break;
+ }
+ }
+ }
+
+ udev_device_unref(dev);
+ (void) usleep(sleep_ms * MILLISEC);
+
+ } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+ udev_unref(udev);
+
+ return (ret);
+}
+
+
+/*
* Encode the persistent devices strings
* used for the vdev disk label
*/
@@ -414,6 +494,41 @@ is_mpath_whole_disk(const char *path)
return (B_FALSE);
}
+/*
+ * Wait up to timeout_ms for udev to set up the device node. The device is
+ * considered ready when the provided path have been verified to exist and
+ * it has been allowed to settle. At this point the device the device can
+ * be accessed reliably. Depending on the complexity of the udev rules thisi
+ * process could take several seconds.
+ */
+int
+zpool_label_disk_wait(char *path, int timeout_ms)
+{
+ int settle_ms = 50;
+ long sleep_ms = 10;
+ hrtime_t start, settle;
+ struct stat64 statbuf;
+
+ start = gethrtime();
+ settle = 0;
+
+ do {
+ errno = 0;
+ if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
+ if (settle == 0)
+ settle = gethrtime();
+ else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
+ return (0);
+ } else if (errno != ENOENT) {
+ return (errno);
+ }
+
+ usleep(sleep_ms * MILLISEC);
+ } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+ return (ENODEV);
+}
+
void
update_vdev_config_dev_strs(nvlist_t *nv)
{
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 214c57ab4b46..c405abe3edf1 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -4122,30 +4122,7 @@ find_start_block(nvlist_t *config)
return (MAXOFFSET_T);
}
-int
-zpool_label_disk_wait(char *path, int timeout)
-{
- struct stat64 statbuf;
- int i;
-
- /*
- * Wait timeout miliseconds for a newly created device to be available
- * from the given path. There is a small window when a /dev/ device
- * will exist and the udev link will not, so we must wait for the
- * symlink. Depending on the udev rules this may take a few seconds.
- */
- for (i = 0; i < timeout; i++) {
- usleep(1000);
-
- errno = 0;
- if ((stat64(path, &statbuf) == 0) && (errno == 0))
- return (0);
- }
-
- return (ENOENT);
-}
-
-int
+static int
zpool_label_disk_check(char *path)
{
struct dk_gpt *vtoc;
@@ -4310,12 +4287,11 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
(void) close(fd);
efi_free(vtoc);
- /* Wait for the first expected partition to appear. */
-
(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
(void) zfs_append_partition(path, MAXPATHLEN);
- rval = zpool_label_disk_wait(path, 3000);
+ /* Wait to udev to signal use the device has settled. */
+ rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT);
if (rval) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to "
"detect device partitions on '%s': %d"), path, rval);
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index cdb8f78e2788..9b51ecc1d968 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -244,12 +244,12 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
{
struct block_device *bdev = ERR_PTR(-ENXIO);
vdev_disk_t *vd;
- int mode, block_size;
+ int count = 0, mode, block_size;
/* Must have a pathname and it must be absolute. */
if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
+ return (SET_ERROR(EINVAL));
}
/*
@@ -264,7 +264,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
if (vd == NULL)
- return (ENOMEM);
+ return (SET_ERROR(ENOMEM));
/*
* Devices are always opened by the path provided at configuration
@@ -279,16 +279,35 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
* /dev/[hd]d devices which may be reordered due to probing order.
* Devices in the wrong locations will be detected by the higher
* level vdev validation.
+ *
+ * The specified paths may be briefly removed and recreated in
+ * response to udev events. This should be exceptionally unlikely
+ * because the zpool command makes every effort to verify these paths
+ * have already settled prior to reaching this point. Therefore,
+ * a ENOENT failure at this point is highly likely to be transient
+ * and it is reasonable to sleep and retry before giving up. In
+ * practice delays have been observed to be on the order of 100ms.
*/
mode = spa_mode(v->vdev_spa);
if (v->vdev_wholedisk && v->vdev_expanding)
bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
- if (IS_ERR(bdev))
+
+ while (IS_ERR(bdev) && count < 50) {
bdev = vdev_bdev_open(v->vdev_path,
vdev_bdev_mode(mode), zfs_vdev_holder);
+ if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+ msleep(10);
+ count++;
+ } else if (IS_ERR(bdev)) {
+ break;
+ }
+ }
+
if (IS_ERR(bdev)) {
+ dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
+ v->vdev_path, -PTR_ERR(bdev), count);
kmem_free(vd, sizeof (vdev_disk_t));
- return (-PTR_ERR(bdev));
+ return (SET_ERROR(-PTR_ERR(bdev)));
}
v->vdev_tsd = vd;