7 files changed, 693 insertions, 193 deletions
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
index 0902bc78a7b5..1ac0d142443b 100644
--- a/sys/dev/nvmf/host/nvmf.c
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -8,13 +8,18 @@
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
+#include <sys/dnv.h>
+#include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/memdesc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
+#include <sys/nv.h>
+#include <sys/reboot.h>
 #include <sys/sx.h>
+#include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/nvmf.h>
@@ -22,10 +27,20 @@
 #include <dev/nvmf/host/nvmf_var.h>
 
 static struct cdevsw nvmf_cdevsw;
+static struct taskqueue *nvmf_tq;
+
+bool nvmf_fail_disconnect = false;
+SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
+    &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
 
 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
 
+static void	nvmf_controller_loss_task(void *arg, int pending);
 static void	nvmf_disconnect_task(void *arg, int pending);
+static void	nvmf_request_reconnect(struct nvmf_softc *sc);
+static void	nvmf_request_reconnect_task(void *arg, int pending);
+static void	nvmf_shutdown_pre_sync(void *arg, int howto);
+static void	nvmf_shutdown_post_sync(void *arg, int howto);
 
 void
 nvmf_complete(void *arg, const struct nvme_completion *cqe)
@@ -187,104 +202,132 @@ nvmf_send_keep_alive(void *arg)
 }
 
 int
-nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
+nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
 {
-	size_t len;
-	u_int i;
+	const struct nvme_discovery_log_entry *dle;
+	const struct nvme_controller_data *cdata;
+	const nvlist_t *const *io;
+	const nvlist_t *admin, *rparams;
+	nvlist_t *nvl;
+	size_t i, num_io_queues;
+	uint32_t qsize;
 	int error;
 
-	memset(ivars, 0, sizeof(*ivars));
-
-	if (!hh->admin.admin || hh->num_io_queues < 1)
-		return (EINVAL);
-
-	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
-	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
+	error = nvmf_unpack_ioc_nvlist(nv, &nvl);
 	if (error != 0)
-		goto out;
-	nvme_controller_data_swapbytes(ivars->cdata);
+		return (error);
 
-	len = hh->num_io_queues * sizeof(*ivars->io_params);
-	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
-	error = copyin(hh->io, ivars->io_params, len);
-	if (error != 0)
-		goto out;
-	for (i = 0; i < hh->num_io_queues; i++) {
-		if (ivars->io_params[i].admin) {
-			error = EINVAL;
-			goto out;
-		}
+	if (!nvlist_exists_number(nvl, "trtype") ||
+	    !nvlist_exists_nvlist(nvl, "admin") ||
+	    !nvlist_exists_nvlist_array(nvl, "io") ||
+	    !nvlist_exists_binary(nvl, "cdata") ||
+	    !nvlist_exists_nvlist(nvl, "rparams"))
+		goto invalid;
+
+	rparams = nvlist_get_nvlist(nvl, "rparams");
+	if (!nvlist_exists_binary(rparams, "dle") ||
+	    !nvlist_exists_string(rparams, "hostnqn") ||
+	    !nvlist_exists_number(rparams, "num_io_queues") ||
+	    !nvlist_exists_number(rparams, "io_qsize"))
+		goto invalid;
+
+	admin = nvlist_get_nvlist(nvl, "admin");
+	if (!nvmf_validate_qpair_nvlist(admin, false))
+		goto invalid;
+	if (!nvlist_get_bool(admin, "admin"))
+		goto invalid;
+
+	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
+	if (num_io_queues < 1 ||
+	    num_io_queues != nvlist_get_number(rparams, "num_io_queues"))
+		goto invalid;
+	for (i = 0; i < num_io_queues; i++) {
+		if (!nvmf_validate_qpair_nvlist(io[i], false))
+			goto invalid;
+	}
 
-		/* Require all I/O queues to be the same size. */
-		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
-			error = EINVAL;
-			goto out;
-		}
+	/* Require all I/O queues to be the same size. */
+	qsize = nvlist_get_number(rparams, "io_qsize");
+	for (i = 0; i < num_io_queues; i++) {
+		if (nvlist_get_number(io[i], "qsize") != qsize)
+			goto invalid;
 	}
 
-	ivars->hh = hh;
-	return (0);
+	cdata = nvlist_get_binary(nvl, "cdata", &i);
+	if (i != sizeof(*cdata))
+		goto invalid;
+	dle = nvlist_get_binary(rparams, "dle", &i);
+	if (i != sizeof(*dle))
+		goto invalid;
 
-out:
-	free(ivars->io_params, M_NVMF);
-	free(ivars->cdata, M_NVMF);
-	return (error);
-}
+	if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0)
+		goto invalid;
 
-void
-nvmf_free_ivars(struct nvmf_ivars *ivars)
-{
-	free(ivars->io_params, M_NVMF);
-	free(ivars->cdata, M_NVMF);
+	*nvlp = nvl;
+	return (0);
+invalid:
+	nvlist_destroy(nvl);
+	return (EINVAL);
 }
 
 static int
 nvmf_probe(device_t dev)
 {
-	struct nvmf_ivars *ivars = device_get_ivars(dev);
-	char desc[260];
+	const nvlist_t *nvl = device_get_ivars(dev);
+	const struct nvme_controller_data *cdata;
 
-	if (ivars == NULL)
+	if (nvl == NULL)
 		return (ENXIO);
 
-	snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
-	device_set_desc_copy(dev, desc);
+	cdata = nvlist_get_binary(nvl, "cdata", NULL);
+	device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
-nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
+nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl)
 {
+	const nvlist_t *const *io;
+	const nvlist_t *admin;
+	uint64_t kato;
+	size_t num_io_queues;
+	enum nvmf_trtype trtype;
 	char name[16];
 
+	trtype = nvlist_get_number(nvl, "trtype");
+	admin = nvlist_get_nvlist(nvl, "admin");
+	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
+	kato = dnvlist_get_number(nvl, "kato", 0);
+	sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0);
+	sc->controller_loss_timeout = dnvlist_get_number(nvl,
+	    "controller_loss_timeout", 0);
+
 	/* Setup the admin queue. */
-	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
-	    "admin queue");
+	sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
 	if (sc->admin == NULL) {
 		device_printf(sc->dev, "Failed to setup admin queue\n");
 		return (ENXIO);
 	}
 
 	/* Setup I/O queues. */
-	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
+	sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
 	    M_WAITOK | M_ZERO);
-	sc->num_io_queues = ivars->hh->num_io_queues;
+	sc->num_io_queues = num_io_queues;
 	for (u_int i = 0; i < sc->num_io_queues; i++) {
 		snprintf(name, sizeof(name), "I/O queue %u", i);
-		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
-		    &ivars->io_params[i], name);
+		sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
 		if (sc->io[i] == NULL) {
 			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
-			    i + 1);
+			    i);
 			return (ENXIO);
 		}
 	}
 
 	/* Start KeepAlive timers. */
-	if (ivars->hh->kato != 0) {
+	if (kato != 0) {
 		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
 		    sc->cdata->ctratt) != 0;
-		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
+		sc->ka_rx_sbt = mstosbt(kato);
 		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
 		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
 		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
@@ -292,12 +335,23 @@ nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
 		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
 	}
 
+	memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
+	    sizeof(*sc->cdata));
+
+	/* Save reconnect parameters. */
+	nvlist_destroy(sc->rparams);
+	sc->rparams = nvlist_take_nvlist(nvl, "rparams");
+
 	return (0);
 }
 
+typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
+    const struct nvme_namespace_data *, void *);
+
 static bool
-nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
-    struct nvme_namespace_data *data, uint32_t *nsidp)
+nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
+    struct nvme_namespace_data *data, uint32_t *nsidp,
+    nvmf_scan_active_ns_cb *cb, void *cb_arg)
 {
 	struct nvmf_completion_status status;
 	uint32_t nsid;
@@ -333,13 +387,6 @@ nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
 			return (true);
 		}
 
-		if (sc->ns[nsid - 1] != NULL) {
-			device_printf(sc->dev,
-			    "duplicate namespace %u in active namespace list\n",
-			    nsid);
-			return (false);
-		}
-
 		nvmf_status_init(&status);
 		nvmf_status_wait_io(&status);
 		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
@@ -365,49 +412,37 @@ nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
 			return (false);
 		}
 
-		/*
-		 * As in nvme_ns_construct, a size of zero indicates an
-		 * invalid namespace.
-		 */
 		nvme_namespace_data_swapbytes(data);
-		if (data->nsze == 0) {
-			device_printf(sc->dev,
-			    "ignoring active namespace %u with zero size\n",
-			    nsid);
-			continue;
-		}
-
-		sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
-
-		nvmf_sim_rescan_ns(sc, nsid);
+		if (!cb(sc, nsid, data, cb_arg))
+			return (false);
 	}
 
 	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
 
-	if (nsid >= 0xfffffffd)
+	if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
 		*nsidp = 0;
 	else
-		*nsidp = nsid + 1;
+		*nsidp = nsid;
 	return (true);
 }
 
 static bool
-nvmf_add_namespaces(struct nvmf_softc *sc)
+nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
+    void *cb_arg)
 {
 	struct nvme_namespace_data *data;
 	struct nvme_ns_list *nslist;
 	uint32_t nsid;
 	bool retval;
 
-	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
-	    M_WAITOK | M_ZERO);
 	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
 
 	nsid = 0;
 	retval = true;
 	for (;;) {
-		if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
+		if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
+		    cb_arg)) {
 			retval = false;
 			break;
 		}
@@ -420,36 +455,77 @@ nvmf_add_namespaces(struct nvmf_softc *sc)
 	return (retval);
 }
 
+static bool
+nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
+    const struct nvme_namespace_data *data, void *arg __unused)
+{
+	if (sc->ns[nsid - 1] != NULL) {
+		device_printf(sc->dev,
+		    "duplicate namespace %u in active namespace list\n",
+		    nsid);
+		return (false);
+	}
+
+	/*
+	 * As in nvme_ns_construct, a size of zero indicates an
+	 * invalid namespace.
+	 */
+	if (data->nsze == 0) {
+		device_printf(sc->dev,
+		    "ignoring active namespace %u with zero size\n", nsid);
+		return (true);
+	}
+
+	sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+
+	nvmf_sim_rescan_ns(sc, nsid);
+	return (true);
+}
+
+static bool
+nvmf_add_namespaces(struct nvmf_softc *sc)
+{
+	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
+	    M_WAITOK | M_ZERO);
+	return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
+}
+
 static int
 nvmf_attach(device_t dev)
 {
 	struct make_dev_args mda;
 	struct nvmf_softc *sc = device_get_softc(dev);
-	struct nvmf_ivars *ivars = device_get_ivars(dev);
+	nvlist_t *nvl = device_get_ivars(dev);
+	const nvlist_t * const *io;
+	struct sysctl_oid *oid;
 	uint64_t val;
 	u_int i;
 	int error;
 
-	if (ivars == NULL)
+	if (nvl == NULL)
 		return (ENXIO);
 
 	sc->dev = dev;
-	sc->trtype = ivars->hh->trtype;
+	sc->trtype = nvlist_get_number(nvl, "trtype");
 	callout_init(&sc->ka_rx_timer, 1);
 	callout_init(&sc->ka_tx_timer, 1);
 	sx_init(&sc->connection_lock, "nvmf connection");
 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+	TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0,
+	    nvmf_controller_loss_task, sc);
+	TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0,
+	    nvmf_request_reconnect_task, sc);
 
-	/* Claim the cdata pointer from ivars. */
-	sc->cdata = ivars->cdata;
-	ivars->cdata = NULL;
+	oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
+	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
+	sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
 
-	nvmf_init_aer(sc);
+	sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
 
-	/* TODO: Multiqueue support. */
-	sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
+	nvmf_init_aer(sc);
 
-	error = nvmf_establish_connection(sc, ivars);
+	error = nvmf_establish_connection(sc, nvl);
 	if (error != 0)
 		goto out;
 
@@ -476,6 +552,10 @@ nvmf_attach(device_t dev)
 		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
 	}
 
+	io = nvlist_get_nvlist_array(nvl, "io", NULL);
+	sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
+	    sc->num_io_queues;
+
 	error = nvmf_init_sim(sc);
 	if (error != 0)
 		goto out;
@@ -503,6 +583,11 @@ nvmf_attach(device_t dev)
 		goto out;
 	}
 
+	sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
+	    nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
+	sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
+	    nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST);
+
 	return (0);
 out:
 	if (sc->ns != NULL) {
@@ -529,8 +614,11 @@ out:
 
 	nvmf_destroy_aer(sc);
 
-	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+	taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
+	taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task);
+	taskqueue_drain(nvmf_tq, &sc->disconnect_task);
 	sx_destroy(&sc->connection_lock);
+	nvlist_destroy(sc->rparams);
 	free(sc->cdata, M_NVMF);
 	return (error);
 }
@@ -538,7 +626,7 @@ out:
 void
 nvmf_disconnect(struct nvmf_softc *sc)
 {
-	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+	taskqueue_enqueue(nvmf_tq, &sc->disconnect_task);
 }
 
 static void
@@ -579,6 +667,7 @@ nvmf_disconnect_task(void *arg, int pending __unused)
 		return;
 	}
 
+	nanotime(&sc->last_disconnect);
 	callout_drain(&sc->ka_tx_timer);
 	callout_drain(&sc->ka_rx_timer);
 	sc->ka_traffic = false;
@@ -600,29 +689,98 @@ nvmf_disconnect_task(void *arg, int pending __unused)
 	nvmf_destroy_qp(sc->admin);
 	sc->admin = NULL;
 
+	if (sc->reconnect_delay != 0)
+		nvmf_request_reconnect(sc);
+	if (sc->controller_loss_timeout != 0)
+		taskqueue_enqueue_timeout(nvmf_tq,
+		    &sc->controller_loss_task, sc->controller_loss_timeout *
+		    hz);
+
+	sx_xunlock(&sc->connection_lock);
+}
+
+static void
+nvmf_controller_loss_task(void *arg, int pending)
+{
+	struct nvmf_softc *sc = arg;
+	device_t dev;
+	int error;
+
+	bus_topo_lock();
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin != NULL || sc->detaching) {
+		/* Reconnected or already detaching. */
+		sx_xunlock(&sc->connection_lock);
+		bus_topo_unlock();
+		return;
+	}
+
+	sc->controller_timedout = true;
+	sx_xunlock(&sc->connection_lock);
+
+	/*
+	 * XXX: Doing this from here is a bit ugly.  We don't have an
+	 * extra reference on `dev` but bus_topo_lock should block any
+	 * concurrent device_delete_child invocations.
+	 */
+	dev = sc->dev;
+	error = device_delete_child(root_bus, dev);
+	if (error != 0)
+		device_printf(dev,
+		    "failed to detach after controller loss: %d\n", error);
+	bus_topo_unlock();
+}
+
+static void
+nvmf_request_reconnect(struct nvmf_softc *sc)
+{
+	char buf[64];
+
+	sx_assert(&sc->connection_lock, SX_LOCKED);
+
+	snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev));
+	devctl_notify("nvme", "controller", "RECONNECT", buf);
+	taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task,
+	    sc->reconnect_delay * hz);
+}
+
+static void
+nvmf_request_reconnect_task(void *arg, int pending)
+{
+	struct nvmf_softc *sc = arg;
+
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
+		/* Reconnected or already detaching. */
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	nvmf_request_reconnect(sc);
 	sx_xunlock(&sc->connection_lock);
 }
 
 static int
-nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
+nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
 {
-	struct nvmf_ivars ivars;
+	const struct nvme_controller_data *cdata;
+	nvlist_t *nvl;
 	u_int i;
 	int error;
 
+	error = nvmf_copyin_handoff(nv, &nvl);
+	if (error != 0)
+		return (error);
+
 	/* XXX: Should we permit changing the transport type? */
-	if (sc->trtype != hh->trtype) {
+	if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
 		device_printf(sc->dev,
 		    "transport type mismatch on reconnect\n");
 		return (EINVAL);
 	}
 
-	error = nvmf_init_ivars(&ivars, hh);
-	if (error != 0)
-		return (error);
-
 	sx_xlock(&sc->connection_lock);
-	if (sc->admin != NULL || sc->detaching) {
+	if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
 		error = EBUSY;
 		goto out;
 	}
@@ -634,8 +792,9 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
 	 * ensures the new association is connected to the same NVMe
 	 * subsystem.
 	 */
-	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
-	    sizeof(ivars.cdata->subnqn)) != 0) {
+	cdata = nvlist_get_binary(nvl, "cdata", NULL);
+	if (memcmp(sc->cdata->subnqn, cdata->subnqn,
+	    sizeof(cdata->subnqn)) != 0) {
 		device_printf(sc->dev,
 		    "controller subsystem NQN mismatch on reconnect\n");
 		error = EINVAL;
@@ -647,7 +806,7 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
 	 * max_pending_io is still correct?
 	 */
 
-	error = nvmf_establish_connection(sc, &ivars);
+	error = nvmf_establish_connection(sc, nvl);
 	if (error != 0)
 		goto out;
 
@@ -665,12 +824,85 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
 			nvmf_reconnect_ns(sc->ns[i]);
 	}
 	nvmf_reconnect_sim(sc);
+
+	nvmf_rescan_all_ns(sc);
+
+	taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL);
+	taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL);
 out:
 	sx_xunlock(&sc->connection_lock);
-	nvmf_free_ivars(&ivars);
+	nvlist_destroy(nvl);
 	return (error);
 }
 
+static void
+nvmf_shutdown_pre_sync(void *arg, int howto)
+{
+	struct nvmf_softc *sc = arg;
+
+	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
+		return;
+
+	/*
+	 * If this association is disconnected, abort any pending
+	 * requests with an error to permit filesystems to unmount
+	 * without hanging.
+	 */
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin != NULL || sc->detaching) {
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	for (u_int i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_shutdown_ns(sc->ns[i]);
+	}
+	nvmf_shutdown_sim(sc);
+	sx_xunlock(&sc->connection_lock);
+}
+
+static void
+nvmf_shutdown_post_sync(void *arg, int howto)
+{
+	struct nvmf_softc *sc = arg;
+
+	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
+		return;
+
+	/*
+	 * If this association is connected, disconnect gracefully.
+	 */
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin == NULL || sc->detaching) {
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	callout_drain(&sc->ka_tx_timer);
+	callout_drain(&sc->ka_rx_timer);
+
+	nvmf_shutdown_controller(sc);
+
+	/*
+	 * Quiesce consumers so that any commands submitted after this
+	 * fail with an error.  Notably, nda(4) calls nda_flush() from
+	 * a post_sync handler that might be ordered after this one.
+	 */
+	for (u_int i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_shutdown_ns(sc->ns[i]);
+	}
+	nvmf_shutdown_sim(sc);
+
+	for (u_int i = 0; i < sc->num_io_queues; i++) {
+		nvmf_destroy_qp(sc->io[i]);
+	}
+	nvmf_destroy_qp(sc->admin);
+	sc->admin = NULL;
+	sx_xunlock(&sc->connection_lock);
+}
+
 static int
 nvmf_detach(device_t dev)
 {
@@ -683,6 +915,9 @@ nvmf_detach(device_t dev)
 	sc->detaching = true;
 	sx_xunlock(&sc->connection_lock);
 
+	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
+	EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
+
 	nvmf_destroy_sim(sc);
 	for (i = 0; i < sc->cdata->nn; i++) {
 		if (sc->ns[i] != NULL)
@@ -701,7 +936,21 @@ nvmf_detach(device_t dev)
 	}
 	free(sc->io, M_NVMF);
 
-	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+	taskqueue_drain(nvmf_tq, &sc->disconnect_task);
+	if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task,
+	    NULL) != 0)
+		taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
+
+	/*
+	 * Don't cancel/drain the controller loss task if that task
+	 * has fired and is triggering the detach.
+	 */
+	if (!sc->controller_timedout) {
+		if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task,
+		    NULL) != 0)
+			taskqueue_drain_timeout(nvmf_tq,
+			    &sc->controller_loss_task);
+	}
 
 	if (sc->admin != NULL)
 		nvmf_destroy_qp(sc->admin);
@@ -709,16 +958,45 @@ nvmf_detach(device_t dev)
 	nvmf_destroy_aer(sc);
 
 	sx_destroy(&sc->connection_lock);
+	nvlist_destroy(sc->rparams);
 	free(sc->cdata, M_NVMF);
 	return (0);
 }
 
+static void
+nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
+    const struct nvme_namespace_data *data)
+{
+	struct nvmf_namespace *ns;
+
+	/* XXX: Needs locking around sc->ns[]. */
+	ns = sc->ns[nsid - 1];
+	if (data->nsze == 0) {
+		/* XXX: Needs locking */
+		if (ns != NULL) {
+			nvmf_destroy_ns(ns);
+			sc->ns[nsid - 1] = NULL;
+		}
+	} else {
+		/* XXX: Needs locking */
+		if (ns == NULL) {
+			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+		} else {
+			if (!nvmf_update_ns(ns, data)) {
+				nvmf_destroy_ns(ns);
+				sc->ns[nsid - 1] = NULL;
+			}
+		}
+	}
+
+	nvmf_sim_rescan_ns(sc, nsid);
+}
+
 void
 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
 {
 	struct nvmf_completion_status status;
 	struct nvme_namespace_data *data;
-	struct nvmf_namespace *ns;
 
 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
 
@@ -751,29 +1029,58 @@ nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
 
 	nvme_namespace_data_swapbytes(data);
 
-	/* XXX: Needs locking around sc->ns[]. */
-	ns = sc->ns[nsid - 1];
-	if (data->nsze == 0) {
-		/* XXX: Needs locking */
+	nvmf_rescan_ns_1(sc, nsid, data);
+
+	free(data, M_NVMF);
+}
+
+static void
+nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
+    uint32_t next_valid_nsid)
+{
+	struct nvmf_namespace *ns;
+
+	for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
+	{
+		/* XXX: Needs locking around sc->ns[]. */
+		ns = sc->ns[nsid - 1];
 		if (ns != NULL) {
 			nvmf_destroy_ns(ns);
 			sc->ns[nsid - 1] = NULL;
-		}
-	} else {
-		/* XXX: Needs locking */
-		if (ns == NULL) {
-			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
-		} else {
-			if (!nvmf_update_ns(ns, data)) {
-				nvmf_destroy_ns(ns);
-				sc->ns[nsid - 1] = NULL;
-			}
+
+			nvmf_sim_rescan_ns(sc, nsid);
 		}
 	}
+}
 
-	free(data, M_NVMF);
+static bool
+nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
+    const struct nvme_namespace_data *data, void *arg)
+{
+	uint32_t *last_nsid = arg;
 
-	nvmf_sim_rescan_ns(sc, nsid);
+	/* Check for any gaps prior to this namespace. */
+	nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
+	*last_nsid = nsid;
+
+	nvmf_rescan_ns_1(sc, nsid, data);
+	return (true);
+}
+
+void
+nvmf_rescan_all_ns(struct nvmf_softc *sc)
+{
+	uint32_t last_nsid;
+
+	last_nsid = 0;
+	if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
+		return;
+
+	/*
+	 * Check for any namespace devices after the last active
+	 * namespace.
+	 */
+	nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
 }
 
 int
@@ -822,12 +1129,21 @@ nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
 	cmd.cdw14 = pt->cmd.cdw14;
 	cmd.cdw15 = pt->cmd.cdw15;
 
+	sx_slock(&sc->connection_lock);
+	if (sc->admin == NULL || sc->detaching) {
+		device_printf(sc->dev,
+		    "failed to send passthrough command\n");
+		error = ECONNABORTED;
+		sx_sunlock(&sc->connection_lock);
+		goto error;
+	}
 	if (admin)
 		qp = sc->admin;
 	else
 		qp = nvmf_select_io_queue(sc);
 	nvmf_status_init(&status);
 	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
+	sx_sunlock(&sc->connection_lock);
 	if (req == NULL) {
 		device_printf(sc->dev, "failed to send passthrough command\n");
 		error = ECONNABORTED;
@@ -857,14 +1173,46 @@ error:
 }
 
 static int
+nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
+{
+	int error;
+
+	sx_slock(&sc->connection_lock);
+	error = nvmf_pack_ioc_nvlist(sc->rparams, nv);
+	sx_sunlock(&sc->connection_lock);
+
+	return (error);
+}
+
+static int
+nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
+{
+	nvlist_t *nvl, *nvl_ts;
+	int error;
+
+	nvl = nvlist_create(0);
+	nvl_ts = nvlist_create(0);
+
+	sx_slock(&sc->connection_lock);
+	nvlist_add_bool(nvl, "connected", sc->admin != NULL);
+	nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec);
+	nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec);
+	sx_sunlock(&sc->connection_lock);
+	nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts);
+
+	error = nvmf_pack_ioc_nvlist(nvl, nv);
+	nvlist_destroy(nvl);
+	return (error);
+}
+
+static int
 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
     struct thread *td)
 {
 	struct nvmf_softc *sc = cdev->si_drv1;
 	struct nvme_get_nsid *gnsid;
 	struct nvme_pt_command *pt;
-	struct nvmf_reconnect_params *rp;
-	struct nvmf_handoff_host *hh;
+	struct nvmf_ioc_nv *nv;
 
 	switch (cmd) {
 	case NVME_PASSTHROUGH_CMD:
@@ -872,25 +1220,25 @@ nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 		return (nvmf_passthrough_cmd(sc, pt, true));
 	case NVME_GET_NSID:
 		gnsid = (struct nvme_get_nsid *)arg;
-		strncpy(gnsid->cdev, device_get_nameunit(sc->dev),
+		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
 		    sizeof(gnsid->cdev));
-		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
 		gnsid->nsid = 0;
 		return (0);
 	case NVME_GET_MAX_XFER_SIZE:
 		*(uint64_t *)arg = sc->max_xfer_size;
 		return (0);
-	case NVMF_RECONNECT_PARAMS:
-		rp = (struct nvmf_reconnect_params *)arg;
-		if ((sc->cdata->fcatt & 1) == 0)
-			rp->cntlid = NVMF_CNTLID_DYNAMIC;
-		else
-			rp->cntlid = sc->cdata->ctrlr_id;
-		memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
+	case NVME_GET_CONTROLLER_DATA:
+		memcpy(arg, sc->cdata, sizeof(*sc->cdata));
 		return (0);
+	case NVMF_RECONNECT_PARAMS:
+		nv = (struct nvmf_ioc_nv *)arg;
+		return (nvmf_reconnect_params(sc, nv));
 	case NVMF_RECONNECT_HOST:
-		hh = (struct nvmf_handoff_host *)arg;
-		return (nvmf_reconnect_host(sc, hh));
+		nv = (struct nvmf_ioc_nv *)arg;
+		return (nvmf_reconnect_host(sc, nv));
+	case NVMF_CONNECTION_STATUS:
+		nv = (struct nvmf_ioc_nv *)arg;
+		return (nvmf_connection_status(sc, nv));
 	default:
 		return (ENOTTY);
 	}
@@ -904,14 +1252,25 @@ static struct cdevsw nvmf_cdevsw = {
 static int
 nvmf_modevent(module_t mod, int what, void *arg)
 {
+	int error;
+
 	switch (what) {
 	case MOD_LOAD:
-		return (nvmf_ctl_load());
+		error = nvmf_ctl_load();
+		if (error != 0)
+			return (error);
+
+		nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO,
+		    taskqueue_thread_enqueue, &nvmf_tq);
+		taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq");
+		return (0);
 	case MOD_QUIESCE:
 		return (0);
 	case MOD_UNLOAD:
 		nvmf_ctl_unload();
 		destroy_dev_drain(&nvmf_cdevsw);
+		if (nvmf_tq != NULL)
+			taskqueue_free(nvmf_tq);
 		return (0);
 	default:
 		return (EOPNOTSUPP);
@@ -923,9 +1282,6 @@ static device_method_t nvmf_methods[] = {
 	DEVMETHOD(device_probe,     nvmf_probe),
 	DEVMETHOD(device_attach,    nvmf_attach),
 	DEVMETHOD(device_detach,    nvmf_detach),
-#if 0
-	DEVMETHOD(device_shutdown,  nvmf_shutdown),
-#endif
 	DEVMETHOD_END
 };
 
diff --git a/sys/dev/nvmf/host/nvmf_aer.c b/sys/dev/nvmf/host/nvmf_aer.c
index 4c950f1518d0..2f7f177d0421 100644
--- a/sys/dev/nvmf/host/nvmf_aer.c
+++ b/sys/dev/nvmf/host/nvmf_aer.c
@@ -62,7 +62,7 @@ nvmf_handle_changed_namespaces(struct nvmf_softc *sc,
 	 * probably just rescan the entire set of namespaces.
 	 */
 	if (ns_list->ns[0] == 0xffffffff) {
-		device_printf(sc->dev, "too many changed namespaces\n");
+		nvmf_rescan_all_ns(sc);
 		return;
 	}
 
diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c
index f40005a2a666..275d5e9c932a 100644
--- a/sys/dev/nvmf/host/nvmf_ctldev.c
+++ b/sys/dev/nvmf/host/nvmf_ctldev.c
@@ -9,6 +9,7 @@
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
+#include <sys/nv.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/nvmf.h>
 #include <dev/nvmf/nvmf_transport.h>
@@ -17,25 +18,25 @@
 static struct cdev *nvmf_cdev;
 
 static int
-nvmf_handoff_host(struct nvmf_handoff_host *hh)
+nvmf_handoff_host(struct nvmf_ioc_nv *nv)
 {
-	struct nvmf_ivars ivars;
+	nvlist_t *nvl;
 	device_t dev;
 	int error;
 
-	error = nvmf_init_ivars(&ivars, hh);
+	error = nvmf_copyin_handoff(nv, &nvl);
 	if (error != 0)
 		return (error);
 
 	bus_topo_lock();
-	dev = device_add_child(root_bus, "nvme", -1);
+	dev = device_add_child(root_bus, "nvme", DEVICE_UNIT_ANY);
 	if (dev == NULL) {
 		bus_topo_unlock();
 		error = ENXIO;
 		goto out;
 	}
 
-	device_set_ivars(dev, &ivars);
+	device_set_ivars(dev, nvl);
 	error = device_probe_and_attach(dev);
 	device_set_ivars(dev, NULL);
 	if (error != 0)
@@ -43,7 +44,7 @@ nvmf_handoff_host(struct nvmf_handoff_host *hh)
 	bus_topo_unlock();
 
 out:
-	nvmf_free_ivars(&ivars);
+	nvlist_destroy(nvl);
 	return (error);
 }
 
@@ -117,7 +118,7 @@ nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
 {
 	switch (cmd) {
 	case NVMF_HANDOFF_HOST:
-		return (nvmf_handoff_host((struct nvmf_handoff_host *)arg));
+		return (nvmf_handoff_host((struct nvmf_ioc_nv *)arg));
 	case NVMF_DISCONNECT_HOST:
 		return (nvmf_disconnect_host((const char **)arg));
 	case NVMF_DISCONNECT_ALL:
diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c
index 3ce434bf7c50..4215c8295d2e 100644
--- a/sys/dev/nvmf/host/nvmf_ns.c
+++ b/sys/dev/nvmf/host/nvmf_ns.c
@@ -18,7 +18,7 @@
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sbuf.h>
-#include <machine/stdarg.h>
+#include <sys/stdarg.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/host/nvmf_var.h>
 
@@ -29,6 +29,7 @@ struct nvmf_namespace {
 	u_int	flags;
 	uint32_t lba_size;
 	bool disconnected;
+	bool shutdown;
 
 	TAILQ_HEAD(, bio) pending_bios;
 	struct mtx lock;
@@ -49,7 +50,7 @@ ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
 
-	sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev),
+	sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
 	    ns->id);
 
 	va_start(ap, fmt);
@@ -84,13 +85,22 @@ nvmf_ns_biodone(struct bio *bio)
 	ns = bio->bio_dev->si_drv1;
 
 	/* If a request is aborted, resubmit or queue it for resubmission. */
-	if (bio->bio_error == ECONNABORTED) {
+	if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) {
 		bio->bio_error = 0;
 		bio->bio_driver2 = 0;
 		mtx_lock(&ns->lock);
 		if (ns->disconnected) {
-			TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
-			mtx_unlock(&ns->lock);
+			if (nvmf_fail_disconnect || ns->shutdown) {
+				mtx_unlock(&ns->lock);
+				bio->bio_error = ECONNABORTED;
+				bio->bio_flags |= BIO_ERROR;
+				bio->bio_resid = bio->bio_bcount;
+				biodone(bio);
+			} else {
+				TAILQ_INSERT_TAIL(&ns->pending_bios, bio,
+				    bio_queue);
+				mtx_unlock(&ns->lock);
+			}
 		} else {
 			mtx_unlock(&ns->lock);
 			nvmf_ns_strategy(bio);
@@ -163,6 +173,7 @@ nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
 	struct nvme_dsm_range *dsm_range;
 	struct memdesc mem;
 	uint64_t lba, lba_count;
+	int error;
 
 	dsm_range = NULL;
 	memset(&cmd, 0, sizeof(cmd));
@@ -201,10 +212,15 @@ nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
 
 	mtx_lock(&ns->lock);
 	if (ns->disconnected) {
-		TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+		if (nvmf_fail_disconnect || ns->shutdown) {
+			error = ECONNABORTED;
+		} else {
+			TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+			error = 0;
+		}
 		mtx_unlock(&ns->lock);
 		free(dsm_range, M_NVMF);
-		return (0);
+		return (error);
 	}
 
 	req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
@@ -258,9 +274,8 @@ nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
 		return (nvmf_passthrough_cmd(ns->sc, pt, false));
 	case NVME_GET_NSID:
 		gnsid = (struct nvme_get_nsid *)arg;
-		strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
+		strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
 		    sizeof(gnsid->cdev));
-		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
 		gnsid->nsid = ns->id;
 		return (0);
 	case DIOCGMEDIASIZE:
@@ -314,7 +329,7 @@ static struct cdevsw nvmf_ns_cdevsw = {
 
 struct nvmf_namespace *
 nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
-    struct nvme_namespace_data *data)
+    const struct nvme_namespace_data *data)
 {
 	struct make_dev_args mda;
 	struct nvmf_namespace *ns;
@@ -372,10 +387,12 @@ nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = ns;
-	error = make_dev_s(&mda, &ns->cdev, "%sns%u",
+	error = make_dev_s(&mda, &ns->cdev, "%sn%u",
 	    device_get_nameunit(sc->dev), id);
 	if (error != 0)
 		goto fail;
+	ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
+	    device_get_nameunit(sc->dev), id);
 
 	ns->cdev->si_flags |= SI_UNMAPPED;
 
@@ -414,11 +431,35 @@ nvmf_reconnect_ns(struct nvmf_namespace *ns)
 }
 
 void
+nvmf_shutdown_ns(struct nvmf_namespace *ns)
+{
+	TAILQ_HEAD(, bio) bios;
+	struct bio *bio;
+
+	mtx_lock(&ns->lock);
+	ns->shutdown = true;
+	TAILQ_INIT(&bios);
+	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
+	mtx_unlock(&ns->lock);
+
+	while (!TAILQ_EMPTY(&bios)) {
+		bio = TAILQ_FIRST(&bios);
+		TAILQ_REMOVE(&bios, bio, bio_queue);
+		bio->bio_error = ECONNABORTED;
+		bio->bio_flags |= BIO_ERROR;
+		bio->bio_resid = bio->bio_bcount;
+		biodone(bio);
+	}
+}
+
+void
 nvmf_destroy_ns(struct nvmf_namespace *ns)
 {
 	TAILQ_HEAD(, bio) bios;
 	struct bio *bio;
 
+	if (ns->cdev->si_drv2 != NULL)
+		destroy_dev(ns->cdev->si_drv2);
 	destroy_dev(ns->cdev);
 
 	/*
@@ -451,7 +492,8 @@ nvmf_destroy_ns(struct nvmf_namespace *ns)
 }
 
 bool
-nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data)
+nvmf_update_ns(struct nvmf_namespace *ns,
+    const struct nvme_namespace_data *data)
 {
 	uint8_t lbads, lbaf;
 
diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c
index 96cb5a8b0465..2f511cf0406d 100644
--- a/sys/dev/nvmf/host/nvmf_qpair.c
+++ b/sys/dev/nvmf/host/nvmf_qpair.c
@@ -10,6 +10,8 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/nv.h>
+#include <sys/sysctl.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/nvmf.h>
 #include <dev/nvmf/nvmf_transport.h>
@@ -31,6 +33,7 @@ struct nvmf_host_qpair {
 	u_int	num_commands;
 	uint16_t sqhd;
 	uint16_t sqtail;
+	uint64_t submitted;
 
 	struct mtx lock;
 
@@ -41,6 +44,7 @@ struct nvmf_host_qpair {
 	struct nvmf_host_command **active_commands;
 
 	char	name[16];
+	struct sysctl_ctx_list sysctl_ctx;
 };
 
 struct nvmf_request *
@@ -112,8 +116,23 @@ nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd)
 	struct nvmf_softc *sc = qp->sc;
 	struct nvme_command *sqe;
 	struct nvmf_capsule *nc;
+	uint16_t new_sqtail;
 	int error;
 
+	mtx_assert(&qp->lock, MA_OWNED);
+
+	qp->submitted++;
+
+	/*
+	 * Update flow control tracking.  This is just a sanity check.
+	 * Since num_commands == qsize - 1, there can never be too
+	 * many commands in flight.
+	 */
+	new_sqtail = (qp->sqtail + 1) % (qp->num_commands + 1);
+	KASSERT(new_sqtail != qp->sqhd, ("%s: qp %p is full", __func__, qp));
+	qp->sqtail = new_sqtail;
+	mtx_unlock(&qp->lock);
+
 	nc = cmd->req->nc;
 	sqe = nvmf_capsule_sqe(nc);
 
@@ -177,11 +196,23 @@ nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
 		return;
 	}
 
+	/* Update flow control tracking. */
+	mtx_lock(&qp->lock);
+	if (qp->sq_flow_control) {
+		if (nvmf_sqhd_valid(nc))
+			qp->sqhd = le16toh(cqe->sqhd);
+	} else {
+		/*
+		 * If SQ FC is disabled, just advance the head for
+		 * each response capsule received.
+		 */
+		qp->sqhd = (qp->sqhd + 1) % (qp->num_commands + 1);
+	}
+
 	/*
 	 * If the queue has been shutdown due to an error, silently
 	 * drop the response.
 	 */
-	mtx_lock(&qp->lock);
 	if (qp->qp == NULL) {
 		device_printf(sc->dev,
 		    "received completion for CID %u on shutdown %s\n", cid,
@@ -212,7 +243,6 @@ nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
 	} else {
 		cmd->req = STAILQ_FIRST(&qp->pending_requests);
 		STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
-		mtx_unlock(&qp->lock);
 		nvmf_dispatch_command(qp, cmd);
 	}
 
@@ -221,28 +251,61 @@ nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
 	nvmf_free_request(req);
 }
 
+static void
+nvmf_sysctls_qp(struct nvmf_softc *sc, struct nvmf_host_qpair *qp,
+    bool admin, u_int qid)
+{
+	struct sysctl_ctx_list *ctx = &qp->sysctl_ctx;
+	struct sysctl_oid *oid;
+	struct sysctl_oid_list *list;
+	char name[8];
+
+	if (admin) {
+		oid = SYSCTL_ADD_NODE(ctx,
+		    SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO,
+		    "adminq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue");
+	} else {
+		snprintf(name, sizeof(name), "%u", qid);
+		oid = SYSCTL_ADD_NODE(ctx, sc->ioq_oid_list, OID_AUTO, name,
+		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queue");
+	}
+	list = SYSCTL_CHILDREN(oid);
+
+	SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "num_entries", CTLFLAG_RD,
+	    NULL, qp->num_commands + 1, "Number of entries in queue");
+	SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_head", CTLFLAG_RD, &qp->sqhd,
+	    0, "Current head of submission queue (as observed by driver)");
+	SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_tail", CTLFLAG_RD, &qp->sqtail,
+	    0, "Current tail of submission queue (as observed by driver)");
+	SYSCTL_ADD_U64(ctx, list, OID_AUTO, "num_cmds", CTLFLAG_RD,
+	    &qp->submitted, 0, "Number of commands submitted");
+}
+
 struct nvmf_host_qpair *
 nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
-    struct nvmf_handoff_qpair_params *handoff, const char *name)
+    const nvlist_t *nvl, const char *name, u_int qid)
 {
 	struct nvmf_host_command *cmd, *ncmd;
 	struct nvmf_host_qpair *qp;
 	u_int i;
+	bool admin;
 
+	admin = nvlist_get_bool(nvl, "admin");
 	qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO);
 	qp->sc = sc;
-	qp->sq_flow_control = handoff->sq_flow_control;
-	qp->sqhd = handoff->sqhd;
-	qp->sqtail = handoff->sqtail;
+	qp->sq_flow_control = nvlist_get_bool(nvl, "sq_flow_control");
+	qp->sqhd = nvlist_get_number(nvl, "sqhd");
+	qp->sqtail = nvlist_get_number(nvl, "sqtail");
 	strlcpy(qp->name, name, sizeof(qp->name));
 	mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
+	(void)sysctl_ctx_init(&qp->sysctl_ctx);
 
 	/*
 	 * Allocate a spare command slot for each pending AER command
 	 * on the admin queue.
 	 */
-	qp->num_commands = handoff->qsize - 1;
-	if (handoff->admin)
+	qp->num_commands = nvlist_get_number(nvl, "qsize") - 1;
+	if (admin)
 		qp->num_commands += sc->num_aer;
 
 	qp->active_commands = malloc(sizeof(*qp->active_commands) *
@@ -255,9 +318,10 @@ nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
 	}
 	STAILQ_INIT(&qp->pending_requests);
 
-	qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error,
-	    qp, nvmf_receive_capsule, qp);
+	qp->qp = nvmf_allocate_qpair(trtype, false, nvl, nvmf_qp_error, qp,
+	    nvmf_receive_capsule, qp);
 	if (qp->qp == NULL) {
+		(void)sysctl_ctx_free(&qp->sysctl_ctx);
 		TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
 			TAILQ_REMOVE(&qp->free_commands, cmd, link);
 			free(cmd, M_NVMF);
@@ -268,6 +332,8 @@ nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
 		return (NULL);
 	}
 
+	nvmf_sysctls_qp(sc, qp, admin, qid);
+
 	return (qp);
 }
 
@@ -339,6 +405,7 @@ nvmf_destroy_qp(struct nvmf_host_qpair *qp)
 	struct nvmf_host_command *cmd, *ncmd;
 
 	nvmf_shutdown_qp(qp);
+	(void)sysctl_ctx_free(&qp->sysctl_ctx);
 
 	TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
 		TAILQ_REMOVE(&qp->free_commands, cmd, link);
@@ -381,6 +448,5 @@ nvmf_submit_request(struct nvmf_request *req)
 	    ("%s: CID already busy", __func__));
 	qp->active_commands[cmd->cid] = cmd;
 	cmd->req = req;
-	mtx_unlock(&qp->lock);
 	nvmf_dispatch_command(qp, cmd);
 }
diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c
index b097b04d64c3..de9e958d8afd 100644
--- a/sys/dev/nvmf/host/nvmf_sim.c
+++ b/sys/dev/nvmf/host/nvmf_sim.c
@@ -40,7 +40,13 @@ nvmf_ccb_done(union ccb *ccb)
 		return;
 
 	if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) {
-		ccb->ccb_h.status = CAM_REQUEUE_REQ;
+		struct cam_sim *sim = xpt_path_sim(ccb->ccb_h.path);
+		struct nvmf_softc *sc = cam_sim_softc(sim);
+
+		if (nvmf_fail_disconnect || sc->sim_shutdown)
+			ccb->ccb_h.status = CAM_DEV_NOT_THERE;
+		else
+			ccb->ccb_h.status = CAM_REQUEUE_REQ;
 		xpt_done(ccb);
 	} else if (ccb->nvmeio.cpl.status != 0) {
 		ccb->ccb_h.status = CAM_NVME_STATUS_ERROR;
@@ -52,7 +58,7 @@ nvmf_ccb_done(union ccb *ccb)
 		xpt_done(ccb);
 	} else {
 		ccb->ccb_h.status = CAM_REQ_CMP;
-		xpt_done_direct(ccb);
+		xpt_done(ccb);
 	}
 }
 
@@ -106,7 +112,10 @@ nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb)
 	mtx_lock(&sc->sim_mtx);
 	if (sc->sim_disconnected) {
 		mtx_unlock(&sc->sim_mtx);
-		nvmeio->ccb_h.status = CAM_REQUEUE_REQ;
+		if (nvmf_fail_disconnect || sc->sim_shutdown)
+			nvmeio->ccb_h.status = CAM_DEV_NOT_THERE;
+		else
+			nvmeio->ccb_h.status = CAM_REQUEUE_REQ;
 		xpt_done(ccb);
 		return;
 	}
@@ -116,8 +125,8 @@ nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb)
 		qp = sc->admin;
 	req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete,
 	    ccb, M_NOWAIT);
+	mtx_unlock(&sc->sim_mtx);
 	if (req == NULL) {
-		mtx_unlock(&sc->sim_mtx);
 		nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
 		xpt_done(ccb);
 		return;
@@ -141,7 +150,6 @@ nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb)
 	    ("%s: incoming CCB is not in-progress", __func__));
 	ccb->ccb_h.status |= CAM_SIM_QUEUED;
 	nvmf_submit_request(req);
-	mtx_unlock(&sc->sim_mtx);
 }
 
 static void
@@ -183,7 +191,7 @@ nvmf_sim_action(struct cam_sim *sim, union ccb *ccb)
 		cpi->xport_specific.nvmf.nsid =
 		    xpt_path_lun_id(ccb->ccb_h.path);
 		cpi->xport_specific.nvmf.trtype = sc->trtype;
-		strncpy(cpi->xport_specific.nvmf.dev_name,
+		strlcpy(cpi->xport_specific.nvmf.dev_name,
 		    device_get_nameunit(sc->dev),
 		    sizeof(cpi->xport_specific.nvmf.dev_name));
 		cpi->maxio = sc->max_xfer_size;
@@ -320,6 +328,15 @@ nvmf_reconnect_sim(struct nvmf_softc *sc)
 }
 
 void
+nvmf_shutdown_sim(struct nvmf_softc *sc)
+{
+	mtx_lock(&sc->sim_mtx);
+	sc->sim_shutdown = true;
+	mtx_unlock(&sc->sim_mtx);
+	xpt_release_simq(sc->sim, 1);
+}
+
+void
 nvmf_destroy_sim(struct nvmf_softc *sc)
 {
 	xpt_async(AC_LOST_DEVICE, sc->path, NULL);
diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h
index 64525851631e..606245b3969c 100644
--- a/sys/dev/nvmf/host/nvmf_var.h
+++ b/sys/dev/nvmf/host/nvmf_var.h
@@ -9,10 +9,13 @@
 #define	__NVMF_VAR_H__
 
 #include <sys/_callout.h>
+#include <sys/_eventhandler.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+//#include <sys/_nv.h>
 #include <sys/_sx.h>
 #include <sys/_task.h>
+#include <sys/smp.h>
 #include <sys/queue.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/nvmf_transport.h>
@@ -21,15 +24,10 @@ struct nvmf_aer;
 struct nvmf_capsule;
 struct nvmf_host_qpair;
 struct nvmf_namespace;
+struct sysctl_oid_list;
 
 typedef void nvmf_request_complete_t(void *, const struct nvme_completion *);
 
-struct nvmf_ivars {
-	struct nvmf_handoff_host *hh;
-	struct nvmf_handoff_qpair_params *io_params;
-	struct nvme_controller_data *cdata;
-};
-
 struct nvmf_softc {
 	device_t dev;
 
@@ -42,6 +40,7 @@ struct nvmf_softc {
 	struct cam_path *path;
 	struct mtx sim_mtx;
 	bool sim_disconnected;
+	bool sim_shutdown;
 
 	struct nvmf_namespace **ns;
 
@@ -76,12 +75,27 @@ struct nvmf_softc {
 	struct callout ka_rx_timer;
 	sbintime_t ka_rx_sbt;
 
+	struct timeout_task request_reconnect_task;
+	struct timeout_task controller_loss_task;
+	uint32_t reconnect_delay;
+	uint32_t controller_loss_timeout;
+
 	struct sx connection_lock;
 	struct task disconnect_task;
 	bool detaching;
+	bool controller_timedout;
 
 	u_int num_aer;
 	struct nvmf_aer *aer;
+
+	struct sysctl_oid_list *ioq_oid_list;
+
+	nvlist_t *rparams;
+
+	struct timespec last_disconnect;
+
+	eventhandler_tag shutdown_pre_sync_eh;
+	eventhandler_tag shutdown_post_sync_eh;
 };
 
 struct nvmf_request {
@@ -104,8 +118,8 @@ struct nvmf_completion_status {
 static __inline struct nvmf_host_qpair *
 nvmf_select_io_queue(struct nvmf_softc *sc)
 {
-	/* TODO: Support multiple queues? */
-	return (sc->io[0]);
+	u_int idx = curcpu * sc->num_io_queues / (mp_maxid + 1);
+	return (sc->io[idx]);
 }
 
 static __inline bool
@@ -140,14 +154,17 @@ extern driver_t nvme_nvmf_driver;
 MALLOC_DECLARE(M_NVMF);
 #endif
 
+/* If true, I/O requests will fail while the host is disconnected. */
+extern bool nvmf_fail_disconnect;
+
 /* nvmf.c */
 void	nvmf_complete(void *arg, const struct nvme_completion *cqe);
 void	nvmf_io_complete(void *arg, size_t xfered, int error);
 void	nvmf_wait_for_reply(struct nvmf_completion_status *status);
-int	nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh);
-void	nvmf_free_ivars(struct nvmf_ivars *ivars);
+int	nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp);
 void	nvmf_disconnect(struct nvmf_softc *sc);
 void	nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid);
+void	nvmf_rescan_all_ns(struct nvmf_softc *sc);
 int	nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
     bool admin);
 
@@ -180,17 +197,17 @@ void	nvmf_ctl_unload(void);
 
 /* nvmf_ns.c */
 struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
-    struct nvme_namespace_data *data);
+    const struct nvme_namespace_data *data);
 void	nvmf_disconnect_ns(struct nvmf_namespace *ns);
 void	nvmf_reconnect_ns(struct nvmf_namespace *ns);
+void	nvmf_shutdown_ns(struct nvmf_namespace *ns);
 void	nvmf_destroy_ns(struct nvmf_namespace *ns);
 bool	nvmf_update_ns(struct nvmf_namespace *ns,
-    struct nvme_namespace_data *data);
+    const struct nvme_namespace_data *data);
 
 /* nvmf_qpair.c */
 struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc,
-    enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff,
-    const char *name);
+    enum nvmf_trtype trtype, const nvlist_t *nvl, const char *name, u_int qid);
 void	nvmf_shutdown_qp(struct nvmf_host_qpair *qp);
 void	nvmf_destroy_qp(struct nvmf_host_qpair *qp);
 struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp,
@@ -202,6 +219,7 @@ void	nvmf_free_request(struct nvmf_request *req);
 int	nvmf_init_sim(struct nvmf_softc *sc);
 void	nvmf_disconnect_sim(struct nvmf_softc *sc);
 void	nvmf_reconnect_sim(struct nvmf_softc *sc);
+void	nvmf_shutdown_sim(struct nvmf_softc *sc);
 void	nvmf_destroy_sim(struct nvmf_softc *sc);
 void	nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id);