11 files changed, 925 insertions, 493 deletions
diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
index a4baade7df5d..17c5cdb4db87 100644
--- a/sys/dev/nvme/nvme.h
+++ b/sys/dev/nvme/nvme.h
@@ -35,11 +35,17 @@
 
 #include <sys/param.h>
 #include <sys/endian.h>
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
+
+struct sbuf;
 
 #define	NVME_PASSTHROUGH_CMD		_IOWR('n', 0, struct nvme_pt_command)
 #define	NVME_RESET_CONTROLLER		_IO('n', 1)
 #define	NVME_GET_NSID			_IOR('n', 2, struct nvme_get_nsid)
 #define	NVME_GET_MAX_XFER_SIZE		_IOR('n', 3, uint64_t)
+#define	NVME_GET_CONTROLLER_DATA	_IOR('n', 4, struct nvme_controller_data)
 
 #define	NVME_IO_TEST			_IOWR('n', 100, struct nvme_io_test)
 #define	NVME_BIO_TEST			_IOWR('n', 101, struct nvme_io_test)
@@ -648,8 +654,16 @@ enum nvme_critical_warning_state {
 	NVME_CRIT_WARN_ST_PERSISTENT_MEMORY_REGION	= 0x20,
 };
 #define NVME_CRIT_WARN_ST_RESERVED_MASK			(0xC0)
-#define	NVME_ASYNC_EVENT_NS_ATTRIBUTE			(0x100)
-#define	NVME_ASYNC_EVENT_FW_ACTIVATE			(0x200)
+#define	NVME_ASYNC_EVENT_NS_ATTRIBUTE			(1U << 8)
+#define	NVME_ASYNC_EVENT_FW_ACTIVATE			(1U << 9)
+#define	NVME_ASYNC_EVENT_TELEMETRY_LOG			(1U << 10)
+#define	NVME_ASYNC_EVENT_ASYM_NS_ACC			(1U << 11)
+#define	NVME_ASYNC_EVENT_PRED_LAT_DELTA			(1U << 12)
+#define	NVME_ASYNC_EVENT_LBA_STATUS			(1U << 13)
+#define	NVME_ASYNC_EVENT_ENDURANCE_DELTA		(1U << 14)
+#define	NVME_ASYNC_EVENT_NVM_SHUTDOWN			(1U << 15)
+#define	NVME_ASYNC_EVENT_ZONE_DELTA			(1U << 27)
+#define	NVME_ASYNC_EVENT_DISCOVERY_DELTA		(1U << 31)
 
 /* slot for current FW */
 #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT		(0)
@@ -832,7 +846,7 @@ struct nvme_command {
 	uint32_t cdw13;		/* command-specific */
 	uint32_t cdw14;		/* command-specific */
 	uint32_t cdw15;		/* command-specific */
-};
+} __aligned(8);
 
 _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command");
 
@@ -1601,7 +1615,7 @@ struct nvme_health_information_page {
 	uint32_t		ttftmt2;
 
 	uint8_t			reserved2[280];
-} __packed __aligned(4);
+} __packed __aligned(8);
 
 _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page");
 
@@ -1652,6 +1666,30 @@ struct nvme_device_self_test_page {
 _Static_assert(sizeof(struct nvme_device_self_test_page) == 564,
     "bad size for nvme_device_self_test_page");
 
+/*
+ * Header structure for both host initiated telemetry (page 7) and controller
+ * initiated telemetry (page 8).
+ */
+struct nvme_telemetry_log_page {
+	uint8_t			identifier;
+	uint8_t			rsvd[4];
+	uint8_t			oui[3];
+	uint16_t		da1_last;
+	uint16_t		da2_last;
+	uint16_t		da3_last;
+	uint8_t			rsvd2[2];
+	uint32_t		da4_last;
+	uint8_t			rsvd3[361];
+	uint8_t			hi_gen;
+	uint8_t			ci_avail;
+	uint8_t			ci_gen;
+	uint8_t			reason[128];
+	/* Blocks of telemetry data follow */
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_telemetry_log_page) == 512,
+    "bad size for nvme_telemetry_log");
+
 struct nvme_discovery_log_entry {
 	uint8_t			trtype;
 	uint8_t			adrfam;
@@ -1868,6 +1906,9 @@ struct nvme_hmb_desc {
 #define nvme_completion_is_error(cpl)					\
 	(NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0)
 
+void	nvme_cpl_sbuf(const struct nvme_completion *cpl, struct sbuf *sbuf);
+void	nvme_opcode_sbuf(bool admin, uint8_t opc, struct sbuf *sb);
+void	nvme_sc_sbuf(const struct nvme_completion *cpl, struct sbuf *sbuf);
 void	nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen);
 
 #ifdef _KERNEL
@@ -1878,6 +1919,7 @@ struct thread;
 struct nvme_namespace;
 struct nvme_controller;
 struct nvme_consumer;
+struct nvme_passthru_cmd;
 
 typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
 
@@ -1897,6 +1939,11 @@ int	nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 				   uint32_t nsid, int is_user_buffer,
 				   int is_admin_cmd);
 
+int	nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+				      struct nvme_passthru_cmd *npc,
+				      uint32_t nsid, bool is_user,
+				      bool is_admin);
+
 /* Admin functions */
 void	nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
 				   uint8_t feature, uint32_t cdw11,
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index b7b03082c54e..73a7cee4aad0 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -39,10 +39,11 @@
 #include <sys/uio.h>
 #include <sys/sbuf.h>
 #include <sys/endian.h>
-#include <machine/stdarg.h>
+#include <sys/stdarg.h>
 #include <vm/vm.h>
 
 #include "nvme_private.h"
+#include "nvme_linux.h"
 
 #define B4_CHK_RDY_DELAY_MS	2300		/* work around controller bug */
 
@@ -231,7 +232,7 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
 }
 
 static void
-nvme_ctrlr_fail(struct nvme_controller *ctrlr)
+nvme_ctrlr_fail(struct nvme_controller *ctrlr, bool admin_also)
 {
 	int i;
 
@@ -241,7 +242,10 @@ nvme_ctrlr_fail(struct nvme_controller *ctrlr)
 	 * a different error, though when we fail, that hardly matters).
 	 */
 	ctrlr->is_failed = true;
-	nvme_qpair_fail(&ctrlr->adminq);
+	if (admin_also) {
+		ctrlr->is_failed_admin = true;
+		nvme_qpair_fail(&ctrlr->adminq);
+	}
 	if (ctrlr->ioq != NULL) {
 		for (i = 0; i < ctrlr->num_io_queues; i++) {
 			nvme_qpair_fail(&ctrlr->ioq[i]);
@@ -414,6 +418,7 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 
 	TSENTER();
 
+	ctrlr->is_failed_admin = true;
 	nvme_ctrlr_disable_qpairs(ctrlr);
 
 	err = nvme_ctrlr_disable(ctrlr);
@@ -422,6 +427,8 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 
 	err = nvme_ctrlr_enable(ctrlr);
 out:
+	if (err == 0)
+		ctrlr->is_failed_admin = false;
 
 	TSEXIT();
 	return (err);
@@ -434,11 +441,10 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 
 	cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
 
-	if (cmpset == 0 || ctrlr->is_failed)
+	if (cmpset == 0)
 		/*
-		 * Controller is already resetting or has failed.  Return
-		 *  immediately since there is no need to kick off another
-		 *  reset in these cases.
+		 * Controller is already resetting.  Return immediately since
+		 * there is no need to kick off another reset.
 		 */
 		return;
 
@@ -814,7 +820,13 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
 	struct nvme_request *req;
 
 	aer->ctrlr = ctrlr;
-	req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
+	/*
+	 * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable
+	 * callback context.  AER completions should be handled on a dedicated
+	 * thread.
+	 */
+	req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb,
+	    aer);
 	aer->req = req;
 
 	/*
@@ -1089,7 +1101,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 		return;
 
 	if (resetting && nvme_ctrlr_identify(ctrlr) != 0) {
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, false);
 		return;
 	}
 
@@ -1104,7 +1116,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 	if (resetting) {
 		old_num_io_queues = ctrlr->num_io_queues;
 		if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
-			nvme_ctrlr_fail(ctrlr);
+			nvme_ctrlr_fail(ctrlr, false);
 			return;
 		}
 
@@ -1122,12 +1134,12 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 		nvme_ctrlr_hmb_enable(ctrlr, true, true);
 
 	if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, false);
 		return;
 	}
 
 	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, false);
 		return;
 	}
 
@@ -1146,9 +1158,8 @@ nvme_ctrlr_start_config_hook(void *arg)
 
 	TSENTER();
 
-	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
-fail:
-		nvme_ctrlr_fail(ctrlr);
+	if (nvme_ctrlr_hw_reset(ctrlr) != 0 || ctrlr->fail_on_reset != 0) {
+		nvme_ctrlr_fail(ctrlr, true);
 		config_intrhook_disestablish(&ctrlr->config_hook);
 		return;
 	}
@@ -1161,13 +1172,15 @@ fail:
 	    nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
 		nvme_ctrlr_start(ctrlr, false);
 	else
-		goto fail;
+		nvme_ctrlr_fail(ctrlr, false);
 
 	nvme_sysctl_initialize_ctrlr(ctrlr);
 	config_intrhook_disestablish(&ctrlr->config_hook);
 
-	ctrlr->is_initialized = 1;
-	nvme_notify_new_controller(ctrlr);
+	if (!ctrlr->is_failed) {
+		ctrlr->is_initialized = true;
+		nvme_notify_new_controller(ctrlr);
+	}
 	TSEXIT();
 }
 
@@ -1184,7 +1197,7 @@ nvme_ctrlr_reset_task(void *arg, int pending)
 		nvme_ctrlr_start(ctrlr, true);
 	} else {
 		nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"timed_out\"");
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, true);
 	}
 
 	atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
@@ -1258,24 +1271,19 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 			return EIO;
 		}
 		if (is_user_buffer) {
-			/*
-			 * Ensure the user buffer is wired for the duration of
-			 *  this pass-through command.
-			 */
-			PHOLD(curproc);
 			buf = uma_zalloc(pbuf_zone, M_WAITOK);
 			buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
 			if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
 				ret = EFAULT;
 				goto err;
 			}
-			req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 
-			    nvme_pt_done, pt);
+			req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
+			    M_WAITOK, nvme_pt_done, pt);
 		} else
 			req = nvme_allocate_request_vaddr(pt->buf, pt->len,
-			    nvme_pt_done, pt);
+			    M_WAITOK, nvme_pt_done, pt);
 	} else
-		req = nvme_allocate_request_null(nvme_pt_done, pt);
+		req = nvme_allocate_request_null(M_WAITOK, nvme_pt_done, pt);
 
 	/* Assume user space already converted to little-endian */
 	req->cmd.opc = pt->cmd.opc;
@@ -1308,7 +1316,104 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 		vunmapbuf(buf);
 err:
 		uma_zfree(pbuf_zone, buf);
-		PRELE(curproc);
+	}
+
+	return (ret);
+}
+
+static void
+nvme_npc_done(void *arg, const struct nvme_completion *cpl)
+{
+	struct nvme_passthru_cmd *npc = arg;
+	struct mtx *mtx = (void *)(uintptr_t)npc->metadata;
+
+	npc->result = cpl->cdw0;	/* cpl in host order by now */
+	mtx_lock(mtx);
+	npc->metadata = 0;
+	wakeup(npc);
+	mtx_unlock(mtx);
+}
+
+/* XXX refactor? */
+
+int
+nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+    struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin)
+{
+	struct nvme_request	*req;
+	struct mtx		*mtx;
+	struct buf		*buf = NULL;
+	int			ret = 0;
+
+	/*
+	 * We don't support metadata.
+	 */
+	if (npc->metadata != 0 || npc->metadata_len != 0)
+		return (EIO);
+
+	if (npc->data_len > 0 && npc->addr != 0) {
+		if (npc->data_len > ctrlr->max_xfer_size) {
+			nvme_printf(ctrlr,
+			    "npc->data_len (%d) exceeds max_xfer_size (%d)\n",
+			    npc->data_len, ctrlr->max_xfer_size);
+			return (EIO);
+		}
+		/*
+		 * We only support data out or data in commands, but not both at
+		 * once. However, there's some comands with lower bit cleared
+		 * that are really read commands, so we should filter & 3 == 0,
+		 * but don't.
+		 */
+		if ((npc->opcode & 0x3) == 3)
+			return (EINVAL);
+		if (is_user) {
+			buf = uma_zalloc(pbuf_zone, M_WAITOK);
+			buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ;
+			if (vmapbuf(buf, (void *)(uintptr_t)npc->addr,
+			    npc->data_len, 1) < 0) {
+				ret = EFAULT;
+				goto err;
+			}
+			req = nvme_allocate_request_vaddr(buf->b_data,
+			    npc->data_len, M_WAITOK, nvme_npc_done, npc);
+		} else
+			req = nvme_allocate_request_vaddr(
+			    (void *)(uintptr_t)npc->addr, npc->data_len,
+			    M_WAITOK, nvme_npc_done, npc);
+	} else
+		req = nvme_allocate_request_null(M_WAITOK, nvme_npc_done, npc);
+
+	req->cmd.opc = npc->opcode;
+	req->cmd.fuse = npc->flags;
+	req->cmd.rsvd2 = htole16(npc->cdw2);
+	req->cmd.rsvd3 = htole16(npc->cdw3);
+	req->cmd.cdw10 = htole32(npc->cdw10);
+	req->cmd.cdw11 = htole32(npc->cdw11);
+	req->cmd.cdw12 = htole32(npc->cdw12);
+	req->cmd.cdw13 = htole32(npc->cdw13);
+	req->cmd.cdw14 = htole32(npc->cdw14);
+	req->cmd.cdw15 = htole32(npc->cdw15);
+
+	req->cmd.nsid = htole32(nsid);
+
+	mtx = mtx_pool_find(mtxpool_sleep, npc);
+	npc->metadata = (uintptr_t) mtx;
+
+	/* XXX no timeout passed down */
+	if (is_admin)
+		nvme_ctrlr_submit_admin_request(ctrlr, req);
+	else
+		nvme_ctrlr_submit_io_request(ctrlr, req);
+
+	mtx_lock(mtx);
+	while (npc->metadata != 0)
+		mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0);
+	mtx_unlock(mtx);
+
+	if (buf != NULL) {
+		vunmapbuf(buf);
+err:
+		uma_zfree(pbuf_zone, buf);
 	}
 
 	return (ret);
@@ -1324,6 +1429,7 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 	ctrlr = cdev->si_drv1;
 
 	switch (cmd) {
+	case NVME_IOCTL_RESET: /* Linux compat */
 	case NVME_RESET_CONTROLLER:
 		nvme_ctrlr_reset(ctrlr);
 		break;
@@ -1334,15 +1440,30 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 	case NVME_GET_NSID:
 	{
 		struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
-		strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
+		strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
 		    sizeof(gnsid->cdev));
-		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
 		gnsid->nsid = 0;
 		break;
 	}
 	case NVME_GET_MAX_XFER_SIZE:
 		*(uint64_t *)arg = ctrlr->max_xfer_size;
 		break;
+	case NVME_GET_CONTROLLER_DATA:
+		memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
+		break;
+	/* Linux Compatible (see nvme_linux.h) */
+	case NVME_IOCTL_ID:
+		td->td_retval[0] = 0xfffffffful;
+		return (0);
+
+	case NVME_IOCTL_ADMIN_CMD:
+	case NVME_IOCTL_IO_CMD: {
+		struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+		return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, true,
+		    cmd == NVME_IOCTL_ADMIN_CMD));
+	}
+
 	default:
 		return (ENOTTY);
 	}
@@ -1443,6 +1564,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 	ctrlr->enable_aborts = 0;
 	TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
 
+	ctrlr->alignment_splits = counter_u64_alloc(M_WAITOK);
+
 	/* Cap transfers by the maximum addressable by page-sized PRP (4KB pages -> 2MB). */
 	ctrlr->max_xfer_size = MIN(maxphys, (ctrlr->page_size / 8 * ctrlr->page_size));
 	if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
@@ -1464,7 +1587,7 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 	taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq");
 
 	ctrlr->is_resetting = 0;
-	ctrlr->is_initialized = 0;
+	ctrlr->is_initialized = false;
 	ctrlr->notification_sent = 0;
 	TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
 	STAILQ_INIT(&ctrlr->fail_req);
@@ -1477,18 +1600,25 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 	md_args.mda_mode = 0600;
 	md_args.mda_unit = device_get_unit(dev);
 	md_args.mda_si_drv1 = (void *)ctrlr;
-	status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
-	    device_get_unit(dev));
+	status = make_dev_s(&md_args, &ctrlr->cdev, "%s",
+	    device_get_nameunit(dev));
 	if (status != 0)
 		return (ENXIO);
 
 	return (0);
 }
 
+/*
+ * Called on detach, or on error on attach. The nvme_controller won't be used
+ * again once we return, so we have to tear everything down (so nothing
+ * references this, no callbacks, etc), but don't need to reset all the state
+ * since nvme_controller will be freed soon.
+ */
 void
 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 {
-	int	gone, i;
+	int	i;
+	bool	gone;
 
 	ctrlr->is_dying = true;
 
@@ -1498,12 +1628,18 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 		goto noadminq;
 
 	/*
-	 * Check whether it is a hot unplug or a clean driver detach.
-	 * If device is not there any more, skip any shutdown commands.
+	 * Check whether it is a hot unplug or a clean driver detach.  If device
+	 * is not there any more, skip any shutdown commands.  Some hotplug
+	 * bridges will return zeros instead of ff's when the device is
+	 * departing, so ask the bridge if the device is gone. Some systems can
+	 * remove the drive w/o the bridge knowing its gone (they don't really
+	 * do hotplug), so failsafe with detecting all ff's (impossible with
+	 * this hardware) as the device being gone.
 	 */
-	gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
+	gone = bus_child_present(dev) == 0 ||
+	    (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
 	if (gone)
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, true);
 	else
 		nvme_notify_fail_consumers(ctrlr);
 
@@ -1529,17 +1665,17 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 	nvme_admin_qpair_destroy(&ctrlr->adminq);
 
 	/*
-	 *  Notify the controller of a shutdown, even though this is due to
-	 *   a driver unload, not a system shutdown (this path is not invoked
-	 *   during shutdown).  This ensures the controller receives a
-	 *   shutdown notification in case the system is shutdown before
-	 *   reloading the driver.
+	 * Notify the controller of a shutdown, even though this is due to a
+	 * driver unload, not a system shutdown (this path is not invoked uring
+	 * shutdown).  This ensures the controller receives a shutdown
+	 * notification in case the system is shutdown before reloading the
+	 * driver. Some NVMe drives need this to flush their cache to stable
+	 * media and consider it a safe shutdown in SMART stats.
 	 */
-	if (!gone)
+	if (!gone) {
 		nvme_ctrlr_shutdown(ctrlr);
-
-	if (!gone)
 		nvme_ctrlr_disable(ctrlr);
+	}
 
 noadminq:
 	if (ctrlr->taskqueue)
@@ -1561,6 +1697,9 @@ noadminq:
 	    ctrlr->resource_id, ctrlr->resource);
 
 nores:
+	if (ctrlr->alignment_splits)
+		counter_u64_free(ctrlr->alignment_splits);
+
 	mtx_destroy(&ctrlr->lock);
 }
 
@@ -1630,7 +1769,9 @@ nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
 	int to = hz;
 
 	/*
-	 * Can't touch failed controllers, so it's already suspended.
+	 * Can't touch failed controllers, so it's already suspended. User will
+	 * need to do an explicit reset to bring it back, if that's even
+	 * possible.
 	 */
 	if (ctrlr->is_failed)
 		return (0);
@@ -1684,7 +1825,8 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 	/*
 	 * Now that we've reset the hardware, we can restart the controller. Any
 	 * I/O that was pending is requeued. Any admin commands are aborted with
-	 * an error. Once we've restarted, take the controller out of reset.
+	 * an error. Once we've restarted, stop flagging the controller as being
+	 * in the reset phase.
 	 */
 	nvme_ctrlr_start(ctrlr, true);
 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
@@ -1697,7 +1839,7 @@ fail:
 	 * itself, due to questionable APIs.
 	 */
 	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
-	nvme_ctrlr_fail(ctrlr);
+	nvme_ctrlr_fail(ctrlr, true);
 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 	return (0);
 }
diff --git a/sys/dev/nvme/nvme_ctrlr_cmd.c b/sys/dev/nvme/nvme_ctrlr_cmd.c
index 68934b9b3947..993a7718356d 100644
--- a/sys/dev/nvme/nvme_ctrlr_cmd.c
+++ b/sys/dev/nvme/nvme_ctrlr_cmd.c
@@ -37,7 +37,7 @@ nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr, void *payload,
 	struct nvme_command *cmd;
 
 	req = nvme_allocate_request_vaddr(payload,
-	    sizeof(struct nvme_controller_data), cb_fn, cb_arg);
+	    sizeof(struct nvme_controller_data), M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_IDENTIFY;
@@ -59,7 +59,7 @@ nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid,
 	struct nvme_command *cmd;
 
 	req = nvme_allocate_request_vaddr(payload,
-	    sizeof(struct nvme_namespace_data), cb_fn, cb_arg);
+	    sizeof(struct nvme_namespace_data), M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_IDENTIFY;
@@ -79,7 +79,7 @@ nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
+	req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_CREATE_IO_CQ;
@@ -103,7 +103,7 @@ nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
+	req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_CREATE_IO_SQ;
@@ -127,7 +127,7 @@ nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
+	req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_DELETE_IO_CQ;
@@ -148,7 +148,7 @@ nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
+	req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_DELETE_IO_SQ;
@@ -171,7 +171,7 @@ nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
+	req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_SET_FEATURES;
@@ -193,7 +193,7 @@ nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
+	req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_GET_FEATURES;
@@ -259,7 +259,12 @@ nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_vaddr(payload, payload_size, cb_fn, cb_arg);
+	/*
+	 * XXX-MJ this should be M_WAITOK but we might be called from AER
+	 * completion processing, which is a non-sleepable context.
+	 */
+	req = nvme_allocate_request_vaddr(payload, payload_size,
+	    M_NOWAIT, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_GET_LOG_PAGE;
@@ -319,7 +324,11 @@ nvme_ctrlr_cmd_abort(struct nvme_controller *ctrlr, uint16_t cid,
 	struct nvme_request *req;
 	struct nvme_command *cmd;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
+	/*
+	 * XXX-MJ this should be M_WAITOK, we do reset from non-sleepable
+	 * context and abort commands as part of that.
+	 */
+	req = nvme_allocate_request_null(M_NOWAIT, cb_fn, cb_arg);
 
 	cmd = &req->cmd;
 	cmd->opc = NVME_OPC_ABORT;
diff --git a/sys/dev/nvme/nvme_linux.h b/sys/dev/nvme/nvme_linux.h
new file mode 100644
index 000000000000..aaa68e1d34f8
--- /dev/null
+++ b/sys/dev/nvme/nvme_linux.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2024, Netflix Inc.
+ * Written by Warner Losh
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * Linux compatible NVME ioctls. So far we just support ID, ADMIN_CMD and
+ * IO_CMD. The rest are not supported.
+ */
+
+
+#include <sys/ioccom.h>
+#include <sys/_types.h>
+
+struct nvme_passthru_cmd {
+	__uint8_t	opcode;
+	__uint8_t	flags;
+	__uint16_t	rsvd1;
+	__uint32_t	nsid;
+	__uint32_t	cdw2;
+	__uint32_t	cdw3;
+	__uint64_t	metadata;
+	__uint64_t	addr;
+	__uint32_t	metadata_len;
+	__uint32_t	data_len;
+	__uint32_t	cdw10;
+	__uint32_t	cdw11;
+	__uint32_t	cdw12;
+	__uint32_t	cdw13;
+	__uint32_t	cdw14;
+	__uint32_t	cdw15;
+	__uint32_t	timeout_ms;
+	__uint32_t	result;
+};
+
+#define nvme_admin_cmd nvme_passthru_cmd
+
+/*
+ * Linux nvme ioctls, commented out ones are not supported
+ */
+#define NVME_IOCTL_ID		_IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
+/* #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io) */
+#define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
+#define NVME_IOCTL_RESET	_IO('N', 0x44)
+/* #define NVME_IOCTL_SUBSYS_RESET	_IO('N', 0x45) */
+/* #define NVME_IOCTL_RESCAN	_IO('N', 0x46) */
+/* #define NVME_IOCTL_ADMIN64_CMD	_IOWR('N', 0x47, struct nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD	_IOWR('N', 0x48, struct nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD_VEC	_IOWR('N', 0x49, struct nvme_passthru_cmd64) */
+
+/* io_uring async commands: */
+/* #define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_IO_VEC	_IOWR('N', 0x81, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_ADMIN	_IOWR('N', 0x82, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) */
diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c
index 360b9f982c20..3f29382fe42f 100644
--- a/sys/dev/nvme/nvme_ns.c
+++ b/sys/dev/nvme/nvme_ns.c
@@ -43,6 +43,7 @@
 #include <geom/geom.h>
 
 #include "nvme_private.h"
+#include "nvme_linux.h"
 
 static void		nvme_bio_child_inbed(struct bio *parent, int bio_error);
 static void		nvme_bio_child_done(void *arg,
@@ -82,9 +83,8 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 	case NVME_GET_NSID:
 	{
 		struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
-		strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
+		strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
 		    sizeof(gnsid->cdev));
-		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
 		gnsid->nsid = ns->id;
 		break;
 	}
@@ -94,6 +94,18 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 	case DIOCGSECTORSIZE:
 		*(u_int *)arg = nvme_ns_get_sector_size(ns);
 		break;
+	/* Linux Compatible (see nvme_linux.h) */
+	case NVME_IOCTL_ID:
+		td->td_retval[0] = ns->id;
+		return (0);
+
+	case NVME_IOCTL_ADMIN_CMD:
+	case NVME_IOCTL_IO_CMD: {
+		struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+		return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, ns->id, true,
+		    cmd == NVME_IOCTL_ADMIN_CMD));
+	}
 	default:
 		return (ENOTTY);
 	}
@@ -429,6 +441,7 @@ nvme_ns_split_bio(struct nvme_namespace *ns, struct bio *bp,
 	if (child_bios == NULL)
 		return (ENOMEM);
 
+	counter_u64_add(ns->ctrlr->alignment_splits, 1);
 	for (i = 0; i < num_bios; i++) {
 		child = child_bios[i];
 		err = nvme_ns_bio_process(ns, child, nvme_bio_child_done);
@@ -604,11 +617,12 @@ nvme_ns_construct(struct nvme_namespace *ns, uint32_t id,
 	md_args.mda_unit = unit;
 	md_args.mda_mode = 0600;
 	md_args.mda_si_drv1 = ns;
-	res = make_dev_s(&md_args, &ns->cdev, "nvme%dns%d",
-	    device_get_unit(ctrlr->dev), ns->id);
+	res = make_dev_s(&md_args, &ns->cdev, "%sn%d",
+	    device_get_nameunit(ctrlr->dev), ns->id);
 	if (res != 0)
 		return (ENXIO);
-
+	ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%d",
+	    device_get_nameunit(ctrlr->dev), ns->id);
 	ns->cdev->si_flags |= SI_UNMAPPED;
 
 	return (0);
@@ -618,6 +632,9 @@ void
 nvme_ns_destruct(struct nvme_namespace *ns)
 {
 
-	if (ns->cdev != NULL)
+	if (ns->cdev != NULL) {
+		if (ns->cdev->si_drv2 != NULL)
+			destroy_dev(ns->cdev->si_drv2);
 		destroy_dev(ns->cdev);
+	}
 }
diff --git a/sys/dev/nvme/nvme_ns_cmd.c b/sys/dev/nvme/nvme_ns_cmd.c
index 8cbeac025307..1bad9929cb09 100644
--- a/sys/dev/nvme/nvme_ns_cmd.c
+++ b/sys/dev/nvme/nvme_ns_cmd.c
@@ -36,8 +36,7 @@ nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba,
 	struct nvme_request	*req;
 
 	req = nvme_allocate_request_vaddr(payload,
-	    lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg);
-
+	    lba_count * nvme_ns_get_sector_size(ns), M_NOWAIT, cb_fn, cb_arg);
 	if (req == NULL)
 		return (ENOMEM);
 
@@ -56,11 +55,9 @@ nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp,
 	uint64_t		lba;
 	uint64_t		lba_count;
 
-	req = nvme_allocate_request_bio(bp, cb_fn, cb_arg);
-
+	req = nvme_allocate_request_bio(bp, M_NOWAIT, cb_fn, cb_arg);
 	if (req == NULL)
 		return (ENOMEM);
-
 	lba = bp->bio_offset / nvme_ns_get_sector_size(ns);
 	lba_count = bp->bio_bcount / nvme_ns_get_sector_size(ns);
 	nvme_ns_read_cmd(&req->cmd, ns->id, lba, lba_count);
@@ -77,8 +74,7 @@ nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba,
 	struct nvme_request	*req;
 
 	req = nvme_allocate_request_vaddr(payload,
-	    lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg);
-
+	    lba_count * nvme_ns_get_sector_size(ns), M_NOWAIT, cb_fn, cb_arg);
 	if (req == NULL)
 		return (ENOMEM);
 
@@ -97,8 +93,7 @@ nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp,
 	uint64_t		lba;
 	uint64_t		lba_count;
 
-	req = nvme_allocate_request_bio(bp, cb_fn, cb_arg);
-
+	req = nvme_allocate_request_bio(bp, M_NOWAIT, cb_fn, cb_arg);
 	if (req == NULL)
 		return (ENOMEM);
 	lba = bp->bio_offset / nvme_ns_get_sector_size(ns);
@@ -118,8 +113,8 @@ nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload,
 	struct nvme_command	*cmd;
 
 	req = nvme_allocate_request_vaddr(payload,
-	    num_ranges * sizeof(struct nvme_dsm_range), cb_fn, cb_arg);
-
+	    num_ranges * sizeof(struct nvme_dsm_range), M_NOWAIT, cb_fn,
+	    cb_arg);
 	if (req == NULL)
 		return (ENOMEM);
 
@@ -141,8 +136,7 @@ nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request	*req;
 
-	req = nvme_allocate_request_null(cb_fn, cb_arg);
-
+	req = nvme_allocate_request_null(M_NOWAIT, cb_fn, cb_arg);
 	if (req == NULL)
 		return (ENOMEM);
 
@@ -165,8 +159,8 @@ nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len)
 	int i;
 
 	status.done = FALSE;
-	req = nvme_allocate_request_vaddr(virt, len, nvme_completion_poll_cb,
-	    &status);
+	req = nvme_allocate_request_vaddr(virt, len, M_NOWAIT,
+	    nvme_completion_poll_cb, &status);
 	if (req == NULL)
 		return (ENOMEM);
 
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 69141add4e48..949e69ec9290 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -32,6 +32,7 @@
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
+#include <sys/counter.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -297,11 +298,15 @@ struct nvme_controller {
 	void				*cons_cookie[NVME_MAX_CONSUMERS];
 
 	uint32_t			is_resetting;
-	uint32_t			is_initialized;
 	uint32_t			notification_sent;
+	u_int				fail_on_reset;
 
 	bool				is_failed;
+	bool				is_failed_admin;
 	bool				is_dying;
+	bool				isr_warned;
+	bool				is_initialized;
+
 	STAILQ_HEAD(, nvme_request)	fail_req;
 
 	/* Host Memory Buffer */
@@ -317,6 +322,9 @@ struct nvme_controller {
 	bus_dmamap_t			hmb_desc_map;
 	struct nvme_hmb_desc		*hmb_desc_vaddr;
 	uint64_t			hmb_desc_paddr;
+
+	/* Statistics */
+	counter_u64_t			alignment_splits;
 };
 
 #define nvme_mmio_offsetof(reg)						       \
@@ -413,9 +421,6 @@ void	nvme_qpair_submit_request(struct nvme_qpair *qpair,
 				  struct nvme_request *req);
 void	nvme_qpair_reset(struct nvme_qpair *qpair);
 void	nvme_qpair_fail(struct nvme_qpair *qpair);
-void	nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
-					   struct nvme_request *req,
-                                           uint32_t sct, uint32_t sc);
 
 void	nvme_admin_qpair_enable(struct nvme_qpair *qpair);
 void	nvme_admin_qpair_disable(struct nvme_qpair *qpair);
@@ -481,11 +486,14 @@ nvme_single_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
 }
 
 static __inline struct nvme_request *
-_nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg)
+_nvme_allocate_request(const int how, nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
-	req = malloc(sizeof(*req), M_NVME, M_NOWAIT | M_ZERO);
+	KASSERT(how == M_WAITOK || how == M_NOWAIT,
+	    ("nvme_allocate_request: invalid how %d", how));
+
+	req = malloc(sizeof(*req), M_NVME, how | M_ZERO);
 	if (req != NULL) {
 		req->cb_fn = cb_fn;
 		req->cb_arg = cb_arg;
@@ -496,11 +504,11 @@ _nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg)
 
 static __inline struct nvme_request *
 nvme_allocate_request_vaddr(void *payload, uint32_t payload_size,
-    nvme_cb_fn_t cb_fn, void *cb_arg)
+    const int how, nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
-	req = _nvme_allocate_request(cb_fn, cb_arg);
+	req = _nvme_allocate_request(how, cb_fn, cb_arg);
 	if (req != NULL) {
 		req->payload = memdesc_vaddr(payload, payload_size);
 		req->payload_valid = true;
@@ -509,20 +517,21 @@ nvme_allocate_request_vaddr(void *payload, uint32_t payload_size,
 }
 
 static __inline struct nvme_request *
-nvme_allocate_request_null(nvme_cb_fn_t cb_fn, void *cb_arg)
+nvme_allocate_request_null(const int how, nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	struct nvme_request *req;
 
-	req = _nvme_allocate_request(cb_fn, cb_arg);
+	req = _nvme_allocate_request(how, cb_fn, cb_arg);
 	return (req);
 }
 
 static __inline struct nvme_request *
-nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg)
+nvme_allocate_request_bio(struct bio *bio, const int how, nvme_cb_fn_t cb_fn,
+    void *cb_arg)
 {
 	struct nvme_request *req;
 
-	req = _nvme_allocate_request(cb_fn, cb_arg);
+	req = _nvme_allocate_request(how, cb_fn, cb_arg);
 	if (req != NULL) {
 		req->payload = memdesc_bio(bio);
 		req->payload_valid = true;
@@ -531,16 +540,16 @@ nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg)
 }
 
 static __inline struct nvme_request *
-nvme_allocate_request_ccb(union ccb *ccb, nvme_cb_fn_t cb_fn, void *cb_arg)
+nvme_allocate_request_ccb(union ccb *ccb, const int how, nvme_cb_fn_t cb_fn,
+    void *cb_arg)
 {
 	struct nvme_request *req;
 
-	req = _nvme_allocate_request(cb_fn, cb_arg);
+	req = _nvme_allocate_request(how, cb_fn, cb_arg);
 	if (req != NULL) {
 		req->payload = memdesc_ccb(ccb);
 		req->payload_valid = true;
 	}
-
 	return (req);
 }
 
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index 62d27e439180..bd8626e32209 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -31,6 +31,7 @@
 #include <sys/conf.h>
 #include <sys/domainset.h>
 #include <sys/proc.h>
+#include <sys/sbuf.h>
 
 #include <dev/pci/pcivar.h>
 
@@ -43,96 +44,36 @@ static void	_nvme_qpair_submit_request(struct nvme_qpair *qpair,
 					   struct nvme_request *req);
 static void	nvme_qpair_destroy(struct nvme_qpair *qpair);
 
-#define DEFAULT_INDEX	256
-#define DEFAULT_ENTRY(x)	[DEFAULT_INDEX] = x
-#define OPC_ENTRY(x)		[NVME_OPC_ ## x] = #x
-
-static const char *admin_opcode[DEFAULT_INDEX + 1] = {
-	OPC_ENTRY(DELETE_IO_SQ),
-	OPC_ENTRY(CREATE_IO_SQ),
-	OPC_ENTRY(GET_LOG_PAGE),
-	OPC_ENTRY(DELETE_IO_CQ),
-	OPC_ENTRY(CREATE_IO_CQ),
-	OPC_ENTRY(IDENTIFY),
-	OPC_ENTRY(ABORT),
-	OPC_ENTRY(SET_FEATURES),
-	OPC_ENTRY(GET_FEATURES),
-	OPC_ENTRY(ASYNC_EVENT_REQUEST),
-	OPC_ENTRY(NAMESPACE_MANAGEMENT),
-	OPC_ENTRY(FIRMWARE_ACTIVATE),
-	OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD),
-	OPC_ENTRY(DEVICE_SELF_TEST),
-	OPC_ENTRY(NAMESPACE_ATTACHMENT),
-	OPC_ENTRY(KEEP_ALIVE),
-	OPC_ENTRY(DIRECTIVE_SEND),
-	OPC_ENTRY(DIRECTIVE_RECEIVE),
-	OPC_ENTRY(VIRTUALIZATION_MANAGEMENT),
-	OPC_ENTRY(NVME_MI_SEND),
-	OPC_ENTRY(NVME_MI_RECEIVE),
-	OPC_ENTRY(CAPACITY_MANAGEMENT),
-	OPC_ENTRY(LOCKDOWN),
-	OPC_ENTRY(DOORBELL_BUFFER_CONFIG),
-	OPC_ENTRY(FABRICS_COMMANDS),
-	OPC_ENTRY(FORMAT_NVM),
-	OPC_ENTRY(SECURITY_SEND),
-	OPC_ENTRY(SECURITY_RECEIVE),
-	OPC_ENTRY(SANITIZE),
-	OPC_ENTRY(GET_LBA_STATUS),
-	DEFAULT_ENTRY("ADMIN COMMAND"),
-};
-
-static const char *io_opcode[DEFAULT_INDEX + 1] = {
-	OPC_ENTRY(FLUSH),
-	OPC_ENTRY(WRITE),
-	OPC_ENTRY(READ),
-	OPC_ENTRY(WRITE_UNCORRECTABLE),
-	OPC_ENTRY(COMPARE),
-	OPC_ENTRY(WRITE_ZEROES),
-	OPC_ENTRY(DATASET_MANAGEMENT),
-	OPC_ENTRY(VERIFY),
-	OPC_ENTRY(RESERVATION_REGISTER),
-	OPC_ENTRY(RESERVATION_REPORT),
-	OPC_ENTRY(RESERVATION_ACQUIRE),
-	OPC_ENTRY(RESERVATION_RELEASE),
-	OPC_ENTRY(COPY),
-	DEFAULT_ENTRY("IO COMMAND"),
-};
-
-static const char *
-get_opcode_string(const char *op[DEFAULT_INDEX + 1], uint16_t opc)
-{
-	const char *nm = opc < DEFAULT_INDEX ? op[opc] : op[DEFAULT_INDEX];
-
-	return (nm != NULL ? nm : op[DEFAULT_INDEX]);
-}
-
 static const char *
-get_admin_opcode_string(uint16_t opc)
+get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len)
 {
-	return (get_opcode_string(admin_opcode, opc));
-}
+	struct sbuf sb;
 
-static const char *
-get_io_opcode_string(uint16_t opc)
-{
-	return (get_opcode_string(io_opcode, opc));
+	sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
+	nvme_opcode_sbuf(admin, opc, &sb);
+	if (sbuf_finish(&sb) != 0)
+		return ("");
+	return (buf);
 }
 
 static void
 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
     struct nvme_command *cmd)
 {
+	char buf[64];
 
-	nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x "
+	nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x "
 	    "cdw10:%08x cdw11:%08x\n",
-	    get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid,
-	    le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11));
+	    get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id,
+	    cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10),
+	    le32toh(cmd->cdw11));
 }
 
 static void
 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
     struct nvme_command *cmd)
 {
+	char buf[64];
 
 	switch (cmd->opc) {
 	case NVME_OPC_WRITE:
@@ -143,23 +84,15 @@ nvme_io_qpair_print_command(struct nvme_qpair *qpair,
 	case NVME_OPC_VERIFY:
 		nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
 		    "lba:%llu len:%d\n",
-		    get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid),
+		    get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
+		    qpair->id, cmd->cid, le32toh(cmd->nsid),
 		    ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
 		    (le32toh(cmd->cdw12) & 0xFFFF) + 1);
 		break;
-	case NVME_OPC_FLUSH:
-	case NVME_OPC_DATASET_MANAGEMENT:
-	case NVME_OPC_RESERVATION_REGISTER:
-	case NVME_OPC_RESERVATION_REPORT:
-	case NVME_OPC_RESERVATION_ACQUIRE:
-	case NVME_OPC_RESERVATION_RELEASE:
-		nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
-		    get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid));
-		break;
 	default:
-		nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n",
-		    get_io_opcode_string(cmd->opc), cmd->opc, qpair->id,
-		    cmd->cid, le32toh(cmd->nsid));
+		nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
+		    get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
+		    qpair->id, cmd->cid, le32toh(cmd->nsid));
 		break;
 	}
 }
@@ -183,170 +116,33 @@ nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
 	}
 }
 
-struct nvme_status_string {
-	uint16_t	sc;
-	const char *	str;
-};
-
-static struct nvme_status_string generic_status[] = {
-	{ NVME_SC_SUCCESS, "SUCCESS" },
-	{ NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
-	{ NVME_SC_INVALID_FIELD, "INVALID_FIELD" },
-	{ NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
-	{ NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
-	{ NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
-	{ NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
-	{ NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
-	{ NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
-	{ NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
-	{ NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
-	{ NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
-	{ NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
-	{ NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" },
-	{ NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" },
-	{ NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
-	{ NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
-	{ NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
-	{ NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" },
-	{ NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" },
-	{ NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
-	{ NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
-	{ NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" },
-	{ NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" },
-	{ NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" },
-	{ NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" },
-	{ NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" },
-	{ NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
-	{ NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
-	{ NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" },
-	{ NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
-	{ NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" },
-	{ NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" },
-	{ NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" },
-
-	{ NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
-	{ NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
-	{ NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
-	{ NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
-	{ NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
-	{ 0xFFFF, "GENERIC" }
-};
-
-static struct nvme_status_string command_specific_status[] = {
-	{ NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
-	{ NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
-	{ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" },
-	{ NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
-	{ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
-	{ NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
-	{ NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
-	{ NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
-	{ NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
-	{ NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
-	{ NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" },
-	{ NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
-	{ NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" },
-	{ NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
-	{ NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
-	{ NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" },
-	{ NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" },
-	{ NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" },
-	{ NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
-	{ NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
-	{ NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
-	{ NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" },
-	{ NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
-	{ NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
-	{ NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" },
-	{ NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" },
-	{ NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" },
-	{ NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
-	{ NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" },
-	{ NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" },
-	{ NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
-	{ NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" },
-	{ NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
-	{ NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" },
-	{ NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" },
-	{ NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" },
-
-	{ NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
-	{ NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
-	{ NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" },
-	{ 0xFFFF, "COMMAND SPECIFIC" }
-};
-
-static struct nvme_status_string media_error_status[] = {
-	{ NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
-	{ NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
-	{ NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
-	{ NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
-	{ NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
-	{ NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
-	{ NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
-	{ NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" },
-	{ 0xFFFF, "MEDIA ERROR" }
-};
-
-static struct nvme_status_string path_related_status[] = {
-	{ NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
-	{ NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" },
-	{ NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" },
-	{ NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" },
-	{ NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" },
-	{ NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" },
-	{ NVME_SC_COMMAND_ABORTED_BY_HOST, "COMMAND ABORTED BY HOST" },
-	{ 0xFFFF, "PATH RELATED" },
-};
-
 static const char *
-get_status_string(uint16_t sct, uint16_t sc)
+get_status_string(const struct nvme_completion *cpl, char *buf, size_t len)
 {
-	struct nvme_status_string *entry;
+	struct sbuf sb;
 
-	switch (sct) {
-	case NVME_SCT_GENERIC:
-		entry = generic_status;
-		break;
-	case NVME_SCT_COMMAND_SPECIFIC:
-		entry = command_specific_status;
-		break;
-	case NVME_SCT_MEDIA_ERROR:
-		entry = media_error_status;
-		break;
-	case NVME_SCT_PATH_RELATED:
-		entry = path_related_status;
-		break;
-	case NVME_SCT_VENDOR_SPECIFIC:
-		return ("VENDOR SPECIFIC");
-	default:
-		return ("RESERVED");
-	}
-
-	while (entry->sc != 0xFFFF) {
-		if (entry->sc == sc)
-			return (entry->str);
-		entry++;
-	}
-	return (entry->str);
+	sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
+	nvme_sc_sbuf(cpl, &sb);
+	if (sbuf_finish(&sb) != 0)
+		return ("");
+	return (buf);
 }
 
 void
 nvme_qpair_print_completion(struct nvme_qpair *qpair,
     struct nvme_completion *cpl)
 {
-	uint8_t sct, sc, crd, m, dnr, p;
+	char buf[64];
+	uint8_t crd, m, dnr, p;
 
-	sct = NVME_STATUS_GET_SCT(cpl->status);
-	sc = NVME_STATUS_GET_SC(cpl->status);
 	crd = NVME_STATUS_GET_CRD(cpl->status);
 	m = NVME_STATUS_GET_M(cpl->status);
 	dnr = NVME_STATUS_GET_DNR(cpl->status);
 	p = NVME_STATUS_GET_P(cpl->status);
 
-	nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x p:%d "
+	nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d "
 	    "sqid:%d cid:%d cdw0:%x\n",
-	    get_status_string(sct, sc), sct, sc, crd, m, dnr, p,
+	    get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p,
 	    cpl->sqid, cpl->cid, cpl->cdw0);
 }
 
@@ -414,10 +210,12 @@ static void
 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
     struct nvme_completion *cpl, error_print_t print_on_error)
 {
-	struct nvme_qpair * qpair = tr->qpair;
+	struct nvme_qpair	*qpair = tr->qpair;
 	struct nvme_request	*req;
 	bool			retry, error, retriable;
 
+	mtx_assert(&qpair->lock, MA_NOTOWNED);
+
 	req = tr->req;
 	error = nvme_completion_is_error(cpl);
 	retriable = nvme_completion_is_retry(cpl);
@@ -480,43 +278,52 @@ nvme_qpair_complete_tracker(struct nvme_tracker *tr,
 	mtx_unlock(&qpair->lock);
 }
 
+static uint32_t
+nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr)
+{
+	uint32_t status = 0;
+
+	status |= NVMEF(NVME_STATUS_SCT, sct);
+	status |= NVMEF(NVME_STATUS_SC, sc);
+	status |= NVMEF(NVME_STATUS_DNR, dnr);
+	/* M=0 : this is artificial so no data in error log page */
+	/* CRD=0 : this is artificial and no delayed retry support anyway */
+	/* P=0 : phase not checked */
+	return (status);
+}
+
 static void
 nvme_qpair_manual_complete_tracker(
     struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
     error_print_t print_on_error)
 {
 	struct nvme_completion	cpl;
+	struct nvme_qpair * qpair = tr->qpair;
 
-	memset(&cpl, 0, sizeof(cpl));
+	mtx_assert(&qpair->lock, MA_NOTOWNED);
 
-	struct nvme_qpair * qpair = tr->qpair;
+	memset(&cpl, 0, sizeof(cpl));
 
 	cpl.sqid = qpair->id;
 	cpl.cid = tr->cid;
-	cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
-	cpl.status |= NVMEF(NVME_STATUS_SC, sc);
-	cpl.status |= NVMEF(NVME_STATUS_DNR, dnr);
-	/* M=0 : this is artificial so no data in error log page */
-	/* CRD=0 : this is artificial and no delayed retry support anyway */
-	/* P=0 : phase not checked */
+	cpl.status = nvme_qpair_make_status(sct, sc, dnr);
 	nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
 }
 
-void
+static void
 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
-    struct nvme_request *req, uint32_t sct, uint32_t sc)
+    struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr,
+    error_print_t print_on_error)
 {
 	struct nvme_completion	cpl;
 	bool			error;
 
 	memset(&cpl, 0, sizeof(cpl));
 	cpl.sqid = qpair->id;
-	cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
-	cpl.status |= NVMEF(NVME_STATUS_SC, sc);
-
+	cpl.status = nvme_qpair_make_status(sct, sc, dnr);
 	error = nvme_completion_is_error(&cpl);
 
-	if (error) {
+	if (error && print_on_error == ERROR_PRINT_ALL) {
 		nvme_qpair_print_command(qpair, &req->cmd);
 		nvme_qpair_print_completion(qpair, &cpl);
 	}
@@ -679,7 +486,7 @@ _nvme_qpair_process_completions(struct nvme_qpair *qpair)
 bool
 nvme_qpair_process_completions(struct nvme_qpair *qpair)
 {
-	bool done;
+	bool done = false;
 
 	/*
 	 * Interlock with reset / recovery code. This is an usually uncontended
@@ -687,12 +494,12 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 	 * and to prevent races with the recovery process called from a timeout
 	 * context.
 	 */
-	if (!mtx_trylock(&qpair->recovery)) {
-		qpair->num_recovery_nolock++;
-		return (false);
-	}
+	mtx_lock(&qpair->recovery);
 
-	done = _nvme_qpair_process_completions(qpair);
+	if (__predict_true(qpair->recovery_state == RECOVERY_NONE))
+		done = _nvme_qpair_process_completions(qpair);
+	else
+		qpair->num_recovery_nolock++;	// XXX likely need to rename
 
 	mtx_unlock(&qpair->recovery);
 
@@ -950,27 +757,26 @@ nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
 	/*
 	 * nvme_complete_tracker must be called without the qpair lock held. It
 	 * takes the lock to adjust outstanding_tr list, so make sure we don't
-	 * have it yet (since this is a general purpose routine). We take the
-	 * lock to make the list traverse safe, but have to drop the lock to
-	 * complete any AER. We restart the list scan when we do this to make
-	 * this safe. There's interlock with the ISR so we know this tracker
-	 * won't be completed twice.
+	 * have it yet. We need the lock to make the list traverse safe, but
+	 * have to drop the lock to complete any AER. We restart the list scan
+	 * when we do this to make this safe. There's interlock with the ISR so
+	 * we know this tracker won't be completed twice.
 	 */
 	mtx_assert(&qpair->lock, MA_NOTOWNED);
 
 	mtx_lock(&qpair->lock);
 	tr = TAILQ_FIRST(&qpair->outstanding_tr);
 	while (tr != NULL) {
-		if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
-			mtx_unlock(&qpair->lock);
-			nvme_qpair_manual_complete_tracker(tr,
-			    NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
-			    ERROR_PRINT_NONE);
-			mtx_lock(&qpair->lock);
-			tr = TAILQ_FIRST(&qpair->outstanding_tr);
-		} else {
+		if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
 			tr = TAILQ_NEXT(tr, tailq);
+			continue;
 		}
+		mtx_unlock(&qpair->lock);
+		nvme_qpair_manual_complete_tracker(tr,
+		    NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
+		    ERROR_PRINT_NONE);
+		mtx_lock(&qpair->lock);
+		tr = TAILQ_FIRST(&qpair->outstanding_tr);
 	}
 	mtx_unlock(&qpair->lock);
 }
@@ -997,22 +803,35 @@ nvme_abort_complete(void *arg, const struct nvme_completion *status)
 	struct nvme_tracker     *tr = arg;
 
 	/*
-	 * If cdw0 == 1, the controller was not able to abort the command
-	 *  we requested.  We still need to check the active tracker array,
-	 *  to cover race where I/O timed out at same time controller was
-	 *  completing the I/O.
+	 * If cdw0 bit 0 == 1, the controller was not able to abort the command
+	 * we requested.  We still need to check the active tracker array, to
+	 * cover race where I/O timed out at same time controller was completing
+	 * the I/O. An abort command always is on the admin queue, but affects
+	 * either an admin or an I/O queue, so take the appropriate qpair lock
+	 * for the original command's queue, since we'll need it to avoid races
+	 * with the completion code and to complete the command manually.
 	 */
-	if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
+	mtx_lock(&tr->qpair->lock);
+	if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
 		/*
-		 * An I/O has timed out, and the controller was unable to
-		 *  abort it for some reason.  Construct a fake completion
-		 *  status, and then complete the I/O's tracker manually.
+		 * An I/O has timed out, and the controller was unable to abort
+		 * it for some reason.  And we've not processed a completion for
+		 * it yet. Construct a fake completion status, and then complete
+		 * the I/O's tracker manually.
 		 */
 		nvme_printf(tr->qpair->ctrlr,
 		    "abort command failed, aborting command manually\n");
 		nvme_qpair_manual_complete_tracker(tr,
 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
 	}
+	/*
+	 * XXX We don't check status for the possible 'Could not abort because
+	 * excess aborts were submitted to the controller'. We don't prevent
+	 * that, either. Document for the future here, since the standard is
+	 * squishy and only says 'may generate' but implies anything is possible
+	 * including hangs if you exceed the ACL.
+	 */
+	mtx_unlock(&tr->qpair->lock);
 }
 
 static void
@@ -1022,8 +841,9 @@ nvme_qpair_timeout(void *arg)
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
 	struct nvme_tracker	*tr;
 	sbintime_t		now;
-	bool			idle = false;
-	bool			needs_reset;
+	bool			idle = true;
+	bool			is_admin = qpair == &ctrlr->adminq;
+	bool			fast;
 	uint32_t		csts;
 	uint8_t			cfs;
 
@@ -1034,9 +854,10 @@ nvme_qpair_timeout(void *arg)
 	 * failure processing that races with the qpair timeout will fail
 	 * safely.
 	 */
-	if (qpair->ctrlr->is_failed) {
+	if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
 		nvme_printf(qpair->ctrlr,
-		    "Failed controller, stopping watchdog timeout.\n");
+		    "%sFailed controller, stopping watchdog timeout.\n",
+		    is_admin ? "Complete " : "");
 		qpair->timer_armed = false;
 		return;
 	}
@@ -1069,23 +890,35 @@ nvme_qpair_timeout(void *arg)
 		 */
 		csts = nvme_mmio_read_4(ctrlr, csts);
 		cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
-		if (csts == NVME_GONE || cfs == 1)
-			goto do_reset;
+		if (csts == NVME_GONE || cfs == 1) {
+			/*
+			 * We've had a command timeout that we weren't able to
+			 * abort or we have aborts disabled and any command
+			 * timed out.
+			 *
+			 * If we get here due to a possible surprise hot-unplug
+			 * event, then we let nvme_ctrlr_reset confirm and fail
+			 * the controller.
+			 */
+do_reset:
+			nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
+			    (csts == 0xffffffff) ? " and possible hot unplug" :
+			    (cfs ? " and fatal error status" : ""));
+			qpair->recovery_state = RECOVERY_WAITING;
+			nvme_ctrlr_reset(ctrlr);
+			idle = false;
+			break;
+		}
 
-		/*
-		 * Process completions. We already have the recovery lock, so
-		 * call the locked version.
-		 */
-		_nvme_qpair_process_completions(qpair);
 
 		/*
-		 * Check to see if we need to timeout any commands. If we do, then
-		 * we also enter a recovery phase.
+		 * See if there's any recovery needed. First, do a fast check to
+		 * see if anything could have timed out. If not, then skip
+		 * everything else.
 		 */
-		now = getsbinuptime();
-		needs_reset = false;
-		idle = true;
+		fast = false;
 		mtx_lock(&qpair->lock);
+		now = getsbinuptime();
 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
 			/*
 			 * Skip async commands, they are posted to the card for
@@ -1093,48 +926,83 @@ nvme_qpair_timeout(void *arg)
 			 */
 			if (tr->deadline == SBT_MAX)
 				continue;
-			if (now > tr->deadline) {
-				if (tr->req->cb_fn != nvme_abort_complete &&
-				    ctrlr->enable_aborts) {
-					/*
-					 * This isn't an abort command, ask
-					 * for a hardware abort.
-					 */
-					nvme_ctrlr_cmd_abort(ctrlr, tr->cid,
-					    qpair->id, nvme_abort_complete, tr);
-				} else {
-					/*
-					 * Otherwise we have a live command in
-					 * the card (either one we couldn't
-					 * abort, or aborts weren't enabled).
-					 * The only safe way to proceed is to do
-					 * a reset.
-					 */
-					needs_reset = true;
-				}
-			} else {
-				idle = false;
-			}
+
+			/*
+			 * If the first real transaction is not in timeout, then
+			 * we're done. Otherwise, we try recovery.
+			 */
+			idle = false;
+			if (now <= tr->deadline)
+				fast = true;
+			break;
 		}
 		mtx_unlock(&qpair->lock);
-		if (!needs_reset)
+		if (idle || fast)
 			break;
 
 		/*
-		 * We've had a command timeout that we weren't able to abort
-		 *
-		 * If we get here due to a possible surprise hot-unplug event,
-		 * then we let nvme_ctrlr_reset confirm and fail the
-		 * controller.
+		 * There's a stale transaction at the start of the queue whose
+		 * deadline has passed. Poll the competions as a last-ditch
+		 * effort in case an interrupt has been missed. Warn the user if
+		 * transactions were found of possible interrupt issues, but
+		 * just once per controller.
+		 */
+		if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) {
+			nvme_printf(ctrlr, "System interrupt issues?\n");
+			ctrlr->isr_warned = true;
+		}
+
+		/*
+		 * Now that we've run the ISR, re-rheck to see if there's any
+		 * timed out commands and abort them or reset the card if so.
 		 */
-	do_reset:
-		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
-		    (csts == 0xffffffff) ? " and possible hot unplug" :
-		    (cfs ? " and fatal error status" : ""));
-		qpair->recovery_state = RECOVERY_WAITING;
-		nvme_ctrlr_reset(ctrlr);
-		idle = false;			/* We want to keep polling */
+		mtx_lock(&qpair->lock);
+		idle = true;
+		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+			/*
+			 * Skip async commands, they are posted to the card for
+			 * an indefinite amount of time and have no deadline.
+			 */
+			if (tr->deadline == SBT_MAX)
+				continue;
+
+			/*
+			 * If we know this tracker hasn't timed out, we also
+			 * know all subsequent ones haven't timed out. The tr
+			 * queue is in submission order and all normal commands
+			 * in a queue have the same timeout (or the timeout was
+			 * changed by the user, but we eventually timeout then).
+			 */
+			idle = false;
+			if (now <= tr->deadline)
+				break;
+
+			/*
+			 * Timeout expired, abort it or reset controller.
+			 */
+			if (ctrlr->enable_aborts &&
+			    tr->req->cb_fn != nvme_abort_complete) {
+				/*
+				 * This isn't an abort command, ask for a
+				 * hardware abort. This goes to the admin
+				 * queue which will reset the card if it
+				 * times out.
+				 */
+				nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
+				    nvme_abort_complete, tr);
+			} else {
+				/*
+				 * We have a live command in the card (either
+				 * one we couldn't abort, or aborts weren't
+				 * enabled).  We can only reset.
+				 */
+				mtx_unlock(&qpair->lock);
+				goto do_reset;
+			}
+		}
+		mtx_unlock(&qpair->lock);
 		break;
+
 	case RECOVERY_WAITING:
 		/*
 		 * These messages aren't interesting while we're suspended. We
@@ -1201,7 +1069,7 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
 
 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-	bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
+	bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle,
 	    qpair->sq_tdbl_off, qpair->sq_tail);
 	qpair->num_cmds++;
 }
@@ -1259,47 +1127,41 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 {
 	struct nvme_tracker	*tr;
 	int			err = 0;
+	bool			is_admin = qpair == &qpair->ctrlr->adminq;
 
 	mtx_assert(&qpair->lock, MA_OWNED);
 
 	tr = TAILQ_FIRST(&qpair->free_tr);
 	req->qpair = qpair;
 
-	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
-		/*
-		 * No tracker is available, or the qpair is disabled due to an
-		 * in-progress controller-level reset. If we lose the race with
-		 * recovery_state, then we may add an extra request to the queue
-		 * which will be resubmitted later.  We only set recovery_state
-		 * to NONE with qpair->lock also held, so if we observe that the
-		 * state is not NONE, we know it can't transition to NONE below
-		 * when we've submitted the request to hardware.
-		 *
-		 * Also, as part of the failure process, we set recovery_state
-		 * to RECOVERY_WAITING, so we check here to see if we've failed
-		 * the controller. We set it before we call the qpair_fail
-		 * functions, which take out the lock lock before messing with
-		 * queued_req. Since we hold that lock, we know it's safe to
-		 * either fail directly, or queue the failure should is_failed
-		 * be stale. If we lose the race reading is_failed, then
-		 * nvme_qpair_fail will fail the queued request.
-		 */
+	/*
+	 * The controller has failed, so fail the request. Note, that this races
+	 * the recovery / timeout code. Since we hold the qpair lock, we know
+	 * it's safe to fail directly. is_failed is set when we fail the
+	 * controller.  It is only ever reset in the ioctl reset controller
+	 * path, which is safe to race (for failed controllers, we make no
+	 * guarantees about bringing it out of failed state relative to other
+	 * commands). We try hard to allow admin commands when the entire
+	 * controller hasn't failed, only something related to I/O queues.
+	 */
+	if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
+		nvme_qpair_manual_complete_request(qpair, req,
+		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
+		    ERROR_PRINT_NONE);
+		return;
+	}
 
-		if (qpair->ctrlr->is_failed) {
-			/*
-			 * The controller has failed, so fail the request.
-			 */
-			nvme_qpair_manual_complete_request(qpair, req,
-			    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
-		} else {
-			/*
-			 * Put the request on the qpair's request queue to be
-			 *  processed when a tracker frees up via a command
-			 *  completion or when the controller reset is
-			 *  completed.
-			 */
-			STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
-		}
+	/*
+	 * No tracker is available, or the qpair is disabled due to an
+	 * in-progress controller-level reset. If we lose the race with
+	 * recovery_state, then we may add an extra request to the queue which
+	 * will be resubmitted later.  We only set recovery_state to NONE with
+	 * qpair->lock also held, so if we observe that the state is not NONE,
+	 * we know it won't transition back to NONE without retrying queued
+	 * request.
+	 */
+	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
+		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
 		return;
 	}
 
@@ -1313,6 +1175,11 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 		return;
 	}
 
+	/*
+	 * tr->deadline updating when nvme_payload_map calls
+	 * nvme_qpair_submit_tracker (we call it above directly
+	 * when there's no map to load).
+	 */
 	err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
 	    tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
 	if (err != 0) {
@@ -1344,11 +1211,13 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
+	bool is_admin __unused = qpair == &qpair->ctrlr->adminq;
+
 	if (mtx_initialized(&qpair->recovery))
 		mtx_assert(&qpair->recovery, MA_OWNED);
 	if (mtx_initialized(&qpair->lock))
 		mtx_assert(&qpair->lock, MA_OWNED);
-	KASSERT(!qpair->ctrlr->is_failed,
+	KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed),
 	    ("Enabling a failed qpair\n"));
 
 	qpair->recovery_state = RECOVERY_NONE;
@@ -1515,7 +1384,7 @@ nvme_qpair_fail(struct nvme_qpair *qpair)
 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
 		mtx_unlock(&qpair->lock);
 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
-		    NVME_SC_ABORTED_BY_REQUEST);
+		    NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL);
 		mtx_lock(&qpair->lock);
 	}
 
diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c
index f561756f99b7..4974bb718222 100644
--- a/sys/dev/nvme/nvme_sim.c
+++ b/sys/dev/nvme/nvme_sim.c
@@ -96,15 +96,16 @@ nvme_sim_nvmeio(struct cam_sim *sim, union ccb *ccb)
 	/* SG LIST ??? */
 	if ((nvmeio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
 		req = nvme_allocate_request_bio((struct bio *)payload,
-		    nvme_sim_nvmeio_done, ccb);
+		    M_NOWAIT, nvme_sim_nvmeio_done, ccb);
 	else if ((nvmeio->ccb_h.flags & CAM_DATA_SG) == CAM_DATA_SG)
-		req = nvme_allocate_request_ccb(ccb, nvme_sim_nvmeio_done, ccb);
+		req = nvme_allocate_request_ccb(ccb, M_NOWAIT,
+		    nvme_sim_nvmeio_done, ccb);
 	else if (payload == NULL)
-		req = nvme_allocate_request_null(nvme_sim_nvmeio_done, ccb);
+		req = nvme_allocate_request_null(M_NOWAIT, nvme_sim_nvmeio_done,
+		    ccb);
 	else
-		req = nvme_allocate_request_vaddr(payload, size,
+		req = nvme_allocate_request_vaddr(payload, size, M_NOWAIT,
 		    nvme_sim_nvmeio_done, ccb);
-
 	if (req == NULL) {
 		nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
 		xpt_done(ccb);
@@ -203,7 +204,7 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
 		cpi->xport_specific.nvme.slot = pci_get_slot(dev);
 		cpi->xport_specific.nvme.function = pci_get_function(dev);
 		cpi->xport_specific.nvme.extra = 0;
-		strncpy(cpi->xport_specific.nvme.dev_name, device_get_nameunit(dev),
+		strlcpy(cpi->xport_specific.nvme.dev_name, device_get_nameunit(dev),
 		    sizeof(cpi->xport_specific.nvme.dev_name));
 		cpi->hba_vendor = pci_get_vendor(dev);
 		cpi->hba_device = pci_get_device(dev);
@@ -268,7 +269,6 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_NVME_IO:		/* Execute the requested I/O operation */
-	case XPT_NVME_ADMIN:		/* or Admin operation */
 		if (ctrlr->is_failed) {
 			/*
 			 * I/O came in while we were failing the drive, so drop
@@ -279,6 +279,18 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
 		}
 		nvme_sim_nvmeio(sim, ccb);
 		return;			/* no done */
+	case XPT_NVME_ADMIN:		/* or Admin operation */
+		if (ctrlr->is_failed_admin) {
+			/*
+			 * Admin request came in when we can't send admin
+			 * commands, so drop it. Once falure is complete, we'll
+			 * be destroyed.
+			 */
+			ccb->ccb_h.status = CAM_DEV_NOT_THERE;
+			break;
+		}
+		nvme_sim_nvmeio(sim, ccb);
+		return;			/* no done */
 	default:
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		break;
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
index d6452a2e5492..a5a44721f9f9 100644
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -30,6 +30,7 @@
 #include "opt_nvme.h"
 
 #include <sys/param.h>
+#include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/sysctl.h>
 
@@ -175,8 +176,10 @@ nvme_sysctl_num_cmds(SYSCTL_HANDLER_ARGS)
 
 	num_cmds = ctrlr->adminq.num_cmds;
 
-	for (i = 0; i < ctrlr->num_io_queues; i++)
-		num_cmds += ctrlr->ioq[i].num_cmds;
+	if (ctrlr->ioq != NULL) {
+		for (i = 0; i < ctrlr->num_io_queues; i++)
+			num_cmds += ctrlr->ioq[i].num_cmds;
+	}
 
 	return (sysctl_handle_64(oidp, &num_cmds, 0, req));
 }
@@ -190,8 +193,10 @@ nvme_sysctl_num_intr_handler_calls(SYSCTL_HANDLER_ARGS)
 
 	num_intr_handler_calls = ctrlr->adminq.num_intr_handler_calls;
 
-	for (i = 0; i < ctrlr->num_io_queues; i++)
-		num_intr_handler_calls += ctrlr->ioq[i].num_intr_handler_calls;
+	if (ctrlr->ioq != NULL) {
+		for (i = 0; i < ctrlr->num_io_queues; i++)
+			num_intr_handler_calls += ctrlr->ioq[i].num_intr_handler_calls;
+	}
 
 	return (sysctl_handle_64(oidp, &num_intr_handler_calls, 0, req));
 }
@@ -205,8 +210,10 @@ nvme_sysctl_num_retries(SYSCTL_HANDLER_ARGS)
 
 	num_retries = ctrlr->adminq.num_retries;
 
-	for (i = 0; i < ctrlr->num_io_queues; i++)
-		num_retries += ctrlr->ioq[i].num_retries;
+	if (ctrlr->ioq != NULL) {
+		for (i = 0; i < ctrlr->num_io_queues; i++)
+			num_retries += ctrlr->ioq[i].num_retries;
+	}
 
 	return (sysctl_handle_64(oidp, &num_retries, 0, req));
 }
@@ -220,8 +227,10 @@ nvme_sysctl_num_failures(SYSCTL_HANDLER_ARGS)
 
 	num_failures = ctrlr->adminq.num_failures;
 
-	for (i = 0; i < ctrlr->num_io_queues; i++)
-		num_failures += ctrlr->ioq[i].num_failures;
+	if (ctrlr->ioq != NULL) {
+		for (i = 0; i < ctrlr->num_io_queues; i++)
+			num_failures += ctrlr->ioq[i].num_failures;
+	}
 
 	return (sysctl_handle_64(oidp, &num_failures, 0, req));
 }
@@ -235,8 +244,10 @@ nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS)
 
 	num_ignored = ctrlr->adminq.num_ignored;
 
-	for (i = 0; i < ctrlr->num_io_queues; i++)
-		num_ignored += ctrlr->ioq[i].num_ignored;
+	if (ctrlr->ioq != NULL) {
+		for (i = 0; i < ctrlr->num_io_queues; i++)
+			num_ignored += ctrlr->ioq[i].num_ignored;
+	}
 
 	return (sysctl_handle_64(oidp, &num_ignored, 0, req));
 }
@@ -250,8 +261,10 @@ nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
 
 	num = ctrlr->adminq.num_recovery_nolock;
 
-	for (i = 0; i < ctrlr->num_io_queues; i++)
-		num += ctrlr->ioq[i].num_recovery_nolock;
+	if (ctrlr->ioq != NULL) {
+		for (i = 0; i < ctrlr->num_io_queues; i++)
+			num += ctrlr->ioq[i].num_recovery_nolock;
+	}
 
 	return (sysctl_handle_64(oidp, &num, 0, req));
 }
@@ -270,8 +283,10 @@ nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
 	if (val != 0) {
 		nvme_qpair_reset_stats(&ctrlr->adminq);
 
-		for (i = 0; i < ctrlr->num_io_queues; i++)
-			nvme_qpair_reset_stats(&ctrlr->ioq[i]);
+		if (ctrlr->ioq != NULL) {
+			for (i = 0; i < ctrlr->num_io_queues; i++)
+				nvme_qpair_reset_stats(&ctrlr->ioq[i]);
+		}
 	}
 
 	return (0);
@@ -318,6 +333,10 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
 	    CTLFLAG_RD, &qpair->num_recovery_nolock,
 	    "Number of times that we failed to lock recovery in the ISR");
 
+	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "recovery",
+	    CTLFLAG_RW, &qpair->recovery_state, 0,
+	    "Current recovery state of the queue");
+
 	SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
 	    "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    qpair, 0, nvme_sysctl_dump_debug, "IU", "Dump debug data");
@@ -327,8 +346,8 @@ void
 nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
 {
 	struct sysctl_ctx_list	*ctrlr_ctx;
-	struct sysctl_oid	*ctrlr_tree, *que_tree;
-	struct sysctl_oid_list	*ctrlr_list;
+	struct sysctl_oid	*ctrlr_tree, *que_tree, *ioq_tree;
+	struct sysctl_oid_list	*ctrlr_list, *ioq_list;
 #define QUEUE_NAME_LENGTH	16
 	char			queue_name[QUEUE_NAME_LENGTH];
 	int			i;
@@ -407,16 +426,35 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
 	    CTLFLAG_RD, &ctrlr->cap_hi, 0,
 	    "Hi 32-bits of capacities for the drive");
 
+	SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "fail_on_reset",
+	    CTLFLAG_RD, &ctrlr->fail_on_reset, 0,
+	    "Pretend the next reset fails and fail the controller");
+
 	que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO, "adminq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue");
 
 	nvme_sysctl_initialize_queue(&ctrlr->adminq, ctrlr_ctx, que_tree);
 
-	for (i = 0; i < ctrlr->num_io_queues; i++) {
-		snprintf(queue_name, QUEUE_NAME_LENGTH, "ioq%d", i);
-		que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO,
-		    queue_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IO Queue");
-		nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx,
-		    que_tree);
+	/*
+	 * Make sure that we've constructed the I/O queues before setting up the
+	 * sysctls. Failed controllers won't allocate it, but we want the rest
+	 * of the sysctls to diagnose things.
+	 */
+	if (ctrlr->ioq != NULL) {
+		ioq_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO,
+		    "ioq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
+		ioq_list = SYSCTL_CHILDREN(ioq_tree);
+
+		for (i = 0; i < ctrlr->num_io_queues; i++) {
+			snprintf(queue_name, QUEUE_NAME_LENGTH, "%d", i);
+			que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ioq_list, OID_AUTO,
+			    queue_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IO Queue");
+			nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx,
+			    que_tree);
+		}
 	}
+
+	SYSCTL_ADD_COUNTER_U64(ctrlr_ctx, ctrlr_list, OID_AUTO, "alignment_splits",
+	    CTLFLAG_RD, &ctrlr->alignment_splits,
+	    "Number of times we split the I/O alignment for drives with preferred alignment");
 }
diff --git a/sys/dev/nvme/nvme_util.c b/sys/dev/nvme/nvme_util.c
index 47d84e5b6957..0a07653a7378 100644
--- a/sys/dev/nvme/nvme_util.c
+++ b/sys/dev/nvme/nvme_util.c
@@ -5,6 +5,8 @@
  * Copyright (C) 1997 Justin T. Gibbs
  * All rights reserved.
  *
+ * Copyright (c) 2023-2025 Chelsio Communications, Inc.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -28,8 +30,243 @@
  */
 
 #include <sys/param.h>
+#include <sys/sbuf.h>
 #include <dev/nvme/nvme.h>
 
+#define OPC_ENTRY(x)		[NVME_OPC_ ## x] = #x
+
+static const char *admin_opcode[256] = {
+	OPC_ENTRY(DELETE_IO_SQ),
+	OPC_ENTRY(CREATE_IO_SQ),
+	OPC_ENTRY(GET_LOG_PAGE),
+	OPC_ENTRY(DELETE_IO_CQ),
+	OPC_ENTRY(CREATE_IO_CQ),
+	OPC_ENTRY(IDENTIFY),
+	OPC_ENTRY(ABORT),
+	OPC_ENTRY(SET_FEATURES),
+	OPC_ENTRY(GET_FEATURES),
+	OPC_ENTRY(ASYNC_EVENT_REQUEST),
+	OPC_ENTRY(NAMESPACE_MANAGEMENT),
+	OPC_ENTRY(FIRMWARE_ACTIVATE),
+	OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD),
+	OPC_ENTRY(DEVICE_SELF_TEST),
+	OPC_ENTRY(NAMESPACE_ATTACHMENT),
+	OPC_ENTRY(KEEP_ALIVE),
+	OPC_ENTRY(DIRECTIVE_SEND),
+	OPC_ENTRY(DIRECTIVE_RECEIVE),
+	OPC_ENTRY(VIRTUALIZATION_MANAGEMENT),
+	OPC_ENTRY(NVME_MI_SEND),
+	OPC_ENTRY(NVME_MI_RECEIVE),
+	OPC_ENTRY(CAPACITY_MANAGEMENT),
+	OPC_ENTRY(LOCKDOWN),
+	OPC_ENTRY(DOORBELL_BUFFER_CONFIG),
+	OPC_ENTRY(FABRICS_COMMANDS),
+	OPC_ENTRY(FORMAT_NVM),
+	OPC_ENTRY(SECURITY_SEND),
+	OPC_ENTRY(SECURITY_RECEIVE),
+	OPC_ENTRY(SANITIZE),
+	OPC_ENTRY(GET_LBA_STATUS),
+};
+
+static const char *nvm_opcode[256] = {
+	OPC_ENTRY(FLUSH),
+	OPC_ENTRY(WRITE),
+	OPC_ENTRY(READ),
+	OPC_ENTRY(WRITE_UNCORRECTABLE),
+	OPC_ENTRY(COMPARE),
+	OPC_ENTRY(WRITE_ZEROES),
+	OPC_ENTRY(DATASET_MANAGEMENT),
+	OPC_ENTRY(VERIFY),
+	OPC_ENTRY(RESERVATION_REGISTER),
+	OPC_ENTRY(RESERVATION_REPORT),
+	OPC_ENTRY(RESERVATION_ACQUIRE),
+	OPC_ENTRY(RESERVATION_RELEASE),
+	OPC_ENTRY(COPY),
+};
+
+#define SC_ENTRY(x)		[NVME_SC_ ## x] = #x
+
+static const char *generic_status[256] = {
+	SC_ENTRY(SUCCESS),
+	SC_ENTRY(INVALID_OPCODE),
+	SC_ENTRY(INVALID_FIELD),
+	SC_ENTRY(COMMAND_ID_CONFLICT),
+	SC_ENTRY(DATA_TRANSFER_ERROR),
+	SC_ENTRY(ABORTED_POWER_LOSS),
+	SC_ENTRY(INTERNAL_DEVICE_ERROR),
+	SC_ENTRY(ABORTED_BY_REQUEST),
+	SC_ENTRY(ABORTED_SQ_DELETION),
+	SC_ENTRY(ABORTED_FAILED_FUSED),
+	SC_ENTRY(ABORTED_MISSING_FUSED),
+	SC_ENTRY(INVALID_NAMESPACE_OR_FORMAT),
+	SC_ENTRY(COMMAND_SEQUENCE_ERROR),
+	SC_ENTRY(INVALID_SGL_SEGMENT_DESCR),
+	SC_ENTRY(INVALID_NUMBER_OF_SGL_DESCR),
+	SC_ENTRY(DATA_SGL_LENGTH_INVALID),
+	SC_ENTRY(METADATA_SGL_LENGTH_INVALID),
+	SC_ENTRY(SGL_DESCRIPTOR_TYPE_INVALID),
+	SC_ENTRY(INVALID_USE_OF_CMB),
+	SC_ENTRY(PRP_OFFET_INVALID),
+	SC_ENTRY(ATOMIC_WRITE_UNIT_EXCEEDED),
+	SC_ENTRY(OPERATION_DENIED),
+	SC_ENTRY(SGL_OFFSET_INVALID),
+	SC_ENTRY(HOST_ID_INCONSISTENT_FORMAT),
+	SC_ENTRY(KEEP_ALIVE_TIMEOUT_EXPIRED),
+	SC_ENTRY(KEEP_ALIVE_TIMEOUT_INVALID),
+	SC_ENTRY(ABORTED_DUE_TO_PREEMPT),
+	SC_ENTRY(SANITIZE_FAILED),
+	SC_ENTRY(SANITIZE_IN_PROGRESS),
+	SC_ENTRY(SGL_DATA_BLOCK_GRAN_INVALID),
+	SC_ENTRY(NOT_SUPPORTED_IN_CMB),
+	SC_ENTRY(NAMESPACE_IS_WRITE_PROTECTED),
+	SC_ENTRY(COMMAND_INTERRUPTED),
+	SC_ENTRY(TRANSIENT_TRANSPORT_ERROR),
+
+	SC_ENTRY(LBA_OUT_OF_RANGE),
+	SC_ENTRY(CAPACITY_EXCEEDED),
+	SC_ENTRY(NAMESPACE_NOT_READY),
+	SC_ENTRY(RESERVATION_CONFLICT),
+	SC_ENTRY(FORMAT_IN_PROGRESS),
+};
+
+static const char *command_specific_status[256] = {
+	SC_ENTRY(COMPLETION_QUEUE_INVALID),
+	SC_ENTRY(INVALID_QUEUE_IDENTIFIER),
+	SC_ENTRY(MAXIMUM_QUEUE_SIZE_EXCEEDED),
+	SC_ENTRY(ABORT_COMMAND_LIMIT_EXCEEDED),
+	SC_ENTRY(ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED),
+	SC_ENTRY(INVALID_FIRMWARE_SLOT),
+	SC_ENTRY(INVALID_FIRMWARE_IMAGE),
+	SC_ENTRY(INVALID_INTERRUPT_VECTOR),
+	SC_ENTRY(INVALID_LOG_PAGE),
+	SC_ENTRY(INVALID_FORMAT),
+	SC_ENTRY(FIRMWARE_REQUIRES_RESET),
+	SC_ENTRY(INVALID_QUEUE_DELETION),
+	SC_ENTRY(FEATURE_NOT_SAVEABLE),
+	SC_ENTRY(FEATURE_NOT_CHANGEABLE),
+	SC_ENTRY(FEATURE_NOT_NS_SPECIFIC),
+	SC_ENTRY(FW_ACT_REQUIRES_NVMS_RESET),
+	SC_ENTRY(FW_ACT_REQUIRES_RESET),
+	SC_ENTRY(FW_ACT_REQUIRES_TIME),
+	SC_ENTRY(FW_ACT_PROHIBITED),
+	SC_ENTRY(OVERLAPPING_RANGE),
+	SC_ENTRY(NS_INSUFFICIENT_CAPACITY),
+	SC_ENTRY(NS_ID_UNAVAILABLE),
+	SC_ENTRY(NS_ALREADY_ATTACHED),
+	SC_ENTRY(NS_IS_PRIVATE),
+	SC_ENTRY(NS_NOT_ATTACHED),
+	SC_ENTRY(THIN_PROV_NOT_SUPPORTED),
+	SC_ENTRY(CTRLR_LIST_INVALID),
+	SC_ENTRY(SELF_TEST_IN_PROGRESS),
+	SC_ENTRY(BOOT_PART_WRITE_PROHIB),
+	SC_ENTRY(INVALID_CTRLR_ID),
+	SC_ENTRY(INVALID_SEC_CTRLR_STATE),
+	SC_ENTRY(INVALID_NUM_OF_CTRLR_RESRC),
+	SC_ENTRY(INVALID_RESOURCE_ID),
+	SC_ENTRY(SANITIZE_PROHIBITED_WPMRE),
+	SC_ENTRY(ANA_GROUP_ID_INVALID),
+	SC_ENTRY(ANA_ATTACH_FAILED),
+
+	SC_ENTRY(CONFLICTING_ATTRIBUTES),
+	SC_ENTRY(INVALID_PROTECTION_INFO),
+	SC_ENTRY(ATTEMPTED_WRITE_TO_RO_PAGE),
+};
+
+static const char *media_error_status[256] = {
+	SC_ENTRY(WRITE_FAULTS),
+	SC_ENTRY(UNRECOVERED_READ_ERROR),
+	SC_ENTRY(GUARD_CHECK_ERROR),
+	SC_ENTRY(APPLICATION_TAG_CHECK_ERROR),
+	SC_ENTRY(REFERENCE_TAG_CHECK_ERROR),
+	SC_ENTRY(COMPARE_FAILURE),
+	SC_ENTRY(ACCESS_DENIED),
+	SC_ENTRY(DEALLOCATED_OR_UNWRITTEN),
+};
+
+static const char *path_related_status[256] = {
+	SC_ENTRY(INTERNAL_PATH_ERROR),
+	SC_ENTRY(ASYMMETRIC_ACCESS_PERSISTENT_LOSS),
+	SC_ENTRY(ASYMMETRIC_ACCESS_INACCESSIBLE),
+	SC_ENTRY(ASYMMETRIC_ACCESS_TRANSITION),
+	SC_ENTRY(CONTROLLER_PATHING_ERROR),
+	SC_ENTRY(HOST_PATHING_ERROR),
+	SC_ENTRY(COMMAND_ABORTED_BY_HOST),
+};
+
+void
+nvme_opcode_sbuf(bool admin, uint8_t opc, struct sbuf *sb)
+{
+	const char *s, *type;
+
+	if (admin) {
+		s = admin_opcode[opc];
+		type = "ADMIN";
+	} else {
+		s = nvm_opcode[opc];
+		type = "NVM";
+	}
+	if (s == NULL)
+		sbuf_printf(sb, "%s (%02x)", type, opc);
+	else
+		sbuf_printf(sb, "%s", s);
+}
+
+void
+nvme_sc_sbuf(const struct nvme_completion *cpl, struct sbuf *sb)
+{
+	const char *s, *type;
+	uint16_t status;
+
+	status = le16toh(cpl->status);
+	switch (NVME_STATUS_GET_SCT(status)) {
+	case NVME_SCT_GENERIC:
+		s = generic_status[NVME_STATUS_GET_SC(status)];
+		type = "GENERIC";
+		break;
+	case NVME_SCT_COMMAND_SPECIFIC:
+		s = command_specific_status[NVME_STATUS_GET_SC(status)];
+		type = "COMMAND SPECIFIC";
+		break;
+	case NVME_SCT_MEDIA_ERROR:
+		s = media_error_status[NVME_STATUS_GET_SC(status)];
+		type = "MEDIA ERROR";
+		break;
+	case NVME_SCT_PATH_RELATED:
+		s = path_related_status[NVME_STATUS_GET_SC(status)];
+		type = "PATH RELATED";
+		break;
+	case NVME_SCT_VENDOR_SPECIFIC:
+		s = NULL;
+		type = "VENDOR SPECIFIC";
+		break;
+	default:
+		s = NULL;
+		type = NULL;
+		break;
+	}
+
+	if (type == NULL)
+		sbuf_printf(sb, "RESERVED (%02x/%02x)",
+		    NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status));
+	else if (s == NULL)
+		sbuf_printf(sb, "%s (%02x)", type, NVME_STATUS_GET_SC(status));
+	else
+		sbuf_printf(sb, "%s", s);
+}
+
+void
+nvme_cpl_sbuf(const struct nvme_completion *cpl, struct sbuf *sb)
+{
+	uint16_t status;
+
+	status = le16toh(cpl->status);
+	nvme_sc_sbuf(cpl, sb);
+	if (NVME_STATUS_GET_M(status) != 0)
+		sbuf_printf(sb, " M");
+	if (NVME_STATUS_GET_DNR(status) != 0)
+		sbuf_printf(sb, " DNR");
+}
+
 void
 nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen)
 {