aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/nvme
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/nvme')
-rw-r--r--sys/dev/nvme/nvme.h55
-rw-r--r--sys/dev/nvme/nvme_ctrlr.c242
-rw-r--r--sys/dev/nvme/nvme_ctrlr_cmd.c29
-rw-r--r--sys/dev/nvme/nvme_linux.h58
-rw-r--r--sys/dev/nvme/nvme_ns.c29
-rw-r--r--sys/dev/nvme/nvme_ns_cmd.c24
-rw-r--r--sys/dev/nvme/nvme_private.h39
-rw-r--r--sys/dev/nvme/nvme_qpair.c597
-rw-r--r--sys/dev/nvme/nvme_sim.c26
-rw-r--r--sys/dev/nvme/nvme_sysctl.c82
-rw-r--r--sys/dev/nvme/nvme_util.c237
11 files changed, 925 insertions, 493 deletions
diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
index a4baade7df5d..17c5cdb4db87 100644
--- a/sys/dev/nvme/nvme.h
+++ b/sys/dev/nvme/nvme.h
@@ -35,11 +35,17 @@
#include <sys/param.h>
#include <sys/endian.h>
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
+
+struct sbuf;
#define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command)
#define NVME_RESET_CONTROLLER _IO('n', 1)
#define NVME_GET_NSID _IOR('n', 2, struct nvme_get_nsid)
#define NVME_GET_MAX_XFER_SIZE _IOR('n', 3, uint64_t)
+#define NVME_GET_CONTROLLER_DATA _IOR('n', 4, struct nvme_controller_data)
#define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test)
#define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test)
@@ -648,8 +654,16 @@ enum nvme_critical_warning_state {
NVME_CRIT_WARN_ST_PERSISTENT_MEMORY_REGION = 0x20,
};
#define NVME_CRIT_WARN_ST_RESERVED_MASK (0xC0)
-#define NVME_ASYNC_EVENT_NS_ATTRIBUTE (0x100)
-#define NVME_ASYNC_EVENT_FW_ACTIVATE (0x200)
+#define NVME_ASYNC_EVENT_NS_ATTRIBUTE (1U << 8)
+#define NVME_ASYNC_EVENT_FW_ACTIVATE (1U << 9)
+#define NVME_ASYNC_EVENT_TELEMETRY_LOG (1U << 10)
+#define NVME_ASYNC_EVENT_ASYM_NS_ACC (1U << 11)
+#define NVME_ASYNC_EVENT_PRED_LAT_DELTA (1U << 12)
+#define NVME_ASYNC_EVENT_LBA_STATUS (1U << 13)
+#define NVME_ASYNC_EVENT_ENDURANCE_DELTA (1U << 14)
+#define NVME_ASYNC_EVENT_NVM_SHUTDOWN (1U << 15)
+#define NVME_ASYNC_EVENT_ZONE_DELTA (1U << 27)
+#define NVME_ASYNC_EVENT_DISCOVERY_DELTA (1U << 31)
/* slot for current FW */
#define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0)
@@ -832,7 +846,7 @@ struct nvme_command {
uint32_t cdw13; /* command-specific */
uint32_t cdw14; /* command-specific */
uint32_t cdw15; /* command-specific */
-};
+} __aligned(8);
_Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command");
@@ -1601,7 +1615,7 @@ struct nvme_health_information_page {
uint32_t ttftmt2;
uint8_t reserved2[280];
-} __packed __aligned(4);
+} __packed __aligned(8);
_Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page");
@@ -1652,6 +1666,30 @@ struct nvme_device_self_test_page {
_Static_assert(sizeof(struct nvme_device_self_test_page) == 564,
"bad size for nvme_device_self_test_page");
+/*
+ * Header structure for both host initiated telemetry (page 7) and controller
+ * initiated telemetry (page 8).
+ */
+struct nvme_telemetry_log_page {
+ uint8_t identifier;
+ uint8_t rsvd[4];
+ uint8_t oui[3];
+ uint16_t da1_last;
+ uint16_t da2_last;
+ uint16_t da3_last;
+ uint8_t rsvd2[2];
+ uint32_t da4_last;
+ uint8_t rsvd3[361];
+ uint8_t hi_gen;
+ uint8_t ci_avail;
+ uint8_t ci_gen;
+ uint8_t reason[128];
+ /* Blocks of telemetry data follow */
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_telemetry_log_page) == 512,
+ "bad size for nvme_telemetry_log");
+
struct nvme_discovery_log_entry {
uint8_t trtype;
uint8_t adrfam;
@@ -1868,6 +1906,9 @@ struct nvme_hmb_desc {
#define nvme_completion_is_error(cpl) \
(NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0)
+void nvme_cpl_sbuf(const struct nvme_completion *cpl, struct sbuf *sbuf);
+void nvme_opcode_sbuf(bool admin, uint8_t opc, struct sbuf *sb);
+void nvme_sc_sbuf(const struct nvme_completion *cpl, struct sbuf *sbuf);
void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen);
#ifdef _KERNEL
@@ -1878,6 +1919,7 @@ struct thread;
struct nvme_namespace;
struct nvme_controller;
struct nvme_consumer;
+struct nvme_passthru_cmd;
typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
@@ -1897,6 +1939,11 @@ int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
uint32_t nsid, int is_user_buffer,
int is_admin_cmd);
+int nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+ struct nvme_passthru_cmd *npc,
+ uint32_t nsid, bool is_user,
+ bool is_admin);
+
/* Admin functions */
void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
uint8_t feature, uint32_t cdw11,
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index b7b03082c54e..73a7cee4aad0 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -39,10 +39,11 @@
#include <sys/uio.h>
#include <sys/sbuf.h>
#include <sys/endian.h>
-#include <machine/stdarg.h>
+#include <sys/stdarg.h>
#include <vm/vm.h>
#include "nvme_private.h"
+#include "nvme_linux.h"
#define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */
@@ -231,7 +232,7 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
}
static void
-nvme_ctrlr_fail(struct nvme_controller *ctrlr)
+nvme_ctrlr_fail(struct nvme_controller *ctrlr, bool admin_also)
{
int i;
@@ -241,7 +242,10 @@ nvme_ctrlr_fail(struct nvme_controller *ctrlr)
* a different error, though when we fail, that hardly matters).
*/
ctrlr->is_failed = true;
- nvme_qpair_fail(&ctrlr->adminq);
+ if (admin_also) {
+ ctrlr->is_failed_admin = true;
+ nvme_qpair_fail(&ctrlr->adminq);
+ }
if (ctrlr->ioq != NULL) {
for (i = 0; i < ctrlr->num_io_queues; i++) {
nvme_qpair_fail(&ctrlr->ioq[i]);
@@ -414,6 +418,7 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
TSENTER();
+ ctrlr->is_failed_admin = true;
nvme_ctrlr_disable_qpairs(ctrlr);
err = nvme_ctrlr_disable(ctrlr);
@@ -422,6 +427,8 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
err = nvme_ctrlr_enable(ctrlr);
out:
+ if (err == 0)
+ ctrlr->is_failed_admin = false;
TSEXIT();
return (err);
@@ -434,11 +441,10 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)
cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
- if (cmpset == 0 || ctrlr->is_failed)
+ if (cmpset == 0)
/*
- * Controller is already resetting or has failed. Return
- * immediately since there is no need to kick off another
- * reset in these cases.
+ * Controller is already resetting. Return immediately since
+ * there is no need to kick off another reset.
*/
return;
@@ -814,7 +820,13 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
struct nvme_request *req;
aer->ctrlr = ctrlr;
- req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
+ /*
+ * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable
+ * callback context. AER completions should be handled on a dedicated
+ * thread.
+ */
+ req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb,
+ aer);
aer->req = req;
/*
@@ -1089,7 +1101,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
return;
if (resetting && nvme_ctrlr_identify(ctrlr) != 0) {
- nvme_ctrlr_fail(ctrlr);
+ nvme_ctrlr_fail(ctrlr, false);
return;
}
@@ -1104,7 +1116,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
if (resetting) {
old_num_io_queues = ctrlr->num_io_queues;
if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
- nvme_ctrlr_fail(ctrlr);
+ nvme_ctrlr_fail(ctrlr, false);
return;
}
@@ -1122,12 +1134,12 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
nvme_ctrlr_hmb_enable(ctrlr, true, true);
if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
- nvme_ctrlr_fail(ctrlr);
+ nvme_ctrlr_fail(ctrlr, false);
return;
}
if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
- nvme_ctrlr_fail(ctrlr);
+ nvme_ctrlr_fail(ctrlr, false);
return;
}
@@ -1146,9 +1158,8 @@ nvme_ctrlr_start_config_hook(void *arg)
TSENTER();
- if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
-fail:
- nvme_ctrlr_fail(ctrlr);
+ if (nvme_ctrlr_hw_reset(ctrlr) != 0 || ctrlr->fail_on_reset != 0) {
+ nvme_ctrlr_fail(ctrlr, true);
config_intrhook_disestablish(&ctrlr->config_hook);
return;
}
@@ -1161,13 +1172,15 @@ fail:
nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
nvme_ctrlr_start(ctrlr, false);
else
- goto fail;
+ nvme_ctrlr_fail(ctrlr, false);
nvme_sysctl_initialize_ctrlr(ctrlr);
config_intrhook_disestablish(&ctrlr->config_hook);
- ctrlr->is_initialized = 1;
- nvme_notify_new_controller(ctrlr);
+ if (!ctrlr->is_failed) {
+ ctrlr->is_initialized = true;
+ nvme_notify_new_controller(ctrlr);
+ }
TSEXIT();
}
@@ -1184,7 +1197,7 @@ nvme_ctrlr_reset_task(void *arg, int pending)
nvme_ctrlr_start(ctrlr, true);
} else {
nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"timed_out\"");
- nvme_ctrlr_fail(ctrlr);
+ nvme_ctrlr_fail(ctrlr, true);
}
atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
@@ -1258,24 +1271,19 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
return EIO;
}
if (is_user_buffer) {
- /*
- * Ensure the user buffer is wired for the duration of
- * this pass-through command.
- */
- PHOLD(curproc);
buf = uma_zalloc(pbuf_zone, M_WAITOK);
buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
ret = EFAULT;
goto err;
}
- req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
- nvme_pt_done, pt);
+ req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
+ M_WAITOK, nvme_pt_done, pt);
} else
req = nvme_allocate_request_vaddr(pt->buf, pt->len,
- nvme_pt_done, pt);
+ M_WAITOK, nvme_pt_done, pt);
} else
- req = nvme_allocate_request_null(nvme_pt_done, pt);
+ req = nvme_allocate_request_null(M_WAITOK, nvme_pt_done, pt);
/* Assume user space already converted to little-endian */
req->cmd.opc = pt->cmd.opc;
@@ -1308,7 +1316,104 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
vunmapbuf(buf);
err:
uma_zfree(pbuf_zone, buf);
- PRELE(curproc);
+ }
+
+ return (ret);
+}
+
+static void
+nvme_npc_done(void *arg, const struct nvme_completion *cpl)
+{
+ struct nvme_passthru_cmd *npc = arg;
+ struct mtx *mtx = (void *)(uintptr_t)npc->metadata;
+
+ npc->result = cpl->cdw0; /* cpl in host order by now */
+ mtx_lock(mtx);
+ npc->metadata = 0;
+ wakeup(npc);
+ mtx_unlock(mtx);
+}
+
+/* XXX refactor? */
+
+int
+nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+ struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin)
+{
+ struct nvme_request *req;
+ struct mtx *mtx;
+ struct buf *buf = NULL;
+ int ret = 0;
+
+ /*
+ * We don't support metadata.
+ */
+ if (npc->metadata != 0 || npc->metadata_len != 0)
+ return (EIO);
+
+ if (npc->data_len > 0 && npc->addr != 0) {
+ if (npc->data_len > ctrlr->max_xfer_size) {
+ nvme_printf(ctrlr,
+ "npc->data_len (%d) exceeds max_xfer_size (%d)\n",
+ npc->data_len, ctrlr->max_xfer_size);
+ return (EIO);
+ }
+ /*
+ * We only support data out or data in commands, but not both at
+ * once. However, there's some comands with lower bit cleared
+ * that are really read commands, so we should filter & 3 == 0,
+ * but don't.
+ */
+ if ((npc->opcode & 0x3) == 3)
+ return (EINVAL);
+ if (is_user) {
+ buf = uma_zalloc(pbuf_zone, M_WAITOK);
+ buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ;
+ if (vmapbuf(buf, (void *)(uintptr_t)npc->addr,
+ npc->data_len, 1) < 0) {
+ ret = EFAULT;
+ goto err;
+ }
+ req = nvme_allocate_request_vaddr(buf->b_data,
+ npc->data_len, M_WAITOK, nvme_npc_done, npc);
+ } else
+ req = nvme_allocate_request_vaddr(
+ (void *)(uintptr_t)npc->addr, npc->data_len,
+ M_WAITOK, nvme_npc_done, npc);
+ } else
+ req = nvme_allocate_request_null(M_WAITOK, nvme_npc_done, npc);
+
+ req->cmd.opc = npc->opcode;
+ req->cmd.fuse = npc->flags;
+ req->cmd.rsvd2 = htole16(npc->cdw2);
+ req->cmd.rsvd3 = htole16(npc->cdw3);
+ req->cmd.cdw10 = htole32(npc->cdw10);
+ req->cmd.cdw11 = htole32(npc->cdw11);
+ req->cmd.cdw12 = htole32(npc->cdw12);
+ req->cmd.cdw13 = htole32(npc->cdw13);
+ req->cmd.cdw14 = htole32(npc->cdw14);
+ req->cmd.cdw15 = htole32(npc->cdw15);
+
+ req->cmd.nsid = htole32(nsid);
+
+ mtx = mtx_pool_find(mtxpool_sleep, npc);
+ npc->metadata = (uintptr_t) mtx;
+
+ /* XXX no timeout passed down */
+ if (is_admin)
+ nvme_ctrlr_submit_admin_request(ctrlr, req);
+ else
+ nvme_ctrlr_submit_io_request(ctrlr, req);
+
+ mtx_lock(mtx);
+ while (npc->metadata != 0)
+ mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0);
+ mtx_unlock(mtx);
+
+ if (buf != NULL) {
+ vunmapbuf(buf);
+err:
+ uma_zfree(pbuf_zone, buf);
}
return (ret);
@@ -1324,6 +1429,7 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
ctrlr = cdev->si_drv1;
switch (cmd) {
+ case NVME_IOCTL_RESET: /* Linux compat */
case NVME_RESET_CONTROLLER:
nvme_ctrlr_reset(ctrlr);
break;
@@ -1334,15 +1440,30 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
case NVME_GET_NSID:
{
struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
- strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
+ strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
sizeof(gnsid->cdev));
- gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
gnsid->nsid = 0;
break;
}
case NVME_GET_MAX_XFER_SIZE:
*(uint64_t *)arg = ctrlr->max_xfer_size;
break;
+ case NVME_GET_CONTROLLER_DATA:
+ memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
+ break;
+ /* Linux Compatible (see nvme_linux.h) */
+ case NVME_IOCTL_ID:
+ td->td_retval[0] = 0xfffffffful;
+ return (0);
+
+ case NVME_IOCTL_ADMIN_CMD:
+ case NVME_IOCTL_IO_CMD: {
+ struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+ return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, true,
+ cmd == NVME_IOCTL_ADMIN_CMD));
+ }
+
default:
return (ENOTTY);
}
@@ -1443,6 +1564,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
ctrlr->enable_aborts = 0;
TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
+ ctrlr->alignment_splits = counter_u64_alloc(M_WAITOK);
+
/* Cap transfers by the maximum addressable by page-sized PRP (4KB pages -> 2MB). */
ctrlr->max_xfer_size = MIN(maxphys, (ctrlr->page_size / 8 * ctrlr->page_size));
if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
@@ -1464,7 +1587,7 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq");
ctrlr->is_resetting = 0;
- ctrlr->is_initialized = 0;
+ ctrlr->is_initialized = false;
ctrlr->notification_sent = 0;
TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
STAILQ_INIT(&ctrlr->fail_req);
@@ -1477,18 +1600,25 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
md_args.mda_mode = 0600;
md_args.mda_unit = device_get_unit(dev);
md_args.mda_si_drv1 = (void *)ctrlr;
- status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
- device_get_unit(dev));
+ status = make_dev_s(&md_args, &ctrlr->cdev, "%s",
+ device_get_nameunit(dev));
if (status != 0)
return (ENXIO);
return (0);
}
+/*
+ * Called on detach, or on error on attach. The nvme_controller won't be used
+ * again once we return, so we have to tear everything down (so nothing
+ * references this, no callbacks, etc), but don't need to reset all the state
+ * since nvme_controller will be freed soon.
+ */
void
nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
{
- int gone, i;
+ int i;
+ bool gone;
ctrlr->is_dying = true;
@@ -1498,12 +1628,18 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
goto noadminq;
/*
- * Check whether it is a hot unplug or a clean driver detach.
- * If device is not there any more, skip any shutdown commands.
+ * Check whether it is a hot unplug or a clean driver detach. If device
+ * is not there any more, skip any shutdown commands. Some hotplug
+ * bridges will return zeros instead of ff's when the device is
+ * departing, so ask the bridge if the device is gone. Some systems can
+ * remove the drive w/o the bridge knowing its gone (they don't really
+ * do hotplug), so failsafe with detecting all ff's (impossible with
+ * this hardware) as the device being gone.
*/
- gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
+ gone = bus_child_present(dev) == 0 ||
+ (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
if (gone)
- nvme_ctrlr_fail(ctrlr);
+ nvme_ctrlr_fail(ctrlr, true);
else
nvme_notify_fail_consumers(ctrlr);
@@ -1529,17 +1665,17 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
nvme_admin_qpair_destroy(&ctrlr->adminq);
/*
- * Notify the controller of a shutdown, even though this is due to
- * a driver unload, not a system shutdown (this path is not invoked
- * during shutdown). This ensures the controller receives a
- * shutdown notification in case the system is shutdown before
- * reloading the driver.
+ * Notify the controller of a shutdown, even though this is due to a
+ * driver unload, not a system shutdown (this path is not invoked uring
+ * shutdown). This ensures the controller receives a shutdown
+ * notification in case the system is shutdown before reloading the
+ * driver. Some NVMe drives need this to flush their cache to stable
+ * media and consider it a safe shutdown in SMART stats.
*/
- if (!gone)
+ if (!gone) {
nvme_ctrlr_shutdown(ctrlr);
-
- if (!gone)
nvme_ctrlr_disable(ctrlr);
+ }
noadminq:
if (ctrlr->taskqueue)
@@ -1561,6 +1697,9 @@ noadminq:
ctrlr->resource_id, ctrlr->resource);
nores:
+ if (ctrlr->alignment_splits)
+ counter_u64_free(ctrlr->alignment_splits);
+
mtx_destroy(&ctrlr->lock);
}
@@ -1630,7 +1769,9 @@ nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
int to = hz;
/*
- * Can't touch failed controllers, so it's already suspended.
+ * Can't touch failed controllers, so it's already suspended. User will
+ * need to do an explicit reset to bring it back, if that's even
+ * possible.
*/
if (ctrlr->is_failed)
return (0);
@@ -1684,7 +1825,8 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
/*
* Now that we've reset the hardware, we can restart the controller. Any
* I/O that was pending is requeued. Any admin commands are aborted with
- * an error. Once we've restarted, take the controller out of reset.
+ * an error. Once we've restarted, stop flagging the controller as being
+ * in the reset phase.
*/
nvme_ctrlr_start(ctrlr, true);
(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
@@ -1697,7 +1839,7 @@ fail:
* itself, due to questionable APIs.
*/
nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
- nvme_ctrlr_fail(ctrlr);
+ nvme_ctrlr_fail(ctrlr, true);
(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
return (0);
}
diff --git a/sys/dev/nvme/nvme_ctrlr_cmd.c b/sys/dev/nvme/nvme_ctrlr_cmd.c
index 68934b9b3947..993a7718356d 100644
--- a/sys/dev/nvme/nvme_ctrlr_cmd.c
+++ b/sys/dev/nvme/nvme_ctrlr_cmd.c
@@ -37,7 +37,7 @@ nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr, void *payload,
struct nvme_command *cmd;
req = nvme_allocate_request_vaddr(payload,
- sizeof(struct nvme_controller_data), cb_fn, cb_arg);
+ sizeof(struct nvme_controller_data), M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_IDENTIFY;
@@ -59,7 +59,7 @@ nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid,
struct nvme_command *cmd;
req = nvme_allocate_request_vaddr(payload,
- sizeof(struct nvme_namespace_data), cb_fn, cb_arg);
+ sizeof(struct nvme_namespace_data), M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_IDENTIFY;
@@ -79,7 +79,7 @@ nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
+ req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_CREATE_IO_CQ;
@@ -103,7 +103,7 @@ nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
+ req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_CREATE_IO_SQ;
@@ -127,7 +127,7 @@ nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
+ req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_DELETE_IO_CQ;
@@ -148,7 +148,7 @@ nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
+ req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_DELETE_IO_SQ;
@@ -171,7 +171,7 @@ nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
+ req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_SET_FEATURES;
@@ -193,7 +193,7 @@ nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
+ req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_GET_FEATURES;
@@ -259,7 +259,12 @@ nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_vaddr(payload, payload_size, cb_fn, cb_arg);
+ /*
+ * XXX-MJ this should be M_WAITOK but we might be called from AER
+ * completion processing, which is a non-sleepable context.
+ */
+ req = nvme_allocate_request_vaddr(payload, payload_size,
+ M_NOWAIT, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_GET_LOG_PAGE;
@@ -319,7 +324,11 @@ nvme_ctrlr_cmd_abort(struct nvme_controller *ctrlr, uint16_t cid,
struct nvme_request *req;
struct nvme_command *cmd;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
+ /*
+ * XXX-MJ this should be M_WAITOK, we do reset from non-sleepable
+ * context and abort commands as part of that.
+ */
+ req = nvme_allocate_request_null(M_NOWAIT, cb_fn, cb_arg);
cmd = &req->cmd;
cmd->opc = NVME_OPC_ABORT;
diff --git a/sys/dev/nvme/nvme_linux.h b/sys/dev/nvme/nvme_linux.h
new file mode 100644
index 000000000000..aaa68e1d34f8
--- /dev/null
+++ b/sys/dev/nvme/nvme_linux.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2024, Netflix Inc.
+ * Written by Warner Losh
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * Linux compatible NVME ioctls. So far we just support ID, ADMIN_CMD and
+ * IO_CMD. The rest are not supported.
+ */
+
+
+#include <sys/ioccom.h>
+#include <sys/_types.h>
+
+struct nvme_passthru_cmd {
+ __uint8_t opcode;
+ __uint8_t flags;
+ __uint16_t rsvd1;
+ __uint32_t nsid;
+ __uint32_t cdw2;
+ __uint32_t cdw3;
+ __uint64_t metadata;
+ __uint64_t addr;
+ __uint32_t metadata_len;
+ __uint32_t data_len;
+ __uint32_t cdw10;
+ __uint32_t cdw11;
+ __uint32_t cdw12;
+ __uint32_t cdw13;
+ __uint32_t cdw14;
+ __uint32_t cdw15;
+ __uint32_t timeout_ms;
+ __uint32_t result;
+};
+
+#define nvme_admin_cmd nvme_passthru_cmd
+
+/*
+ * Linux nvme ioctls, commented out ones are not supported
+ */
+#define NVME_IOCTL_ID _IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd)
+/* #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) */
+#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd)
+#define NVME_IOCTL_RESET _IO('N', 0x44)
+/* #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) */
+/* #define NVME_IOCTL_RESCAN _IO('N', 0x46) */
+/* #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) */
+
+/* io_uring async commands: */
+/* #define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) */
diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c
index 360b9f982c20..3f29382fe42f 100644
--- a/sys/dev/nvme/nvme_ns.c
+++ b/sys/dev/nvme/nvme_ns.c
@@ -43,6 +43,7 @@
#include <geom/geom.h>
#include "nvme_private.h"
+#include "nvme_linux.h"
static void nvme_bio_child_inbed(struct bio *parent, int bio_error);
static void nvme_bio_child_done(void *arg,
@@ -82,9 +83,8 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
case NVME_GET_NSID:
{
struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
- strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
+ strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
sizeof(gnsid->cdev));
- gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
gnsid->nsid = ns->id;
break;
}
@@ -94,6 +94,18 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
case DIOCGSECTORSIZE:
*(u_int *)arg = nvme_ns_get_sector_size(ns);
break;
+ /* Linux Compatible (see nvme_linux.h) */
+ case NVME_IOCTL_ID:
+ td->td_retval[0] = ns->id;
+ return (0);
+
+ case NVME_IOCTL_ADMIN_CMD:
+ case NVME_IOCTL_IO_CMD: {
+ struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+ return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, ns->id, true,
+ cmd == NVME_IOCTL_ADMIN_CMD));
+ }
default:
return (ENOTTY);
}
@@ -429,6 +441,7 @@ nvme_ns_split_bio(struct nvme_namespace *ns, struct bio *bp,
if (child_bios == NULL)
return (ENOMEM);
+ counter_u64_add(ns->ctrlr->alignment_splits, 1);
for (i = 0; i < num_bios; i++) {
child = child_bios[i];
err = nvme_ns_bio_process(ns, child, nvme_bio_child_done);
@@ -604,11 +617,12 @@ nvme_ns_construct(struct nvme_namespace *ns, uint32_t id,
md_args.mda_unit = unit;
md_args.mda_mode = 0600;
md_args.mda_si_drv1 = ns;
- res = make_dev_s(&md_args, &ns->cdev, "nvme%dns%d",
- device_get_unit(ctrlr->dev), ns->id);
+ res = make_dev_s(&md_args, &ns->cdev, "%sn%d",
+ device_get_nameunit(ctrlr->dev), ns->id);
if (res != 0)
return (ENXIO);
-
+ ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%d",
+ device_get_nameunit(ctrlr->dev), ns->id);
ns->cdev->si_flags |= SI_UNMAPPED;
return (0);
@@ -618,6 +632,9 @@ void
nvme_ns_destruct(struct nvme_namespace *ns)
{
- if (ns->cdev != NULL)
+ if (ns->cdev != NULL) {
+ if (ns->cdev->si_drv2 != NULL)
+ destroy_dev(ns->cdev->si_drv2);
destroy_dev(ns->cdev);
+ }
}
diff --git a/sys/dev/nvme/nvme_ns_cmd.c b/sys/dev/nvme/nvme_ns_cmd.c
index 8cbeac025307..1bad9929cb09 100644
--- a/sys/dev/nvme/nvme_ns_cmd.c
+++ b/sys/dev/nvme/nvme_ns_cmd.c
@@ -36,8 +36,7 @@ nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba,
struct nvme_request *req;
req = nvme_allocate_request_vaddr(payload,
- lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg);
-
+ lba_count * nvme_ns_get_sector_size(ns), M_NOWAIT, cb_fn, cb_arg);
if (req == NULL)
return (ENOMEM);
@@ -56,11 +55,9 @@ nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp,
uint64_t lba;
uint64_t lba_count;
- req = nvme_allocate_request_bio(bp, cb_fn, cb_arg);
-
+ req = nvme_allocate_request_bio(bp, M_NOWAIT, cb_fn, cb_arg);
if (req == NULL)
return (ENOMEM);
-
lba = bp->bio_offset / nvme_ns_get_sector_size(ns);
lba_count = bp->bio_bcount / nvme_ns_get_sector_size(ns);
nvme_ns_read_cmd(&req->cmd, ns->id, lba, lba_count);
@@ -77,8 +74,7 @@ nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba,
struct nvme_request *req;
req = nvme_allocate_request_vaddr(payload,
- lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg);
-
+ lba_count * nvme_ns_get_sector_size(ns), M_NOWAIT, cb_fn, cb_arg);
if (req == NULL)
return (ENOMEM);
@@ -97,8 +93,7 @@ nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp,
uint64_t lba;
uint64_t lba_count;
- req = nvme_allocate_request_bio(bp, cb_fn, cb_arg);
-
+ req = nvme_allocate_request_bio(bp, M_NOWAIT, cb_fn, cb_arg);
if (req == NULL)
return (ENOMEM);
lba = bp->bio_offset / nvme_ns_get_sector_size(ns);
@@ -118,8 +113,8 @@ nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload,
struct nvme_command *cmd;
req = nvme_allocate_request_vaddr(payload,
- num_ranges * sizeof(struct nvme_dsm_range), cb_fn, cb_arg);
-
+ num_ranges * sizeof(struct nvme_dsm_range), M_NOWAIT, cb_fn,
+ cb_arg);
if (req == NULL)
return (ENOMEM);
@@ -141,8 +136,7 @@ nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg)
{
struct nvme_request *req;
- req = nvme_allocate_request_null(cb_fn, cb_arg);
-
+ req = nvme_allocate_request_null(M_NOWAIT, cb_fn, cb_arg);
if (req == NULL)
return (ENOMEM);
@@ -165,8 +159,8 @@ nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len)
int i;
status.done = FALSE;
- req = nvme_allocate_request_vaddr(virt, len, nvme_completion_poll_cb,
- &status);
+ req = nvme_allocate_request_vaddr(virt, len, M_NOWAIT,
+ nvme_completion_poll_cb, &status);
if (req == NULL)
return (ENOMEM);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 69141add4e48..949e69ec9290 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -32,6 +32,7 @@
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/bus.h>
+#include <sys/counter.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -297,11 +298,15 @@ struct nvme_controller {
void *cons_cookie[NVME_MAX_CONSUMERS];
uint32_t is_resetting;
- uint32_t is_initialized;
uint32_t notification_sent;
+ u_int fail_on_reset;
bool is_failed;
+ bool is_failed_admin;
bool is_dying;
+ bool isr_warned;
+ bool is_initialized;
+
STAILQ_HEAD(, nvme_request) fail_req;
/* Host Memory Buffer */
@@ -317,6 +322,9 @@ struct nvme_controller {
bus_dmamap_t hmb_desc_map;
struct nvme_hmb_desc *hmb_desc_vaddr;
uint64_t hmb_desc_paddr;
+
+ /* Statistics */
+ counter_u64_t alignment_splits;
};
#define nvme_mmio_offsetof(reg) \
@@ -413,9 +421,6 @@ void nvme_qpair_submit_request(struct nvme_qpair *qpair,
struct nvme_request *req);
void nvme_qpair_reset(struct nvme_qpair *qpair);
void nvme_qpair_fail(struct nvme_qpair *qpair);
-void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
- struct nvme_request *req,
- uint32_t sct, uint32_t sc);
void nvme_admin_qpair_enable(struct nvme_qpair *qpair);
void nvme_admin_qpair_disable(struct nvme_qpair *qpair);
@@ -481,11 +486,14 @@ nvme_single_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
}
static __inline struct nvme_request *
-_nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg)
+_nvme_allocate_request(const int how, nvme_cb_fn_t cb_fn, void *cb_arg)
{
struct nvme_request *req;
- req = malloc(sizeof(*req), M_NVME, M_NOWAIT | M_ZERO);
+ KASSERT(how == M_WAITOK || how == M_NOWAIT,
+ ("nvme_allocate_request: invalid how %d", how));
+
+ req = malloc(sizeof(*req), M_NVME, how | M_ZERO);
if (req != NULL) {
req->cb_fn = cb_fn;
req->cb_arg = cb_arg;
@@ -496,11 +504,11 @@ _nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg)
static __inline struct nvme_request *
nvme_allocate_request_vaddr(void *payload, uint32_t payload_size,
- nvme_cb_fn_t cb_fn, void *cb_arg)
+ const int how, nvme_cb_fn_t cb_fn, void *cb_arg)
{
struct nvme_request *req;
- req = _nvme_allocate_request(cb_fn, cb_arg);
+ req = _nvme_allocate_request(how, cb_fn, cb_arg);
if (req != NULL) {
req->payload = memdesc_vaddr(payload, payload_size);
req->payload_valid = true;
@@ -509,20 +517,21 @@ nvme_allocate_request_vaddr(void *payload, uint32_t payload_size,
}
static __inline struct nvme_request *
-nvme_allocate_request_null(nvme_cb_fn_t cb_fn, void *cb_arg)
+nvme_allocate_request_null(const int how, nvme_cb_fn_t cb_fn, void *cb_arg)
{
struct nvme_request *req;
- req = _nvme_allocate_request(cb_fn, cb_arg);
+ req = _nvme_allocate_request(how, cb_fn, cb_arg);
return (req);
}
static __inline struct nvme_request *
-nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg)
+nvme_allocate_request_bio(struct bio *bio, const int how, nvme_cb_fn_t cb_fn,
+ void *cb_arg)
{
struct nvme_request *req;
- req = _nvme_allocate_request(cb_fn, cb_arg);
+ req = _nvme_allocate_request(how, cb_fn, cb_arg);
if (req != NULL) {
req->payload = memdesc_bio(bio);
req->payload_valid = true;
@@ -531,16 +540,16 @@ nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg)
}
static __inline struct nvme_request *
-nvme_allocate_request_ccb(union ccb *ccb, nvme_cb_fn_t cb_fn, void *cb_arg)
+nvme_allocate_request_ccb(union ccb *ccb, const int how, nvme_cb_fn_t cb_fn,
+ void *cb_arg)
{
struct nvme_request *req;
- req = _nvme_allocate_request(cb_fn, cb_arg);
+ req = _nvme_allocate_request(how, cb_fn, cb_arg);
if (req != NULL) {
req->payload = memdesc_ccb(ccb);
req->payload_valid = true;
}
-
return (req);
}
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index 62d27e439180..bd8626e32209 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -31,6 +31,7 @@
#include <sys/conf.h>
#include <sys/domainset.h>
#include <sys/proc.h>
+#include <sys/sbuf.h>
#include <dev/pci/pcivar.h>
@@ -43,96 +44,36 @@ static void _nvme_qpair_submit_request(struct nvme_qpair *qpair,
struct nvme_request *req);
static void nvme_qpair_destroy(struct nvme_qpair *qpair);
-#define DEFAULT_INDEX 256
-#define DEFAULT_ENTRY(x) [DEFAULT_INDEX] = x
-#define OPC_ENTRY(x) [NVME_OPC_ ## x] = #x
-
-static const char *admin_opcode[DEFAULT_INDEX + 1] = {
- OPC_ENTRY(DELETE_IO_SQ),
- OPC_ENTRY(CREATE_IO_SQ),
- OPC_ENTRY(GET_LOG_PAGE),
- OPC_ENTRY(DELETE_IO_CQ),
- OPC_ENTRY(CREATE_IO_CQ),
- OPC_ENTRY(IDENTIFY),
- OPC_ENTRY(ABORT),
- OPC_ENTRY(SET_FEATURES),
- OPC_ENTRY(GET_FEATURES),
- OPC_ENTRY(ASYNC_EVENT_REQUEST),
- OPC_ENTRY(NAMESPACE_MANAGEMENT),
- OPC_ENTRY(FIRMWARE_ACTIVATE),
- OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD),
- OPC_ENTRY(DEVICE_SELF_TEST),
- OPC_ENTRY(NAMESPACE_ATTACHMENT),
- OPC_ENTRY(KEEP_ALIVE),
- OPC_ENTRY(DIRECTIVE_SEND),
- OPC_ENTRY(DIRECTIVE_RECEIVE),
- OPC_ENTRY(VIRTUALIZATION_MANAGEMENT),
- OPC_ENTRY(NVME_MI_SEND),
- OPC_ENTRY(NVME_MI_RECEIVE),
- OPC_ENTRY(CAPACITY_MANAGEMENT),
- OPC_ENTRY(LOCKDOWN),
- OPC_ENTRY(DOORBELL_BUFFER_CONFIG),
- OPC_ENTRY(FABRICS_COMMANDS),
- OPC_ENTRY(FORMAT_NVM),
- OPC_ENTRY(SECURITY_SEND),
- OPC_ENTRY(SECURITY_RECEIVE),
- OPC_ENTRY(SANITIZE),
- OPC_ENTRY(GET_LBA_STATUS),
- DEFAULT_ENTRY("ADMIN COMMAND"),
-};
-
-static const char *io_opcode[DEFAULT_INDEX + 1] = {
- OPC_ENTRY(FLUSH),
- OPC_ENTRY(WRITE),
- OPC_ENTRY(READ),
- OPC_ENTRY(WRITE_UNCORRECTABLE),
- OPC_ENTRY(COMPARE),
- OPC_ENTRY(WRITE_ZEROES),
- OPC_ENTRY(DATASET_MANAGEMENT),
- OPC_ENTRY(VERIFY),
- OPC_ENTRY(RESERVATION_REGISTER),
- OPC_ENTRY(RESERVATION_REPORT),
- OPC_ENTRY(RESERVATION_ACQUIRE),
- OPC_ENTRY(RESERVATION_RELEASE),
- OPC_ENTRY(COPY),
- DEFAULT_ENTRY("IO COMMAND"),
-};
-
-static const char *
-get_opcode_string(const char *op[DEFAULT_INDEX + 1], uint16_t opc)
-{
- const char *nm = opc < DEFAULT_INDEX ? op[opc] : op[DEFAULT_INDEX];
-
- return (nm != NULL ? nm : op[DEFAULT_INDEX]);
-}
-
static const char *
-get_admin_opcode_string(uint16_t opc)
+get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len)
{
- return (get_opcode_string(admin_opcode, opc));
-}
+ struct sbuf sb;
-static const char *
-get_io_opcode_string(uint16_t opc)
-{
- return (get_opcode_string(io_opcode, opc));
+ sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
+ nvme_opcode_sbuf(admin, opc, &sb);
+ if (sbuf_finish(&sb) != 0)
+ return ("");
+ return (buf);
}
static void
nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
struct nvme_command *cmd)
{
+ char buf[64];
- nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x "
+ nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x "
"cdw10:%08x cdw11:%08x\n",
- get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid,
- le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11));
+ get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id,
+ cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10),
+ le32toh(cmd->cdw11));
}
static void
nvme_io_qpair_print_command(struct nvme_qpair *qpair,
struct nvme_command *cmd)
{
+ char buf[64];
switch (cmd->opc) {
case NVME_OPC_WRITE:
@@ -143,23 +84,15 @@ nvme_io_qpair_print_command(struct nvme_qpair *qpair,
case NVME_OPC_VERIFY:
nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
"lba:%llu len:%d\n",
- get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid),
+ get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
+ qpair->id, cmd->cid, le32toh(cmd->nsid),
((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
(le32toh(cmd->cdw12) & 0xFFFF) + 1);
break;
- case NVME_OPC_FLUSH:
- case NVME_OPC_DATASET_MANAGEMENT:
- case NVME_OPC_RESERVATION_REGISTER:
- case NVME_OPC_RESERVATION_REPORT:
- case NVME_OPC_RESERVATION_ACQUIRE:
- case NVME_OPC_RESERVATION_RELEASE:
- nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
- get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid));
- break;
default:
- nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n",
- get_io_opcode_string(cmd->opc), cmd->opc, qpair->id,
- cmd->cid, le32toh(cmd->nsid));
+ nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
+ get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
+ qpair->id, cmd->cid, le32toh(cmd->nsid));
break;
}
}
@@ -183,170 +116,33 @@ nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
}
}
-struct nvme_status_string {
- uint16_t sc;
- const char * str;
-};
-
-static struct nvme_status_string generic_status[] = {
- { NVME_SC_SUCCESS, "SUCCESS" },
- { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
- { NVME_SC_INVALID_FIELD, "INVALID_FIELD" },
- { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
- { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
- { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
- { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
- { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
- { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
- { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
- { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
- { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
- { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
- { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" },
- { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" },
- { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
- { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
- { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
- { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" },
- { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" },
- { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
- { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
- { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" },
- { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" },
- { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" },
- { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" },
- { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" },
- { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
- { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
- { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" },
- { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
- { NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" },
- { NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" },
- { NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" },
-
- { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
- { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
- { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
- { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
- { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
- { 0xFFFF, "GENERIC" }
-};
-
-static struct nvme_status_string command_specific_status[] = {
- { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
- { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
- { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" },
- { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
- { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
- { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
- { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
- { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
- { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
- { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
- { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" },
- { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
- { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" },
- { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
- { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
- { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" },
- { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" },
- { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" },
- { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
- { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
- { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
- { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" },
- { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
- { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
- { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" },
- { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" },
- { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" },
- { NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
- { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" },
- { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" },
- { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
- { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" },
- { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
- { NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" },
- { NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" },
- { NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" },
-
- { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
- { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
- { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" },
- { 0xFFFF, "COMMAND SPECIFIC" }
-};
-
-static struct nvme_status_string media_error_status[] = {
- { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
- { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
- { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
- { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
- { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
- { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
- { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
- { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" },
- { 0xFFFF, "MEDIA ERROR" }
-};
-
-static struct nvme_status_string path_related_status[] = {
- { NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
- { NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" },
- { NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" },
- { NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" },
- { NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" },
- { NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" },
- { NVME_SC_COMMAND_ABORTED_BY_HOST, "COMMAND ABORTED BY HOST" },
- { 0xFFFF, "PATH RELATED" },
-};
-
static const char *
-get_status_string(uint16_t sct, uint16_t sc)
+get_status_string(const struct nvme_completion *cpl, char *buf, size_t len)
{
- struct nvme_status_string *entry;
+ struct sbuf sb;
- switch (sct) {
- case NVME_SCT_GENERIC:
- entry = generic_status;
- break;
- case NVME_SCT_COMMAND_SPECIFIC:
- entry = command_specific_status;
- break;
- case NVME_SCT_MEDIA_ERROR:
- entry = media_error_status;
- break;
- case NVME_SCT_PATH_RELATED:
- entry = path_related_status;
- break;
- case NVME_SCT_VENDOR_SPECIFIC:
- return ("VENDOR SPECIFIC");
- default:
- return ("RESERVED");
- }
-
- while (entry->sc != 0xFFFF) {
- if (entry->sc == sc)
- return (entry->str);
- entry++;
- }
- return (entry->str);
+ sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
+ nvme_sc_sbuf(cpl, &sb);
+ if (sbuf_finish(&sb) != 0)
+ return ("");
+ return (buf);
}
void
nvme_qpair_print_completion(struct nvme_qpair *qpair,
struct nvme_completion *cpl)
{
- uint8_t sct, sc, crd, m, dnr, p;
+ char buf[64];
+ uint8_t crd, m, dnr, p;
- sct = NVME_STATUS_GET_SCT(cpl->status);
- sc = NVME_STATUS_GET_SC(cpl->status);
crd = NVME_STATUS_GET_CRD(cpl->status);
m = NVME_STATUS_GET_M(cpl->status);
dnr = NVME_STATUS_GET_DNR(cpl->status);
p = NVME_STATUS_GET_P(cpl->status);
- nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x p:%d "
+ nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d "
"sqid:%d cid:%d cdw0:%x\n",
- get_status_string(sct, sc), sct, sc, crd, m, dnr, p,
+ get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p,
cpl->sqid, cpl->cid, cpl->cdw0);
}
@@ -414,10 +210,12 @@ static void
nvme_qpair_complete_tracker(struct nvme_tracker *tr,
struct nvme_completion *cpl, error_print_t print_on_error)
{
- struct nvme_qpair * qpair = tr->qpair;
+ struct nvme_qpair *qpair = tr->qpair;
struct nvme_request *req;
bool retry, error, retriable;
+ mtx_assert(&qpair->lock, MA_NOTOWNED);
+
req = tr->req;
error = nvme_completion_is_error(cpl);
retriable = nvme_completion_is_retry(cpl);
@@ -480,43 +278,52 @@ nvme_qpair_complete_tracker(struct nvme_tracker *tr,
mtx_unlock(&qpair->lock);
}
+static uint32_t
+nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr)
+{
+ uint32_t status = 0;
+
+ status |= NVMEF(NVME_STATUS_SCT, sct);
+ status |= NVMEF(NVME_STATUS_SC, sc);
+ status |= NVMEF(NVME_STATUS_DNR, dnr);
+ /* M=0 : this is artificial so no data in error log page */
+ /* CRD=0 : this is artificial and no delayed retry support anyway */
+ /* P=0 : phase not checked */
+ return (status);
+}
+
static void
nvme_qpair_manual_complete_tracker(
struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
error_print_t print_on_error)
{
struct nvme_completion cpl;
+ struct nvme_qpair * qpair = tr->qpair;
- memset(&cpl, 0, sizeof(cpl));
+ mtx_assert(&qpair->lock, MA_NOTOWNED);
- struct nvme_qpair * qpair = tr->qpair;
+ memset(&cpl, 0, sizeof(cpl));
cpl.sqid = qpair->id;
cpl.cid = tr->cid;
- cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
- cpl.status |= NVMEF(NVME_STATUS_SC, sc);
- cpl.status |= NVMEF(NVME_STATUS_DNR, dnr);
- /* M=0 : this is artificial so no data in error log page */
- /* CRD=0 : this is artificial and no delayed retry support anyway */
- /* P=0 : phase not checked */
+ cpl.status = nvme_qpair_make_status(sct, sc, dnr);
nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
}
-void
+static void
nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
- struct nvme_request *req, uint32_t sct, uint32_t sc)
+ struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr,
+ error_print_t print_on_error)
{
struct nvme_completion cpl;
bool error;
memset(&cpl, 0, sizeof(cpl));
cpl.sqid = qpair->id;
- cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
- cpl.status |= NVMEF(NVME_STATUS_SC, sc);
-
+ cpl.status = nvme_qpair_make_status(sct, sc, dnr);
error = nvme_completion_is_error(&cpl);
- if (error) {
+ if (error && print_on_error == ERROR_PRINT_ALL) {
nvme_qpair_print_command(qpair, &req->cmd);
nvme_qpair_print_completion(qpair, &cpl);
}
@@ -679,7 +486,7 @@ _nvme_qpair_process_completions(struct nvme_qpair *qpair)
bool
nvme_qpair_process_completions(struct nvme_qpair *qpair)
{
- bool done;
+ bool done = false;
/*
* Interlock with reset / recovery code. This is an usually uncontended
@@ -687,12 +494,12 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
* and to prevent races with the recovery process called from a timeout
* context.
*/
- if (!mtx_trylock(&qpair->recovery)) {
- qpair->num_recovery_nolock++;
- return (false);
- }
+ mtx_lock(&qpair->recovery);
- done = _nvme_qpair_process_completions(qpair);
+ if (__predict_true(qpair->recovery_state == RECOVERY_NONE))
+ done = _nvme_qpair_process_completions(qpair);
+ else
+ qpair->num_recovery_nolock++; // XXX likely need to rename
mtx_unlock(&qpair->recovery);
@@ -950,27 +757,26 @@ nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
/*
* nvme_complete_tracker must be called without the qpair lock held. It
* takes the lock to adjust outstanding_tr list, so make sure we don't
- * have it yet (since this is a general purpose routine). We take the
- * lock to make the list traverse safe, but have to drop the lock to
- * complete any AER. We restart the list scan when we do this to make
- * this safe. There's interlock with the ISR so we know this tracker
- * won't be completed twice.
+ * have it yet. We need the lock to make the list traverse safe, but
+ * have to drop the lock to complete any AER. We restart the list scan
+ * when we do this to make this safe. There's interlock with the ISR so
+ * we know this tracker won't be completed twice.
*/
mtx_assert(&qpair->lock, MA_NOTOWNED);
mtx_lock(&qpair->lock);
tr = TAILQ_FIRST(&qpair->outstanding_tr);
while (tr != NULL) {
- if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
- mtx_unlock(&qpair->lock);
- nvme_qpair_manual_complete_tracker(tr,
- NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
- ERROR_PRINT_NONE);
- mtx_lock(&qpair->lock);
- tr = TAILQ_FIRST(&qpair->outstanding_tr);
- } else {
+ if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
tr = TAILQ_NEXT(tr, tailq);
+ continue;
}
+ mtx_unlock(&qpair->lock);
+ nvme_qpair_manual_complete_tracker(tr,
+ NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
+ ERROR_PRINT_NONE);
+ mtx_lock(&qpair->lock);
+ tr = TAILQ_FIRST(&qpair->outstanding_tr);
}
mtx_unlock(&qpair->lock);
}
@@ -997,22 +803,35 @@ nvme_abort_complete(void *arg, const struct nvme_completion *status)
struct nvme_tracker *tr = arg;
/*
- * If cdw0 == 1, the controller was not able to abort the command
- * we requested. We still need to check the active tracker array,
- * to cover race where I/O timed out at same time controller was
- * completing the I/O.
+ * If cdw0 bit 0 == 1, the controller was not able to abort the command
+ * we requested. We still need to check the active tracker array, to
+ * cover race where I/O timed out at same time controller was completing
+ * the I/O. An abort command always is on the admin queue, but affects
+ * either an admin or an I/O queue, so take the appropriate qpair lock
+ * for the original command's queue, since we'll need it to avoid races
+ * with the completion code and to complete the command manually.
*/
- if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
+ mtx_lock(&tr->qpair->lock);
+ if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
/*
- * An I/O has timed out, and the controller was unable to
- * abort it for some reason. Construct a fake completion
- * status, and then complete the I/O's tracker manually.
+ * An I/O has timed out, and the controller was unable to abort
+ * it for some reason. And we've not processed a completion for
+ * it yet. Construct a fake completion status, and then complete
+ * the I/O's tracker manually.
*/
nvme_printf(tr->qpair->ctrlr,
"abort command failed, aborting command manually\n");
nvme_qpair_manual_complete_tracker(tr,
NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
}
+ /*
+ * XXX We don't check status for the possible 'Could not abort because
+ * excess aborts were submitted to the controller'. We don't prevent
+ * that, either. Document for the future here, since the standard is
+ * squishy and only says 'may generate' but implies anything is possible
+ * including hangs if you exceed the ACL.
+ */
+ mtx_unlock(&tr->qpair->lock);
}
static void
@@ -1022,8 +841,9 @@ nvme_qpair_timeout(void *arg)
struct nvme_controller *ctrlr = qpair->ctrlr;
struct nvme_tracker *tr;
sbintime_t now;
- bool idle = false;
- bool needs_reset;
+ bool idle = true;
+ bool is_admin = qpair == &ctrlr->adminq;
+ bool fast;
uint32_t csts;
uint8_t cfs;
@@ -1034,9 +854,10 @@ nvme_qpair_timeout(void *arg)
* failure processing that races with the qpair timeout will fail
* safely.
*/
- if (qpair->ctrlr->is_failed) {
+ if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
nvme_printf(qpair->ctrlr,
- "Failed controller, stopping watchdog timeout.\n");
+ "%sFailed controller, stopping watchdog timeout.\n",
+ is_admin ? "Complete " : "");
qpair->timer_armed = false;
return;
}
@@ -1069,23 +890,35 @@ nvme_qpair_timeout(void *arg)
*/
csts = nvme_mmio_read_4(ctrlr, csts);
cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
- if (csts == NVME_GONE || cfs == 1)
- goto do_reset;
+ if (csts == NVME_GONE || cfs == 1) {
+ /*
+ * We've had a command timeout that we weren't able to
+ * abort or we have aborts disabled and any command
+ * timed out.
+ *
+ * If we get here due to a possible surprise hot-unplug
+ * event, then we let nvme_ctrlr_reset confirm and fail
+ * the controller.
+ */
+do_reset:
+ nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
+ (csts == 0xffffffff) ? " and possible hot unplug" :
+ (cfs ? " and fatal error status" : ""));
+ qpair->recovery_state = RECOVERY_WAITING;
+ nvme_ctrlr_reset(ctrlr);
+ idle = false;
+ break;
+ }
- /*
- * Process completions. We already have the recovery lock, so
- * call the locked version.
- */
- _nvme_qpair_process_completions(qpair);
/*
- * Check to see if we need to timeout any commands. If we do, then
- * we also enter a recovery phase.
+ * See if there's any recovery needed. First, do a fast check to
+ * see if anything could have timed out. If not, then skip
+ * everything else.
*/
- now = getsbinuptime();
- needs_reset = false;
- idle = true;
+ fast = false;
mtx_lock(&qpair->lock);
+ now = getsbinuptime();
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
/*
* Skip async commands, they are posted to the card for
@@ -1093,48 +926,83 @@ nvme_qpair_timeout(void *arg)
*/
if (tr->deadline == SBT_MAX)
continue;
- if (now > tr->deadline) {
- if (tr->req->cb_fn != nvme_abort_complete &&
- ctrlr->enable_aborts) {
- /*
- * This isn't an abort command, ask
- * for a hardware abort.
- */
- nvme_ctrlr_cmd_abort(ctrlr, tr->cid,
- qpair->id, nvme_abort_complete, tr);
- } else {
- /*
- * Otherwise we have a live command in
- * the card (either one we couldn't
- * abort, or aborts weren't enabled).
- * The only safe way to proceed is to do
- * a reset.
- */
- needs_reset = true;
- }
- } else {
- idle = false;
- }
+
+ /*
+ * If the first real transaction is not in timeout, then
+ * we're done. Otherwise, we try recovery.
+ */
+ idle = false;
+ if (now <= tr->deadline)
+ fast = true;
+ break;
}
mtx_unlock(&qpair->lock);
- if (!needs_reset)
+ if (idle || fast)
break;
/*
- * We've had a command timeout that we weren't able to abort
- *
- * If we get here due to a possible surprise hot-unplug event,
- * then we let nvme_ctrlr_reset confirm and fail the
- * controller.
+ * There's a stale transaction at the start of the queue whose
+ * deadline has passed. Poll the competions as a last-ditch
+ * effort in case an interrupt has been missed. Warn the user if
+ * transactions were found of possible interrupt issues, but
+ * just once per controller.
+ */
+ if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) {
+ nvme_printf(ctrlr, "System interrupt issues?\n");
+ ctrlr->isr_warned = true;
+ }
+
+ /*
+ * Now that we've run the ISR, re-rheck to see if there's any
+ * timed out commands and abort them or reset the card if so.
*/
- do_reset:
- nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
- (csts == 0xffffffff) ? " and possible hot unplug" :
- (cfs ? " and fatal error status" : ""));
- qpair->recovery_state = RECOVERY_WAITING;
- nvme_ctrlr_reset(ctrlr);
- idle = false; /* We want to keep polling */
+ mtx_lock(&qpair->lock);
+ idle = true;
+ TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+ /*
+ * Skip async commands, they are posted to the card for
+ * an indefinite amount of time and have no deadline.
+ */
+ if (tr->deadline == SBT_MAX)
+ continue;
+
+ /*
+ * If we know this tracker hasn't timed out, we also
+ * know all subsequent ones haven't timed out. The tr
+ * queue is in submission order and all normal commands
+ * in a queue have the same timeout (or the timeout was
+ * changed by the user, but we eventually timeout then).
+ */
+ idle = false;
+ if (now <= tr->deadline)
+ break;
+
+ /*
+ * Timeout expired, abort it or reset controller.
+ */
+ if (ctrlr->enable_aborts &&
+ tr->req->cb_fn != nvme_abort_complete) {
+ /*
+ * This isn't an abort command, ask for a
+ * hardware abort. This goes to the admin
+ * queue which will reset the card if it
+ * times out.
+ */
+ nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
+ nvme_abort_complete, tr);
+ } else {
+ /*
+ * We have a live command in the card (either
+ * one we couldn't abort, or aborts weren't
+ * enabled). We can only reset.
+ */
+ mtx_unlock(&qpair->lock);
+ goto do_reset;
+ }
+ }
+ mtx_unlock(&qpair->lock);
break;
+
case RECOVERY_WAITING:
/*
* These messages aren't interesting while we're suspended. We
@@ -1201,7 +1069,7 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
- bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
+ bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle,
qpair->sq_tdbl_off, qpair->sq_tail);
qpair->num_cmds++;
}
@@ -1259,47 +1127,41 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
{
struct nvme_tracker *tr;
int err = 0;
+ bool is_admin = qpair == &qpair->ctrlr->adminq;
mtx_assert(&qpair->lock, MA_OWNED);
tr = TAILQ_FIRST(&qpair->free_tr);
req->qpair = qpair;
- if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
- /*
- * No tracker is available, or the qpair is disabled due to an
- * in-progress controller-level reset. If we lose the race with
- * recovery_state, then we may add an extra request to the queue
- * which will be resubmitted later. We only set recovery_state
- * to NONE with qpair->lock also held, so if we observe that the
- * state is not NONE, we know it can't transition to NONE below
- * when we've submitted the request to hardware.
- *
- * Also, as part of the failure process, we set recovery_state
- * to RECOVERY_WAITING, so we check here to see if we've failed
- * the controller. We set it before we call the qpair_fail
- * functions, which take out the lock lock before messing with
- * queued_req. Since we hold that lock, we know it's safe to
- * either fail directly, or queue the failure should is_failed
- * be stale. If we lose the race reading is_failed, then
- * nvme_qpair_fail will fail the queued request.
- */
+ /*
+ * The controller has failed, so fail the request. Note, that this races
+ * the recovery / timeout code. Since we hold the qpair lock, we know
+ * it's safe to fail directly. is_failed is set when we fail the
+ * controller. It is only ever reset in the ioctl reset controller
+ * path, which is safe to race (for failed controllers, we make no
+ * guarantees about bringing it out of failed state relative to other
+ * commands). We try hard to allow admin commands when the entire
+ * controller hasn't failed, only something related to I/O queues.
+ */
+ if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
+ nvme_qpair_manual_complete_request(qpair, req,
+ NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
+ ERROR_PRINT_NONE);
+ return;
+ }
- if (qpair->ctrlr->is_failed) {
- /*
- * The controller has failed, so fail the request.
- */
- nvme_qpair_manual_complete_request(qpair, req,
- NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
- } else {
- /*
- * Put the request on the qpair's request queue to be
- * processed when a tracker frees up via a command
- * completion or when the controller reset is
- * completed.
- */
- STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
- }
+ /*
+ * No tracker is available, or the qpair is disabled due to an
+ * in-progress controller-level reset. If we lose the race with
+ * recovery_state, then we may add an extra request to the queue which
+ * will be resubmitted later. We only set recovery_state to NONE with
+ * qpair->lock also held, so if we observe that the state is not NONE,
+ * we know it won't transition back to NONE without retrying queued
+ * request.
+ */
+ if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
+ STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
return;
}
@@ -1313,6 +1175,11 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
return;
}
+ /*
+ * tr->deadline updating when nvme_payload_map calls
+ * nvme_qpair_submit_tracker (we call it above directly
+ * when there's no map to load).
+ */
err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
if (err != 0) {
@@ -1344,11 +1211,13 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
static void
nvme_qpair_enable(struct nvme_qpair *qpair)
{
+ bool is_admin __unused = qpair == &qpair->ctrlr->adminq;
+
if (mtx_initialized(&qpair->recovery))
mtx_assert(&qpair->recovery, MA_OWNED);
if (mtx_initialized(&qpair->lock))
mtx_assert(&qpair->lock, MA_OWNED);
- KASSERT(!qpair->ctrlr->is_failed,
+ KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed),
("Enabling a failed qpair\n"));
qpair->recovery_state = RECOVERY_NONE;
@@ -1515,7 +1384,7 @@ nvme_qpair_fail(struct nvme_qpair *qpair)
STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
mtx_unlock(&qpair->lock);
nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
- NVME_SC_ABORTED_BY_REQUEST);
+ NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL);
mtx_lock(&qpair->lock);
}
diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c
index f561756f99b7..4974bb718222 100644
--- a/sys/dev/nvme/nvme_sim.c
+++ b/sys/dev/nvme/nvme_sim.c
@@ -96,15 +96,16 @@ nvme_sim_nvmeio(struct cam_sim *sim, union ccb *ccb)
/* SG LIST ??? */
if ((nvmeio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
req = nvme_allocate_request_bio((struct bio *)payload,
- nvme_sim_nvmeio_done, ccb);
+ M_NOWAIT, nvme_sim_nvmeio_done, ccb);
else if ((nvmeio->ccb_h.flags & CAM_DATA_SG) == CAM_DATA_SG)
- req = nvme_allocate_request_ccb(ccb, nvme_sim_nvmeio_done, ccb);
+ req = nvme_allocate_request_ccb(ccb, M_NOWAIT,
+ nvme_sim_nvmeio_done, ccb);
else if (payload == NULL)
- req = nvme_allocate_request_null(nvme_sim_nvmeio_done, ccb);
+ req = nvme_allocate_request_null(M_NOWAIT, nvme_sim_nvmeio_done,
+ ccb);
else
- req = nvme_allocate_request_vaddr(payload, size,
+ req = nvme_allocate_request_vaddr(payload, size, M_NOWAIT,
nvme_sim_nvmeio_done, ccb);
-
if (req == NULL) {
nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
xpt_done(ccb);
@@ -203,7 +204,7 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
cpi->xport_specific.nvme.slot = pci_get_slot(dev);
cpi->xport_specific.nvme.function = pci_get_function(dev);
cpi->xport_specific.nvme.extra = 0;
- strncpy(cpi->xport_specific.nvme.dev_name, device_get_nameunit(dev),
+ strlcpy(cpi->xport_specific.nvme.dev_name, device_get_nameunit(dev),
sizeof(cpi->xport_specific.nvme.dev_name));
cpi->hba_vendor = pci_get_vendor(dev);
cpi->hba_device = pci_get_device(dev);
@@ -268,7 +269,6 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
ccb->ccb_h.status = CAM_REQ_CMP;
break;
case XPT_NVME_IO: /* Execute the requested I/O operation */
- case XPT_NVME_ADMIN: /* or Admin operation */
if (ctrlr->is_failed) {
/*
* I/O came in while we were failing the drive, so drop
@@ -279,6 +279,18 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
}
nvme_sim_nvmeio(sim, ccb);
return; /* no done */
+ case XPT_NVME_ADMIN: /* or Admin operation */
+ if (ctrlr->is_failed_admin) {
+ /*
+ * Admin request came in when we can't send admin
+ * commands, so drop it. Once falure is complete, we'll
+ * be destroyed.
+ */
+ ccb->ccb_h.status = CAM_DEV_NOT_THERE;
+ break;
+ }
+ nvme_sim_nvmeio(sim, ccb);
+ return; /* no done */
default:
ccb->ccb_h.status = CAM_REQ_INVALID;
break;
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
index d6452a2e5492..a5a44721f9f9 100644
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -30,6 +30,7 @@
#include "opt_nvme.h"
#include <sys/param.h>
+#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/sysctl.h>
@@ -175,8 +176,10 @@ nvme_sysctl_num_cmds(SYSCTL_HANDLER_ARGS)
num_cmds = ctrlr->adminq.num_cmds;
- for (i = 0; i < ctrlr->num_io_queues; i++)
- num_cmds += ctrlr->ioq[i].num_cmds;
+ if (ctrlr->ioq != NULL) {
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num_cmds += ctrlr->ioq[i].num_cmds;
+ }
return (sysctl_handle_64(oidp, &num_cmds, 0, req));
}
@@ -190,8 +193,10 @@ nvme_sysctl_num_intr_handler_calls(SYSCTL_HANDLER_ARGS)
num_intr_handler_calls = ctrlr->adminq.num_intr_handler_calls;
- for (i = 0; i < ctrlr->num_io_queues; i++)
- num_intr_handler_calls += ctrlr->ioq[i].num_intr_handler_calls;
+ if (ctrlr->ioq != NULL) {
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num_intr_handler_calls += ctrlr->ioq[i].num_intr_handler_calls;
+ }
return (sysctl_handle_64(oidp, &num_intr_handler_calls, 0, req));
}
@@ -205,8 +210,10 @@ nvme_sysctl_num_retries(SYSCTL_HANDLER_ARGS)
num_retries = ctrlr->adminq.num_retries;
- for (i = 0; i < ctrlr->num_io_queues; i++)
- num_retries += ctrlr->ioq[i].num_retries;
+ if (ctrlr->ioq != NULL) {
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num_retries += ctrlr->ioq[i].num_retries;
+ }
return (sysctl_handle_64(oidp, &num_retries, 0, req));
}
@@ -220,8 +227,10 @@ nvme_sysctl_num_failures(SYSCTL_HANDLER_ARGS)
num_failures = ctrlr->adminq.num_failures;
- for (i = 0; i < ctrlr->num_io_queues; i++)
- num_failures += ctrlr->ioq[i].num_failures;
+ if (ctrlr->ioq != NULL) {
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num_failures += ctrlr->ioq[i].num_failures;
+ }
return (sysctl_handle_64(oidp, &num_failures, 0, req));
}
@@ -235,8 +244,10 @@ nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS)
num_ignored = ctrlr->adminq.num_ignored;
- for (i = 0; i < ctrlr->num_io_queues; i++)
- num_ignored += ctrlr->ioq[i].num_ignored;
+ if (ctrlr->ioq != NULL) {
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num_ignored += ctrlr->ioq[i].num_ignored;
+ }
return (sysctl_handle_64(oidp, &num_ignored, 0, req));
}
@@ -250,8 +261,10 @@ nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
num = ctrlr->adminq.num_recovery_nolock;
- for (i = 0; i < ctrlr->num_io_queues; i++)
- num += ctrlr->ioq[i].num_recovery_nolock;
+ if (ctrlr->ioq != NULL) {
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num += ctrlr->ioq[i].num_recovery_nolock;
+ }
return (sysctl_handle_64(oidp, &num, 0, req));
}
@@ -270,8 +283,10 @@ nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
if (val != 0) {
nvme_qpair_reset_stats(&ctrlr->adminq);
- for (i = 0; i < ctrlr->num_io_queues; i++)
- nvme_qpair_reset_stats(&ctrlr->ioq[i]);
+ if (ctrlr->ioq != NULL) {
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ nvme_qpair_reset_stats(&ctrlr->ioq[i]);
+ }
}
return (0);
@@ -318,6 +333,10 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
CTLFLAG_RD, &qpair->num_recovery_nolock,
"Number of times that we failed to lock recovery in the ISR");
+ SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "recovery",
+ CTLFLAG_RW, &qpair->recovery_state, 0,
+ "Current recovery state of the queue");
+
SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
"dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
qpair, 0, nvme_sysctl_dump_debug, "IU", "Dump debug data");
@@ -327,8 +346,8 @@ void
nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
{
struct sysctl_ctx_list *ctrlr_ctx;
- struct sysctl_oid *ctrlr_tree, *que_tree;
- struct sysctl_oid_list *ctrlr_list;
+ struct sysctl_oid *ctrlr_tree, *que_tree, *ioq_tree;
+ struct sysctl_oid_list *ctrlr_list, *ioq_list;
#define QUEUE_NAME_LENGTH 16
char queue_name[QUEUE_NAME_LENGTH];
int i;
@@ -407,16 +426,35 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
CTLFLAG_RD, &ctrlr->cap_hi, 0,
"Hi 32-bits of capacities for the drive");
+ SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "fail_on_reset",
+ CTLFLAG_RD, &ctrlr->fail_on_reset, 0,
+ "Pretend the next reset fails and fail the controller");
+
que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO, "adminq",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue");
nvme_sysctl_initialize_queue(&ctrlr->adminq, ctrlr_ctx, que_tree);
- for (i = 0; i < ctrlr->num_io_queues; i++) {
- snprintf(queue_name, QUEUE_NAME_LENGTH, "ioq%d", i);
- que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO,
- queue_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IO Queue");
- nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx,
- que_tree);
+ /*
+ * Make sure that we've constructed the I/O queues before setting up the
+ * sysctls. Failed controllers won't allocate it, but we want the rest
+ * of the sysctls to diagnose things.
+ */
+ if (ctrlr->ioq != NULL) {
+ ioq_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO,
+ "ioq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
+ ioq_list = SYSCTL_CHILDREN(ioq_tree);
+
+ for (i = 0; i < ctrlr->num_io_queues; i++) {
+ snprintf(queue_name, QUEUE_NAME_LENGTH, "%d", i);
+ que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ioq_list, OID_AUTO,
+ queue_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IO Queue");
+ nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx,
+ que_tree);
+ }
}
+
+ SYSCTL_ADD_COUNTER_U64(ctrlr_ctx, ctrlr_list, OID_AUTO, "alignment_splits",
+ CTLFLAG_RD, &ctrlr->alignment_splits,
+ "Number of times we split the I/O alignment for drives with preferred alignment");
}
diff --git a/sys/dev/nvme/nvme_util.c b/sys/dev/nvme/nvme_util.c
index 47d84e5b6957..0a07653a7378 100644
--- a/sys/dev/nvme/nvme_util.c
+++ b/sys/dev/nvme/nvme_util.c
@@ -5,6 +5,8 @@
* Copyright (C) 1997 Justin T. Gibbs
* All rights reserved.
*
+ * Copyright (c) 2023-2025 Chelsio Communications, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -28,8 +30,243 @@
*/
#include <sys/param.h>
+#include <sys/sbuf.h>
#include <dev/nvme/nvme.h>
+#define OPC_ENTRY(x) [NVME_OPC_ ## x] = #x
+
+static const char *admin_opcode[256] = {
+ OPC_ENTRY(DELETE_IO_SQ),
+ OPC_ENTRY(CREATE_IO_SQ),
+ OPC_ENTRY(GET_LOG_PAGE),
+ OPC_ENTRY(DELETE_IO_CQ),
+ OPC_ENTRY(CREATE_IO_CQ),
+ OPC_ENTRY(IDENTIFY),
+ OPC_ENTRY(ABORT),
+ OPC_ENTRY(SET_FEATURES),
+ OPC_ENTRY(GET_FEATURES),
+ OPC_ENTRY(ASYNC_EVENT_REQUEST),
+ OPC_ENTRY(NAMESPACE_MANAGEMENT),
+ OPC_ENTRY(FIRMWARE_ACTIVATE),
+ OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD),
+ OPC_ENTRY(DEVICE_SELF_TEST),
+ OPC_ENTRY(NAMESPACE_ATTACHMENT),
+ OPC_ENTRY(KEEP_ALIVE),
+ OPC_ENTRY(DIRECTIVE_SEND),
+ OPC_ENTRY(DIRECTIVE_RECEIVE),
+ OPC_ENTRY(VIRTUALIZATION_MANAGEMENT),
+ OPC_ENTRY(NVME_MI_SEND),
+ OPC_ENTRY(NVME_MI_RECEIVE),
+ OPC_ENTRY(CAPACITY_MANAGEMENT),
+ OPC_ENTRY(LOCKDOWN),
+ OPC_ENTRY(DOORBELL_BUFFER_CONFIG),
+ OPC_ENTRY(FABRICS_COMMANDS),
+ OPC_ENTRY(FORMAT_NVM),
+ OPC_ENTRY(SECURITY_SEND),
+ OPC_ENTRY(SECURITY_RECEIVE),
+ OPC_ENTRY(SANITIZE),
+ OPC_ENTRY(GET_LBA_STATUS),
+};
+
+static const char *nvm_opcode[256] = {
+ OPC_ENTRY(FLUSH),
+ OPC_ENTRY(WRITE),
+ OPC_ENTRY(READ),
+ OPC_ENTRY(WRITE_UNCORRECTABLE),
+ OPC_ENTRY(COMPARE),
+ OPC_ENTRY(WRITE_ZEROES),
+ OPC_ENTRY(DATASET_MANAGEMENT),
+ OPC_ENTRY(VERIFY),
+ OPC_ENTRY(RESERVATION_REGISTER),
+ OPC_ENTRY(RESERVATION_REPORT),
+ OPC_ENTRY(RESERVATION_ACQUIRE),
+ OPC_ENTRY(RESERVATION_RELEASE),
+ OPC_ENTRY(COPY),
+};
+
+#define SC_ENTRY(x) [NVME_SC_ ## x] = #x
+
+static const char *generic_status[256] = {
+ SC_ENTRY(SUCCESS),
+ SC_ENTRY(INVALID_OPCODE),
+ SC_ENTRY(INVALID_FIELD),
+ SC_ENTRY(COMMAND_ID_CONFLICT),
+ SC_ENTRY(DATA_TRANSFER_ERROR),
+ SC_ENTRY(ABORTED_POWER_LOSS),
+ SC_ENTRY(INTERNAL_DEVICE_ERROR),
+ SC_ENTRY(ABORTED_BY_REQUEST),
+ SC_ENTRY(ABORTED_SQ_DELETION),
+ SC_ENTRY(ABORTED_FAILED_FUSED),
+ SC_ENTRY(ABORTED_MISSING_FUSED),
+ SC_ENTRY(INVALID_NAMESPACE_OR_FORMAT),
+ SC_ENTRY(COMMAND_SEQUENCE_ERROR),
+ SC_ENTRY(INVALID_SGL_SEGMENT_DESCR),
+ SC_ENTRY(INVALID_NUMBER_OF_SGL_DESCR),
+ SC_ENTRY(DATA_SGL_LENGTH_INVALID),
+ SC_ENTRY(METADATA_SGL_LENGTH_INVALID),
+ SC_ENTRY(SGL_DESCRIPTOR_TYPE_INVALID),
+ SC_ENTRY(INVALID_USE_OF_CMB),
+ SC_ENTRY(PRP_OFFET_INVALID),
+ SC_ENTRY(ATOMIC_WRITE_UNIT_EXCEEDED),
+ SC_ENTRY(OPERATION_DENIED),
+ SC_ENTRY(SGL_OFFSET_INVALID),
+ SC_ENTRY(HOST_ID_INCONSISTENT_FORMAT),
+ SC_ENTRY(KEEP_ALIVE_TIMEOUT_EXPIRED),
+ SC_ENTRY(KEEP_ALIVE_TIMEOUT_INVALID),
+ SC_ENTRY(ABORTED_DUE_TO_PREEMPT),
+ SC_ENTRY(SANITIZE_FAILED),
+ SC_ENTRY(SANITIZE_IN_PROGRESS),
+ SC_ENTRY(SGL_DATA_BLOCK_GRAN_INVALID),
+ SC_ENTRY(NOT_SUPPORTED_IN_CMB),
+ SC_ENTRY(NAMESPACE_IS_WRITE_PROTECTED),
+ SC_ENTRY(COMMAND_INTERRUPTED),
+ SC_ENTRY(TRANSIENT_TRANSPORT_ERROR),
+
+ SC_ENTRY(LBA_OUT_OF_RANGE),
+ SC_ENTRY(CAPACITY_EXCEEDED),
+ SC_ENTRY(NAMESPACE_NOT_READY),
+ SC_ENTRY(RESERVATION_CONFLICT),
+ SC_ENTRY(FORMAT_IN_PROGRESS),
+};
+
+static const char *command_specific_status[256] = {
+ SC_ENTRY(COMPLETION_QUEUE_INVALID),
+ SC_ENTRY(INVALID_QUEUE_IDENTIFIER),
+ SC_ENTRY(MAXIMUM_QUEUE_SIZE_EXCEEDED),
+ SC_ENTRY(ABORT_COMMAND_LIMIT_EXCEEDED),
+ SC_ENTRY(ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED),
+ SC_ENTRY(INVALID_FIRMWARE_SLOT),
+ SC_ENTRY(INVALID_FIRMWARE_IMAGE),
+ SC_ENTRY(INVALID_INTERRUPT_VECTOR),
+ SC_ENTRY(INVALID_LOG_PAGE),
+ SC_ENTRY(INVALID_FORMAT),
+ SC_ENTRY(FIRMWARE_REQUIRES_RESET),
+ SC_ENTRY(INVALID_QUEUE_DELETION),
+ SC_ENTRY(FEATURE_NOT_SAVEABLE),
+ SC_ENTRY(FEATURE_NOT_CHANGEABLE),
+ SC_ENTRY(FEATURE_NOT_NS_SPECIFIC),
+ SC_ENTRY(FW_ACT_REQUIRES_NVMS_RESET),
+ SC_ENTRY(FW_ACT_REQUIRES_RESET),
+ SC_ENTRY(FW_ACT_REQUIRES_TIME),
+ SC_ENTRY(FW_ACT_PROHIBITED),
+ SC_ENTRY(OVERLAPPING_RANGE),
+ SC_ENTRY(NS_INSUFFICIENT_CAPACITY),
+ SC_ENTRY(NS_ID_UNAVAILABLE),
+ SC_ENTRY(NS_ALREADY_ATTACHED),
+ SC_ENTRY(NS_IS_PRIVATE),
+ SC_ENTRY(NS_NOT_ATTACHED),
+ SC_ENTRY(THIN_PROV_NOT_SUPPORTED),
+ SC_ENTRY(CTRLR_LIST_INVALID),
+ SC_ENTRY(SELF_TEST_IN_PROGRESS),
+ SC_ENTRY(BOOT_PART_WRITE_PROHIB),
+ SC_ENTRY(INVALID_CTRLR_ID),
+ SC_ENTRY(INVALID_SEC_CTRLR_STATE),
+ SC_ENTRY(INVALID_NUM_OF_CTRLR_RESRC),
+ SC_ENTRY(INVALID_RESOURCE_ID),
+ SC_ENTRY(SANITIZE_PROHIBITED_WPMRE),
+ SC_ENTRY(ANA_GROUP_ID_INVALID),
+ SC_ENTRY(ANA_ATTACH_FAILED),
+
+ SC_ENTRY(CONFLICTING_ATTRIBUTES),
+ SC_ENTRY(INVALID_PROTECTION_INFO),
+ SC_ENTRY(ATTEMPTED_WRITE_TO_RO_PAGE),
+};
+
+static const char *media_error_status[256] = {
+ SC_ENTRY(WRITE_FAULTS),
+ SC_ENTRY(UNRECOVERED_READ_ERROR),
+ SC_ENTRY(GUARD_CHECK_ERROR),
+ SC_ENTRY(APPLICATION_TAG_CHECK_ERROR),
+ SC_ENTRY(REFERENCE_TAG_CHECK_ERROR),
+ SC_ENTRY(COMPARE_FAILURE),
+ SC_ENTRY(ACCESS_DENIED),
+ SC_ENTRY(DEALLOCATED_OR_UNWRITTEN),
+};
+
+static const char *path_related_status[256] = {
+ SC_ENTRY(INTERNAL_PATH_ERROR),
+ SC_ENTRY(ASYMMETRIC_ACCESS_PERSISTENT_LOSS),
+ SC_ENTRY(ASYMMETRIC_ACCESS_INACCESSIBLE),
+ SC_ENTRY(ASYMMETRIC_ACCESS_TRANSITION),
+ SC_ENTRY(CONTROLLER_PATHING_ERROR),
+ SC_ENTRY(HOST_PATHING_ERROR),
+ SC_ENTRY(COMMAND_ABORTED_BY_HOST),
+};
+
+void
+nvme_opcode_sbuf(bool admin, uint8_t opc, struct sbuf *sb)
+{
+ const char *s, *type;
+
+ if (admin) {
+ s = admin_opcode[opc];
+ type = "ADMIN";
+ } else {
+ s = nvm_opcode[opc];
+ type = "NVM";
+ }
+ if (s == NULL)
+ sbuf_printf(sb, "%s (%02x)", type, opc);
+ else
+ sbuf_printf(sb, "%s", s);
+}
+
+void
+nvme_sc_sbuf(const struct nvme_completion *cpl, struct sbuf *sb)
+{
+ const char *s, *type;
+ uint16_t status;
+
+ status = le16toh(cpl->status);
+ switch (NVME_STATUS_GET_SCT(status)) {
+ case NVME_SCT_GENERIC:
+ s = generic_status[NVME_STATUS_GET_SC(status)];
+ type = "GENERIC";
+ break;
+ case NVME_SCT_COMMAND_SPECIFIC:
+ s = command_specific_status[NVME_STATUS_GET_SC(status)];
+ type = "COMMAND SPECIFIC";
+ break;
+ case NVME_SCT_MEDIA_ERROR:
+ s = media_error_status[NVME_STATUS_GET_SC(status)];
+ type = "MEDIA ERROR";
+ break;
+ case NVME_SCT_PATH_RELATED:
+ s = path_related_status[NVME_STATUS_GET_SC(status)];
+ type = "PATH RELATED";
+ break;
+ case NVME_SCT_VENDOR_SPECIFIC:
+ s = NULL;
+ type = "VENDOR SPECIFIC";
+ break;
+ default:
+ s = NULL;
+ type = NULL;
+ break;
+ }
+
+ if (type == NULL)
+ sbuf_printf(sb, "RESERVED (%02x/%02x)",
+ NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status));
+ else if (s == NULL)
+ sbuf_printf(sb, "%s (%02x)", type, NVME_STATUS_GET_SC(status));
+ else
+ sbuf_printf(sb, "%s", s);
+}
+
+void
+nvme_cpl_sbuf(const struct nvme_completion *cpl, struct sbuf *sb)
+{
+ uint16_t status;
+
+ status = le16toh(cpl->status);
+ nvme_sc_sbuf(cpl, sb);
+ if (NVME_STATUS_GET_M(status) != 0)
+ sbuf_printf(sb, " M");
+ if (NVME_STATUS_GET_DNR(status) != 0)
+ sbuf_printf(sb, " DNR");
+}
+
void
nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen)
{