diff options
Diffstat (limited to 'sys/dev/nvme')
-rw-r--r-- | sys/dev/nvme/nvme.h | 55 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_ctrlr.c | 242 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_ctrlr_cmd.c | 29 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_linux.h | 58 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_ns.c | 29 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_ns_cmd.c | 24 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_private.h | 39 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_qpair.c | 597 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_sim.c | 26 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_sysctl.c | 82 | ||||
-rw-r--r-- | sys/dev/nvme/nvme_util.c | 237 |
11 files changed, 925 insertions, 493 deletions
diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h index a4baade7df5d..17c5cdb4db87 100644 --- a/sys/dev/nvme/nvme.h +++ b/sys/dev/nvme/nvme.h @@ -35,11 +35,17 @@ #include <sys/param.h> #include <sys/endian.h> +#ifndef _KERNEL +#include <stdbool.h> +#endif + +struct sbuf; #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) #define NVME_RESET_CONTROLLER _IO('n', 1) #define NVME_GET_NSID _IOR('n', 2, struct nvme_get_nsid) #define NVME_GET_MAX_XFER_SIZE _IOR('n', 3, uint64_t) +#define NVME_GET_CONTROLLER_DATA _IOR('n', 4, struct nvme_controller_data) #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) @@ -648,8 +654,16 @@ enum nvme_critical_warning_state { NVME_CRIT_WARN_ST_PERSISTENT_MEMORY_REGION = 0x20, }; #define NVME_CRIT_WARN_ST_RESERVED_MASK (0xC0) -#define NVME_ASYNC_EVENT_NS_ATTRIBUTE (0x100) -#define NVME_ASYNC_EVENT_FW_ACTIVATE (0x200) +#define NVME_ASYNC_EVENT_NS_ATTRIBUTE (1U << 8) +#define NVME_ASYNC_EVENT_FW_ACTIVATE (1U << 9) +#define NVME_ASYNC_EVENT_TELEMETRY_LOG (1U << 10) +#define NVME_ASYNC_EVENT_ASYM_NS_ACC (1U << 11) +#define NVME_ASYNC_EVENT_PRED_LAT_DELTA (1U << 12) +#define NVME_ASYNC_EVENT_LBA_STATUS (1U << 13) +#define NVME_ASYNC_EVENT_ENDURANCE_DELTA (1U << 14) +#define NVME_ASYNC_EVENT_NVM_SHUTDOWN (1U << 15) +#define NVME_ASYNC_EVENT_ZONE_DELTA (1U << 27) +#define NVME_ASYNC_EVENT_DISCOVERY_DELTA (1U << 31) /* slot for current FW */ #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) @@ -832,7 +846,7 @@ struct nvme_command { uint32_t cdw13; /* command-specific */ uint32_t cdw14; /* command-specific */ uint32_t cdw15; /* command-specific */ -}; +} __aligned(8); _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); @@ -1601,7 +1615,7 @@ struct nvme_health_information_page { uint32_t ttftmt2; uint8_t reserved2[280]; -} __packed __aligned(4); +} __packed __aligned(8); _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); @@ -1652,6 +1666,30 @@ struct nvme_device_self_test_page { _Static_assert(sizeof(struct nvme_device_self_test_page) == 564, "bad size for nvme_device_self_test_page"); +/* + * Header structure for both host initiated telemetry (page 7) and controller + * initiated telemetry (page 8). + */ +struct nvme_telemetry_log_page { + uint8_t identifier; + uint8_t rsvd[4]; + uint8_t oui[3]; + uint16_t da1_last; + uint16_t da2_last; + uint16_t da3_last; + uint8_t rsvd2[2]; + uint32_t da4_last; + uint8_t rsvd3[361]; + uint8_t hi_gen; + uint8_t ci_avail; + uint8_t ci_gen; + uint8_t reason[128]; + /* Blocks of telemetry data follow */ +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_telemetry_log_page) == 512, + "bad size for nvme_telemetry_log"); + struct nvme_discovery_log_entry { uint8_t trtype; uint8_t adrfam; @@ -1868,6 +1906,9 @@ struct nvme_hmb_desc { #define nvme_completion_is_error(cpl) \ (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) +void nvme_cpl_sbuf(const struct nvme_completion *cpl, struct sbuf *sbuf); +void nvme_opcode_sbuf(bool admin, uint8_t opc, struct sbuf *sb); +void nvme_sc_sbuf(const struct nvme_completion *cpl, struct sbuf *sbuf); void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL @@ -1878,6 +1919,7 @@ struct thread; struct nvme_namespace; struct nvme_controller; struct nvme_consumer; +struct nvme_passthru_cmd; typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); @@ -1897,6 +1939,11 @@ int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, uint32_t nsid, int is_user_buffer, int is_admin_cmd); +int nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, + struct nvme_passthru_cmd *npc, + uint32_t nsid, bool is_user, + bool is_admin); + /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index b7b03082c54e..73a7cee4aad0 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -39,10 +39,11 @@ #include <sys/uio.h> #include <sys/sbuf.h> #include <sys/endian.h> -#include <machine/stdarg.h> +#include <sys/stdarg.h> #include <vm/vm.h> #include "nvme_private.h" +#include "nvme_linux.h" #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ @@ -231,7 +232,7 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) } static void -nvme_ctrlr_fail(struct nvme_controller *ctrlr) +nvme_ctrlr_fail(struct nvme_controller *ctrlr, bool admin_also) { int i; @@ -241,7 +242,10 @@ nvme_ctrlr_fail(struct nvme_controller *ctrlr) * a different error, though when we fail, that hardly matters). */ ctrlr->is_failed = true; - nvme_qpair_fail(&ctrlr->adminq); + if (admin_also) { + ctrlr->is_failed_admin = true; + nvme_qpair_fail(&ctrlr->adminq); + } if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) { nvme_qpair_fail(&ctrlr->ioq[i]); @@ -414,6 +418,7 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) TSENTER(); + ctrlr->is_failed_admin = true; nvme_ctrlr_disable_qpairs(ctrlr); err = nvme_ctrlr_disable(ctrlr); @@ -422,6 +427,8 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) err = nvme_ctrlr_enable(ctrlr); out: + if (err == 0) + ctrlr->is_failed_admin = false; TSEXIT(); return (err); @@ -434,11 +441,10 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr) cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); - if (cmpset == 0 || ctrlr->is_failed) + if (cmpset == 0) /* - * Controller is already resetting or has failed. Return - * immediately since there is no need to kick off another - * reset in these cases. + * Controller is already resetting. Return immediately since + * there is no need to kick off another reset. */ return; @@ -814,7 +820,13 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_request *req; aer->ctrlr = ctrlr; - req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer); + /* + * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable + * callback context. AER completions should be handled on a dedicated + * thread. + */ + req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb, + aer); aer->req = req; /* @@ -1089,7 +1101,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting) return; if (resetting && nvme_ctrlr_identify(ctrlr) != 0) { - nvme_ctrlr_fail(ctrlr); + nvme_ctrlr_fail(ctrlr, false); return; } @@ -1104,7 +1116,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting) if (resetting) { old_num_io_queues = ctrlr->num_io_queues; if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { - nvme_ctrlr_fail(ctrlr); + nvme_ctrlr_fail(ctrlr, false); return; } @@ -1122,12 +1134,12 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting) nvme_ctrlr_hmb_enable(ctrlr, true, true); if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { - nvme_ctrlr_fail(ctrlr); + nvme_ctrlr_fail(ctrlr, false); return; } if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { - nvme_ctrlr_fail(ctrlr); + nvme_ctrlr_fail(ctrlr, false); return; } @@ -1146,9 +1158,8 @@ nvme_ctrlr_start_config_hook(void *arg) TSENTER(); - if (nvme_ctrlr_hw_reset(ctrlr) != 0) { -fail: - nvme_ctrlr_fail(ctrlr); + if (nvme_ctrlr_hw_reset(ctrlr) != 0 || ctrlr->fail_on_reset != 0) { + nvme_ctrlr_fail(ctrlr, true); config_intrhook_disestablish(&ctrlr->config_hook); return; } @@ -1161,13 +1172,15 @@ fail: nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) nvme_ctrlr_start(ctrlr, false); else - goto fail; + nvme_ctrlr_fail(ctrlr, false); nvme_sysctl_initialize_ctrlr(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); - ctrlr->is_initialized = 1; - nvme_notify_new_controller(ctrlr); + if (!ctrlr->is_failed) { + ctrlr->is_initialized = true; + nvme_notify_new_controller(ctrlr); + } TSEXIT(); } @@ -1184,7 +1197,7 @@ nvme_ctrlr_reset_task(void *arg, int pending) nvme_ctrlr_start(ctrlr, true); } else { nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"timed_out\""); - nvme_ctrlr_fail(ctrlr); + nvme_ctrlr_fail(ctrlr, true); } atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); @@ -1258,24 +1271,19 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, return EIO; } if (is_user_buffer) { - /* - * Ensure the user buffer is wired for the duration of - * this pass-through command. - */ - PHOLD(curproc); buf = uma_zalloc(pbuf_zone, M_WAITOK); buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) { ret = EFAULT; goto err; } - req = nvme_allocate_request_vaddr(buf->b_data, pt->len, - nvme_pt_done, pt); + req = nvme_allocate_request_vaddr(buf->b_data, pt->len, + M_WAITOK, nvme_pt_done, pt); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, - nvme_pt_done, pt); + M_WAITOK, nvme_pt_done, pt); } else - req = nvme_allocate_request_null(nvme_pt_done, pt); + req = nvme_allocate_request_null(M_WAITOK, nvme_pt_done, pt); /* Assume user space already converted to little-endian */ req->cmd.opc = pt->cmd.opc; @@ -1308,7 +1316,104 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, vunmapbuf(buf); err: uma_zfree(pbuf_zone, buf); - PRELE(curproc); + } + + return (ret); +} + +static void +nvme_npc_done(void *arg, const struct nvme_completion *cpl) +{ + struct nvme_passthru_cmd *npc = arg; + struct mtx *mtx = (void *)(uintptr_t)npc->metadata; + + npc->result = cpl->cdw0; /* cpl in host order by now */ + mtx_lock(mtx); + npc->metadata = 0; + wakeup(npc); + mtx_unlock(mtx); +} + +/* XXX refactor? */ + +int +nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, + struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin) +{ + struct nvme_request *req; + struct mtx *mtx; + struct buf *buf = NULL; + int ret = 0; + + /* + * We don't support metadata. + */ + if (npc->metadata != 0 || npc->metadata_len != 0) + return (EIO); + + if (npc->data_len > 0 && npc->addr != 0) { + if (npc->data_len > ctrlr->max_xfer_size) { + nvme_printf(ctrlr, + "npc->data_len (%d) exceeds max_xfer_size (%d)\n", + npc->data_len, ctrlr->max_xfer_size); + return (EIO); + } + /* + * We only support data out or data in commands, but not both at + * once. However, there's some comands with lower bit cleared + * that are really read commands, so we should filter & 3 == 0, + * but don't. + */ + if ((npc->opcode & 0x3) == 3) + return (EINVAL); + if (is_user) { + buf = uma_zalloc(pbuf_zone, M_WAITOK); + buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ; + if (vmapbuf(buf, (void *)(uintptr_t)npc->addr, + npc->data_len, 1) < 0) { + ret = EFAULT; + goto err; + } + req = nvme_allocate_request_vaddr(buf->b_data, + npc->data_len, M_WAITOK, nvme_npc_done, npc); + } else + req = nvme_allocate_request_vaddr( + (void *)(uintptr_t)npc->addr, npc->data_len, + M_WAITOK, nvme_npc_done, npc); + } else + req = nvme_allocate_request_null(M_WAITOK, nvme_npc_done, npc); + + req->cmd.opc = npc->opcode; + req->cmd.fuse = npc->flags; + req->cmd.rsvd2 = htole16(npc->cdw2); + req->cmd.rsvd3 = htole16(npc->cdw3); + req->cmd.cdw10 = htole32(npc->cdw10); + req->cmd.cdw11 = htole32(npc->cdw11); + req->cmd.cdw12 = htole32(npc->cdw12); + req->cmd.cdw13 = htole32(npc->cdw13); + req->cmd.cdw14 = htole32(npc->cdw14); + req->cmd.cdw15 = htole32(npc->cdw15); + + req->cmd.nsid = htole32(nsid); + + mtx = mtx_pool_find(mtxpool_sleep, npc); + npc->metadata = (uintptr_t) mtx; + + /* XXX no timeout passed down */ + if (is_admin) + nvme_ctrlr_submit_admin_request(ctrlr, req); + else + nvme_ctrlr_submit_io_request(ctrlr, req); + + mtx_lock(mtx); + while (npc->metadata != 0) + mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0); + mtx_unlock(mtx); + + if (buf != NULL) { + vunmapbuf(buf); +err: + uma_zfree(pbuf_zone, buf); } return (ret); @@ -1324,6 +1429,7 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, ctrlr = cdev->si_drv1; switch (cmd) { + case NVME_IOCTL_RESET: /* Linux compat */ case NVME_RESET_CONTROLLER: nvme_ctrlr_reset(ctrlr); break; @@ -1334,15 +1440,30 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, case NVME_GET_NSID: { struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), + strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = 0; break; } case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = ctrlr->max_xfer_size; break; + case NVME_GET_CONTROLLER_DATA: + memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata)); + break; + /* Linux Compatible (see nvme_linux.h) */ + case NVME_IOCTL_ID: + td->td_retval[0] = 0xfffffffful; + return (0); + + case NVME_IOCTL_ADMIN_CMD: + case NVME_IOCTL_IO_CMD: { + struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg; + + return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, true, + cmd == NVME_IOCTL_ADMIN_CMD)); + } + default: return (ENOTTY); } @@ -1443,6 +1564,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) ctrlr->enable_aborts = 0; TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); + ctrlr->alignment_splits = counter_u64_alloc(M_WAITOK); + /* Cap transfers by the maximum addressable by page-sized PRP (4KB pages -> 2MB). */ ctrlr->max_xfer_size = MIN(maxphys, (ctrlr->page_size / 8 * ctrlr->page_size)); if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0) @@ -1464,7 +1587,7 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq"); ctrlr->is_resetting = 0; - ctrlr->is_initialized = 0; + ctrlr->is_initialized = false; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); STAILQ_INIT(&ctrlr->fail_req); @@ -1477,18 +1600,25 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) md_args.mda_mode = 0600; md_args.mda_unit = device_get_unit(dev); md_args.mda_si_drv1 = (void *)ctrlr; - status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d", - device_get_unit(dev)); + status = make_dev_s(&md_args, &ctrlr->cdev, "%s", + device_get_nameunit(dev)); if (status != 0) return (ENXIO); return (0); } +/* + * Called on detach, or on error on attach. The nvme_controller won't be used + * again once we return, so we have to tear everything down (so nothing + * references this, no callbacks, etc), but don't need to reset all the state + * since nvme_controller will be freed soon. + */ void nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) { - int gone, i; + int i; + bool gone; ctrlr->is_dying = true; @@ -1498,12 +1628,18 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) goto noadminq; /* - * Check whether it is a hot unplug or a clean driver detach. - * If device is not there any more, skip any shutdown commands. + * Check whether it is a hot unplug or a clean driver detach. If device + * is not there any more, skip any shutdown commands. Some hotplug + * bridges will return zeros instead of ff's when the device is + * departing, so ask the bridge if the device is gone. Some systems can + * remove the drive w/o the bridge knowing its gone (they don't really + * do hotplug), so failsafe with detecting all ff's (impossible with + * this hardware) as the device being gone. */ - gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE); + gone = bus_child_present(dev) == 0 || + (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE); if (gone) - nvme_ctrlr_fail(ctrlr); + nvme_ctrlr_fail(ctrlr, true); else nvme_notify_fail_consumers(ctrlr); @@ -1529,17 +1665,17 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) nvme_admin_qpair_destroy(&ctrlr->adminq); /* - * Notify the controller of a shutdown, even though this is due to - * a driver unload, not a system shutdown (this path is not invoked - * during shutdown). This ensures the controller receives a - * shutdown notification in case the system is shutdown before - * reloading the driver. + * Notify the controller of a shutdown, even though this is due to a + * driver unload, not a system shutdown (this path is not invoked uring + * shutdown). This ensures the controller receives a shutdown + * notification in case the system is shutdown before reloading the + * driver. Some NVMe drives need this to flush their cache to stable + * media and consider it a safe shutdown in SMART stats. */ - if (!gone) + if (!gone) { nvme_ctrlr_shutdown(ctrlr); - - if (!gone) nvme_ctrlr_disable(ctrlr); + } noadminq: if (ctrlr->taskqueue) @@ -1561,6 +1697,9 @@ noadminq: ctrlr->resource_id, ctrlr->resource); nores: + if (ctrlr->alignment_splits) + counter_u64_free(ctrlr->alignment_splits); + mtx_destroy(&ctrlr->lock); } @@ -1630,7 +1769,9 @@ nvme_ctrlr_suspend(struct nvme_controller *ctrlr) int to = hz; /* - * Can't touch failed controllers, so it's already suspended. + * Can't touch failed controllers, so it's already suspended. User will + * need to do an explicit reset to bring it back, if that's even + * possible. */ if (ctrlr->is_failed) return (0); @@ -1684,7 +1825,8 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr) /* * Now that we've reset the hardware, we can restart the controller. Any * I/O that was pending is requeued. Any admin commands are aborted with - * an error. Once we've restarted, take the controller out of reset. + * an error. Once we've restarted, stop flagging the controller as being + * in the reset phase. */ nvme_ctrlr_start(ctrlr, true); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); @@ -1697,7 +1839,7 @@ fail: * itself, due to questionable APIs. */ nvme_printf(ctrlr, "Failed to reset on resume, failing.\n"); - nvme_ctrlr_fail(ctrlr); + nvme_ctrlr_fail(ctrlr, true); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); return (0); } diff --git a/sys/dev/nvme/nvme_ctrlr_cmd.c b/sys/dev/nvme/nvme_ctrlr_cmd.c index 68934b9b3947..993a7718356d 100644 --- a/sys/dev/nvme/nvme_ctrlr_cmd.c +++ b/sys/dev/nvme/nvme_ctrlr_cmd.c @@ -37,7 +37,7 @@ nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr, void *payload, struct nvme_command *cmd; req = nvme_allocate_request_vaddr(payload, - sizeof(struct nvme_controller_data), cb_fn, cb_arg); + sizeof(struct nvme_controller_data), M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_IDENTIFY; @@ -59,7 +59,7 @@ nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid, struct nvme_command *cmd; req = nvme_allocate_request_vaddr(payload, - sizeof(struct nvme_namespace_data), cb_fn, cb_arg); + sizeof(struct nvme_namespace_data), M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_IDENTIFY; @@ -79,7 +79,7 @@ nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_null(cb_fn, cb_arg); + req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_CREATE_IO_CQ; @@ -103,7 +103,7 @@ nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_null(cb_fn, cb_arg); + req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_CREATE_IO_SQ; @@ -127,7 +127,7 @@ nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_null(cb_fn, cb_arg); + req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_DELETE_IO_CQ; @@ -148,7 +148,7 @@ nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_null(cb_fn, cb_arg); + req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_DELETE_IO_SQ; @@ -171,7 +171,7 @@ nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_null(cb_fn, cb_arg); + req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_SET_FEATURES; @@ -193,7 +193,7 @@ nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_null(cb_fn, cb_arg); + req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_GET_FEATURES; @@ -259,7 +259,12 @@ nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_vaddr(payload, payload_size, cb_fn, cb_arg); + /* + * XXX-MJ this should be M_WAITOK but we might be called from AER + * completion processing, which is a non-sleepable context. + */ + req = nvme_allocate_request_vaddr(payload, payload_size, + M_NOWAIT, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_GET_LOG_PAGE; @@ -319,7 +324,11 @@ nvme_ctrlr_cmd_abort(struct nvme_controller *ctrlr, uint16_t cid, struct nvme_request *req; struct nvme_command *cmd; - req = nvme_allocate_request_null(cb_fn, cb_arg); + /* + * XXX-MJ this should be M_WAITOK, we do reset from non-sleepable + * context and abort commands as part of that. + */ + req = nvme_allocate_request_null(M_NOWAIT, cb_fn, cb_arg); cmd = &req->cmd; cmd->opc = NVME_OPC_ABORT; diff --git a/sys/dev/nvme/nvme_linux.h b/sys/dev/nvme/nvme_linux.h new file mode 100644 index 000000000000..aaa68e1d34f8 --- /dev/null +++ b/sys/dev/nvme/nvme_linux.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2024, Netflix Inc. + * Written by Warner Losh + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +/* + * Linux compatible NVME ioctls. So far we just support ID, ADMIN_CMD and + * IO_CMD. The rest are not supported. + */ + + +#include <sys/ioccom.h> +#include <sys/_types.h> + +struct nvme_passthru_cmd { + __uint8_t opcode; + __uint8_t flags; + __uint16_t rsvd1; + __uint32_t nsid; + __uint32_t cdw2; + __uint32_t cdw3; + __uint64_t metadata; + __uint64_t addr; + __uint32_t metadata_len; + __uint32_t data_len; + __uint32_t cdw10; + __uint32_t cdw11; + __uint32_t cdw12; + __uint32_t cdw13; + __uint32_t cdw14; + __uint32_t cdw15; + __uint32_t timeout_ms; + __uint32_t result; +}; + +#define nvme_admin_cmd nvme_passthru_cmd + +/* + * Linux nvme ioctls, commented out ones are not supported + */ +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +/* #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) */ +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) +#define NVME_IOCTL_RESET _IO('N', 0x44) +/* #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) */ +/* #define NVME_IOCTL_RESCAN _IO('N', 0x46) */ +/* #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) */ +/* #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) */ +/* #define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) */ + +/* io_uring async commands: */ +/* #define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) */ diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c index 360b9f982c20..3f29382fe42f 100644 --- a/sys/dev/nvme/nvme_ns.c +++ b/sys/dev/nvme/nvme_ns.c @@ -43,6 +43,7 @@ #include <geom/geom.h> #include "nvme_private.h" +#include "nvme_linux.h" static void nvme_bio_child_inbed(struct bio *parent, int bio_error); static void nvme_bio_child_done(void *arg, @@ -82,9 +83,8 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, case NVME_GET_NSID: { struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), + strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = ns->id; break; } @@ -94,6 +94,18 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, case DIOCGSECTORSIZE: *(u_int *)arg = nvme_ns_get_sector_size(ns); break; + /* Linux Compatible (see nvme_linux.h) */ + case NVME_IOCTL_ID: + td->td_retval[0] = ns->id; + return (0); + + case NVME_IOCTL_ADMIN_CMD: + case NVME_IOCTL_IO_CMD: { + struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg; + + return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, ns->id, true, + cmd == NVME_IOCTL_ADMIN_CMD)); + } default: return (ENOTTY); } @@ -429,6 +441,7 @@ nvme_ns_split_bio(struct nvme_namespace *ns, struct bio *bp, if (child_bios == NULL) return (ENOMEM); + counter_u64_add(ns->ctrlr->alignment_splits, 1); for (i = 0; i < num_bios; i++) { child = child_bios[i]; err = nvme_ns_bio_process(ns, child, nvme_bio_child_done); @@ -604,11 +617,12 @@ nvme_ns_construct(struct nvme_namespace *ns, uint32_t id, md_args.mda_unit = unit; md_args.mda_mode = 0600; md_args.mda_si_drv1 = ns; - res = make_dev_s(&md_args, &ns->cdev, "nvme%dns%d", - device_get_unit(ctrlr->dev), ns->id); + res = make_dev_s(&md_args, &ns->cdev, "%sn%d", + device_get_nameunit(ctrlr->dev), ns->id); if (res != 0) return (ENXIO); - + ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%d", + device_get_nameunit(ctrlr->dev), ns->id); ns->cdev->si_flags |= SI_UNMAPPED; return (0); @@ -618,6 +632,9 @@ void nvme_ns_destruct(struct nvme_namespace *ns) { - if (ns->cdev != NULL) + if (ns->cdev != NULL) { + if (ns->cdev->si_drv2 != NULL) + destroy_dev(ns->cdev->si_drv2); destroy_dev(ns->cdev); + } } diff --git a/sys/dev/nvme/nvme_ns_cmd.c b/sys/dev/nvme/nvme_ns_cmd.c index 8cbeac025307..1bad9929cb09 100644 --- a/sys/dev/nvme/nvme_ns_cmd.c +++ b/sys/dev/nvme/nvme_ns_cmd.c @@ -36,8 +36,7 @@ nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba, struct nvme_request *req; req = nvme_allocate_request_vaddr(payload, - lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg); - + lba_count * nvme_ns_get_sector_size(ns), M_NOWAIT, cb_fn, cb_arg); if (req == NULL) return (ENOMEM); @@ -56,11 +55,9 @@ nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, uint64_t lba; uint64_t lba_count; - req = nvme_allocate_request_bio(bp, cb_fn, cb_arg); - + req = nvme_allocate_request_bio(bp, M_NOWAIT, cb_fn, cb_arg); if (req == NULL) return (ENOMEM); - lba = bp->bio_offset / nvme_ns_get_sector_size(ns); lba_count = bp->bio_bcount / nvme_ns_get_sector_size(ns); nvme_ns_read_cmd(&req->cmd, ns->id, lba, lba_count); @@ -77,8 +74,7 @@ nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba, struct nvme_request *req; req = nvme_allocate_request_vaddr(payload, - lba_count*nvme_ns_get_sector_size(ns), cb_fn, cb_arg); - + lba_count * nvme_ns_get_sector_size(ns), M_NOWAIT, cb_fn, cb_arg); if (req == NULL) return (ENOMEM); @@ -97,8 +93,7 @@ nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, uint64_t lba; uint64_t lba_count; - req = nvme_allocate_request_bio(bp, cb_fn, cb_arg); - + req = nvme_allocate_request_bio(bp, M_NOWAIT, cb_fn, cb_arg); if (req == NULL) return (ENOMEM); lba = bp->bio_offset / nvme_ns_get_sector_size(ns); @@ -118,8 +113,8 @@ nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, struct nvme_command *cmd; req = nvme_allocate_request_vaddr(payload, - num_ranges * sizeof(struct nvme_dsm_range), cb_fn, cb_arg); - + num_ranges * sizeof(struct nvme_dsm_range), M_NOWAIT, cb_fn, + cb_arg); if (req == NULL) return (ENOMEM); @@ -141,8 +136,7 @@ nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; - req = nvme_allocate_request_null(cb_fn, cb_arg); - + req = nvme_allocate_request_null(M_NOWAIT, cb_fn, cb_arg); if (req == NULL) return (ENOMEM); @@ -165,8 +159,8 @@ nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len) int i; status.done = FALSE; - req = nvme_allocate_request_vaddr(virt, len, nvme_completion_poll_cb, - &status); + req = nvme_allocate_request_vaddr(virt, len, M_NOWAIT, + nvme_completion_poll_cb, &status); if (req == NULL) return (ENOMEM); diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 69141add4e48..949e69ec9290 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -32,6 +32,7 @@ #include <sys/param.h> #include <sys/bio.h> #include <sys/bus.h> +#include <sys/counter.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -297,11 +298,15 @@ struct nvme_controller { void *cons_cookie[NVME_MAX_CONSUMERS]; uint32_t is_resetting; - uint32_t is_initialized; uint32_t notification_sent; + u_int fail_on_reset; bool is_failed; + bool is_failed_admin; bool is_dying; + bool isr_warned; + bool is_initialized; + STAILQ_HEAD(, nvme_request) fail_req; /* Host Memory Buffer */ @@ -317,6 +322,9 @@ struct nvme_controller { bus_dmamap_t hmb_desc_map; struct nvme_hmb_desc *hmb_desc_vaddr; uint64_t hmb_desc_paddr; + + /* Statistics */ + counter_u64_t alignment_splits; }; #define nvme_mmio_offsetof(reg) \ @@ -413,9 +421,6 @@ void nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req); void nvme_qpair_reset(struct nvme_qpair *qpair); void nvme_qpair_fail(struct nvme_qpair *qpair); -void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, - struct nvme_request *req, - uint32_t sct, uint32_t sc); void nvme_admin_qpair_enable(struct nvme_qpair *qpair); void nvme_admin_qpair_disable(struct nvme_qpair *qpair); @@ -481,11 +486,14 @@ nvme_single_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) } static __inline struct nvme_request * -_nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg) +_nvme_allocate_request(const int how, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; - req = malloc(sizeof(*req), M_NVME, M_NOWAIT | M_ZERO); + KASSERT(how == M_WAITOK || how == M_NOWAIT, + ("nvme_allocate_request: invalid how %d", how)); + + req = malloc(sizeof(*req), M_NVME, how | M_ZERO); if (req != NULL) { req->cb_fn = cb_fn; req->cb_arg = cb_arg; @@ -496,11 +504,11 @@ _nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg) static __inline struct nvme_request * nvme_allocate_request_vaddr(void *payload, uint32_t payload_size, - nvme_cb_fn_t cb_fn, void *cb_arg) + const int how, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; - req = _nvme_allocate_request(cb_fn, cb_arg); + req = _nvme_allocate_request(how, cb_fn, cb_arg); if (req != NULL) { req->payload = memdesc_vaddr(payload, payload_size); req->payload_valid = true; @@ -509,20 +517,21 @@ nvme_allocate_request_vaddr(void *payload, uint32_t payload_size, } static __inline struct nvme_request * -nvme_allocate_request_null(nvme_cb_fn_t cb_fn, void *cb_arg) +nvme_allocate_request_null(const int how, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; - req = _nvme_allocate_request(cb_fn, cb_arg); + req = _nvme_allocate_request(how, cb_fn, cb_arg); return (req); } static __inline struct nvme_request * -nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg) +nvme_allocate_request_bio(struct bio *bio, const int how, nvme_cb_fn_t cb_fn, + void *cb_arg) { struct nvme_request *req; - req = _nvme_allocate_request(cb_fn, cb_arg); + req = _nvme_allocate_request(how, cb_fn, cb_arg); if (req != NULL) { req->payload = memdesc_bio(bio); req->payload_valid = true; @@ -531,16 +540,16 @@ nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg) } static __inline struct nvme_request * -nvme_allocate_request_ccb(union ccb *ccb, nvme_cb_fn_t cb_fn, void *cb_arg) +nvme_allocate_request_ccb(union ccb *ccb, const int how, nvme_cb_fn_t cb_fn, + void *cb_arg) { struct nvme_request *req; - req = _nvme_allocate_request(cb_fn, cb_arg); + req = _nvme_allocate_request(how, cb_fn, cb_arg); if (req != NULL) { req->payload = memdesc_ccb(ccb); req->payload_valid = true; } - return (req); } diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c index 62d27e439180..bd8626e32209 100644 --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -31,6 +31,7 @@ #include <sys/conf.h> #include <sys/domainset.h> #include <sys/proc.h> +#include <sys/sbuf.h> #include <dev/pci/pcivar.h> @@ -43,96 +44,36 @@ static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req); static void nvme_qpair_destroy(struct nvme_qpair *qpair); -#define DEFAULT_INDEX 256 -#define DEFAULT_ENTRY(x) [DEFAULT_INDEX] = x -#define OPC_ENTRY(x) [NVME_OPC_ ## x] = #x - -static const char *admin_opcode[DEFAULT_INDEX + 1] = { - OPC_ENTRY(DELETE_IO_SQ), - OPC_ENTRY(CREATE_IO_SQ), - OPC_ENTRY(GET_LOG_PAGE), - OPC_ENTRY(DELETE_IO_CQ), - OPC_ENTRY(CREATE_IO_CQ), - OPC_ENTRY(IDENTIFY), - OPC_ENTRY(ABORT), - OPC_ENTRY(SET_FEATURES), - OPC_ENTRY(GET_FEATURES), - OPC_ENTRY(ASYNC_EVENT_REQUEST), - OPC_ENTRY(NAMESPACE_MANAGEMENT), - OPC_ENTRY(FIRMWARE_ACTIVATE), - OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD), - OPC_ENTRY(DEVICE_SELF_TEST), - OPC_ENTRY(NAMESPACE_ATTACHMENT), - OPC_ENTRY(KEEP_ALIVE), - OPC_ENTRY(DIRECTIVE_SEND), - OPC_ENTRY(DIRECTIVE_RECEIVE), - OPC_ENTRY(VIRTUALIZATION_MANAGEMENT), - OPC_ENTRY(NVME_MI_SEND), - OPC_ENTRY(NVME_MI_RECEIVE), - OPC_ENTRY(CAPACITY_MANAGEMENT), - OPC_ENTRY(LOCKDOWN), - OPC_ENTRY(DOORBELL_BUFFER_CONFIG), - OPC_ENTRY(FABRICS_COMMANDS), - OPC_ENTRY(FORMAT_NVM), - OPC_ENTRY(SECURITY_SEND), - OPC_ENTRY(SECURITY_RECEIVE), - OPC_ENTRY(SANITIZE), - OPC_ENTRY(GET_LBA_STATUS), - DEFAULT_ENTRY("ADMIN COMMAND"), -}; - -static const char *io_opcode[DEFAULT_INDEX + 1] = { - OPC_ENTRY(FLUSH), - OPC_ENTRY(WRITE), - OPC_ENTRY(READ), - OPC_ENTRY(WRITE_UNCORRECTABLE), - OPC_ENTRY(COMPARE), - OPC_ENTRY(WRITE_ZEROES), - OPC_ENTRY(DATASET_MANAGEMENT), - OPC_ENTRY(VERIFY), - OPC_ENTRY(RESERVATION_REGISTER), - OPC_ENTRY(RESERVATION_REPORT), - OPC_ENTRY(RESERVATION_ACQUIRE), - OPC_ENTRY(RESERVATION_RELEASE), - OPC_ENTRY(COPY), - DEFAULT_ENTRY("IO COMMAND"), -}; - -static const char * -get_opcode_string(const char *op[DEFAULT_INDEX + 1], uint16_t opc) -{ - const char *nm = opc < DEFAULT_INDEX ? op[opc] : op[DEFAULT_INDEX]; - - return (nm != NULL ? nm : op[DEFAULT_INDEX]); -} - static const char * -get_admin_opcode_string(uint16_t opc) +get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len) { - return (get_opcode_string(admin_opcode, opc)); -} + struct sbuf sb; -static const char * -get_io_opcode_string(uint16_t opc) -{ - return (get_opcode_string(io_opcode, opc)); + sbuf_new(&sb, buf, len, SBUF_FIXEDLEN); + nvme_opcode_sbuf(admin, opc, &sb); + if (sbuf_finish(&sb) != 0) + return (""); + return (buf); } static void nvme_admin_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { + char buf[64]; - nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x " + nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x " "cdw10:%08x cdw11:%08x\n", - get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, - le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11)); + get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id, + cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10), + le32toh(cmd->cdw11)); } static void nvme_io_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { + char buf[64]; switch (cmd->opc) { case NVME_OPC_WRITE: @@ -143,23 +84,15 @@ nvme_io_qpair_print_command(struct nvme_qpair *qpair, case NVME_OPC_VERIFY: nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " "lba:%llu len:%d\n", - get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid), + get_opcode_string(false, cmd->opc, buf, sizeof(buf)), + qpair->id, cmd->cid, le32toh(cmd->nsid), ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10), (le32toh(cmd->cdw12) & 0xFFFF) + 1); break; - case NVME_OPC_FLUSH: - case NVME_OPC_DATASET_MANAGEMENT: - case NVME_OPC_RESERVATION_REGISTER: - case NVME_OPC_RESERVATION_REPORT: - case NVME_OPC_RESERVATION_ACQUIRE: - case NVME_OPC_RESERVATION_RELEASE: - nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", - get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid)); - break; default: - nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n", - get_io_opcode_string(cmd->opc), cmd->opc, qpair->id, - cmd->cid, le32toh(cmd->nsid)); + nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", + get_opcode_string(false, cmd->opc, buf, sizeof(buf)), + qpair->id, cmd->cid, le32toh(cmd->nsid)); break; } } @@ -183,170 +116,33 @@ nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) } } -struct nvme_status_string { - uint16_t sc; - const char * str; -}; - -static struct nvme_status_string generic_status[] = { - { NVME_SC_SUCCESS, "SUCCESS" }, - { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, - { NVME_SC_INVALID_FIELD, "INVALID_FIELD" }, - { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, - { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, - { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, - { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, - { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, - { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, - { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, - { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, - { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, - { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, - { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" }, - { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" }, - { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" }, - { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" }, - { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" }, - { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" }, - { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" }, - { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" }, - { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" }, - { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" }, - { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" }, - { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" }, - { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" }, - { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" }, - { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" }, - { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" }, - { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" }, - { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" }, - { NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" }, - { NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" }, - { NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" }, - - { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, - { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, - { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, - { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" }, - { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" }, - { 0xFFFF, "GENERIC" } -}; - -static struct nvme_status_string command_specific_status[] = { - { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, - { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, - { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" }, - { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, - { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, - { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, - { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, - { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, - { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, - { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, - { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" }, - { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" }, - { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" }, - { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" }, - { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" }, - { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" }, - { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" }, - { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" }, - { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" }, - { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" }, - { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" }, - { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" }, - { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" }, - { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" }, - { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" }, - { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" }, - { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" }, - { NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" }, - { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" }, - { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" }, - { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" }, - { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" }, - { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" }, - { NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" }, - { NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" }, - { NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" }, - - { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, - { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, - { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" }, - { 0xFFFF, "COMMAND SPECIFIC" } -}; - -static struct nvme_status_string media_error_status[] = { - { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, - { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, - { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, - { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, - { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, - { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, - { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, - { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" }, - { 0xFFFF, "MEDIA ERROR" } -}; - -static struct nvme_status_string path_related_status[] = { - { NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" }, - { NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" }, - { NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" }, - { NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" }, - { NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" }, - { NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" }, - { NVME_SC_COMMAND_ABORTED_BY_HOST, "COMMAND ABORTED BY HOST" }, - { 0xFFFF, "PATH RELATED" }, -}; - static const char * -get_status_string(uint16_t sct, uint16_t sc) +get_status_string(const struct nvme_completion *cpl, char *buf, size_t len) { - struct nvme_status_string *entry; + struct sbuf sb; - switch (sct) { - case NVME_SCT_GENERIC: - entry = generic_status; - break; - case NVME_SCT_COMMAND_SPECIFIC: - entry = command_specific_status; - break; - case NVME_SCT_MEDIA_ERROR: - entry = media_error_status; - break; - case NVME_SCT_PATH_RELATED: - entry = path_related_status; - break; - case NVME_SCT_VENDOR_SPECIFIC: - return ("VENDOR SPECIFIC"); - default: - return ("RESERVED"); - } - - while (entry->sc != 0xFFFF) { - if (entry->sc == sc) - return (entry->str); - entry++; - } - return (entry->str); + sbuf_new(&sb, buf, len, SBUF_FIXEDLEN); + nvme_sc_sbuf(cpl, &sb); + if (sbuf_finish(&sb) != 0) + return (""); + return (buf); } void nvme_qpair_print_completion(struct nvme_qpair *qpair, struct nvme_completion *cpl) { - uint8_t sct, sc, crd, m, dnr, p; + char buf[64]; + uint8_t crd, m, dnr, p; - sct = NVME_STATUS_GET_SCT(cpl->status); - sc = NVME_STATUS_GET_SC(cpl->status); crd = NVME_STATUS_GET_CRD(cpl->status); m = NVME_STATUS_GET_M(cpl->status); dnr = NVME_STATUS_GET_DNR(cpl->status); p = NVME_STATUS_GET_P(cpl->status); - nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x p:%d " + nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d " "sqid:%d cid:%d cdw0:%x\n", - get_status_string(sct, sc), sct, sc, crd, m, dnr, p, + get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p, cpl->sqid, cpl->cid, cpl->cdw0); } @@ -414,10 +210,12 @@ static void nvme_qpair_complete_tracker(struct nvme_tracker *tr, struct nvme_completion *cpl, error_print_t print_on_error) { - struct nvme_qpair * qpair = tr->qpair; + struct nvme_qpair *qpair = tr->qpair; struct nvme_request *req; bool retry, error, retriable; + mtx_assert(&qpair->lock, MA_NOTOWNED); + req = tr->req; error = nvme_completion_is_error(cpl); retriable = nvme_completion_is_retry(cpl); @@ -480,43 +278,52 @@ nvme_qpair_complete_tracker(struct nvme_tracker *tr, mtx_unlock(&qpair->lock); } +static uint32_t +nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr) +{ + uint32_t status = 0; + + status |= NVMEF(NVME_STATUS_SCT, sct); + status |= NVMEF(NVME_STATUS_SC, sc); + status |= NVMEF(NVME_STATUS_DNR, dnr); + /* M=0 : this is artificial so no data in error log page */ + /* CRD=0 : this is artificial and no delayed retry support anyway */ + /* P=0 : phase not checked */ + return (status); +} + static void nvme_qpair_manual_complete_tracker( struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, error_print_t print_on_error) { struct nvme_completion cpl; + struct nvme_qpair * qpair = tr->qpair; - memset(&cpl, 0, sizeof(cpl)); + mtx_assert(&qpair->lock, MA_NOTOWNED); - struct nvme_qpair * qpair = tr->qpair; + memset(&cpl, 0, sizeof(cpl)); cpl.sqid = qpair->id; cpl.cid = tr->cid; - cpl.status |= NVMEF(NVME_STATUS_SCT, sct); - cpl.status |= NVMEF(NVME_STATUS_SC, sc); - cpl.status |= NVMEF(NVME_STATUS_DNR, dnr); - /* M=0 : this is artificial so no data in error log page */ - /* CRD=0 : this is artificial and no delayed retry support anyway */ - /* P=0 : phase not checked */ + cpl.status = nvme_qpair_make_status(sct, sc, dnr); nvme_qpair_complete_tracker(tr, &cpl, print_on_error); } -void +static void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, - struct nvme_request *req, uint32_t sct, uint32_t sc) + struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr, + error_print_t print_on_error) { struct nvme_completion cpl; bool error; memset(&cpl, 0, sizeof(cpl)); cpl.sqid = qpair->id; - cpl.status |= NVMEF(NVME_STATUS_SCT, sct); - cpl.status |= NVMEF(NVME_STATUS_SC, sc); - + cpl.status = nvme_qpair_make_status(sct, sc, dnr); error = nvme_completion_is_error(&cpl); - if (error) { + if (error && print_on_error == ERROR_PRINT_ALL) { nvme_qpair_print_command(qpair, &req->cmd); nvme_qpair_print_completion(qpair, &cpl); } @@ -679,7 +486,7 @@ _nvme_qpair_process_completions(struct nvme_qpair *qpair) bool nvme_qpair_process_completions(struct nvme_qpair *qpair) { - bool done; + bool done = false; /* * Interlock with reset / recovery code. This is an usually uncontended @@ -687,12 +494,12 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) * and to prevent races with the recovery process called from a timeout * context. */ - if (!mtx_trylock(&qpair->recovery)) { - qpair->num_recovery_nolock++; - return (false); - } + mtx_lock(&qpair->recovery); - done = _nvme_qpair_process_completions(qpair); + if (__predict_true(qpair->recovery_state == RECOVERY_NONE)) + done = _nvme_qpair_process_completions(qpair); + else + qpair->num_recovery_nolock++; // XXX likely need to rename mtx_unlock(&qpair->recovery); @@ -950,27 +757,26 @@ nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) /* * nvme_complete_tracker must be called without the qpair lock held. It * takes the lock to adjust outstanding_tr list, so make sure we don't - * have it yet (since this is a general purpose routine). We take the - * lock to make the list traverse safe, but have to drop the lock to - * complete any AER. We restart the list scan when we do this to make - * this safe. There's interlock with the ISR so we know this tracker - * won't be completed twice. + * have it yet. We need the lock to make the list traverse safe, but + * have to drop the lock to complete any AER. We restart the list scan + * when we do this to make this safe. There's interlock with the ISR so + * we know this tracker won't be completed twice. */ mtx_assert(&qpair->lock, MA_NOTOWNED); mtx_lock(&qpair->lock); tr = TAILQ_FIRST(&qpair->outstanding_tr); while (tr != NULL) { - if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { - mtx_unlock(&qpair->lock); - nvme_qpair_manual_complete_tracker(tr, - NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, - ERROR_PRINT_NONE); - mtx_lock(&qpair->lock); - tr = TAILQ_FIRST(&qpair->outstanding_tr); - } else { + if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) { tr = TAILQ_NEXT(tr, tailq); + continue; } + mtx_unlock(&qpair->lock); + nvme_qpair_manual_complete_tracker(tr, + NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, + ERROR_PRINT_NONE); + mtx_lock(&qpair->lock); + tr = TAILQ_FIRST(&qpair->outstanding_tr); } mtx_unlock(&qpair->lock); } @@ -997,22 +803,35 @@ nvme_abort_complete(void *arg, const struct nvme_completion *status) struct nvme_tracker *tr = arg; /* - * If cdw0 == 1, the controller was not able to abort the command - * we requested. We still need to check the active tracker array, - * to cover race where I/O timed out at same time controller was - * completing the I/O. + * If cdw0 bit 0 == 1, the controller was not able to abort the command + * we requested. We still need to check the active tracker array, to + * cover race where I/O timed out at same time controller was completing + * the I/O. An abort command always is on the admin queue, but affects + * either an admin or an I/O queue, so take the appropriate qpair lock + * for the original command's queue, since we'll need it to avoid races + * with the completion code and to complete the command manually. */ - if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) { + mtx_lock(&tr->qpair->lock); + if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) { /* - * An I/O has timed out, and the controller was unable to - * abort it for some reason. Construct a fake completion - * status, and then complete the I/O's tracker manually. + * An I/O has timed out, and the controller was unable to abort + * it for some reason. And we've not processed a completion for + * it yet. Construct a fake completion status, and then complete + * the I/O's tracker manually. */ nvme_printf(tr->qpair->ctrlr, "abort command failed, aborting command manually\n"); nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); } + /* + * XXX We don't check status for the possible 'Could not abort because + * excess aborts were submitted to the controller'. We don't prevent + * that, either. Document for the future here, since the standard is + * squishy and only says 'may generate' but implies anything is possible + * including hangs if you exceed the ACL. + */ + mtx_unlock(&tr->qpair->lock); } static void @@ -1022,8 +841,9 @@ nvme_qpair_timeout(void *arg) struct nvme_controller *ctrlr = qpair->ctrlr; struct nvme_tracker *tr; sbintime_t now; - bool idle = false; - bool needs_reset; + bool idle = true; + bool is_admin = qpair == &ctrlr->adminq; + bool fast; uint32_t csts; uint8_t cfs; @@ -1034,9 +854,10 @@ nvme_qpair_timeout(void *arg) * failure processing that races with the qpair timeout will fail * safely. */ - if (qpair->ctrlr->is_failed) { + if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) { nvme_printf(qpair->ctrlr, - "Failed controller, stopping watchdog timeout.\n"); + "%sFailed controller, stopping watchdog timeout.\n", + is_admin ? "Complete " : ""); qpair->timer_armed = false; return; } @@ -1069,23 +890,35 @@ nvme_qpair_timeout(void *arg) */ csts = nvme_mmio_read_4(ctrlr, csts); cfs = NVMEV(NVME_CSTS_REG_CFS, csts); - if (csts == NVME_GONE || cfs == 1) - goto do_reset; + if (csts == NVME_GONE || cfs == 1) { + /* + * We've had a command timeout that we weren't able to + * abort or we have aborts disabled and any command + * timed out. + * + * If we get here due to a possible surprise hot-unplug + * event, then we let nvme_ctrlr_reset confirm and fail + * the controller. + */ +do_reset: + nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", + (csts == 0xffffffff) ? " and possible hot unplug" : + (cfs ? " and fatal error status" : "")); + qpair->recovery_state = RECOVERY_WAITING; + nvme_ctrlr_reset(ctrlr); + idle = false; + break; + } - /* - * Process completions. We already have the recovery lock, so - * call the locked version. - */ - _nvme_qpair_process_completions(qpair); /* - * Check to see if we need to timeout any commands. If we do, then - * we also enter a recovery phase. + * See if there's any recovery needed. First, do a fast check to + * see if anything could have timed out. If not, then skip + * everything else. */ - now = getsbinuptime(); - needs_reset = false; - idle = true; + fast = false; mtx_lock(&qpair->lock); + now = getsbinuptime(); TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { /* * Skip async commands, they are posted to the card for @@ -1093,48 +926,83 @@ nvme_qpair_timeout(void *arg) */ if (tr->deadline == SBT_MAX) continue; - if (now > tr->deadline) { - if (tr->req->cb_fn != nvme_abort_complete && - ctrlr->enable_aborts) { - /* - * This isn't an abort command, ask - * for a hardware abort. - */ - nvme_ctrlr_cmd_abort(ctrlr, tr->cid, - qpair->id, nvme_abort_complete, tr); - } else { - /* - * Otherwise we have a live command in - * the card (either one we couldn't - * abort, or aborts weren't enabled). - * The only safe way to proceed is to do - * a reset. - */ - needs_reset = true; - } - } else { - idle = false; - } + + /* + * If the first real transaction is not in timeout, then + * we're done. Otherwise, we try recovery. + */ + idle = false; + if (now <= tr->deadline) + fast = true; + break; } mtx_unlock(&qpair->lock); - if (!needs_reset) + if (idle || fast) break; /* - * We've had a command timeout that we weren't able to abort - * - * If we get here due to a possible surprise hot-unplug event, - * then we let nvme_ctrlr_reset confirm and fail the - * controller. + * There's a stale transaction at the start of the queue whose + * deadline has passed. Poll the competions as a last-ditch + * effort in case an interrupt has been missed. Warn the user if + * transactions were found of possible interrupt issues, but + * just once per controller. + */ + if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) { + nvme_printf(ctrlr, "System interrupt issues?\n"); + ctrlr->isr_warned = true; + } + + /* + * Now that we've run the ISR, re-rheck to see if there's any + * timed out commands and abort them or reset the card if so. */ - do_reset: - nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", - (csts == 0xffffffff) ? " and possible hot unplug" : - (cfs ? " and fatal error status" : "")); - qpair->recovery_state = RECOVERY_WAITING; - nvme_ctrlr_reset(ctrlr); - idle = false; /* We want to keep polling */ + mtx_lock(&qpair->lock); + idle = true; + TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { + /* + * Skip async commands, they are posted to the card for + * an indefinite amount of time and have no deadline. + */ + if (tr->deadline == SBT_MAX) + continue; + + /* + * If we know this tracker hasn't timed out, we also + * know all subsequent ones haven't timed out. The tr + * queue is in submission order and all normal commands + * in a queue have the same timeout (or the timeout was + * changed by the user, but we eventually timeout then). + */ + idle = false; + if (now <= tr->deadline) + break; + + /* + * Timeout expired, abort it or reset controller. + */ + if (ctrlr->enable_aborts && + tr->req->cb_fn != nvme_abort_complete) { + /* + * This isn't an abort command, ask for a + * hardware abort. This goes to the admin + * queue which will reset the card if it + * times out. + */ + nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, + nvme_abort_complete, tr); + } else { + /* + * We have a live command in the card (either + * one we couldn't abort, or aborts weren't + * enabled). We can only reset. + */ + mtx_unlock(&qpair->lock); + goto do_reset; + } + } + mtx_unlock(&qpair->lock); break; + case RECOVERY_WAITING: /* * These messages aren't interesting while we're suspended. We @@ -1201,7 +1069,7 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, + bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle, qpair->sq_tdbl_off, qpair->sq_tail); qpair->num_cmds++; } @@ -1259,47 +1127,41 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) { struct nvme_tracker *tr; int err = 0; + bool is_admin = qpair == &qpair->ctrlr->adminq; mtx_assert(&qpair->lock, MA_OWNED); tr = TAILQ_FIRST(&qpair->free_tr); req->qpair = qpair; - if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { - /* - * No tracker is available, or the qpair is disabled due to an - * in-progress controller-level reset. If we lose the race with - * recovery_state, then we may add an extra request to the queue - * which will be resubmitted later. We only set recovery_state - * to NONE with qpair->lock also held, so if we observe that the - * state is not NONE, we know it can't transition to NONE below - * when we've submitted the request to hardware. - * - * Also, as part of the failure process, we set recovery_state - * to RECOVERY_WAITING, so we check here to see if we've failed - * the controller. We set it before we call the qpair_fail - * functions, which take out the lock lock before messing with - * queued_req. Since we hold that lock, we know it's safe to - * either fail directly, or queue the failure should is_failed - * be stale. If we lose the race reading is_failed, then - * nvme_qpair_fail will fail the queued request. - */ + /* + * The controller has failed, so fail the request. Note, that this races + * the recovery / timeout code. Since we hold the qpair lock, we know + * it's safe to fail directly. is_failed is set when we fail the + * controller. It is only ever reset in the ioctl reset controller + * path, which is safe to race (for failed controllers, we make no + * guarantees about bringing it out of failed state relative to other + * commands). We try hard to allow admin commands when the entire + * controller hasn't failed, only something related to I/O queues. + */ + if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) { + nvme_qpair_manual_complete_request(qpair, req, + NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1, + ERROR_PRINT_NONE); + return; + } - if (qpair->ctrlr->is_failed) { - /* - * The controller has failed, so fail the request. - */ - nvme_qpair_manual_complete_request(qpair, req, - NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST); - } else { - /* - * Put the request on the qpair's request queue to be - * processed when a tracker frees up via a command - * completion or when the controller reset is - * completed. - */ - STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); - } + /* + * No tracker is available, or the qpair is disabled due to an + * in-progress controller-level reset. If we lose the race with + * recovery_state, then we may add an extra request to the queue which + * will be resubmitted later. We only set recovery_state to NONE with + * qpair->lock also held, so if we observe that the state is not NONE, + * we know it won't transition back to NONE without retrying queued + * request. + */ + if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); return; } @@ -1313,6 +1175,11 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) return; } + /* + * tr->deadline updating when nvme_payload_map calls + * nvme_qpair_submit_tracker (we call it above directly + * when there's no map to load). + */ err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload, tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0); if (err != 0) { @@ -1344,11 +1211,13 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) static void nvme_qpair_enable(struct nvme_qpair *qpair) { + bool is_admin __unused = qpair == &qpair->ctrlr->adminq; + if (mtx_initialized(&qpair->recovery)) mtx_assert(&qpair->recovery, MA_OWNED); if (mtx_initialized(&qpair->lock)) mtx_assert(&qpair->lock, MA_OWNED); - KASSERT(!qpair->ctrlr->is_failed, + KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed), ("Enabling a failed qpair\n")); qpair->recovery_state = RECOVERY_NONE; @@ -1515,7 +1384,7 @@ nvme_qpair_fail(struct nvme_qpair *qpair) STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, - NVME_SC_ABORTED_BY_REQUEST); + NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL); mtx_lock(&qpair->lock); } diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c index f561756f99b7..4974bb718222 100644 --- a/sys/dev/nvme/nvme_sim.c +++ b/sys/dev/nvme/nvme_sim.c @@ -96,15 +96,16 @@ nvme_sim_nvmeio(struct cam_sim *sim, union ccb *ccb) /* SG LIST ??? */ if ((nvmeio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO) req = nvme_allocate_request_bio((struct bio *)payload, - nvme_sim_nvmeio_done, ccb); + M_NOWAIT, nvme_sim_nvmeio_done, ccb); else if ((nvmeio->ccb_h.flags & CAM_DATA_SG) == CAM_DATA_SG) - req = nvme_allocate_request_ccb(ccb, nvme_sim_nvmeio_done, ccb); + req = nvme_allocate_request_ccb(ccb, M_NOWAIT, + nvme_sim_nvmeio_done, ccb); else if (payload == NULL) - req = nvme_allocate_request_null(nvme_sim_nvmeio_done, ccb); + req = nvme_allocate_request_null(M_NOWAIT, nvme_sim_nvmeio_done, + ccb); else - req = nvme_allocate_request_vaddr(payload, size, + req = nvme_allocate_request_vaddr(payload, size, M_NOWAIT, nvme_sim_nvmeio_done, ccb); - if (req == NULL) { nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL; xpt_done(ccb); @@ -203,7 +204,7 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb) cpi->xport_specific.nvme.slot = pci_get_slot(dev); cpi->xport_specific.nvme.function = pci_get_function(dev); cpi->xport_specific.nvme.extra = 0; - strncpy(cpi->xport_specific.nvme.dev_name, device_get_nameunit(dev), + strlcpy(cpi->xport_specific.nvme.dev_name, device_get_nameunit(dev), sizeof(cpi->xport_specific.nvme.dev_name)); cpi->hba_vendor = pci_get_vendor(dev); cpi->hba_device = pci_get_device(dev); @@ -268,7 +269,6 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb) ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_NVME_IO: /* Execute the requested I/O operation */ - case XPT_NVME_ADMIN: /* or Admin operation */ if (ctrlr->is_failed) { /* * I/O came in while we were failing the drive, so drop @@ -279,6 +279,18 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb) } nvme_sim_nvmeio(sim, ccb); return; /* no done */ + case XPT_NVME_ADMIN: /* or Admin operation */ + if (ctrlr->is_failed_admin) { + /* + * Admin request came in when we can't send admin + * commands, so drop it. Once falure is complete, we'll + * be destroyed. + */ + ccb->ccb_h.status = CAM_DEV_NOT_THERE; + break; + } + nvme_sim_nvmeio(sim, ccb); + return; /* no done */ default: ccb->ccb_h.status = CAM_REQ_INVALID; break; diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c index d6452a2e5492..a5a44721f9f9 100644 --- a/sys/dev/nvme/nvme_sysctl.c +++ b/sys/dev/nvme/nvme_sysctl.c @@ -30,6 +30,7 @@ #include "opt_nvme.h" #include <sys/param.h> +#include <sys/systm.h> #include <sys/bus.h> #include <sys/sysctl.h> @@ -175,8 +176,10 @@ nvme_sysctl_num_cmds(SYSCTL_HANDLER_ARGS) num_cmds = ctrlr->adminq.num_cmds; - for (i = 0; i < ctrlr->num_io_queues; i++) - num_cmds += ctrlr->ioq[i].num_cmds; + if (ctrlr->ioq != NULL) { + for (i = 0; i < ctrlr->num_io_queues; i++) + num_cmds += ctrlr->ioq[i].num_cmds; + } return (sysctl_handle_64(oidp, &num_cmds, 0, req)); } @@ -190,8 +193,10 @@ nvme_sysctl_num_intr_handler_calls(SYSCTL_HANDLER_ARGS) num_intr_handler_calls = ctrlr->adminq.num_intr_handler_calls; - for (i = 0; i < ctrlr->num_io_queues; i++) - num_intr_handler_calls += ctrlr->ioq[i].num_intr_handler_calls; + if (ctrlr->ioq != NULL) { + for (i = 0; i < ctrlr->num_io_queues; i++) + num_intr_handler_calls += ctrlr->ioq[i].num_intr_handler_calls; + } return (sysctl_handle_64(oidp, &num_intr_handler_calls, 0, req)); } @@ -205,8 +210,10 @@ nvme_sysctl_num_retries(SYSCTL_HANDLER_ARGS) num_retries = ctrlr->adminq.num_retries; - for (i = 0; i < ctrlr->num_io_queues; i++) - num_retries += ctrlr->ioq[i].num_retries; + if (ctrlr->ioq != NULL) { + for (i = 0; i < ctrlr->num_io_queues; i++) + num_retries += ctrlr->ioq[i].num_retries; + } return (sysctl_handle_64(oidp, &num_retries, 0, req)); } @@ -220,8 +227,10 @@ nvme_sysctl_num_failures(SYSCTL_HANDLER_ARGS) num_failures = ctrlr->adminq.num_failures; - for (i = 0; i < ctrlr->num_io_queues; i++) - num_failures += ctrlr->ioq[i].num_failures; + if (ctrlr->ioq != NULL) { + for (i = 0; i < ctrlr->num_io_queues; i++) + num_failures += ctrlr->ioq[i].num_failures; + } return (sysctl_handle_64(oidp, &num_failures, 0, req)); } @@ -235,8 +244,10 @@ nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS) num_ignored = ctrlr->adminq.num_ignored; - for (i = 0; i < ctrlr->num_io_queues; i++) - num_ignored += ctrlr->ioq[i].num_ignored; + if (ctrlr->ioq != NULL) { + for (i = 0; i < ctrlr->num_io_queues; i++) + num_ignored += ctrlr->ioq[i].num_ignored; + } return (sysctl_handle_64(oidp, &num_ignored, 0, req)); } @@ -250,8 +261,10 @@ nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS) num = ctrlr->adminq.num_recovery_nolock; - for (i = 0; i < ctrlr->num_io_queues; i++) - num += ctrlr->ioq[i].num_recovery_nolock; + if (ctrlr->ioq != NULL) { + for (i = 0; i < ctrlr->num_io_queues; i++) + num += ctrlr->ioq[i].num_recovery_nolock; + } return (sysctl_handle_64(oidp, &num, 0, req)); } @@ -270,8 +283,10 @@ nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS) if (val != 0) { nvme_qpair_reset_stats(&ctrlr->adminq); - for (i = 0; i < ctrlr->num_io_queues; i++) - nvme_qpair_reset_stats(&ctrlr->ioq[i]); + if (ctrlr->ioq != NULL) { + for (i = 0; i < ctrlr->num_io_queues; i++) + nvme_qpair_reset_stats(&ctrlr->ioq[i]); + } } return (0); @@ -318,6 +333,10 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair, CTLFLAG_RD, &qpair->num_recovery_nolock, "Number of times that we failed to lock recovery in the ISR"); + SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "recovery", + CTLFLAG_RW, &qpair->recovery_state, 0, + "Current recovery state of the queue"); + SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO, "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, qpair, 0, nvme_sysctl_dump_debug, "IU", "Dump debug data"); @@ -327,8 +346,8 @@ void nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr) { struct sysctl_ctx_list *ctrlr_ctx; - struct sysctl_oid *ctrlr_tree, *que_tree; - struct sysctl_oid_list *ctrlr_list; + struct sysctl_oid *ctrlr_tree, *que_tree, *ioq_tree; + struct sysctl_oid_list *ctrlr_list, *ioq_list; #define QUEUE_NAME_LENGTH 16 char queue_name[QUEUE_NAME_LENGTH]; int i; @@ -407,16 +426,35 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr) CTLFLAG_RD, &ctrlr->cap_hi, 0, "Hi 32-bits of capacities for the drive"); + SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "fail_on_reset", + CTLFLAG_RD, &ctrlr->fail_on_reset, 0, + "Pretend the next reset fails and fail the controller"); + que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO, "adminq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue"); nvme_sysctl_initialize_queue(&ctrlr->adminq, ctrlr_ctx, que_tree); - for (i = 0; i < ctrlr->num_io_queues; i++) { - snprintf(queue_name, QUEUE_NAME_LENGTH, "ioq%d", i); - que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO, - queue_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IO Queue"); - nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx, - que_tree); + /* + * Make sure that we've constructed the I/O queues before setting up the + * sysctls. Failed controllers won't allocate it, but we want the rest + * of the sysctls to diagnose things. + */ + if (ctrlr->ioq != NULL) { + ioq_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO, + "ioq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); + ioq_list = SYSCTL_CHILDREN(ioq_tree); + + for (i = 0; i < ctrlr->num_io_queues; i++) { + snprintf(queue_name, QUEUE_NAME_LENGTH, "%d", i); + que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ioq_list, OID_AUTO, + queue_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IO Queue"); + nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx, + que_tree); + } } + + SYSCTL_ADD_COUNTER_U64(ctrlr_ctx, ctrlr_list, OID_AUTO, "alignment_splits", + CTLFLAG_RD, &ctrlr->alignment_splits, + "Number of times we split the I/O alignment for drives with preferred alignment"); } diff --git a/sys/dev/nvme/nvme_util.c b/sys/dev/nvme/nvme_util.c index 47d84e5b6957..0a07653a7378 100644 --- a/sys/dev/nvme/nvme_util.c +++ b/sys/dev/nvme/nvme_util.c @@ -5,6 +5,8 @@ * Copyright (C) 1997 Justin T. Gibbs * All rights reserved. * + * Copyright (c) 2023-2025 Chelsio Communications, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -28,8 +30,243 @@ */ #include <sys/param.h> +#include <sys/sbuf.h> #include <dev/nvme/nvme.h> +#define OPC_ENTRY(x) [NVME_OPC_ ## x] = #x + +static const char *admin_opcode[256] = { + OPC_ENTRY(DELETE_IO_SQ), + OPC_ENTRY(CREATE_IO_SQ), + OPC_ENTRY(GET_LOG_PAGE), + OPC_ENTRY(DELETE_IO_CQ), + OPC_ENTRY(CREATE_IO_CQ), + OPC_ENTRY(IDENTIFY), + OPC_ENTRY(ABORT), + OPC_ENTRY(SET_FEATURES), + OPC_ENTRY(GET_FEATURES), + OPC_ENTRY(ASYNC_EVENT_REQUEST), + OPC_ENTRY(NAMESPACE_MANAGEMENT), + OPC_ENTRY(FIRMWARE_ACTIVATE), + OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD), + OPC_ENTRY(DEVICE_SELF_TEST), + OPC_ENTRY(NAMESPACE_ATTACHMENT), + OPC_ENTRY(KEEP_ALIVE), + OPC_ENTRY(DIRECTIVE_SEND), + OPC_ENTRY(DIRECTIVE_RECEIVE), + OPC_ENTRY(VIRTUALIZATION_MANAGEMENT), + OPC_ENTRY(NVME_MI_SEND), + OPC_ENTRY(NVME_MI_RECEIVE), + OPC_ENTRY(CAPACITY_MANAGEMENT), + OPC_ENTRY(LOCKDOWN), + OPC_ENTRY(DOORBELL_BUFFER_CONFIG), + OPC_ENTRY(FABRICS_COMMANDS), + OPC_ENTRY(FORMAT_NVM), + OPC_ENTRY(SECURITY_SEND), + OPC_ENTRY(SECURITY_RECEIVE), + OPC_ENTRY(SANITIZE), + OPC_ENTRY(GET_LBA_STATUS), +}; + +static const char *nvm_opcode[256] = { + OPC_ENTRY(FLUSH), + OPC_ENTRY(WRITE), + OPC_ENTRY(READ), + OPC_ENTRY(WRITE_UNCORRECTABLE), + OPC_ENTRY(COMPARE), + OPC_ENTRY(WRITE_ZEROES), + OPC_ENTRY(DATASET_MANAGEMENT), + OPC_ENTRY(VERIFY), + OPC_ENTRY(RESERVATION_REGISTER), + OPC_ENTRY(RESERVATION_REPORT), + OPC_ENTRY(RESERVATION_ACQUIRE), + OPC_ENTRY(RESERVATION_RELEASE), + OPC_ENTRY(COPY), +}; + +#define SC_ENTRY(x) [NVME_SC_ ## x] = #x + +static const char *generic_status[256] = { + SC_ENTRY(SUCCESS), + SC_ENTRY(INVALID_OPCODE), + SC_ENTRY(INVALID_FIELD), + SC_ENTRY(COMMAND_ID_CONFLICT), + SC_ENTRY(DATA_TRANSFER_ERROR), + SC_ENTRY(ABORTED_POWER_LOSS), + SC_ENTRY(INTERNAL_DEVICE_ERROR), + SC_ENTRY(ABORTED_BY_REQUEST), + SC_ENTRY(ABORTED_SQ_DELETION), + SC_ENTRY(ABORTED_FAILED_FUSED), + SC_ENTRY(ABORTED_MISSING_FUSED), + SC_ENTRY(INVALID_NAMESPACE_OR_FORMAT), + SC_ENTRY(COMMAND_SEQUENCE_ERROR), + SC_ENTRY(INVALID_SGL_SEGMENT_DESCR), + SC_ENTRY(INVALID_NUMBER_OF_SGL_DESCR), + SC_ENTRY(DATA_SGL_LENGTH_INVALID), + SC_ENTRY(METADATA_SGL_LENGTH_INVALID), + SC_ENTRY(SGL_DESCRIPTOR_TYPE_INVALID), + SC_ENTRY(INVALID_USE_OF_CMB), + SC_ENTRY(PRP_OFFET_INVALID), + SC_ENTRY(ATOMIC_WRITE_UNIT_EXCEEDED), + SC_ENTRY(OPERATION_DENIED), + SC_ENTRY(SGL_OFFSET_INVALID), + SC_ENTRY(HOST_ID_INCONSISTENT_FORMAT), + SC_ENTRY(KEEP_ALIVE_TIMEOUT_EXPIRED), + SC_ENTRY(KEEP_ALIVE_TIMEOUT_INVALID), + SC_ENTRY(ABORTED_DUE_TO_PREEMPT), + SC_ENTRY(SANITIZE_FAILED), + SC_ENTRY(SANITIZE_IN_PROGRESS), + SC_ENTRY(SGL_DATA_BLOCK_GRAN_INVALID), + SC_ENTRY(NOT_SUPPORTED_IN_CMB), + SC_ENTRY(NAMESPACE_IS_WRITE_PROTECTED), + SC_ENTRY(COMMAND_INTERRUPTED), + SC_ENTRY(TRANSIENT_TRANSPORT_ERROR), + + SC_ENTRY(LBA_OUT_OF_RANGE), + SC_ENTRY(CAPACITY_EXCEEDED), + SC_ENTRY(NAMESPACE_NOT_READY), + SC_ENTRY(RESERVATION_CONFLICT), + SC_ENTRY(FORMAT_IN_PROGRESS), +}; + +static const char *command_specific_status[256] = { + SC_ENTRY(COMPLETION_QUEUE_INVALID), + SC_ENTRY(INVALID_QUEUE_IDENTIFIER), + SC_ENTRY(MAXIMUM_QUEUE_SIZE_EXCEEDED), + SC_ENTRY(ABORT_COMMAND_LIMIT_EXCEEDED), + SC_ENTRY(ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED), + SC_ENTRY(INVALID_FIRMWARE_SLOT), + SC_ENTRY(INVALID_FIRMWARE_IMAGE), + SC_ENTRY(INVALID_INTERRUPT_VECTOR), + SC_ENTRY(INVALID_LOG_PAGE), + SC_ENTRY(INVALID_FORMAT), + SC_ENTRY(FIRMWARE_REQUIRES_RESET), + SC_ENTRY(INVALID_QUEUE_DELETION), + SC_ENTRY(FEATURE_NOT_SAVEABLE), + SC_ENTRY(FEATURE_NOT_CHANGEABLE), + SC_ENTRY(FEATURE_NOT_NS_SPECIFIC), + SC_ENTRY(FW_ACT_REQUIRES_NVMS_RESET), + SC_ENTRY(FW_ACT_REQUIRES_RESET), + SC_ENTRY(FW_ACT_REQUIRES_TIME), + SC_ENTRY(FW_ACT_PROHIBITED), + SC_ENTRY(OVERLAPPING_RANGE), + SC_ENTRY(NS_INSUFFICIENT_CAPACITY), + SC_ENTRY(NS_ID_UNAVAILABLE), + SC_ENTRY(NS_ALREADY_ATTACHED), + SC_ENTRY(NS_IS_PRIVATE), + SC_ENTRY(NS_NOT_ATTACHED), + SC_ENTRY(THIN_PROV_NOT_SUPPORTED), + SC_ENTRY(CTRLR_LIST_INVALID), + SC_ENTRY(SELF_TEST_IN_PROGRESS), + SC_ENTRY(BOOT_PART_WRITE_PROHIB), + SC_ENTRY(INVALID_CTRLR_ID), + SC_ENTRY(INVALID_SEC_CTRLR_STATE), + SC_ENTRY(INVALID_NUM_OF_CTRLR_RESRC), + SC_ENTRY(INVALID_RESOURCE_ID), + SC_ENTRY(SANITIZE_PROHIBITED_WPMRE), + SC_ENTRY(ANA_GROUP_ID_INVALID), + SC_ENTRY(ANA_ATTACH_FAILED), + + SC_ENTRY(CONFLICTING_ATTRIBUTES), + SC_ENTRY(INVALID_PROTECTION_INFO), + SC_ENTRY(ATTEMPTED_WRITE_TO_RO_PAGE), +}; + +static const char *media_error_status[256] = { + SC_ENTRY(WRITE_FAULTS), + SC_ENTRY(UNRECOVERED_READ_ERROR), + SC_ENTRY(GUARD_CHECK_ERROR), + SC_ENTRY(APPLICATION_TAG_CHECK_ERROR), + SC_ENTRY(REFERENCE_TAG_CHECK_ERROR), + SC_ENTRY(COMPARE_FAILURE), + SC_ENTRY(ACCESS_DENIED), + SC_ENTRY(DEALLOCATED_OR_UNWRITTEN), +}; + +static const char *path_related_status[256] = { + SC_ENTRY(INTERNAL_PATH_ERROR), + SC_ENTRY(ASYMMETRIC_ACCESS_PERSISTENT_LOSS), + SC_ENTRY(ASYMMETRIC_ACCESS_INACCESSIBLE), + SC_ENTRY(ASYMMETRIC_ACCESS_TRANSITION), + SC_ENTRY(CONTROLLER_PATHING_ERROR), + SC_ENTRY(HOST_PATHING_ERROR), + SC_ENTRY(COMMAND_ABORTED_BY_HOST), +}; + +void +nvme_opcode_sbuf(bool admin, uint8_t opc, struct sbuf *sb) +{ + const char *s, *type; + + if (admin) { + s = admin_opcode[opc]; + type = "ADMIN"; + } else { + s = nvm_opcode[opc]; + type = "NVM"; + } + if (s == NULL) + sbuf_printf(sb, "%s (%02x)", type, opc); + else + sbuf_printf(sb, "%s", s); +} + +void +nvme_sc_sbuf(const struct nvme_completion *cpl, struct sbuf *sb) +{ + const char *s, *type; + uint16_t status; + + status = le16toh(cpl->status); + switch (NVME_STATUS_GET_SCT(status)) { + case NVME_SCT_GENERIC: + s = generic_status[NVME_STATUS_GET_SC(status)]; + type = "GENERIC"; + break; + case NVME_SCT_COMMAND_SPECIFIC: + s = command_specific_status[NVME_STATUS_GET_SC(status)]; + type = "COMMAND SPECIFIC"; + break; + case NVME_SCT_MEDIA_ERROR: + s = media_error_status[NVME_STATUS_GET_SC(status)]; + type = "MEDIA ERROR"; + break; + case NVME_SCT_PATH_RELATED: + s = path_related_status[NVME_STATUS_GET_SC(status)]; + type = "PATH RELATED"; + break; + case NVME_SCT_VENDOR_SPECIFIC: + s = NULL; + type = "VENDOR SPECIFIC"; + break; + default: + s = NULL; + type = NULL; + break; + } + + if (type == NULL) + sbuf_printf(sb, "RESERVED (%02x/%02x)", + NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status)); + else if (s == NULL) + sbuf_printf(sb, "%s (%02x)", type, NVME_STATUS_GET_SC(status)); + else + sbuf_printf(sb, "%s", s); +} + +void +nvme_cpl_sbuf(const struct nvme_completion *cpl, struct sbuf *sb) +{ + uint16_t status; + + status = le16toh(cpl->status); + nvme_sc_sbuf(cpl, sb); + if (NVME_STATUS_GET_M(status) != 0) + sbuf_printf(sb, " M"); + if (NVME_STATUS_GET_DNR(status) != 0) + sbuf_printf(sb, " DNR"); +} + void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen) { |