diff options
Diffstat (limited to 'sys/dev/nvmf')
-rw-r--r-- | sys/dev/nvmf/controller/ctl_frontend_nvmf.c | 196 | ||||
-rw-r--r-- | sys/dev/nvmf/controller/nvmft_controller.c | 78 | ||||
-rw-r--r-- | sys/dev/nvmf/controller/nvmft_qpair.c | 72 | ||||
-rw-r--r-- | sys/dev/nvmf/controller/nvmft_subr.c | 40 | ||||
-rw-r--r-- | sys/dev/nvmf/controller/nvmft_var.h | 19 | ||||
-rw-r--r-- | sys/dev/nvmf/host/nvmf.c | 640 | ||||
-rw-r--r-- | sys/dev/nvmf/host/nvmf_aer.c | 2 | ||||
-rw-r--r-- | sys/dev/nvmf/host/nvmf_ctldev.c | 15 | ||||
-rw-r--r-- | sys/dev/nvmf/host/nvmf_ns.c | 66 | ||||
-rw-r--r-- | sys/dev/nvmf/host/nvmf_qpair.c | 88 | ||||
-rw-r--r-- | sys/dev/nvmf/host/nvmf_sim.c | 29 | ||||
-rw-r--r-- | sys/dev/nvmf/host/nvmf_var.h | 46 | ||||
-rw-r--r-- | sys/dev/nvmf/nvmf.h | 131 | ||||
-rw-r--r-- | sys/dev/nvmf/nvmf_proto.h | 6 | ||||
-rw-r--r-- | sys/dev/nvmf/nvmf_tcp.c | 93 | ||||
-rw-r--r-- | sys/dev/nvmf/nvmf_tcp.h | 27 | ||||
-rw-r--r-- | sys/dev/nvmf/nvmf_transport.c | 102 | ||||
-rw-r--r-- | sys/dev/nvmf/nvmf_transport.h | 25 | ||||
-rw-r--r-- | sys/dev/nvmf/nvmf_transport_internal.h | 3 |
19 files changed, 1257 insertions, 421 deletions
diff --git a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c index a203bb1c90a6..658b47699c1d 100644 --- a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c +++ b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c @@ -19,7 +19,9 @@ #include <sys/queue.h> #include <sys/refcount.h> #include <sys/sbuf.h> +#include <sys/smp.h> #include <sys/sx.h> +#include <sys/taskqueue.h> #include <machine/bus.h> #include <machine/bus_dma.h> @@ -31,8 +33,10 @@ #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_error.h> +#include <cam/ctl/ctl_ha.h> #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl_frontend.h> +#include <cam/ctl/ctl_private.h> /* * Store pointers to the capsule and qpair in the two pointer members @@ -47,6 +51,7 @@ static int nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag, struct thread *td); static int nvmft_shutdown(void); +static struct taskqueue *nvmft_taskq; static TAILQ_HEAD(, nvmft_port) nvmft_ports; static struct sx nvmft_ports_lock; @@ -65,9 +70,9 @@ nvmft_online(void *arg) { struct nvmft_port *np = arg; - sx_xlock(&np->lock); + mtx_lock(&np->lock); np->online = true; - sx_xunlock(&np->lock); + mtx_unlock(&np->lock); } static void @@ -76,7 +81,7 @@ nvmft_offline(void *arg) struct nvmft_port *np = arg; struct nvmft_controller *ctrlr; - sx_xlock(&np->lock); + mtx_lock(&np->lock); np->online = false; TAILQ_FOREACH(ctrlr, &np->controllers, link) { @@ -86,8 +91,32 @@ nvmft_offline(void *arg) } while (!TAILQ_EMPTY(&np->controllers)) - sx_sleep(np, &np->lock, 0, "nvmfoff", 0); - sx_xunlock(&np->lock); + mtx_sleep(np, &np->lock, 0, "nvmfoff", 0); + mtx_unlock(&np->lock); +} + +static int +nvmft_info(void *arg, struct sbuf *sb) +{ + struct nvmft_port *np = arg; + struct nvmft_controller *ctrlr; + int retval; + + mtx_lock(&np->lock); + retval = sbuf_printf(sb, "\t<port>%s,p,%u</port>\n", np->cdata.subnqn, + np->portid); + if (retval != 0) + goto out; + + TAILQ_FOREACH(ctrlr, &np->controllers, link) { + retval = sbuf_printf(sb, "\t<host id=\"%u\">%s</host>\n", + ctrlr->cntlid, ctrlr->hostnqn); + if (retval != 0) + break; + } +out: + mtx_unlock(&np->lock); + return (retval); } static int @@ -97,7 +126,7 @@ nvmft_lun_enable(void *arg, int lun_id) struct nvmft_controller *ctrlr; uint32_t *old_ns, *new_ns; uint32_t nsid; - u_int i; + u_int i, new_count; if (lun_id >= le32toh(np->cdata.nn)) { printf("NVMFT: %s lun %d larger than maximum nsid %u\n", @@ -106,14 +135,22 @@ nvmft_lun_enable(void *arg, int lun_id) } nsid = lun_id + 1; - sx_xlock(&np->lock); - new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT, - M_WAITOK); + mtx_lock(&np->lock); + for (;;) { + new_count = np->num_ns + 1; + mtx_unlock(&np->lock); + new_ns = mallocarray(new_count, sizeof(*new_ns), M_NVMFT, + M_WAITOK); + mtx_lock(&np->lock); + if (np->num_ns + 1 <= new_count) + break; + free(new_ns, M_NVMFT); + } for (i = 0; i < np->num_ns; i++) { if (np->active_ns[i] < nsid) continue; if (np->active_ns[i] == nsid) { - sx_xunlock(&np->lock); + mtx_unlock(&np->lock); free(new_ns, M_NVMFT); printf("NVMFT: %s duplicate lun %d\n", np->cdata.subnqn, lun_id); @@ -140,7 +177,7 @@ nvmft_lun_enable(void *arg, int lun_id) nvmft_controller_lun_changed(ctrlr, lun_id); } - sx_xunlock(&np->lock); + mtx_unlock(&np->lock); free(old_ns, M_NVMFT); return (0); @@ -158,12 +195,12 @@ nvmft_lun_disable(void *arg, int lun_id) return (0); nsid = lun_id + 1; - sx_xlock(&np->lock); + mtx_lock(&np->lock); for (i = 0; i < np->num_ns; i++) { if (np->active_ns[i] == nsid) goto found; } - sx_xunlock(&np->lock); + mtx_unlock(&np->lock); printf("NVMFT: %s request to disable nonexistent lun %d\n", np->cdata.subnqn, lun_id); return (EINVAL); @@ -180,7 +217,7 @@ found: nvmft_controller_lun_changed(ctrlr, lun_id); } - sx_xunlock(&np->lock); + mtx_unlock(&np->lock); return (0); } @@ -191,7 +228,7 @@ nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid, { u_int i, count; - sx_slock(&np->lock); + mtx_lock(&np->lock); count = 0; for (i = 0; i < np->num_ns; i++) { if (np->active_ns[i] <= nsid) @@ -201,7 +238,7 @@ nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid, if (count == nitems(nslist->ns)) break; } - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); } void @@ -458,8 +495,8 @@ nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp, ctl_datamove_done((union ctl_io *)ctnio, true); } -static void -nvmft_datamove(union ctl_io *io) +void +nvmft_handle_datamove(union ctl_io *io) { struct nvmf_capsule *nc; struct nvmft_qpair *qp; @@ -478,6 +515,35 @@ nvmft_datamove(union ctl_io *io) nvmft_datamove_out(&io->nvmeio, qp, nc); } +void +nvmft_abort_datamove(union ctl_io *io) +{ + io->io_hdr.port_status = 1; + io->io_hdr.flags |= CTL_FLAG_ABORT; + ctl_datamove_done(io, true); +} + +static void +nvmft_datamove(union ctl_io *io) +{ + struct nvmft_qpair *qp; + + qp = NVMFT_QP(io); + nvmft_qpair_datamove(qp, io); +} + +void +nvmft_enqueue_task(struct task *task) +{ + taskqueue_enqueue(nvmft_taskq, task); +} + +void +nvmft_drain_task(struct task *task) +{ + taskqueue_drain(nvmft_taskq, task); +} + static void hip_add(uint64_t pair[2], uint64_t addend) { @@ -561,6 +627,17 @@ end: static int nvmft_init(void) { + int error; + + nvmft_taskq = taskqueue_create("nvmft", M_WAITOK, + taskqueue_thread_enqueue, &nvmft_taskq); + error = taskqueue_start_threads_in_proc(&nvmft_taskq, mp_ncpus, PWAIT, + control_softc->ctl_proc, "nvmft"); + if (error != 0) { + taskqueue_free(nvmft_taskq); + return (error); + } + TAILQ_INIT(&nvmft_ports); sx_init(&nvmft_ports_lock, "nvmft ports"); return (0); @@ -580,7 +657,7 @@ nvmft_port_free(struct nvmft_port *np) free(np->active_ns, M_NVMFT); clean_unrhdr(np->ids); delete_unrhdr(np->ids); - sx_destroy(&np->lock); + mtx_destroy(&np->lock); free(np, M_NVMFT); } @@ -750,9 +827,10 @@ nvmft_port_create(struct ctl_req *req) np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO); refcount_init(&np->refs, 1); + np->portid = portid; np->max_io_qsize = max_io_qsize; np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500); - sx_init(&np->lock, "nvmft port"); + mtx_init(&np->lock, "nvmft port", NULL, MTX_DEF); np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1, NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX); TAILQ_INIT(&np->controllers); @@ -781,6 +859,7 @@ nvmft_port_create(struct ctl_req *req) port->virtual_port = 0; port->port_online = nvmft_online; port->port_offline = nvmft_offline; + port->port_info = nvmft_info; port->onoff_arg = np; port->lun_enable = nvmft_lun_enable; port->lun_disable = nvmft_lun_disable; @@ -870,7 +949,13 @@ nvmft_port_remove(struct ctl_req *req) TAILQ_REMOVE(&nvmft_ports, np, link); sx_xunlock(&nvmft_ports_lock); - ctl_port_offline(&np->port); + mtx_lock(&np->lock); + if (np->online) { + mtx_unlock(&np->lock); + ctl_port_offline(&np->port); + } else + mtx_unlock(&np->lock); + nvmft_port_rele(np); req->status = CTL_LUN_OK; } @@ -878,29 +963,55 @@ nvmft_port_remove(struct ctl_req *req) static void nvmft_handoff(struct ctl_nvmf *cn) { - struct nvmf_fabric_connect_cmd cmd; - struct nvmf_handoff_controller_qpair *handoff; - struct nvmf_fabric_connect_data *data; + const struct nvmf_fabric_connect_cmd *cmd; + const struct nvmf_fabric_connect_data *data; + const nvlist_t *params; struct nvmft_port *np; + nvlist_t *nvl; + size_t len; + enum nvmf_trtype trtype; int error; np = NULL; - data = NULL; - handoff = &cn->data.handoff; - error = copyin(handoff->cmd, &cmd, sizeof(cmd)); + error = nvmf_unpack_ioc_nvlist(&cn->data.handoff, &nvl); if (error != 0) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), - "Failed to copyin CONNECT SQE"); + "Failed to copyin and unpack handoff arguments"); return; } - data = malloc(sizeof(*data), M_NVMFT, M_WAITOK); - error = copyin(handoff->data, data, sizeof(*data)); - if (error != 0) { + if (!nvlist_exists_number(nvl, "trtype") || + !nvlist_exists_nvlist(nvl, "params") || + !nvlist_exists_binary(nvl, "cmd") || + !nvlist_exists_binary(nvl, "data")) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), - "Failed to copyin CONNECT data"); + "Handoff arguments missing required value"); + goto out; + } + + params = nvlist_get_nvlist(nvl, "params"); + if (!nvmf_validate_qpair_nvlist(params, true)) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Invalid queue pair parameters"); + goto out; + } + + cmd = nvlist_get_binary(nvl, "cmd", &len); + if (len != sizeof(*cmd)) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Wrong size for CONNECT SQE"); + goto out; + } + + data = nvlist_get_binary(nvl, "data", &len); + if (len != sizeof(*data)) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Wrong size for CONNECT data"); goto out; } @@ -931,8 +1042,10 @@ nvmft_handoff(struct ctl_nvmf *cn) nvmft_port_ref(np); sx_sunlock(&nvmft_ports_lock); - if (handoff->params.admin) { - error = nvmft_handoff_admin_queue(np, handoff, &cmd, data); + trtype = nvlist_get_number(nvl, "trtype"); + if (nvlist_get_bool(params, "admin")) { + error = nvmft_handoff_admin_queue(np, trtype, params, cmd, + data); if (error != 0) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), @@ -940,11 +1053,11 @@ nvmft_handoff(struct ctl_nvmf *cn) goto out; } } else { - error = nvmft_handoff_io_queue(np, handoff, &cmd, data); + error = nvmft_handoff_io_queue(np, trtype, params, cmd, data); if (error != 0) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), - "Failed to handoff admin queue: %d", error); + "Failed to handoff I/O queue: %d", error); goto out; } } @@ -953,7 +1066,7 @@ nvmft_handoff(struct ctl_nvmf *cn) out: if (np != NULL) nvmft_port_rele(np); - free(data, M_NVMFT); + nvlist_destroy(nvl); } static void @@ -979,7 +1092,7 @@ nvmft_list(struct ctl_nvmf *cn) sbuf_printf(sb, "<ctlnvmflist>\n"); sx_slock(&nvmft_ports_lock); TAILQ_FOREACH(np, &nvmft_ports, link) { - sx_slock(&np->lock); + mtx_lock(&np->lock); TAILQ_FOREACH(ctrlr, &np->controllers, link) { sbuf_printf(sb, "<connection id=\"%d\">" "<hostnqn>%s</hostnqn>" @@ -991,7 +1104,7 @@ nvmft_list(struct ctl_nvmf *cn) np->cdata.subnqn, ctrlr->trtype); } - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); } sx_sunlock(&nvmft_ports_lock); sbuf_printf(sb, "</ctlnvmflist>\n"); @@ -1029,7 +1142,7 @@ nvmft_terminate(struct ctl_nvmf *cn) found = false; sx_slock(&nvmft_ports_lock); TAILQ_FOREACH(np, &nvmft_ports, link) { - sx_slock(&np->lock); + mtx_lock(&np->lock); TAILQ_FOREACH(ctrlr, &np->controllers, link) { if (tp->all != 0) match = true; @@ -1047,7 +1160,7 @@ nvmft_terminate(struct ctl_nvmf *cn) nvmft_controller_error(ctrlr, NULL, ECONNABORTED); found = true; } - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); } sx_sunlock(&nvmft_ports_lock); @@ -1115,6 +1228,7 @@ nvmft_shutdown(void) if (!TAILQ_EMPTY(&nvmft_ports)) return (EBUSY); + taskqueue_free(nvmft_taskq); sx_destroy(&nvmft_ports_lock); return (0); } diff --git a/sys/dev/nvmf/controller/nvmft_controller.c b/sys/dev/nvmf/controller/nvmft_controller.c index f3783eac1275..390467534ca2 100644 --- a/sys/dev/nvmf/controller/nvmft_controller.c +++ b/sys/dev/nvmf/controller/nvmft_controller.c @@ -14,7 +14,6 @@ #include <sys/memdesc.h> #include <sys/mutex.h> #include <sys/sbuf.h> -#include <sys/sx.h> #include <sys/taskqueue.h> #include <dev/nvmf/nvmf_transport.h> @@ -55,8 +54,6 @@ nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid, ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO); ctrlr->cntlid = cntlid; - nvmft_port_ref(np); - TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link); ctrlr->np = np; mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF); callout_init(&ctrlr->ka_timer, 1); @@ -107,9 +104,8 @@ nvmft_keep_alive_timer(void *arg) } int -nvmft_handoff_admin_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, - const struct nvmf_fabric_connect_cmd *cmd, +nvmft_handoff_admin_queue(struct nvmft_port *np, enum nvmf_trtype trtype, + const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data) { struct nvmft_controller *ctrlr; @@ -120,13 +116,17 @@ nvmft_handoff_admin_queue(struct nvmft_port *np, if (cmd->qid != htole16(0)) return (EINVAL); - qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0, - "admin queue"); + qp = nvmft_qpair_init(trtype, params, 0, "admin queue"); + if (qp == NULL) { + printf("NVMFT: Failed to setup admin queue from %.*s\n", + (int)sizeof(data->hostnqn), data->hostnqn); + return (ENXIO); + } - sx_xlock(&np->lock); + mtx_lock(&np->lock); cntlid = alloc_unr(np->ids); if (cntlid == -1) { - sx_xunlock(&np->lock); + mtx_unlock(&np->lock); printf("NVMFT: Unable to allocate controller for %.*s\n", (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC, @@ -141,12 +141,25 @@ nvmft_handoff_admin_queue(struct nvmft_port *np, ("%s: duplicate controllers with id %d", __func__, cntlid)); } #endif + mtx_unlock(&np->lock); ctrlr = nvmft_controller_alloc(np, cntlid, data); + + mtx_lock(&np->lock); + if (!np->online) { + mtx_unlock(&np->lock); + nvmft_controller_free(ctrlr); + free_unr(np->ids, cntlid); + nvmft_qpair_destroy(qp); + return (ENXIO); + } + nvmft_port_ref(np); + TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link); + nvmft_printf(ctrlr, "associated with %.*s\n", (int)sizeof(data->hostnqn), data->hostnqn); ctrlr->admin = qp; - ctrlr->trtype = handoff->trtype; + ctrlr->trtype = trtype; /* * The spec requires a non-zero KeepAlive timer, but allow a @@ -162,17 +175,16 @@ nvmft_handoff_admin_queue(struct nvmft_port *np, callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK); } + mtx_unlock(&np->lock); nvmft_finish_accept(qp, cmd, ctrlr); - sx_xunlock(&np->lock); return (0); } int -nvmft_handoff_io_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, - const struct nvmf_fabric_connect_cmd *cmd, +nvmft_handoff_io_queue(struct nvmft_port *np, enum nvmf_trtype trtype, + const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data) { struct nvmft_controller *ctrlr; @@ -186,15 +198,20 @@ nvmft_handoff_io_queue(struct nvmft_port *np, cntlid = le16toh(data->cntlid); snprintf(name, sizeof(name), "I/O queue %u", qid); - qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name); + qp = nvmft_qpair_init(trtype, params, qid, name); + if (qp == NULL) { + printf("NVMFT: Failed to setup I/O queue %u from %.*s\n", qid, + (int)sizeof(data->hostnqn), data->hostnqn); + return (ENXIO); + } - sx_slock(&np->lock); + mtx_lock(&np->lock); TAILQ_FOREACH(ctrlr, &np->controllers, link) { if (ctrlr->cntlid == cntlid) break; } if (ctrlr == NULL) { - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n", ctrlr->cntlid, qid, (int)sizeof(data->hostnqn), data->hostnqn); @@ -205,7 +222,7 @@ nvmft_handoff_io_queue(struct nvmft_port *np, } if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) { - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); nvmft_printf(ctrlr, "hostid mismatch for I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); @@ -215,7 +232,7 @@ nvmft_handoff_io_queue(struct nvmft_port *np, return (EINVAL); } if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) { - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); nvmft_printf(ctrlr, "hostnqn mismatch for I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); @@ -225,12 +242,12 @@ nvmft_handoff_io_queue(struct nvmft_port *np, return (EINVAL); } - /* XXX: Require handoff->trtype == ctrlr->trtype? */ + /* XXX: Require trtype == ctrlr->trtype? */ mtx_lock(&ctrlr->lock); if (ctrlr->shutdown) { mtx_unlock(&ctrlr->lock); - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); nvmft_printf(ctrlr, "attempt to create I/O queue %u on disabled controller from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); @@ -241,7 +258,7 @@ nvmft_handoff_io_queue(struct nvmft_port *np, } if (ctrlr->num_io_queues == 0) { mtx_unlock(&ctrlr->lock); - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); nvmft_printf(ctrlr, "attempt to create I/O queue %u without enabled queues from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); @@ -252,7 +269,7 @@ nvmft_handoff_io_queue(struct nvmft_port *np, } if (cmd->qid > ctrlr->num_io_queues) { mtx_unlock(&ctrlr->lock); - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); nvmft_printf(ctrlr, "attempt to create invalid I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); @@ -263,7 +280,7 @@ nvmft_handoff_io_queue(struct nvmft_port *np, } if (ctrlr->io_qpairs[qid - 1].qp != NULL) { mtx_unlock(&ctrlr->lock); - sx_sunlock(&np->lock); + mtx_unlock(&np->lock); nvmft_printf(ctrlr, "attempt to re-create I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); @@ -275,8 +292,8 @@ nvmft_handoff_io_queue(struct nvmft_port *np, ctrlr->io_qpairs[qid - 1].qp = qp; mtx_unlock(&ctrlr->lock); + mtx_unlock(&np->lock); nvmft_finish_accept(qp, cmd, ctrlr); - sx_sunlock(&np->lock); return (0); } @@ -375,11 +392,11 @@ nvmft_controller_terminate(void *arg, int pending) /* Remove association (CNTLID). */ np = ctrlr->np; - sx_xlock(&np->lock); + mtx_lock(&np->lock); TAILQ_REMOVE(&np->controllers, ctrlr, link); - free_unr(np->ids, ctrlr->cntlid); wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers)); - sx_xunlock(&np->lock); + mtx_unlock(&np->lock); + free_unr(np->ids, ctrlr->cntlid); if (wakeup_np) wakeup(np); @@ -770,6 +787,7 @@ handle_set_features(struct nvmft_controller *ctrlr, ctrlr->aer_mask = aer_mask; mtx_unlock(&ctrlr->lock); nvmft_send_success(ctrlr->admin, nc); + nvmf_free_capsule(nc); return; } default: @@ -944,7 +962,7 @@ nvmft_handle_admin_command(struct nvmft_controller *ctrlr, if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 && cmd->opc != NVME_OPC_FABRICS_COMMANDS) { nvmft_printf(ctrlr, - "Unsupported admin opcode %#x whiled disabled\n", cmd->opc); + "Unsupported admin opcode %#x while disabled\n", cmd->opc); nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_COMMAND_SEQUENCE_ERROR); nvmf_free_capsule(nc); diff --git a/sys/dev/nvmf/controller/nvmft_qpair.c b/sys/dev/nvmf/controller/nvmft_qpair.c index 6cb3ebd76884..73c7bb280780 100644 --- a/sys/dev/nvmf/controller/nvmft_qpair.c +++ b/sys/dev/nvmf/controller/nvmft_qpair.c @@ -31,9 +31,11 @@ struct nvmft_qpair { uint16_t qid; u_int qsize; uint16_t sqhd; - uint16_t sqtail; volatile u_int qp_refs; /* Internal references on 'qp'. */ + struct task datamove_task; + STAILQ_HEAD(, ctl_io_hdr) datamove_queue; + struct mtx lock; char name[16]; @@ -41,6 +43,7 @@ struct nvmft_qpair { static int _nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, uint8_t sc_status); +static void nvmft_datamove_task(void *context, int pending); static void nvmft_qpair_error(void *arg, int error) @@ -98,24 +101,24 @@ nvmft_receive_capsule(void *arg, struct nvmf_capsule *nc) } struct nvmft_qpair * -nvmft_qpair_init(enum nvmf_trtype trtype, - const struct nvmf_handoff_qpair_params *handoff, uint16_t qid, +nvmft_qpair_init(enum nvmf_trtype trtype, const nvlist_t *params, uint16_t qid, const char *name) { struct nvmft_qpair *qp; qp = malloc(sizeof(*qp), M_NVMFT, M_WAITOK | M_ZERO); - qp->admin = handoff->admin; - qp->sq_flow_control = handoff->sq_flow_control; - qp->qsize = handoff->qsize; + qp->admin = nvlist_get_bool(params, "admin"); + qp->sq_flow_control = nvlist_get_bool(params, "sq_flow_control"); + qp->qsize = nvlist_get_number(params, "qsize"); qp->qid = qid; - qp->sqhd = handoff->sqhd; - qp->sqtail = handoff->sqtail; + qp->sqhd = nvlist_get_number(params, "sqhd"); strlcpy(qp->name, name, sizeof(qp->name)); mtx_init(&qp->lock, "nvmft qp", NULL, MTX_DEF); qp->cids = BITSET_ALLOC(NUM_CIDS, M_NVMFT, M_WAITOK | M_ZERO); + STAILQ_INIT(&qp->datamove_queue); + TASK_INIT(&qp->datamove_task, 0, nvmft_datamove_task, qp); - qp->qp = nvmf_allocate_qpair(trtype, true, handoff, nvmft_qpair_error, + qp->qp = nvmf_allocate_qpair(trtype, true, params, nvmft_qpair_error, qp, nvmft_receive_capsule, qp); if (qp->qp == NULL) { mtx_destroy(&qp->lock); @@ -131,14 +134,25 @@ nvmft_qpair_init(enum nvmf_trtype trtype, void nvmft_qpair_shutdown(struct nvmft_qpair *qp) { + STAILQ_HEAD(, ctl_io_hdr) datamove_queue; struct nvmf_qpair *nq; + union ctl_io *io; + STAILQ_INIT(&datamove_queue); mtx_lock(&qp->lock); nq = qp->qp; qp->qp = NULL; + STAILQ_CONCAT(&datamove_queue, &qp->datamove_queue); mtx_unlock(&qp->lock); if (nq != NULL && refcount_release(&qp->qp_refs)) nvmf_free_qpair(nq); + + while (!STAILQ_EMPTY(&datamove_queue)) { + io = (union ctl_io *)STAILQ_FIRST(&datamove_queue); + STAILQ_REMOVE_HEAD(&datamove_queue, links); + nvmft_abort_datamove(io); + } + nvmft_drain_task(&qp->datamove_task); } void @@ -359,3 +373,43 @@ nvmft_finish_accept(struct nvmft_qpair *qp, rsp.status_code_specific.success.cntlid = htole16(ctrlr->cntlid); return (nvmft_send_connect_response(qp, &rsp)); } + +void +nvmft_qpair_datamove(struct nvmft_qpair *qp, union ctl_io *io) +{ + bool enqueue_task; + + mtx_lock(&qp->lock); + if (qp->qp == NULL) { + mtx_unlock(&qp->lock); + nvmft_abort_datamove(io); + return; + } + enqueue_task = STAILQ_EMPTY(&qp->datamove_queue); + STAILQ_INSERT_TAIL(&qp->datamove_queue, &io->io_hdr, links); + mtx_unlock(&qp->lock); + if (enqueue_task) + nvmft_enqueue_task(&qp->datamove_task); +} + +static void +nvmft_datamove_task(void *context, int pending __unused) +{ + struct nvmft_qpair *qp = context; + union ctl_io *io; + bool abort; + + mtx_lock(&qp->lock); + while (!STAILQ_EMPTY(&qp->datamove_queue)) { + io = (union ctl_io *)STAILQ_FIRST(&qp->datamove_queue); + STAILQ_REMOVE_HEAD(&qp->datamove_queue, links); + abort = (qp->qp == NULL); + mtx_unlock(&qp->lock); + if (abort) + nvmft_abort_datamove(io); + else + nvmft_handle_datamove(io); + mtx_lock(&qp->lock); + } + mtx_unlock(&qp->lock); +} diff --git a/sys/dev/nvmf/controller/nvmft_subr.c b/sys/dev/nvmf/controller/nvmft_subr.c index bb2bc0988e81..245971813854 100644 --- a/sys/dev/nvmf/controller/nvmft_subr.c +++ b/sys/dev/nvmf/controller/nvmft_subr.c @@ -26,46 +26,6 @@ nvmf_nqn_valid(const char *nqn) len = strnlen(nqn, NVME_NQN_FIELD_SIZE); if (len == 0 || len > NVMF_NQN_MAX_LEN) return (false); - -#ifdef STRICT_CHECKS - /* - * Stricter checks from the spec. Linux does not seem to - * require these. - */ - - /* - * NVMF_NQN_MIN_LEN does not include '.', and require at least - * one character of a domain name. - */ - if (len < NVMF_NQN_MIN_LEN + 2) - return (false); - if (memcmp("nqn.", nqn, strlen("nqn.")) != 0) - return (false); - nqn += strlen("nqn."); - - /* Next 4 digits must be a year. */ - for (u_int i = 0; i < 4; i++) { - if (!isdigit(nqn[i])) - return (false); - } - nqn += 4; - - /* '-' between year and month. */ - if (nqn[0] != '-') - return (false); - nqn++; - - /* 2 digit month. */ - for (u_int i = 0; i < 2; i++) { - if (!isdigit(nqn[i])) - return (false); - } - nqn += 2; - - /* '.' between month and reverse domain name. */ - if (nqn[0] != '.') - return (false); -#endif return (true); } diff --git a/sys/dev/nvmf/controller/nvmft_var.h b/sys/dev/nvmf/controller/nvmft_var.h index fc1f86754382..85032b2dc55f 100644 --- a/sys/dev/nvmf/controller/nvmft_var.h +++ b/sys/dev/nvmf/controller/nvmft_var.h @@ -9,6 +9,7 @@ #define __NVMFT_VAR_H__ #include <sys/_callout.h> +#include <sys/_nv.h> #include <sys/refcount.h> #include <sys/taskqueue.h> @@ -32,9 +33,10 @@ struct nvmft_port { struct nvme_firmware_page fp; uint64_t cap; uint32_t max_io_qsize; + uint16_t portid; bool online; - struct sx lock; + struct mtx lock; struct unrhdr *ids; TAILQ_HEAD(, nvmft_controller) controllers; @@ -110,6 +112,10 @@ void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid, void nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc, bool admin); void nvmft_terminate_commands(struct nvmft_controller *ctrlr); +void nvmft_abort_datamove(union ctl_io *io); +void nvmft_handle_datamove(union ctl_io *io); +void nvmft_drain_task(struct task *task); +void nvmft_enqueue_task(struct task *task); /* nvmft_controller.c */ void nvmft_controller_error(struct nvmft_controller *ctrlr, @@ -121,23 +127,22 @@ void nvmft_handle_admin_command(struct nvmft_controller *ctrlr, void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid, struct nvmf_capsule *nc); int nvmft_handoff_admin_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, + enum nvmf_trtype trtype, const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data); -int nvmft_handoff_io_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, - const struct nvmf_fabric_connect_cmd *cmd, +int nvmft_handoff_io_queue(struct nvmft_port *np, enum nvmf_trtype trtype, + const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data); int nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...) __printflike(2, 3); /* nvmft_qpair.c */ struct nvmft_qpair *nvmft_qpair_init(enum nvmf_trtype trtype, - const struct nvmf_handoff_qpair_params *handoff, uint16_t qid, - const char *name); + const nvlist_t *params, uint16_t qid, const char *name); void nvmft_qpair_shutdown(struct nvmft_qpair *qp); void nvmft_qpair_destroy(struct nvmft_qpair *qp); struct nvmft_controller *nvmft_qpair_ctrlr(struct nvmft_qpair *qp); +void nvmft_qpair_datamove(struct nvmft_qpair *qp, union ctl_io *io); uint16_t nvmft_qpair_id(struct nvmft_qpair *qp); const char *nvmft_qpair_name(struct nvmft_qpair *qp); void nvmft_command_completed(struct nvmft_qpair *qp, diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c index 0902bc78a7b5..1ac0d142443b 100644 --- a/sys/dev/nvmf/host/nvmf.c +++ b/sys/dev/nvmf/host/nvmf.c @@ -8,13 +8,18 @@ #include <sys/param.h> #include <sys/bus.h> #include <sys/conf.h> +#include <sys/dnv.h> +#include <sys/eventhandler.h> #include <sys/lock.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/memdesc.h> #include <sys/module.h> #include <sys/mutex.h> +#include <sys/nv.h> +#include <sys/reboot.h> #include <sys/sx.h> +#include <sys/sysctl.h> #include <sys/taskqueue.h> #include <dev/nvme/nvme.h> #include <dev/nvmf/nvmf.h> @@ -22,10 +27,20 @@ #include <dev/nvmf/host/nvmf_var.h> static struct cdevsw nvmf_cdevsw; +static struct taskqueue *nvmf_tq; + +bool nvmf_fail_disconnect = false; +SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, + &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); +static void nvmf_controller_loss_task(void *arg, int pending); static void nvmf_disconnect_task(void *arg, int pending); +static void nvmf_request_reconnect(struct nvmf_softc *sc); +static void nvmf_request_reconnect_task(void *arg, int pending); +static void nvmf_shutdown_pre_sync(void *arg, int howto); +static void nvmf_shutdown_post_sync(void *arg, int howto); void nvmf_complete(void *arg, const struct nvme_completion *cqe) @@ -187,104 +202,132 @@ nvmf_send_keep_alive(void *arg) } int -nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh) +nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) { - size_t len; - u_int i; + const struct nvme_discovery_log_entry *dle; + const struct nvme_controller_data *cdata; + const nvlist_t *const *io; + const nvlist_t *admin, *rparams; + nvlist_t *nvl; + size_t i, num_io_queues; + uint32_t qsize; int error; - memset(ivars, 0, sizeof(*ivars)); - - if (!hh->admin.admin || hh->num_io_queues < 1) - return (EINVAL); - - ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK); - error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata)); + error = nvmf_unpack_ioc_nvlist(nv, &nvl); if (error != 0) - goto out; - nvme_controller_data_swapbytes(ivars->cdata); + return (error); - len = hh->num_io_queues * sizeof(*ivars->io_params); - ivars->io_params = malloc(len, M_NVMF, M_WAITOK); - error = copyin(hh->io, ivars->io_params, len); - if (error != 0) - goto out; - for (i = 0; i < hh->num_io_queues; i++) { - if (ivars->io_params[i].admin) { - error = EINVAL; - goto out; - } + if (!nvlist_exists_number(nvl, "trtype") || + !nvlist_exists_nvlist(nvl, "admin") || + !nvlist_exists_nvlist_array(nvl, "io") || + !nvlist_exists_binary(nvl, "cdata") || + !nvlist_exists_nvlist(nvl, "rparams")) + goto invalid; + + rparams = nvlist_get_nvlist(nvl, "rparams"); + if (!nvlist_exists_binary(rparams, "dle") || + !nvlist_exists_string(rparams, "hostnqn") || + !nvlist_exists_number(rparams, "num_io_queues") || + !nvlist_exists_number(rparams, "io_qsize")) + goto invalid; + + admin = nvlist_get_nvlist(nvl, "admin"); + if (!nvmf_validate_qpair_nvlist(admin, false)) + goto invalid; + if (!nvlist_get_bool(admin, "admin")) + goto invalid; + + io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); + if (num_io_queues < 1 || + num_io_queues != nvlist_get_number(rparams, "num_io_queues")) + goto invalid; + for (i = 0; i < num_io_queues; i++) { + if (!nvmf_validate_qpair_nvlist(io[i], false)) + goto invalid; + } - /* Require all I/O queues to be the same size. */ - if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) { - error = EINVAL; - goto out; - } + /* Require all I/O queues to be the same size. */ + qsize = nvlist_get_number(rparams, "io_qsize"); + for (i = 0; i < num_io_queues; i++) { + if (nvlist_get_number(io[i], "qsize") != qsize) + goto invalid; } - ivars->hh = hh; - return (0); + cdata = nvlist_get_binary(nvl, "cdata", &i); + if (i != sizeof(*cdata)) + goto invalid; + dle = nvlist_get_binary(rparams, "dle", &i); + if (i != sizeof(*dle)) + goto invalid; -out: - free(ivars->io_params, M_NVMF); - free(ivars->cdata, M_NVMF); - return (error); -} + if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0) + goto invalid; -void -nvmf_free_ivars(struct nvmf_ivars *ivars) -{ - free(ivars->io_params, M_NVMF); - free(ivars->cdata, M_NVMF); + *nvlp = nvl; + return (0); +invalid: + nvlist_destroy(nvl); + return (EINVAL); } static int nvmf_probe(device_t dev) { - struct nvmf_ivars *ivars = device_get_ivars(dev); - char desc[260]; + const nvlist_t *nvl = device_get_ivars(dev); + const struct nvme_controller_data *cdata; - if (ivars == NULL) + if (nvl == NULL) return (ENXIO); - snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn); - device_set_desc_copy(dev, desc); + cdata = nvlist_get_binary(nvl, "cdata", NULL); + device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn); return (BUS_PROBE_DEFAULT); } static int -nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars) +nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl) { + const nvlist_t *const *io; + const nvlist_t *admin; + uint64_t kato; + size_t num_io_queues; + enum nvmf_trtype trtype; char name[16]; + trtype = nvlist_get_number(nvl, "trtype"); + admin = nvlist_get_nvlist(nvl, "admin"); + io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); + kato = dnvlist_get_number(nvl, "kato", 0); + sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0); + sc->controller_loss_timeout = dnvlist_get_number(nvl, + "controller_loss_timeout", 0); + /* Setup the admin queue. */ - sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin, - "admin queue"); + sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); if (sc->admin == NULL) { device_printf(sc->dev, "Failed to setup admin queue\n"); return (ENXIO); } /* Setup I/O queues. */ - sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF, + sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF, M_WAITOK | M_ZERO); - sc->num_io_queues = ivars->hh->num_io_queues; + sc->num_io_queues = num_io_queues; for (u_int i = 0; i < sc->num_io_queues; i++) { snprintf(name, sizeof(name), "I/O queue %u", i); - sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype, - &ivars->io_params[i], name); + sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i); if (sc->io[i] == NULL) { device_printf(sc->dev, "Failed to setup I/O queue %u\n", - i + 1); + i); return (ENXIO); } } /* Start KeepAlive timers. */ - if (ivars->hh->kato != 0) { + if (kato != 0) { sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, sc->cdata->ctratt) != 0; - sc->ka_rx_sbt = mstosbt(ivars->hh->kato); + sc->ka_rx_sbt = mstosbt(kato); sc->ka_tx_sbt = sc->ka_rx_sbt / 2; callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, nvmf_check_keep_alive, sc, C_HARDCLOCK); @@ -292,12 +335,23 @@ nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars) nvmf_send_keep_alive, sc, C_HARDCLOCK); } + memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL), + sizeof(*sc->cdata)); + + /* Save reconnect parameters. */ + nvlist_destroy(sc->rparams); + sc->rparams = nvlist_take_nvlist(nvl, "rparams"); + return (0); } +typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, + const struct nvme_namespace_data *, void *); + static bool -nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, - struct nvme_namespace_data *data, uint32_t *nsidp) +nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, + struct nvme_namespace_data *data, uint32_t *nsidp, + nvmf_scan_active_ns_cb *cb, void *cb_arg) { struct nvmf_completion_status status; uint32_t nsid; @@ -333,13 +387,6 @@ nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, return (true); } - if (sc->ns[nsid - 1] != NULL) { - device_printf(sc->dev, - "duplicate namespace %u in active namespace list\n", - nsid); - return (false); - } - nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, @@ -365,49 +412,37 @@ nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, return (false); } - /* - * As in nvme_ns_construct, a size of zero indicates an - * invalid namespace. - */ nvme_namespace_data_swapbytes(data); - if (data->nsze == 0) { - device_printf(sc->dev, - "ignoring active namespace %u with zero size\n", - nsid); - continue; - } - - sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); - - nvmf_sim_rescan_ns(sc, nsid); + if (!cb(sc, nsid, data, cb_arg)) + return (false); } MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); - if (nsid >= 0xfffffffd) + if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) *nsidp = 0; else - *nsidp = nsid + 1; + *nsidp = nsid; return (true); } static bool -nvmf_add_namespaces(struct nvmf_softc *sc) +nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, + void *cb_arg) { struct nvme_namespace_data *data; struct nvme_ns_list *nslist; uint32_t nsid; bool retval; - sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, - M_WAITOK | M_ZERO); nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); data = malloc(sizeof(*data), M_NVMF, M_WAITOK); nsid = 0; retval = true; for (;;) { - if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) { + if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, + cb_arg)) { retval = false; break; } @@ -420,36 +455,77 @@ nvmf_add_namespaces(struct nvmf_softc *sc) return (retval); } +static bool +nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, + const struct nvme_namespace_data *data, void *arg __unused) +{ + if (sc->ns[nsid - 1] != NULL) { + device_printf(sc->dev, + "duplicate namespace %u in active namespace list\n", + nsid); + return (false); + } + + /* + * As in nvme_ns_construct, a size of zero indicates an + * invalid namespace. + */ + if (data->nsze == 0) { + device_printf(sc->dev, + "ignoring active namespace %u with zero size\n", nsid); + return (true); + } + + sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); + + nvmf_sim_rescan_ns(sc, nsid); + return (true); +} + +static bool +nvmf_add_namespaces(struct nvmf_softc *sc) +{ + sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, + M_WAITOK | M_ZERO); + return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); +} + static int nvmf_attach(device_t dev) { struct make_dev_args mda; struct nvmf_softc *sc = device_get_softc(dev); - struct nvmf_ivars *ivars = device_get_ivars(dev); + nvlist_t *nvl = device_get_ivars(dev); + const nvlist_t * const *io; + struct sysctl_oid *oid; uint64_t val; u_int i; int error; - if (ivars == NULL) + if (nvl == NULL) return (ENXIO); sc->dev = dev; - sc->trtype = ivars->hh->trtype; + sc->trtype = nvlist_get_number(nvl, "trtype"); callout_init(&sc->ka_rx_timer, 1); callout_init(&sc->ka_tx_timer, 1); sx_init(&sc->connection_lock, "nvmf connection"); TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); + TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0, + nvmf_controller_loss_task, sc); + TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0, + nvmf_request_reconnect_task, sc); - /* Claim the cdata pointer from ivars. */ - sc->cdata = ivars->cdata; - ivars->cdata = NULL; + oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); + sc->ioq_oid_list = SYSCTL_CHILDREN(oid); - nvmf_init_aer(sc); + sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK); - /* TODO: Multiqueue support. */ - sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */; + nvmf_init_aer(sc); - error = nvmf_establish_connection(sc, ivars); + error = nvmf_establish_connection(sc, nvl); if (error != 0) goto out; @@ -476,6 +552,10 @@ nvmf_attach(device_t dev) NVME_CAP_HI_MPSMIN(sc->cap >> 32))); } + io = nvlist_get_nvlist_array(nvl, "io", NULL); + sc->max_pending_io = nvlist_get_number(io[0], "qsize") * + sc->num_io_queues; + error = nvmf_init_sim(sc); if (error != 0) goto out; @@ -503,6 +583,11 @@ nvmf_attach(device_t dev) goto out; } + sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, + nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); + sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, + nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST); + return (0); out: if (sc->ns != NULL) { @@ -529,8 +614,11 @@ out: nvmf_destroy_aer(sc); - taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); + taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task); + taskqueue_drain(nvmf_tq, &sc->disconnect_task); sx_destroy(&sc->connection_lock); + nvlist_destroy(sc->rparams); free(sc->cdata, M_NVMF); return (error); } @@ -538,7 +626,7 @@ out: void nvmf_disconnect(struct nvmf_softc *sc) { - taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); + taskqueue_enqueue(nvmf_tq, &sc->disconnect_task); } static void @@ -579,6 +667,7 @@ nvmf_disconnect_task(void *arg, int pending __unused) return; } + nanotime(&sc->last_disconnect); callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); sc->ka_traffic = false; @@ -600,29 +689,98 @@ nvmf_disconnect_task(void *arg, int pending __unused) nvmf_destroy_qp(sc->admin); sc->admin = NULL; + if (sc->reconnect_delay != 0) + nvmf_request_reconnect(sc); + if (sc->controller_loss_timeout != 0) + taskqueue_enqueue_timeout(nvmf_tq, + &sc->controller_loss_task, sc->controller_loss_timeout * + hz); + + sx_xunlock(&sc->connection_lock); +} + +static void +nvmf_controller_loss_task(void *arg, int pending) +{ + struct nvmf_softc *sc = arg; + device_t dev; + int error; + + bus_topo_lock(); + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching) { + /* Reconnected or already detaching. */ + sx_xunlock(&sc->connection_lock); + bus_topo_unlock(); + return; + } + + sc->controller_timedout = true; + sx_xunlock(&sc->connection_lock); + + /* + * XXX: Doing this from here is a bit ugly. We don't have an + * extra reference on `dev` but bus_topo_lock should block any + * concurrent device_delete_child invocations. + */ + dev = sc->dev; + error = device_delete_child(root_bus, dev); + if (error != 0) + device_printf(dev, + "failed to detach after controller loss: %d\n", error); + bus_topo_unlock(); +} + +static void +nvmf_request_reconnect(struct nvmf_softc *sc) +{ + char buf[64]; + + sx_assert(&sc->connection_lock, SX_LOCKED); + + snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev)); + devctl_notify("nvme", "controller", "RECONNECT", buf); + taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task, + sc->reconnect_delay * hz); +} + +static void +nvmf_request_reconnect_task(void *arg, int pending) +{ + struct nvmf_softc *sc = arg; + + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { + /* Reconnected or already detaching. */ + sx_xunlock(&sc->connection_lock); + return; + } + + nvmf_request_reconnect(sc); sx_xunlock(&sc->connection_lock); } static int -nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) +nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) { - struct nvmf_ivars ivars; + const struct nvme_controller_data *cdata; + nvlist_t *nvl; u_int i; int error; + error = nvmf_copyin_handoff(nv, &nvl); + if (error != 0) + return (error); + /* XXX: Should we permit changing the transport type? */ - if (sc->trtype != hh->trtype) { + if (sc->trtype != nvlist_get_number(nvl, "trtype")) { device_printf(sc->dev, "transport type mismatch on reconnect\n"); return (EINVAL); } - error = nvmf_init_ivars(&ivars, hh); - if (error != 0) - return (error); - sx_xlock(&sc->connection_lock); - if (sc->admin != NULL || sc->detaching) { + if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { error = EBUSY; goto out; } @@ -634,8 +792,9 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) * ensures the new association is connected to the same NVMe * subsystem. */ - if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn, - sizeof(ivars.cdata->subnqn)) != 0) { + cdata = nvlist_get_binary(nvl, "cdata", NULL); + if (memcmp(sc->cdata->subnqn, cdata->subnqn, + sizeof(cdata->subnqn)) != 0) { device_printf(sc->dev, "controller subsystem NQN mismatch on reconnect\n"); error = EINVAL; @@ -647,7 +806,7 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) * max_pending_io is still correct? */ - error = nvmf_establish_connection(sc, &ivars); + error = nvmf_establish_connection(sc, nvl); if (error != 0) goto out; @@ -665,12 +824,85 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) nvmf_reconnect_ns(sc->ns[i]); } nvmf_reconnect_sim(sc); + + nvmf_rescan_all_ns(sc); + + taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL); + taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL); out: sx_xunlock(&sc->connection_lock); - nvmf_free_ivars(&ivars); + nvlist_destroy(nvl); return (error); } +static void +nvmf_shutdown_pre_sync(void *arg, int howto) +{ + struct nvmf_softc *sc = arg; + + if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) + return; + + /* + * If this association is disconnected, abort any pending + * requests with an error to permit filesystems to unmount + * without hanging. + */ + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching) { + sx_xunlock(&sc->connection_lock); + return; + } + + for (u_int i = 0; i < sc->cdata->nn; i++) { + if (sc->ns[i] != NULL) + nvmf_shutdown_ns(sc->ns[i]); + } + nvmf_shutdown_sim(sc); + sx_xunlock(&sc->connection_lock); +} + +static void +nvmf_shutdown_post_sync(void *arg, int howto) +{ + struct nvmf_softc *sc = arg; + + if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) + return; + + /* + * If this association is connected, disconnect gracefully. + */ + sx_xlock(&sc->connection_lock); + if (sc->admin == NULL || sc->detaching) { + sx_xunlock(&sc->connection_lock); + return; + } + + callout_drain(&sc->ka_tx_timer); + callout_drain(&sc->ka_rx_timer); + + nvmf_shutdown_controller(sc); + + /* + * Quiesce consumers so that any commands submitted after this + * fail with an error. Notably, nda(4) calls nda_flush() from + * a post_sync handler that might be ordered after this one. + */ + for (u_int i = 0; i < sc->cdata->nn; i++) { + if (sc->ns[i] != NULL) + nvmf_shutdown_ns(sc->ns[i]); + } + nvmf_shutdown_sim(sc); + + for (u_int i = 0; i < sc->num_io_queues; i++) { + nvmf_destroy_qp(sc->io[i]); + } + nvmf_destroy_qp(sc->admin); + sc->admin = NULL; + sx_xunlock(&sc->connection_lock); +} + static int nvmf_detach(device_t dev) { @@ -683,6 +915,9 @@ nvmf_detach(device_t dev) sc->detaching = true; sx_xunlock(&sc->connection_lock); + EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); + EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); + nvmf_destroy_sim(sc); for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) @@ -701,7 +936,21 @@ nvmf_detach(device_t dev) } free(sc->io, M_NVMF); - taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + taskqueue_drain(nvmf_tq, &sc->disconnect_task); + if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, + NULL) != 0) + taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); + + /* + * Don't cancel/drain the controller loss task if that task + * has fired and is triggering the detach. + */ + if (!sc->controller_timedout) { + if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, + NULL) != 0) + taskqueue_drain_timeout(nvmf_tq, + &sc->controller_loss_task); + } if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); @@ -709,16 +958,45 @@ nvmf_detach(device_t dev) nvmf_destroy_aer(sc); sx_destroy(&sc->connection_lock); + nvlist_destroy(sc->rparams); free(sc->cdata, M_NVMF); return (0); } +static void +nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, + const struct nvme_namespace_data *data) +{ + struct nvmf_namespace *ns; + + /* XXX: Needs locking around sc->ns[]. */ + ns = sc->ns[nsid - 1]; + if (data->nsze == 0) { + /* XXX: Needs locking */ + if (ns != NULL) { + nvmf_destroy_ns(ns); + sc->ns[nsid - 1] = NULL; + } + } else { + /* XXX: Needs locking */ + if (ns == NULL) { + sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); + } else { + if (!nvmf_update_ns(ns, data)) { + nvmf_destroy_ns(ns); + sc->ns[nsid - 1] = NULL; + } + } + } + + nvmf_sim_rescan_ns(sc, nsid); +} + void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) { struct nvmf_completion_status status; struct nvme_namespace_data *data; - struct nvmf_namespace *ns; data = malloc(sizeof(*data), M_NVMF, M_WAITOK); @@ -751,29 +1029,58 @@ nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) nvme_namespace_data_swapbytes(data); - /* XXX: Needs locking around sc->ns[]. */ - ns = sc->ns[nsid - 1]; - if (data->nsze == 0) { - /* XXX: Needs locking */ + nvmf_rescan_ns_1(sc, nsid, data); + + free(data, M_NVMF); +} + +static void +nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, + uint32_t next_valid_nsid) +{ + struct nvmf_namespace *ns; + + for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) + { + /* XXX: Needs locking around sc->ns[]. */ + ns = sc->ns[nsid - 1]; if (ns != NULL) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; - } - } else { - /* XXX: Needs locking */ - if (ns == NULL) { - sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); - } else { - if (!nvmf_update_ns(ns, data)) { - nvmf_destroy_ns(ns); - sc->ns[nsid - 1] = NULL; - } + + nvmf_sim_rescan_ns(sc, nsid); } } +} - free(data, M_NVMF); +static bool +nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, + const struct nvme_namespace_data *data, void *arg) +{ + uint32_t *last_nsid = arg; - nvmf_sim_rescan_ns(sc, nsid); + /* Check for any gaps prior to this namespace. */ + nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); + *last_nsid = nsid; + + nvmf_rescan_ns_1(sc, nsid, data); + return (true); +} + +void +nvmf_rescan_all_ns(struct nvmf_softc *sc) +{ + uint32_t last_nsid; + + last_nsid = 0; + if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) + return; + + /* + * Check for any namespace devices after the last active + * namespace. + */ + nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); } int @@ -822,12 +1129,21 @@ nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, cmd.cdw14 = pt->cmd.cdw14; cmd.cdw15 = pt->cmd.cdw15; + sx_slock(&sc->connection_lock); + if (sc->admin == NULL || sc->detaching) { + device_printf(sc->dev, + "failed to send passthrough command\n"); + error = ECONNABORTED; + sx_sunlock(&sc->connection_lock); + goto error; + } if (admin) qp = sc->admin; else qp = nvmf_select_io_queue(sc); nvmf_status_init(&status); req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); + sx_sunlock(&sc->connection_lock); if (req == NULL) { device_printf(sc->dev, "failed to send passthrough command\n"); error = ECONNABORTED; @@ -857,14 +1173,46 @@ error: } static int +nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) +{ + int error; + + sx_slock(&sc->connection_lock); + error = nvmf_pack_ioc_nvlist(sc->rparams, nv); + sx_sunlock(&sc->connection_lock); + + return (error); +} + +static int +nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) +{ + nvlist_t *nvl, *nvl_ts; + int error; + + nvl = nvlist_create(0); + nvl_ts = nvlist_create(0); + + sx_slock(&sc->connection_lock); + nvlist_add_bool(nvl, "connected", sc->admin != NULL); + nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec); + nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec); + sx_sunlock(&sc->connection_lock); + nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts); + + error = nvmf_pack_ioc_nvlist(nvl, nv); + nvlist_destroy(nvl); + return (error); +} + +static int nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvmf_softc *sc = cdev->si_drv1; struct nvme_get_nsid *gnsid; struct nvme_pt_command *pt; - struct nvmf_reconnect_params *rp; - struct nvmf_handoff_host *hh; + struct nvmf_ioc_nv *nv; switch (cmd) { case NVME_PASSTHROUGH_CMD: @@ -872,25 +1220,25 @@ nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, return (nvmf_passthrough_cmd(sc, pt, true)); case NVME_GET_NSID: gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(sc->dev), + strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = 0; return (0); case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = sc->max_xfer_size; return (0); - case NVMF_RECONNECT_PARAMS: - rp = (struct nvmf_reconnect_params *)arg; - if ((sc->cdata->fcatt & 1) == 0) - rp->cntlid = NVMF_CNTLID_DYNAMIC; - else - rp->cntlid = sc->cdata->ctrlr_id; - memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn)); + case NVME_GET_CONTROLLER_DATA: + memcpy(arg, sc->cdata, sizeof(*sc->cdata)); return (0); + case NVMF_RECONNECT_PARAMS: + nv = (struct nvmf_ioc_nv *)arg; + return (nvmf_reconnect_params(sc, nv)); case NVMF_RECONNECT_HOST: - hh = (struct nvmf_handoff_host *)arg; - return (nvmf_reconnect_host(sc, hh)); + nv = (struct nvmf_ioc_nv *)arg; + return (nvmf_reconnect_host(sc, nv)); + case NVMF_CONNECTION_STATUS: + nv = (struct nvmf_ioc_nv *)arg; + return (nvmf_connection_status(sc, nv)); default: return (ENOTTY); } @@ -904,14 +1252,25 @@ static struct cdevsw nvmf_cdevsw = { static int nvmf_modevent(module_t mod, int what, void *arg) { + int error; + switch (what) { case MOD_LOAD: - return (nvmf_ctl_load()); + error = nvmf_ctl_load(); + if (error != 0) + return (error); + + nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO, + taskqueue_thread_enqueue, &nvmf_tq); + taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq"); + return (0); case MOD_QUIESCE: return (0); case MOD_UNLOAD: nvmf_ctl_unload(); destroy_dev_drain(&nvmf_cdevsw); + if (nvmf_tq != NULL) + taskqueue_free(nvmf_tq); return (0); default: return (EOPNOTSUPP); @@ -923,9 +1282,6 @@ static device_method_t nvmf_methods[] = { DEVMETHOD(device_probe, nvmf_probe), DEVMETHOD(device_attach, nvmf_attach), DEVMETHOD(device_detach, nvmf_detach), -#if 0 - DEVMETHOD(device_shutdown, nvmf_shutdown), -#endif DEVMETHOD_END }; diff --git a/sys/dev/nvmf/host/nvmf_aer.c b/sys/dev/nvmf/host/nvmf_aer.c index 4c950f1518d0..2f7f177d0421 100644 --- a/sys/dev/nvmf/host/nvmf_aer.c +++ b/sys/dev/nvmf/host/nvmf_aer.c @@ -62,7 +62,7 @@ nvmf_handle_changed_namespaces(struct nvmf_softc *sc, * probably just rescan the entire set of namespaces. */ if (ns_list->ns[0] == 0xffffffff) { - device_printf(sc->dev, "too many changed namespaces\n"); + nvmf_rescan_all_ns(sc); return; } diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c index f40005a2a666..275d5e9c932a 100644 --- a/sys/dev/nvmf/host/nvmf_ctldev.c +++ b/sys/dev/nvmf/host/nvmf_ctldev.c @@ -9,6 +9,7 @@ #include <sys/bus.h> #include <sys/conf.h> #include <sys/malloc.h> +#include <sys/nv.h> #include <dev/nvme/nvme.h> #include <dev/nvmf/nvmf.h> #include <dev/nvmf/nvmf_transport.h> @@ -17,25 +18,25 @@ static struct cdev *nvmf_cdev; static int -nvmf_handoff_host(struct nvmf_handoff_host *hh) +nvmf_handoff_host(struct nvmf_ioc_nv *nv) { - struct nvmf_ivars ivars; + nvlist_t *nvl; device_t dev; int error; - error = nvmf_init_ivars(&ivars, hh); + error = nvmf_copyin_handoff(nv, &nvl); if (error != 0) return (error); bus_topo_lock(); - dev = device_add_child(root_bus, "nvme", -1); + dev = device_add_child(root_bus, "nvme", DEVICE_UNIT_ANY); if (dev == NULL) { bus_topo_unlock(); error = ENXIO; goto out; } - device_set_ivars(dev, &ivars); + device_set_ivars(dev, nvl); error = device_probe_and_attach(dev); device_set_ivars(dev, NULL); if (error != 0) @@ -43,7 +44,7 @@ nvmf_handoff_host(struct nvmf_handoff_host *hh) bus_topo_unlock(); out: - nvmf_free_ivars(&ivars); + nvlist_destroy(nvl); return (error); } @@ -117,7 +118,7 @@ nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, { switch (cmd) { case NVMF_HANDOFF_HOST: - return (nvmf_handoff_host((struct nvmf_handoff_host *)arg)); + return (nvmf_handoff_host((struct nvmf_ioc_nv *)arg)); case NVMF_DISCONNECT_HOST: return (nvmf_disconnect_host((const char **)arg)); case NVMF_DISCONNECT_ALL: diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c index 3ce434bf7c50..4215c8295d2e 100644 --- a/sys/dev/nvmf/host/nvmf_ns.c +++ b/sys/dev/nvmf/host/nvmf_ns.c @@ -18,7 +18,7 @@ #include <sys/proc.h> #include <sys/refcount.h> #include <sys/sbuf.h> -#include <machine/stdarg.h> +#include <sys/stdarg.h> #include <dev/nvme/nvme.h> #include <dev/nvmf/host/nvmf_var.h> @@ -29,6 +29,7 @@ struct nvmf_namespace { u_int flags; uint32_t lba_size; bool disconnected; + bool shutdown; TAILQ_HEAD(, bio) pending_bios; struct mtx lock; @@ -49,7 +50,7 @@ ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); - sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev), + sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev), ns->id); va_start(ap, fmt); @@ -84,13 +85,22 @@ nvmf_ns_biodone(struct bio *bio) ns = bio->bio_dev->si_drv1; /* If a request is aborted, resubmit or queue it for resubmission. */ - if (bio->bio_error == ECONNABORTED) { + if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) { bio->bio_error = 0; bio->bio_driver2 = 0; mtx_lock(&ns->lock); if (ns->disconnected) { - TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); - mtx_unlock(&ns->lock); + if (nvmf_fail_disconnect || ns->shutdown) { + mtx_unlock(&ns->lock); + bio->bio_error = ECONNABORTED; + bio->bio_flags |= BIO_ERROR; + bio->bio_resid = bio->bio_bcount; + biodone(bio); + } else { + TAILQ_INSERT_TAIL(&ns->pending_bios, bio, + bio_queue); + mtx_unlock(&ns->lock); + } } else { mtx_unlock(&ns->lock); nvmf_ns_strategy(bio); @@ -163,6 +173,7 @@ nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) struct nvme_dsm_range *dsm_range; struct memdesc mem; uint64_t lba, lba_count; + int error; dsm_range = NULL; memset(&cmd, 0, sizeof(cmd)); @@ -201,10 +212,15 @@ nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) mtx_lock(&ns->lock); if (ns->disconnected) { - TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); + if (nvmf_fail_disconnect || ns->shutdown) { + error = ECONNABORTED; + } else { + TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); + error = 0; + } mtx_unlock(&ns->lock); free(dsm_range, M_NVMF); - return (0); + return (error); } req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, @@ -258,9 +274,8 @@ nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, return (nvmf_passthrough_cmd(ns->sc, pt, false)); case NVME_GET_NSID: gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), + strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = ns->id; return (0); case DIOCGMEDIASIZE: @@ -314,7 +329,7 @@ static struct cdevsw nvmf_ns_cdevsw = { struct nvmf_namespace * nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, - struct nvme_namespace_data *data) + const struct nvme_namespace_data *data) { struct make_dev_args mda; struct nvmf_namespace *ns; @@ -372,10 +387,12 @@ nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = ns; - error = make_dev_s(&mda, &ns->cdev, "%sns%u", + error = make_dev_s(&mda, &ns->cdev, "%sn%u", device_get_nameunit(sc->dev), id); if (error != 0) goto fail; + ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u", + device_get_nameunit(sc->dev), id); ns->cdev->si_flags |= SI_UNMAPPED; @@ -414,11 +431,35 @@ nvmf_reconnect_ns(struct nvmf_namespace *ns) } void +nvmf_shutdown_ns(struct nvmf_namespace *ns) +{ + TAILQ_HEAD(, bio) bios; + struct bio *bio; + + mtx_lock(&ns->lock); + ns->shutdown = true; + TAILQ_INIT(&bios); + TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); + mtx_unlock(&ns->lock); + + while (!TAILQ_EMPTY(&bios)) { + bio = TAILQ_FIRST(&bios); + TAILQ_REMOVE(&bios, bio, bio_queue); + bio->bio_error = ECONNABORTED; + bio->bio_flags |= BIO_ERROR; + bio->bio_resid = bio->bio_bcount; + biodone(bio); + } +} + +void nvmf_destroy_ns(struct nvmf_namespace *ns) { TAILQ_HEAD(, bio) bios; struct bio *bio; + if (ns->cdev->si_drv2 != NULL) + destroy_dev(ns->cdev->si_drv2); destroy_dev(ns->cdev); /* @@ -451,7 +492,8 @@ nvmf_destroy_ns(struct nvmf_namespace *ns) } bool -nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data) +nvmf_update_ns(struct nvmf_namespace *ns, + const struct nvme_namespace_data *data) { uint8_t lbads, lbaf; diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c index 96cb5a8b0465..2f511cf0406d 100644 --- a/sys/dev/nvmf/host/nvmf_qpair.c +++ b/sys/dev/nvmf/host/nvmf_qpair.c @@ -10,6 +10,8 @@ #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> +#include <sys/nv.h> +#include <sys/sysctl.h> #include <dev/nvme/nvme.h> #include <dev/nvmf/nvmf.h> #include <dev/nvmf/nvmf_transport.h> @@ -31,6 +33,7 @@ struct nvmf_host_qpair { u_int num_commands; uint16_t sqhd; uint16_t sqtail; + uint64_t submitted; struct mtx lock; @@ -41,6 +44,7 @@ struct nvmf_host_qpair { struct nvmf_host_command **active_commands; char name[16]; + struct sysctl_ctx_list sysctl_ctx; }; struct nvmf_request * @@ -112,8 +116,23 @@ nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd) struct nvmf_softc *sc = qp->sc; struct nvme_command *sqe; struct nvmf_capsule *nc; + uint16_t new_sqtail; int error; + mtx_assert(&qp->lock, MA_OWNED); + + qp->submitted++; + + /* + * Update flow control tracking. This is just a sanity check. + * Since num_commands == qsize - 1, there can never be too + * many commands in flight. + */ + new_sqtail = (qp->sqtail + 1) % (qp->num_commands + 1); + KASSERT(new_sqtail != qp->sqhd, ("%s: qp %p is full", __func__, qp)); + qp->sqtail = new_sqtail; + mtx_unlock(&qp->lock); + nc = cmd->req->nc; sqe = nvmf_capsule_sqe(nc); @@ -177,11 +196,23 @@ nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc) return; } + /* Update flow control tracking. */ + mtx_lock(&qp->lock); + if (qp->sq_flow_control) { + if (nvmf_sqhd_valid(nc)) + qp->sqhd = le16toh(cqe->sqhd); + } else { + /* + * If SQ FC is disabled, just advance the head for + * each response capsule received. + */ + qp->sqhd = (qp->sqhd + 1) % (qp->num_commands + 1); + } + /* * If the queue has been shutdown due to an error, silently * drop the response. */ - mtx_lock(&qp->lock); if (qp->qp == NULL) { device_printf(sc->dev, "received completion for CID %u on shutdown %s\n", cid, @@ -212,7 +243,6 @@ nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc) } else { cmd->req = STAILQ_FIRST(&qp->pending_requests); STAILQ_REMOVE_HEAD(&qp->pending_requests, link); - mtx_unlock(&qp->lock); nvmf_dispatch_command(qp, cmd); } @@ -221,28 +251,61 @@ nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc) nvmf_free_request(req); } +static void +nvmf_sysctls_qp(struct nvmf_softc *sc, struct nvmf_host_qpair *qp, + bool admin, u_int qid) +{ + struct sysctl_ctx_list *ctx = &qp->sysctl_ctx; + struct sysctl_oid *oid; + struct sysctl_oid_list *list; + char name[8]; + + if (admin) { + oid = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, + "adminq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue"); + } else { + snprintf(name, sizeof(name), "%u", qid); + oid = SYSCTL_ADD_NODE(ctx, sc->ioq_oid_list, OID_AUTO, name, + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queue"); + } + list = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "num_entries", CTLFLAG_RD, + NULL, qp->num_commands + 1, "Number of entries in queue"); + SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_head", CTLFLAG_RD, &qp->sqhd, + 0, "Current head of submission queue (as observed by driver)"); + SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_tail", CTLFLAG_RD, &qp->sqtail, + 0, "Current tail of submission queue (as observed by driver)"); + SYSCTL_ADD_U64(ctx, list, OID_AUTO, "num_cmds", CTLFLAG_RD, + &qp->submitted, 0, "Number of commands submitted"); +} + struct nvmf_host_qpair * nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype, - struct nvmf_handoff_qpair_params *handoff, const char *name) + const nvlist_t *nvl, const char *name, u_int qid) { struct nvmf_host_command *cmd, *ncmd; struct nvmf_host_qpair *qp; u_int i; + bool admin; + admin = nvlist_get_bool(nvl, "admin"); qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO); qp->sc = sc; - qp->sq_flow_control = handoff->sq_flow_control; - qp->sqhd = handoff->sqhd; - qp->sqtail = handoff->sqtail; + qp->sq_flow_control = nvlist_get_bool(nvl, "sq_flow_control"); + qp->sqhd = nvlist_get_number(nvl, "sqhd"); + qp->sqtail = nvlist_get_number(nvl, "sqtail"); strlcpy(qp->name, name, sizeof(qp->name)); mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF); + (void)sysctl_ctx_init(&qp->sysctl_ctx); /* * Allocate a spare command slot for each pending AER command * on the admin queue. */ - qp->num_commands = handoff->qsize - 1; - if (handoff->admin) + qp->num_commands = nvlist_get_number(nvl, "qsize") - 1; + if (admin) qp->num_commands += sc->num_aer; qp->active_commands = malloc(sizeof(*qp->active_commands) * @@ -255,9 +318,10 @@ nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype, } STAILQ_INIT(&qp->pending_requests); - qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error, - qp, nvmf_receive_capsule, qp); + qp->qp = nvmf_allocate_qpair(trtype, false, nvl, nvmf_qp_error, qp, + nvmf_receive_capsule, qp); if (qp->qp == NULL) { + (void)sysctl_ctx_free(&qp->sysctl_ctx); TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) { TAILQ_REMOVE(&qp->free_commands, cmd, link); free(cmd, M_NVMF); @@ -268,6 +332,8 @@ nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype, return (NULL); } + nvmf_sysctls_qp(sc, qp, admin, qid); + return (qp); } @@ -339,6 +405,7 @@ nvmf_destroy_qp(struct nvmf_host_qpair *qp) struct nvmf_host_command *cmd, *ncmd; nvmf_shutdown_qp(qp); + (void)sysctl_ctx_free(&qp->sysctl_ctx); TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) { TAILQ_REMOVE(&qp->free_commands, cmd, link); @@ -381,6 +448,5 @@ nvmf_submit_request(struct nvmf_request *req) ("%s: CID already busy", __func__)); qp->active_commands[cmd->cid] = cmd; cmd->req = req; - mtx_unlock(&qp->lock); nvmf_dispatch_command(qp, cmd); } diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c index b097b04d64c3..de9e958d8afd 100644 --- a/sys/dev/nvmf/host/nvmf_sim.c +++ b/sys/dev/nvmf/host/nvmf_sim.c @@ -40,7 +40,13 @@ nvmf_ccb_done(union ccb *ccb) return; if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) { - ccb->ccb_h.status = CAM_REQUEUE_REQ; + struct cam_sim *sim = xpt_path_sim(ccb->ccb_h.path); + struct nvmf_softc *sc = cam_sim_softc(sim); + + if (nvmf_fail_disconnect || sc->sim_shutdown) + ccb->ccb_h.status = CAM_DEV_NOT_THERE; + else + ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_done(ccb); } else if (ccb->nvmeio.cpl.status != 0) { ccb->ccb_h.status = CAM_NVME_STATUS_ERROR; @@ -52,7 +58,7 @@ nvmf_ccb_done(union ccb *ccb) xpt_done(ccb); } else { ccb->ccb_h.status = CAM_REQ_CMP; - xpt_done_direct(ccb); + xpt_done(ccb); } } @@ -106,7 +112,10 @@ nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb) mtx_lock(&sc->sim_mtx); if (sc->sim_disconnected) { mtx_unlock(&sc->sim_mtx); - nvmeio->ccb_h.status = CAM_REQUEUE_REQ; + if (nvmf_fail_disconnect || sc->sim_shutdown) + nvmeio->ccb_h.status = CAM_DEV_NOT_THERE; + else + nvmeio->ccb_h.status = CAM_REQUEUE_REQ; xpt_done(ccb); return; } @@ -116,8 +125,8 @@ nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb) qp = sc->admin; req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete, ccb, M_NOWAIT); + mtx_unlock(&sc->sim_mtx); if (req == NULL) { - mtx_unlock(&sc->sim_mtx); nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL; xpt_done(ccb); return; @@ -141,7 +150,6 @@ nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb) ("%s: incoming CCB is not in-progress", __func__)); ccb->ccb_h.status |= CAM_SIM_QUEUED; nvmf_submit_request(req); - mtx_unlock(&sc->sim_mtx); } static void @@ -183,7 +191,7 @@ nvmf_sim_action(struct cam_sim *sim, union ccb *ccb) cpi->xport_specific.nvmf.nsid = xpt_path_lun_id(ccb->ccb_h.path); cpi->xport_specific.nvmf.trtype = sc->trtype; - strncpy(cpi->xport_specific.nvmf.dev_name, + strlcpy(cpi->xport_specific.nvmf.dev_name, device_get_nameunit(sc->dev), sizeof(cpi->xport_specific.nvmf.dev_name)); cpi->maxio = sc->max_xfer_size; @@ -320,6 +328,15 @@ nvmf_reconnect_sim(struct nvmf_softc *sc) } void +nvmf_shutdown_sim(struct nvmf_softc *sc) +{ + mtx_lock(&sc->sim_mtx); + sc->sim_shutdown = true; + mtx_unlock(&sc->sim_mtx); + xpt_release_simq(sc->sim, 1); +} + +void nvmf_destroy_sim(struct nvmf_softc *sc) { xpt_async(AC_LOST_DEVICE, sc->path, NULL); diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h index 64525851631e..606245b3969c 100644 --- a/sys/dev/nvmf/host/nvmf_var.h +++ b/sys/dev/nvmf/host/nvmf_var.h @@ -9,10 +9,13 @@ #define __NVMF_VAR_H__ #include <sys/_callout.h> +#include <sys/_eventhandler.h> #include <sys/_lock.h> #include <sys/_mutex.h> +//#include <sys/_nv.h> #include <sys/_sx.h> #include <sys/_task.h> +#include <sys/smp.h> #include <sys/queue.h> #include <dev/nvme/nvme.h> #include <dev/nvmf/nvmf_transport.h> @@ -21,15 +24,10 @@ struct nvmf_aer; struct nvmf_capsule; struct nvmf_host_qpair; struct nvmf_namespace; +struct sysctl_oid_list; typedef void nvmf_request_complete_t(void *, const struct nvme_completion *); -struct nvmf_ivars { - struct nvmf_handoff_host *hh; - struct nvmf_handoff_qpair_params *io_params; - struct nvme_controller_data *cdata; -}; - struct nvmf_softc { device_t dev; @@ -42,6 +40,7 @@ struct nvmf_softc { struct cam_path *path; struct mtx sim_mtx; bool sim_disconnected; + bool sim_shutdown; struct nvmf_namespace **ns; @@ -76,12 +75,27 @@ struct nvmf_softc { struct callout ka_rx_timer; sbintime_t ka_rx_sbt; + struct timeout_task request_reconnect_task; + struct timeout_task controller_loss_task; + uint32_t reconnect_delay; + uint32_t controller_loss_timeout; + struct sx connection_lock; struct task disconnect_task; bool detaching; + bool controller_timedout; u_int num_aer; struct nvmf_aer *aer; + + struct sysctl_oid_list *ioq_oid_list; + + nvlist_t *rparams; + + struct timespec last_disconnect; + + eventhandler_tag shutdown_pre_sync_eh; + eventhandler_tag shutdown_post_sync_eh; }; struct nvmf_request { @@ -104,8 +118,8 @@ struct nvmf_completion_status { static __inline struct nvmf_host_qpair * nvmf_select_io_queue(struct nvmf_softc *sc) { - /* TODO: Support multiple queues? */ - return (sc->io[0]); + u_int idx = curcpu * sc->num_io_queues / (mp_maxid + 1); + return (sc->io[idx]); } static __inline bool @@ -140,14 +154,17 @@ extern driver_t nvme_nvmf_driver; MALLOC_DECLARE(M_NVMF); #endif +/* If true, I/O requests will fail while the host is disconnected. */ +extern bool nvmf_fail_disconnect; + /* nvmf.c */ void nvmf_complete(void *arg, const struct nvme_completion *cqe); void nvmf_io_complete(void *arg, size_t xfered, int error); void nvmf_wait_for_reply(struct nvmf_completion_status *status); -int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh); -void nvmf_free_ivars(struct nvmf_ivars *ivars); +int nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp); void nvmf_disconnect(struct nvmf_softc *sc); void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid); +void nvmf_rescan_all_ns(struct nvmf_softc *sc); int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, bool admin); @@ -180,17 +197,17 @@ void nvmf_ctl_unload(void); /* nvmf_ns.c */ struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, - struct nvme_namespace_data *data); + const struct nvme_namespace_data *data); void nvmf_disconnect_ns(struct nvmf_namespace *ns); void nvmf_reconnect_ns(struct nvmf_namespace *ns); +void nvmf_shutdown_ns(struct nvmf_namespace *ns); void nvmf_destroy_ns(struct nvmf_namespace *ns); bool nvmf_update_ns(struct nvmf_namespace *ns, - struct nvme_namespace_data *data); + const struct nvme_namespace_data *data); /* nvmf_qpair.c */ struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc, - enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff, - const char *name); + enum nvmf_trtype trtype, const nvlist_t *nvl, const char *name, u_int qid); void nvmf_shutdown_qp(struct nvmf_host_qpair *qp); void nvmf_destroy_qp(struct nvmf_host_qpair *qp); struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp, @@ -202,6 +219,7 @@ void nvmf_free_request(struct nvmf_request *req); int nvmf_init_sim(struct nvmf_softc *sc); void nvmf_disconnect_sim(struct nvmf_softc *sc); void nvmf_reconnect_sim(struct nvmf_softc *sc); +void nvmf_shutdown_sim(struct nvmf_softc *sc); void nvmf_destroy_sim(struct nvmf_softc *sc); void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id); diff --git a/sys/dev/nvmf/nvmf.h b/sys/dev/nvmf/nvmf.h index 1f1ecd437c7e..9b2b4c1dea40 100644 --- a/sys/dev/nvmf/nvmf.h +++ b/sys/dev/nvmf/nvmf.h @@ -26,54 +26,107 @@ #define NVMF_NN (1024) -struct nvmf_handoff_qpair_params { - bool admin; - bool sq_flow_control; - u_int qsize; - uint16_t sqhd; - uint16_t sqtail; /* host only */ - union { - struct { - int fd; - uint8_t rxpda; - uint8_t txpda; - bool header_digests; - bool data_digests; - uint32_t maxr2t; - uint32_t maxh2cdata; - uint32_t max_icd; - } tcp; - }; -}; +/* + * Default timeouts for Fabrics hosts. These match values used by + * Linux. + */ +#define NVMF_DEFAULT_RECONNECT_DELAY 10 +#define NVMF_DEFAULT_CONTROLLER_LOSS 600 -struct nvmf_handoff_host { - u_int trtype; - u_int num_io_queues; - u_int kato; - struct nvmf_handoff_qpair_params admin; - struct nvmf_handoff_qpair_params *io; - const struct nvme_controller_data *cdata; +/* + * (data, size) is the userspace buffer for a packed nvlist. + * + * For requests that copyout an nvlist, len is the amount of data + * copied out to *data. If size is zero, no data is copied and len is + * set to the required buffer size. + */ +struct nvmf_ioc_nv { + void *data; + size_t len; + size_t size; }; -struct nvmf_reconnect_params { - uint16_t cntlid; - char subnqn[256]; -}; +/* + * The fields in a qpair handoff nvlist are: + * + * Transport independent: + * + * bool admin + * bool sq_flow_control + * number qsize + * number sqhd + * number sqtail host only + * + * TCP transport: + * + * number fd + * number rxpda + * number txpda + * bool header_digests + * bool data_digests + * number maxr2t + * number maxh2cdata + * number max_icd + */ -struct nvmf_handoff_controller_qpair { - u_int trtype; - struct nvmf_handoff_qpair_params params; - const struct nvmf_fabric_connect_cmd *cmd; - const struct nvmf_fabric_connect_data *data; -}; +/* + * The fields in the nvlist for NVMF_HANDOFF_HOST and + * NVMF_RECONNECT_HOST are: + * + * number trtype + * number kato (optional) + * number reconnect_delay (optional) + * number controller_loss_timeout (optional) + * qpair handoff nvlist admin + * qpair handoff nvlist array io + * binary cdata struct nvme_controller_data + * NVMF_RECONNECT_PARAMS nvlist rparams + */ + +/* + * The fields in the nvlist for NVMF_RECONNECT_PARAMS are: + * + * binary dle struct nvme_discovery_log_entry + * string hostnqn + * number num_io_queues + * number kato (optional) + * number reconnect_delay (optional) + * number controller_loss_timeout (optional) + * number io_qsize + * bool sq_flow_control + * + * TCP transport: + * + * bool header_digests + * bool data_digests + */ + +/* + * The fields in the nvlist for NVMF_CONNECTION_STATUS are: + * + * bool connected + * timespec nvlist last_disconnect + * number tv_sec + * number tv_nsec + */ + +/* + * The fields in the nvlist for handing off a controller qpair are: + * + * number trtype + * qpair handoff nvlist params + * binary cmd struct nvmf_fabric_connect_cmd + * binary data struct nvmf_fabric_connect_data + */ /* Operations on /dev/nvmf */ -#define NVMF_HANDOFF_HOST _IOW('n', 200, struct nvmf_handoff_host) +#define NVMF_HANDOFF_HOST _IOW('n', 200, struct nvmf_ioc_nv) #define NVMF_DISCONNECT_HOST _IOW('n', 201, const char *) #define NVMF_DISCONNECT_ALL _IO('n', 202) /* Operations on /dev/nvmeX */ -#define NVMF_RECONNECT_PARAMS _IOR('n', 203, struct nvmf_reconnect_params) -#define NVMF_RECONNECT_HOST _IOW('n', 204, struct nvmf_handoff_host) +#define NVMF_RECONNECT_PARAMS _IOWR('n', 203, struct nvmf_ioc_nv) +#define NVMF_RECONNECT_HOST _IOW('n', 204, struct nvmf_ioc_nv) +#define NVMF_CONNECTION_STATUS _IOWR('n', 205, struct nvmf_ioc_nv) #endif /* !__NVMF_H__ */ diff --git a/sys/dev/nvmf/nvmf_proto.h b/sys/dev/nvmf/nvmf_proto.h index b0be236f77fa..f67c34acbf95 100644 --- a/sys/dev/nvmf/nvmf_proto.h +++ b/sys/dev/nvmf/nvmf_proto.h @@ -22,8 +22,6 @@ * NVMe over Fabrics specification definitions */ -#pragma pack(push, 1) - #define NVME_NQN_FIELD_SIZE 256 struct nvmf_capsule_cmd { @@ -174,7 +172,7 @@ struct nvmf_fabric_cmd { uint16_t cid; uint8_t fctype; uint8_t reserved2[59]; -}; +} __aligned(8); struct nvmf_fabric_auth_recv_cmd { uint8_t opcode; @@ -764,6 +762,4 @@ _Static_assert(offsetof(struct nvme_tcp_r2t_hdr, ttag) == 10, "Incorrect offset" _Static_assert(offsetof(struct nvme_tcp_r2t_hdr, r2to) == 12, "Incorrect offset"); _Static_assert(offsetof(struct nvme_tcp_r2t_hdr, r2tl) == 16, "Incorrect offset"); -#pragma pack(pop) - #endif /* __NVMF_PROTO_H__ */ diff --git a/sys/dev/nvmf/nvmf_tcp.c b/sys/dev/nvmf/nvmf_tcp.c index 57c81eceee02..6ad5229f6043 100644 --- a/sys/dev/nvmf/nvmf_tcp.c +++ b/sys/dev/nvmf/nvmf_tcp.c @@ -18,6 +18,7 @@ #include <sys/mbuf.h> #include <sys/module.h> #include <sys/mutex.h> +#include <sys/nv.h> #include <sys/protosw.h> #include <sys/refcount.h> #include <sys/socket.h> @@ -138,7 +139,7 @@ static void tcp_free_qpair(struct nvmf_qpair *nq); SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TCP transport"); static u_int tcp_max_transmit_data = 256 * 1024; -SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_c2hdata, CTLFLAG_RWTUN, +SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN, &tcp_max_transmit_data, 0, "Maximum size of data payload in a transmitted PDU"); @@ -442,7 +443,7 @@ nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen, plen += sizeof(digest); if (data_len != 0) { KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); - pdo = roundup2(plen, qp->txpda); + pdo = roundup(plen, qp->txpda); pad = pdo - plen; plen = pdo + data_len; if (qp->data_digests) @@ -623,10 +624,7 @@ mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, while (len != 0) { MPASS((m->m_flags & M_EXTPG) == 0); - todo = m->m_len - skip; - if (todo > len) - todo = len; - + todo = min(m->m_len - skip, len); memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); skip = 0; io_offset += todo; @@ -887,7 +885,7 @@ nvmf_tcp_mext_pg(void *arg, int how) struct nvmf_tcp_command_buffer *cb = arg; struct mbuf *m; - m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg); + m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg, M_RDONLY); m->m_ext.ext_arg1 = cb; tcp_hold_command_buffer(cb); return (m); @@ -1000,9 +998,7 @@ nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) struct mbuf *m; uint32_t sent, todo; - todo = data_len; - if (todo > qp->max_tx_data) - todo = qp->max_tx_data; + todo = min(data_len, qp->max_tx_data); m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent, todo < data_len); tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, @@ -1418,8 +1414,7 @@ nvmf_soupcall_send(struct socket *so, void *arg, int waitflag) } static struct nvmf_qpair * -tcp_allocate_qpair(bool controller, - const struct nvmf_handoff_qpair_params *params) +tcp_allocate_qpair(bool controller, const nvlist_t *nvl) { struct nvmf_tcp_qpair *qp; struct socket *so; @@ -1427,8 +1422,18 @@ tcp_allocate_qpair(bool controller, cap_rights_t rights; int error; - error = fget(curthread, params->tcp.fd, cap_rights_init_one(&rights, - CAP_SOCK_CLIENT), &fp); + if (!nvlist_exists_number(nvl, "fd") || + !nvlist_exists_number(nvl, "rxpda") || + !nvlist_exists_number(nvl, "txpda") || + !nvlist_exists_bool(nvl, "header_digests") || + !nvlist_exists_bool(nvl, "data_digests") || + !nvlist_exists_number(nvl, "maxr2t") || + !nvlist_exists_number(nvl, "maxh2cdata") || + !nvlist_exists_number(nvl, "max_icd")) + return (NULL); + + error = fget(curthread, nvlist_get_number(nvl, "fd"), + cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (NULL); if (fp->f_type != DTYPE_SOCKET) { @@ -1450,26 +1455,28 @@ tcp_allocate_qpair(bool controller, qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO); qp->so = so; refcount_init(&qp->refs, 1); - qp->txpda = params->tcp.txpda; - qp->rxpda = params->tcp.rxpda; - qp->header_digests = params->tcp.header_digests; - qp->data_digests = params->tcp.data_digests; - qp->maxr2t = params->tcp.maxr2t; - qp->maxh2cdata = params->tcp.maxh2cdata; + qp->txpda = nvlist_get_number(nvl, "txpda"); + qp->rxpda = nvlist_get_number(nvl, "rxpda"); + qp->header_digests = nvlist_get_bool(nvl, "header_digests"); + qp->data_digests = nvlist_get_bool(nvl, "data_digests"); + qp->maxr2t = nvlist_get_number(nvl, "maxr2t"); + if (controller) + qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata"); qp->max_tx_data = tcp_max_transmit_data; if (!controller) { - if (qp->max_tx_data > params->tcp.maxh2cdata) - qp->max_tx_data = params->tcp.maxh2cdata; + qp->max_tx_data = min(qp->max_tx_data, + nvlist_get_number(nvl, "maxh2cdata")); + qp->max_icd = nvlist_get_number(nvl, "max_icd"); } - qp->max_icd = params->tcp.max_icd; if (controller) { /* Use the SUCCESS flag if SQ flow control is disabled. */ - qp->send_success = !params->sq_flow_control; + qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control"); /* NB: maxr2t is 0's based. */ qp->num_ttags = MIN((u_int)UINT16_MAX + 1, - (uint64_t)params->qsize * (uint64_t)qp->maxr2t + 1); + nvlist_get_number(nvl, "qsize") * + ((uint64_t)qp->maxr2t + 1)); qp->open_ttags = mallocarray(qp->num_ttags, sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO); } @@ -1558,6 +1565,7 @@ tcp_free_qpair(struct nvmf_qpair *nq) for (u_int i = 0; i < qp->num_ttags; i++) { cb = qp->open_ttags[i]; if (cb != NULL) { + cb->tc->active_r2ts--; cb->error = ECONNABORTED; tcp_release_command_buffer(cb); } @@ -1569,6 +1577,10 @@ tcp_free_qpair(struct nvmf_qpair *nq) TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { tcp_remove_command_buffer(&qp->rx_buffers, cb); mtx_unlock(&qp->rx_buffers.lock); +#ifdef INVARIANTS + if (cb->tc != NULL) + cb->tc->pending_r2ts--; +#endif cb->error = ECONNABORTED; tcp_release_command_buffer(cb); mtx_lock(&qp->rx_buffers.lock); @@ -1784,7 +1796,6 @@ tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, { struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); struct nvme_sgl_descriptor *sgl; - struct mbuf *n, *p; uint32_t data_len; bool last_pdu, last_xfer; @@ -1813,21 +1824,29 @@ tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, /* Queue one more C2H_DATA PDUs containing the data from 'm'. */ while (m != NULL) { + struct mbuf *n; uint32_t todo; - todo = m->m_len; - p = m; - n = p->m_next; - while (n != NULL) { - if (todo + n->m_len > qp->max_tx_data) { - p->m_next = NULL; - break; - } - todo += n->m_len; - p = n; + if (m->m_len > qp->max_tx_data) { + n = m_split(m, qp->max_tx_data, M_WAITOK); + todo = m->m_len; + } else { + struct mbuf *p; + + todo = m->m_len; + p = m; n = p->m_next; + while (n != NULL) { + if (todo + n->m_len > qp->max_tx_data) { + p->m_next = NULL; + break; + } + todo += n->m_len; + p = n; + n = p->m_next; + } + MPASS(m_length(m, NULL) == todo); } - MPASS(m_length(m, NULL) == todo); last_pdu = (n == NULL && last_xfer); tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, diff --git a/sys/dev/nvmf/nvmf_tcp.h b/sys/dev/nvmf/nvmf_tcp.h index 00b0917f75a4..03b5d2445928 100644 --- a/sys/dev/nvmf/nvmf_tcp.h +++ b/sys/dev/nvmf/nvmf_tcp.h @@ -9,7 +9,6 @@ #define __NVMF_TCP_H__ #ifndef _KERNEL -#define __assert_unreachable __unreachable #define MPASS assert #endif @@ -41,6 +40,13 @@ nvmf_tcp_validate_pdu_header(const struct nvme_tcp_common_pdu_hdr *ch, uint8_t digest_flags, valid_flags; plen = le32toh(ch->plen); + full_hlen = ch->hlen; + if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) + full_hlen += sizeof(uint32_t); + if (plen == full_hlen) + data_len = 0; + else + data_len = plen - ch->pdo; /* * Errors must be reported for the lowest incorrect field @@ -50,7 +56,7 @@ nvmf_tcp_validate_pdu_header(const struct nvme_tcp_common_pdu_hdr *ch, /* Validate pdu_type. */ /* Controllers only receive PDUs with a PDU direction of 0. */ - if (controller != (ch->pdu_type & 0x01) == 0) { + if (controller != ((ch->pdu_type & 0x01) == 0)) { printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type); *fes = NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; *fei = offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type); @@ -125,11 +131,15 @@ nvmf_tcp_validate_pdu_header(const struct nvme_tcp_common_pdu_hdr *ch, return (EBADMSG); } - /* Verify that digests are present iff enabled. */ + /* + * Verify that digests are present iff enabled. Note that the + * data digest will not be present if there is no data + * payload. + */ digest_flags = 0; if (header_digests) digest_flags |= NVME_TCP_CH_FLAGS_HDGSTF; - if (data_digests) + if (data_digests && data_len != 0) digest_flags |= NVME_TCP_CH_FLAGS_DDGSTF; if ((digest_flags & valid_flags) != (ch->flags & (NVME_TCP_CH_FLAGS_HDGSTF | @@ -184,9 +194,6 @@ nvmf_tcp_validate_pdu_header(const struct nvme_tcp_common_pdu_hdr *ch, } /* Validate pdo. */ - full_hlen = ch->hlen; - if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) - full_hlen += sizeof(uint32_t); switch (ch->pdu_type) { default: __assert_unreachable(); @@ -207,7 +214,7 @@ nvmf_tcp_validate_pdu_header(const struct nvme_tcp_common_pdu_hdr *ch, case NVME_TCP_PDU_TYPE_H2C_DATA: case NVME_TCP_PDU_TYPE_C2H_DATA: /* Permit PDO of 0 if there is no data. */ - if (full_hlen == plen && ch->pdo == 0) + if (data_len == 0 && ch->pdo == 0) break; if (ch->pdo < full_hlen || ch->pdo > plen || @@ -229,10 +236,6 @@ nvmf_tcp_validate_pdu_header(const struct nvme_tcp_common_pdu_hdr *ch, return (EBADMSG); } - if (plen == full_hlen) - data_len = 0; - else - data_len = plen - ch->pdo; switch (ch->pdu_type) { default: __assert_unreachable(); diff --git a/sys/dev/nvmf/nvmf_transport.c b/sys/dev/nvmf/nvmf_transport.c index 14d526192270..1d3f5ea4cf69 100644 --- a/sys/dev/nvmf/nvmf_transport.c +++ b/sys/dev/nvmf/nvmf_transport.c @@ -12,6 +12,7 @@ #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/module.h> +#include <sys/nv.h> #include <sys/refcount.h> #include <sys/sysctl.h> #include <sys/sx.h> @@ -47,8 +48,7 @@ nvmf_supported_trtype(enum nvmf_trtype trtype) struct nvmf_qpair * nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller, - const struct nvmf_handoff_qpair_params *params, - nvmf_qpair_error_t *error_cb, void *error_cb_arg, + const nvlist_t *params, nvmf_qpair_error_t *error_cb, void *error_cb_arg, nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg) { struct nvmf_transport *nt; @@ -76,7 +76,7 @@ nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller, qp->nq_error_arg = error_cb_arg; qp->nq_receive = receive_cb; qp->nq_receive_arg = receive_cb_arg; - qp->nq_admin = params->admin; + qp->nq_admin = nvlist_get_bool(params, "admin"); return (qp); } @@ -180,6 +180,14 @@ nvmf_capsule_cqe(struct nvmf_capsule *nc) return (&nc->nc_cqe); } +bool +nvmf_sqhd_valid(struct nvmf_capsule *nc) +{ + KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion), + ("%s: capsule %p is not a response capsule", __func__, nc)); + return (nc->nc_sqhd_valid); +} + uint8_t nvmf_validate_command_capsule(struct nvmf_capsule *nc) { @@ -223,6 +231,92 @@ nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, } int +nvmf_pack_ioc_nvlist(const nvlist_t *nvl, struct nvmf_ioc_nv *nv) +{ + void *packed; + int error; + + error = nvlist_error(nvl); + if (error != 0) + return (error); + + if (nv->size == 0) { + nv->len = nvlist_size(nvl); + } else { + packed = nvlist_pack(nvl, &nv->len); + if (packed == NULL) + error = ENOMEM; + else if (nv->len > nv->size) + error = EFBIG; + else + error = copyout(packed, nv->data, nv->len); + free(packed, M_NVLIST); + } + return (error); +} + +int +nvmf_unpack_ioc_nvlist(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) +{ + void *packed; + nvlist_t *nvl; + int error; + + packed = malloc(nv->size, M_NVMF_TRANSPORT, M_WAITOK); + error = copyin(nv->data, packed, nv->size); + if (error != 0) { + free(packed, M_NVMF_TRANSPORT); + return (error); + } + + nvl = nvlist_unpack(packed, nv->size, 0); + free(packed, M_NVMF_TRANSPORT); + if (nvl == NULL) + return (EINVAL); + + *nvlp = nvl; + return (0); +} + +bool +nvmf_validate_qpair_nvlist(const nvlist_t *nvl, bool controller) +{ + uint64_t value, qsize; + bool admin, valid; + + valid = true; + valid &= nvlist_exists_bool(nvl, "admin"); + valid &= nvlist_exists_bool(nvl, "sq_flow_control"); + valid &= nvlist_exists_number(nvl, "qsize"); + valid &= nvlist_exists_number(nvl, "sqhd"); + if (!controller) + valid &= nvlist_exists_number(nvl, "sqtail"); + if (!valid) + return (false); + + admin = nvlist_get_bool(nvl, "admin"); + qsize = nvlist_get_number(nvl, "qsize"); + if (admin) { + if (qsize < NVME_MIN_ADMIN_ENTRIES || + qsize > NVME_MAX_ADMIN_ENTRIES) + return (false); + } else { + if (qsize < NVME_MIN_IO_ENTRIES || qsize > NVME_MAX_IO_ENTRIES) + return (false); + } + value = nvlist_get_number(nvl, "sqhd"); + if (value > qsize - 1) + return (false); + if (!controller) { + value = nvlist_get_number(nvl, "sqtail"); + if (value > qsize - 1) + return (false); + } + + return (true); +} + +int nvmf_transport_module_handler(struct module *mod, int what, void *arg) { struct nvmf_transport_ops *ops = arg; @@ -292,8 +386,6 @@ nvmf_transport_module_handler(struct module *mod, int what, void *arg) prev = nt; } if (nt == NULL) { - KASSERT(nt->nt_active_qpairs == 0, - ("unregistered transport has connections")); sx_xunlock(&nvmf_transports_lock); return (0); } diff --git a/sys/dev/nvmf/nvmf_transport.h b/sys/dev/nvmf/nvmf_transport.h index 549170b25940..b192baeaccc1 100644 --- a/sys/dev/nvmf/nvmf_transport.h +++ b/sys/dev/nvmf/nvmf_transport.h @@ -13,6 +13,7 @@ * (target) to send and receive capsules and associated data. */ +#include <sys/_nv.h> #include <sys/sysctl.h> #include <dev/nvmf/nvmf_proto.h> @@ -20,8 +21,8 @@ struct mbuf; struct memdesc; struct nvmf_capsule; struct nvmf_connection; +struct nvmf_ioc_nv; struct nvmf_qpair; -struct nvmf_handoff_qpair_params; SYSCTL_DECL(_kern_nvmf); @@ -54,7 +55,7 @@ typedef void nvmf_io_complete_t(void *, size_t, int); * independent. */ struct nvmf_qpair *nvmf_allocate_qpair(enum nvmf_trtype trtype, - bool controller, const struct nvmf_handoff_qpair_params *params, + bool controller, const nvlist_t *params, nvmf_qpair_error_t *error_cb, void *error_cb_arg, nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg); void nvmf_free_qpair(struct nvmf_qpair *qp); @@ -78,6 +79,7 @@ int nvmf_transmit_capsule(struct nvmf_capsule *nc); void nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error); void *nvmf_capsule_sqe(struct nvmf_capsule *nc); void *nvmf_capsule_cqe(struct nvmf_capsule *nc); +bool nvmf_sqhd_valid(struct nvmf_capsule *nc); /* Controller-specific APIs. */ @@ -137,4 +139,23 @@ u_int nvmf_send_controller_data(struct nvmf_capsule *nc, #define NVMF_SUCCESS_SENT 0x100 #define NVMF_MORE 0x101 +/* Helper APIs for nvlists used in icotls. */ + +/* + * Pack the nvlist nvl and copyout to the buffer described by nv. + */ +int nvmf_pack_ioc_nvlist(const nvlist_t *nvl, struct nvmf_ioc_nv *nv); + +/* + * Copyin and unpack an nvlist described by nv. The unpacked nvlist + * is returned in *nvlp on success. + */ +int nvmf_unpack_ioc_nvlist(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp); + +/* + * Returns true if a qpair handoff nvlist has all the required + * transport-independent values. + */ +bool nvmf_validate_qpair_nvlist(const nvlist_t *nvl, bool controller); + #endif /* !__NVMF_TRANSPORT_H__ */ diff --git a/sys/dev/nvmf/nvmf_transport_internal.h b/sys/dev/nvmf/nvmf_transport_internal.h index 0be427ee0690..eb819a5c83b9 100644 --- a/sys/dev/nvmf/nvmf_transport_internal.h +++ b/sys/dev/nvmf/nvmf_transport_internal.h @@ -8,6 +8,7 @@ #ifndef __NVMF_TRANSPORT_INTERNAL_H__ #define __NVMF_TRANSPORT_INTERNAL_H__ +#include <sys/_nv.h> #include <sys/memdesc.h> /* @@ -21,7 +22,7 @@ struct nvmf_io_request; struct nvmf_transport_ops { /* Queue pair management. */ struct nvmf_qpair *(*allocate_qpair)(bool controller, - const struct nvmf_handoff_qpair_params *params); + const nvlist_t *nvl); void (*free_qpair)(struct nvmf_qpair *qp); /* Capsule operations. */ |