4 files changed, 224 insertions, 9 deletions
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
index 8b8f2e570245..4de451f1b039 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c
@@ -42,13 +42,30 @@
 
 static if_snd_tag_free_t mlx5e_tls_rx_snd_tag_free;
 static if_snd_tag_modify_t mlx5e_tls_rx_snd_tag_modify;
+static if_snd_tag_status_str_t mlx5e_tls_rx_snd_tag_status_str;
 
 static const struct if_snd_tag_sw mlx5e_tls_rx_snd_tag_sw = {
 	.snd_tag_modify = mlx5e_tls_rx_snd_tag_modify,
 	.snd_tag_free = mlx5e_tls_rx_snd_tag_free,
+	.snd_tag_status_str = mlx5e_tls_rx_snd_tag_status_str,
 	.type = IF_SND_TAG_TYPE_TLS_RX
 };
 
+static const char *mlx5e_tls_rx_progress_params_auth_state_str[] = {
+	[MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD] = "no_offload",
+	[MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_OFFLOAD] = "offload",
+	[MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_AUTHENTICATION] =
+	    "authentication",
+};
+
+static const char *mlx5e_tls_rx_progress_params_record_tracker_state_str[] = {
+	[MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START] = "start",
+	[MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_TRACKING] =
+	    "tracking",
+	[MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_SEARCHING] =
+	    "searching",
+};
+
 MALLOC_DEFINE(M_MLX5E_TLS_RX, "MLX5E_TLS_RX", "MLX5 ethernet HW TLS RX");
 
 /* software TLS RX context */
@@ -250,7 +267,8 @@ mlx5e_tls_rx_send_progress_parameters_sync(struct mlx5e_iq *iq,
 	mtx_unlock(&iq->lock);
 
 	while (1) {
-		if (wait_for_completion_timeout(&ptag->progress_complete, hz) != 0)
+		if (wait_for_completion_timeout(&ptag->progress_complete,
+		    msecs_to_jiffies(1000)) != 0)
 			break;
 		priv = container_of(iq, struct mlx5e_channel, iq)->priv;
 		if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR ||
@@ -331,7 +349,8 @@ done:
  * Zero is returned upon success, else some error happened.
  */
 static int
-mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_rx_tag *ptag)
+mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq,
+    struct mlx5e_tls_rx_tag *ptag, mlx5e_iq_callback_t *cb)
 {
 	struct mlx5e_get_tls_progress_params_wqe *wqe;
 	const u32 ds_cnt = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS);
@@ -367,7 +386,7 @@ mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_r
 	memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32));
 
 	iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
-	iq->data[pi].callback = &mlx5e_tls_rx_receive_progress_parameters_cb;
+	iq->data[pi].callback = cb;
 	iq->data[pi].arg = ptag;
 
 	m_snd_tag_ref(&ptag->tag);
@@ -819,6 +838,7 @@ mlx5e_tls_rx_snd_tag_alloc(if_t ifp,
 	}
 
 	ptag->flow_rule = flow_rule;
+	init_completion(&ptag->progress_complete);
 
 	return (0);
 
@@ -968,7 +988,8 @@ mlx5e_tls_rx_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_param
 	    params->tls_rx.tls_rec_length,
 	    params->tls_rx.tls_seq_number) &&
 	    ptag->tcp_resync_pending == 0) {
-		err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag);
+		err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag,
+		    &mlx5e_tls_rx_receive_progress_parameters_cb);
 		if (err != 0) {
 			MLX5E_TLS_RX_STAT_INC(ptag, rx_resync_err, 1);
 		} else {
@@ -1001,6 +1022,74 @@ mlx5e_tls_rx_snd_tag_free(struct m_snd_tag *pmt)
 	queue_work(priv->tls_rx.wq, &ptag->work);
 }
 
+static void
+mlx5e_tls_rx_str_status_cb(void *arg)
+{
+	struct mlx5e_tls_rx_tag *ptag;
+
+	ptag = (struct mlx5e_tls_rx_tag *)arg;
+	complete_all(&ptag->progress_complete);
+	m_snd_tag_rele(&ptag->tag);
+}
+
+static int
+mlx5e_tls_rx_snd_tag_status_str(struct m_snd_tag *pmt, char *buf, size_t *sz)
+{
+	int err, out_size;
+	struct mlx5e_iq *iq;
+	void *buffer;
+	uint32_t tracker_state_val;
+	uint32_t auth_state_val;
+	struct mlx5e_priv *priv;
+	struct mlx5e_tls_rx_tag *ptag = 
+	    container_of(pmt, struct mlx5e_tls_rx_tag, tag);
+
+	if (buf == NULL)
+		return (0);
+
+	MLX5E_TLS_RX_TAG_LOCK(ptag);
+	priv = container_of(ptag->tls_rx, struct mlx5e_priv, tls_rx);
+	iq = mlx5e_tls_rx_get_iq(priv, ptag->flowid, ptag->flowtype);
+	reinit_completion(&ptag->progress_complete);
+	err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag,
+	    &mlx5e_tls_rx_str_status_cb);
+	MLX5E_TLS_RX_TAG_UNLOCK(ptag);
+	if (err != 0)
+		return (err);
+
+	for (;;) {
+		if (wait_for_completion_timeout(&ptag->progress_complete,
+		    msecs_to_jiffies(1000)) != 0)
+			break;
+		if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR ||
+		    pci_channel_offline(priv->mdev->pdev) != 0)
+			return (ENXIO);
+	}
+	buffer = mlx5e_tls_rx_get_progress_buffer(ptag);
+	tracker_state_val = MLX5_GET(tls_progress_params, buffer,
+	    record_tracker_state);
+	auth_state_val = MLX5_GET(tls_progress_params, buffer, auth_state);
+
+	/* Validate tracker state value is in range */
+	if (tracker_state_val >
+	    MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_SEARCHING)
+		return (EINVAL);
+
+	/* Validate auth state value is in range */
+	if (auth_state_val >
+	    MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_AUTHENTICATION)
+		return (EINVAL);
+
+	out_size = snprintf(buf, *sz, "tracker_state: %s, auth_state: %s",
+	    mlx5e_tls_rx_progress_params_record_tracker_state_str[
+		tracker_state_val],
+	    mlx5e_tls_rx_progress_params_auth_state_str[auth_state_val]);
+
+	if (out_size <= *sz)
+		*sz = out_size;
+	return (0);
+}
+
 #else
 
 int
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
index dbdd4568bdf1..1ac0d142443b 100644
--- a/sys/dev/nvmf/host/nvmf.c
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -27,6 +27,7 @@
 #include <dev/nvmf/host/nvmf_var.h>
 
 static struct cdevsw nvmf_cdevsw;
+static struct taskqueue *nvmf_tq;
 
 bool nvmf_fail_disconnect = false;
 SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
@@ -34,7 +35,10 @@ SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
 
 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
 
+static void	nvmf_controller_loss_task(void *arg, int pending);
 static void	nvmf_disconnect_task(void *arg, int pending);
+static void	nvmf_request_reconnect(struct nvmf_softc *sc);
+static void	nvmf_request_reconnect_task(void *arg, int pending);
 static void	nvmf_shutdown_pre_sync(void *arg, int howto);
 static void	nvmf_shutdown_post_sync(void *arg, int howto);
 
@@ -294,6 +298,9 @@ nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl)
 	admin = nvlist_get_nvlist(nvl, "admin");
 	io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
 	kato = dnvlist_get_number(nvl, "kato", 0);
+	sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0);
+	sc->controller_loss_timeout = dnvlist_get_number(nvl,
+	    "controller_loss_timeout", 0);
 
 	/* Setup the admin queue. */
 	sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
@@ -504,6 +511,10 @@ nvmf_attach(device_t dev)
 	callout_init(&sc->ka_tx_timer, 1);
 	sx_init(&sc->connection_lock, "nvmf connection");
 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+	TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0,
+	    nvmf_controller_loss_task, sc);
+	TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0,
+	    nvmf_request_reconnect_task, sc);
 
 	oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
@@ -603,7 +614,9 @@ out:
 
 	nvmf_destroy_aer(sc);
 
-	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+	taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
+	taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task);
+	taskqueue_drain(nvmf_tq, &sc->disconnect_task);
 	sx_destroy(&sc->connection_lock);
 	nvlist_destroy(sc->rparams);
 	free(sc->cdata, M_NVMF);
@@ -613,7 +626,7 @@ out:
 void
 nvmf_disconnect(struct nvmf_softc *sc)
 {
-	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+	taskqueue_enqueue(nvmf_tq, &sc->disconnect_task);
 }
 
 static void
@@ -676,6 +689,74 @@ nvmf_disconnect_task(void *arg, int pending __unused)
 	nvmf_destroy_qp(sc->admin);
 	sc->admin = NULL;
 
+	if (sc->reconnect_delay != 0)
+		nvmf_request_reconnect(sc);
+	if (sc->controller_loss_timeout != 0)
+		taskqueue_enqueue_timeout(nvmf_tq,
+		    &sc->controller_loss_task, sc->controller_loss_timeout *
+		    hz);
+
+	sx_xunlock(&sc->connection_lock);
+}
+
+static void
+nvmf_controller_loss_task(void *arg, int pending)
+{
+	struct nvmf_softc *sc = arg;
+	device_t dev;
+	int error;
+
+	bus_topo_lock();
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin != NULL || sc->detaching) {
+		/* Reconnected or already detaching. */
+		sx_xunlock(&sc->connection_lock);
+		bus_topo_unlock();
+		return;
+	}
+
+	sc->controller_timedout = true;
+	sx_xunlock(&sc->connection_lock);
+
+	/*
+	 * XXX: Doing this from here is a bit ugly.  We don't have an
+	 * extra reference on `dev` but bus_topo_lock should block any
+	 * concurrent device_delete_child invocations.
+	 */
+	dev = sc->dev;
+	error = device_delete_child(root_bus, dev);
+	if (error != 0)
+		device_printf(dev,
+		    "failed to detach after controller loss: %d\n", error);
+	bus_topo_unlock();
+}
+
+static void
+nvmf_request_reconnect(struct nvmf_softc *sc)
+{
+	char buf[64];
+
+	sx_assert(&sc->connection_lock, SX_LOCKED);
+
+	snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev));
+	devctl_notify("nvme", "controller", "RECONNECT", buf);
+	taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task,
+	    sc->reconnect_delay * hz);
+}
+
+static void
+nvmf_request_reconnect_task(void *arg, int pending)
+{
+	struct nvmf_softc *sc = arg;
+
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
+		/* Reconnected or already detaching. */
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	nvmf_request_reconnect(sc);
 	sx_xunlock(&sc->connection_lock);
 }
 
@@ -699,7 +780,7 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
 	}
 
 	sx_xlock(&sc->connection_lock);
-	if (sc->admin != NULL || sc->detaching) {
+	if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
 		error = EBUSY;
 		goto out;
 	}
@@ -745,6 +826,9 @@ nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
 	nvmf_reconnect_sim(sc);
 
 	nvmf_rescan_all_ns(sc);
+
+	taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL);
+	taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL);
 out:
 	sx_xunlock(&sc->connection_lock);
 	nvlist_destroy(nvl);
@@ -852,7 +936,21 @@ nvmf_detach(device_t dev)
 	}
 	free(sc->io, M_NVMF);
 
-	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+	taskqueue_drain(nvmf_tq, &sc->disconnect_task);
+	if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task,
+	    NULL) != 0)
+		taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
+
+	/*
+	 * Don't cancel/drain the controller loss task if that task
+	 * has fired and is triggering the detach.
+	 */
+	if (!sc->controller_timedout) {
+		if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task,
+		    NULL) != 0)
+			taskqueue_drain_timeout(nvmf_tq,
+			    &sc->controller_loss_task);
+	}
 
 	if (sc->admin != NULL)
 		nvmf_destroy_qp(sc->admin);
@@ -1154,14 +1252,25 @@ static struct cdevsw nvmf_cdevsw = {
 static int
 nvmf_modevent(module_t mod, int what, void *arg)
 {
+	int error;
+
 	switch (what) {
 	case MOD_LOAD:
-		return (nvmf_ctl_load());
+		error = nvmf_ctl_load();
+		if (error != 0)
+			return (error);
+
+		nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO,
+		    taskqueue_thread_enqueue, &nvmf_tq);
+		taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq");
+		return (0);
 	case MOD_QUIESCE:
 		return (0);
 	case MOD_UNLOAD:
 		nvmf_ctl_unload();
 		destroy_dev_drain(&nvmf_cdevsw);
+		if (nvmf_tq != NULL)
+			taskqueue_free(nvmf_tq);
 		return (0);
 	default:
 		return (EOPNOTSUPP);
diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h
index e45a31f413a4..606245b3969c 100644
--- a/sys/dev/nvmf/host/nvmf_var.h
+++ b/sys/dev/nvmf/host/nvmf_var.h
@@ -75,9 +75,15 @@ struct nvmf_softc {
 	struct callout ka_rx_timer;
 	sbintime_t ka_rx_sbt;
 
+	struct timeout_task request_reconnect_task;
+	struct timeout_task controller_loss_task;
+	uint32_t reconnect_delay;
+	uint32_t controller_loss_timeout;
+
 	struct sx connection_lock;
 	struct task disconnect_task;
 	bool detaching;
+	bool controller_timedout;
 
 	u_int num_aer;
 	struct nvmf_aer *aer;
diff --git a/sys/dev/nvmf/nvmf.h b/sys/dev/nvmf/nvmf.h
index d4e7b1511e9d..9b2b4c1dea40 100644
--- a/sys/dev/nvmf/nvmf.h
+++ b/sys/dev/nvmf/nvmf.h
@@ -27,6 +27,13 @@
 #define	NVMF_NN			(1024)
 
 /*
+ * Default timeouts for Fabrics hosts.  These match values used by
+ * Linux.
+ */
+#define	NVMF_DEFAULT_RECONNECT_DELAY	10
+#define	NVMF_DEFAULT_CONTROLLER_LOSS	600
+
+/*
  * (data, size) is the userspace buffer for a packed nvlist.
  *
  * For requests that copyout an nvlist, len is the amount of data
@@ -68,6 +75,8 @@ struct nvmf_ioc_nv {
  *
  * number			trtype
  * number			kato	(optional)
+ * number                       reconnect_delay (optional)
+ * number                       controller_loss_timeout (optional)
  * qpair handoff nvlist		admin
  * qpair handoff nvlist array	io
  * binary			cdata	struct nvme_controller_data
@@ -81,6 +90,8 @@ struct nvmf_ioc_nv {
  * string			hostnqn
  * number			num_io_queues
  * number			kato	(optional)
+ * number                       reconnect_delay (optional)
+ * number                       controller_loss_timeout (optional)
  * number			io_qsize
  * bool				sq_flow_control
  *