src - FreeBSD source tree

diff options


context:
space:
mode:

author	Ameer Hamza <ahamza@ixsystems.com>	2024-04-04 01:21:25 +0000
committer	GitHub <noreply@github.com>	2024-04-04 01:21:25 +0000
commit	99741bde59d1d1df0963009bb624ddc105f7d8dc (patch)
tree	f6efab10e93f61acb21273c007eec34c4a8a22db
parent	30c4eba4eaaef92cbc1b8d2eed6689600a342386 (diff)
download	src-99741bde59d1d1df0963009bb624ddc105f7d8dc.tar.gz src-99741bde59d1d1df0963009bb624ddc105f7d8dc.zip

zvol: use multiple taskq

Currently, zvol uses a single taskq, resulting in throughput bottleneck under heavy load due to lock contention on the single taskq. This patch addresses the performance bottleneck under heavy load conditions by utilizing multiple taskqs, thus mitigating lock contention. The number of taskqs scale dynamically based on the available CPUs in the system, as illustrated below: taskq total cpus taskqs threads threads ------- ------- ------- ------- 1 1 32 32 2 1 32 32 4 1 32 32 8 2 16 32 16 3 11 33 32 5 7 35 64 8 8 64 128 11 12 132 256 16 16 256 Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #15992

Diffstat

-rw-r--r--

man/man4/zfs.4

-rw-r--r--

module/os/linux/zfs/zvol_os.c

102

2 files changed, 99 insertions, 10 deletions

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index cacb214d1dc1..c8e90968ab34 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4

@@ -2387,6 +2387,13 @@ The number of requests which can be handled concurrently is controlled by

is ignored when running on a kernel that supports block multiqueue

.Pq Li blk-mq .

+.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint

+Number of zvol taskqs.

+If

+.Sy 0

+(the default) then scaling is done internally to prefer 6 threads per taskq.

+This only applies on Linux.

.It Sy zvol_threads Ns = Ns Sy 0 Pq uint

The number of system wide threads to use for processing zvol block IOs.

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index d815cb2ad2c4..107776932493 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c

@@ -37,6 +37,7 @@

#include <sys/spa_impl.h>

#include <sys/zvol.h>

#include <sys/zvol_impl.h>

+#include <cityhash.h>

#include <linux/blkdev_compat.h>

#include <linux/task_io_accounting_ops.h>

@@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0;

static unsigned int zvol_prefetch_bytes = (128 * 1024);

static unsigned long zvol_max_discard_blocks = 16384;

+/*

+ * Switch taskq at multiple of 512 MB offset. This can be set to a lower value

+ * to utilize more threads for small files but may affect prefetch hits.

+ */

+#define ZVOL_TASKQ_OFFSET_SHIFT 29

#ifndef HAVE_BLKDEV_GET_ERESTARTSYS

static unsigned int zvol_open_timeout_ms = 1000;

#endif

@@ -74,6 +81,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE;

* read and write tests to a zvol in an NVMe pool (with 16 CPUs).

static unsigned int zvol_blk_mq_blocks_per_thread = 8;

+static unsigned int zvol_num_taskqs = 0;

#endif

#ifndef BLKDEV_DEFAULT_RQ

@@ -114,7 +122,11 @@ struct zvol_state_os {

boolean_t use_blk_mq;

};

-static taskq_t *zvol_taskq;

+typedef struct zv_taskq {

+ uint_t tqs_cnt;

+ taskq_t **tqs_taskq;

+} zv_taskq_t;

+static zv_taskq_t zvol_taskqs;

static struct ida zvol_ida;

typedef struct zv_request_stack {

@@ -532,6 +544,17 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,

}

zv_request_task_t *task;

+ zv_taskq_t *ztqs = &zvol_taskqs;

+ uint_t blk_mq_hw_queue = 0;

+ uint_t tq_idx;

+ uint_t taskq_hash;

+#ifdef HAVE_BLK_MQ

+ if (rq)

+ blk_mq_hw_queue = rq->mq_hctx->queue_num;

+#endif

+ taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,

+ blk_mq_hw_queue, 0);

+ tq_idx = taskq_hash % ztqs->tqs_cnt;

if (rw == WRITE) {

if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {

@@ -601,7 +624,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,

zvol_discard(&zvr);

} else {

task = zv_request_task_create(zvr);

- taskq_dispatch_ent(zvol_taskq,

+ taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],

zvol_discard_task, task, 0, &task->ent);

}

} else {

@@ -609,7 +632,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,

zvol_write(&zvr);

} else {

task = zv_request_task_create(zvr);

- taskq_dispatch_ent(zvol_taskq,

+ taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],

zvol_write_task, task, 0, &task->ent);

}

@@ -631,7 +654,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,

zvol_read(&zvr);

} else {

task = zv_request_task_create(zvr);

- taskq_dispatch_ent(zvol_taskq,

+ taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],

zvol_read_task, task, 0, &task->ent);

}

@@ -1598,8 +1621,40 @@ zvol_init(void)

zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);

}

+ /*

+ * Use atleast 32 zvol_threads but for many core system,

+ * prefer 6 threads per taskq, but no more taskqs

+ * than threads in them on large systems.

+ *

+ * taskq total

+ * cpus taskqs threads threads

+ * ------- ------- ------- -------

+ * 1 1 32 32

+ * 2 1 32 32

+ * 4 1 32 32

+ * 8 2 16 32

+ * 16 3 11 33

+ * 32 5 7 35

+ * 64 8 8 64

+ * 128 11 12 132

+ * 256 16 16 256

+ */

+ zv_taskq_t *ztqs = &zvol_taskqs;

+ uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);

+ if (num_tqs == 0) {

+ num_tqs = 1 + num_online_cpus() / 6;

+ while (num_tqs * num_tqs > zvol_actual_threads)

+ num_tqs--;

+ }

+ uint_t per_tq_thread = zvol_actual_threads / num_tqs;

+ if (per_tq_thread * num_tqs < zvol_actual_threads)

+ per_tq_thread++;

+ ztqs->tqs_cnt = num_tqs;

+ ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);

error = register_blkdev(zvol_major, ZVOL_DRIVER);

if (error) {

+ kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));

+ ztqs->tqs_taskq = NULL;

printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);

return (error);

}

@@ -1619,11 +1674,22 @@ zvol_init(void)

1024);

}

#endif

- zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,

- zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);

- if (zvol_taskq == NULL) {

- unregister_blkdev(zvol_major, ZVOL_DRIVER);

- return (-ENOMEM);

+ for (uint_t i = 0; i < num_tqs; i++) {

+ char name[32];

+ (void) snprintf(name, sizeof (name), "%s_tq-%u",

+ ZVOL_DRIVER, i);

+ ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,

+ maxclsyspri, per_tq_thread, INT_MAX,

+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);

+ if (ztqs->tqs_taskq[i] == NULL) {

+ for (int j = i - 1; j >= 0; j--)

+ taskq_destroy(ztqs->tqs_taskq[j]);

+ unregister_blkdev(zvol_major, ZVOL_DRIVER);

+ kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *

+ sizeof (taskq_t *));

+ ztqs->tqs_taskq = NULL;

+ return (-ENOMEM);

+ }

}

zvol_init_impl();

@@ -1634,9 +1700,22 @@ zvol_init(void)

void

zvol_fini(void)

{

+ zv_taskq_t *ztqs = &zvol_taskqs;

zvol_fini_impl();

unregister_blkdev(zvol_major, ZVOL_DRIVER);

- taskq_destroy(zvol_taskq);

+ if (ztqs->tqs_taskq == NULL) {

+ ASSERT3U(ztqs->tqs_cnt, ==, 0);

+ } else {

+ for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {

+ ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);

+ taskq_destroy(ztqs->tqs_taskq[i]);

+ }

+ kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *

+ sizeof (taskq_t *));

+ ztqs->tqs_taskq = NULL;

+ }

ida_destroy(&zvol_ida);

}

@@ -1657,6 +1736,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");

module_param(zvol_max_discard_blocks, ulong, 0444);

MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");

+module_param(zvol_num_taskqs, uint, 0444);

+MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");

module_param(zvol_prefetch_bytes, uint, 0644);

MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");