aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Motin <mav@FreeBSD.org>2024-04-09 23:23:19 +0000
committerGitHub <noreply@github.com>2024-04-09 23:23:19 +0000
commit997f85b4d3123286a584bbd3aaac3077a8067abb (patch)
treeb3a030b82d45d080230cd3444248b9843137a204
parent9e63631dea553fb81fe10710e626fae26ff5c14f (diff)
downloadsrc-997f85b4d3123286a584bbd3aaac3077a8067abb.tar.gz
src-997f85b4d3123286a584bbd3aaac3077a8067abb.zip
-rw-r--r--include/sys/multilist.h5
-rw-r--r--module/zfs/arc.c179
-rw-r--r--module/zfs/dbuf.c2
-rw-r--r--module/zfs/dmu_objset.c10
-rw-r--r--module/zfs/metaslab.c8
-rw-r--r--module/zfs/multilist.c26
6 files changed, 131 insertions, 99 deletions
diff --git a/include/sys/multilist.h b/include/sys/multilist.h
index 26f37c37ab38..e7de86f2379b 100644
--- a/include/sys/multilist.h
+++ b/include/sys/multilist.h
@@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *);
unsigned int multilist_get_num_sublists(multilist_t *);
unsigned int multilist_get_random_index(multilist_t *);
-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_lock(multilist_sublist_t *);
+multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
void multilist_sublist_unlock(multilist_sublist_t *);
void multilist_sublist_insert_head(multilist_sublist_t *, void *);
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
+void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
void multilist_sublist_remove(multilist_sublist_t *, void *);
int multilist_sublist_is_empty(multilist_sublist_t *);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index b1bcac6c44bc..16c95db10f47 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
ASSERT3P(marker, !=, NULL);
- mls = multilist_sublist_lock(ml, idx);
+ mls = multilist_sublist_lock_idx(ml, idx);
for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
hdr = multilist_sublist_prev(mls, marker)) {
@@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
return (bytes_evicted);
}
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+ arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+ /*
+ * A b_spa of 0 is used to indicate that this header is
+ * a marker. This fact is used in arc_evict_state_impl().
+ */
+ marker->b_spa = 0;
+
+ return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+ kmem_cache_free(hdr_full_cache, marker);
+}
+
/*
* Allocate an array of buffer headers used as placeholders during arc state
* eviction.
@@ -3994,16 +4014,8 @@ arc_state_alloc_markers(int count)
arc_buf_hdr_t **markers;
markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
- for (int i = 0; i < count; i++) {
- markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
- /*
- * A b_spa of 0 is used to indicate that this header is
- * a marker. This fact is used in arc_evict_state_impl().
- */
- markers[i]->b_spa = 0;
-
- }
+ for (int i = 0; i < count; i++)
+ markers[i] = arc_state_alloc_marker();
return (markers);
}
@@ -4011,7 +4023,7 @@ static void
arc_state_free_markers(arc_buf_hdr_t **markers, int count)
{
for (int i = 0; i < count; i++)
- kmem_cache_free(hdr_full_cache, markers[i]);
+ arc_state_free_marker(markers[i]);
kmem_free(markers, sizeof (*markers) * count);
}
@@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls;
- mls = multilist_sublist_lock(ml, i);
+ mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_insert_tail(mls, markers[i]);
multilist_sublist_unlock(mls);
}
@@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
}
for (int i = 0; i < num_sublists; i++) {
- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
}
@@ -8633,7 +8645,7 @@ l2arc_sublist_lock(int list_num)
* sublists being selected.
*/
idx = multilist_get_random_index(ml);
- return (multilist_sublist_lock(ml, idx));
+ return (multilist_sublist_lock_idx(ml, idx));
}
/*
@@ -9046,9 +9058,9 @@ l2arc_blk_fetch_done(zio_t *zio)
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
- arc_buf_hdr_t *hdr, *hdr_prev, *head;
- uint64_t write_asize, write_psize, write_lsize, headroom;
- boolean_t full;
+ arc_buf_hdr_t *hdr, *head, *marker;
+ uint64_t write_asize, write_psize, headroom;
+ boolean_t full, from_head = !arc_warm;
l2arc_write_callback_t *cb = NULL;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
@@ -9057,10 +9069,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT3P(dev->l2ad_vdev, !=, NULL);
pio = NULL;
- write_lsize = write_asize = write_psize = 0;
+ write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+ marker = arc_state_alloc_marker();
/*
* Copy buffers for L2ARC writing.
@@ -9075,40 +9088,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
continue;
}
- multilist_sublist_t *mls = l2arc_sublist_lock(pass);
uint64_t passed_sz = 0;
-
- VERIFY3P(mls, !=, NULL);
+ headroom = target_sz * l2arc_headroom;
+ if (zfs_compressed_arc_enabled)
+ headroom = (headroom * l2arc_headroom_boost) / 100;
/*
- * L2ARC fast warmup.
- *
* Until the ARC is warm and starts to evict, read from the
* head of the ARC lists rather than the tail.
*/
- if (arc_warm == B_FALSE)
+ multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+ ASSERT3P(mls, !=, NULL);
+ if (from_head)
hdr = multilist_sublist_head(mls);
else
hdr = multilist_sublist_tail(mls);
- headroom = target_sz * l2arc_headroom;
- if (zfs_compressed_arc_enabled)
- headroom = (headroom * l2arc_headroom_boost) / 100;
-
- for (; hdr; hdr = hdr_prev) {
+ while (hdr != NULL) {
kmutex_t *hash_lock;
abd_t *to_write = NULL;
- if (arc_warm == B_FALSE)
- hdr_prev = multilist_sublist_next(mls, hdr);
- else
- hdr_prev = multilist_sublist_prev(mls, hdr);
-
hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
- /*
- * Skip this buffer rather than waiting.
- */
+skip:
+ /* Skip this buffer rather than waiting. */
+ if (from_head)
+ hdr = multilist_sublist_next(mls, hdr);
+ else
+ hdr = multilist_sublist_prev(mls, hdr);
continue;
}
@@ -9123,11 +9130,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
if (!l2arc_write_eligible(guid, hdr)) {
mutex_exit(hash_lock);
- continue;
+ goto skip;
}
ASSERT(HDR_HAS_L1HDR(hdr));
-
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
ASSERT3U(arc_hdr_size(hdr), >, 0);
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9149,12 +9155,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
}
/*
- * We rely on the L1 portion of the header below, so
- * it's invalid for this header to have been evicted out
- * of the ghost cache, prior to being written out. The
- * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ * We should not sleep with sublist lock held or it
+ * may block ARC eviction. Insert a marker to save
+ * the position and drop the lock.
*/
- arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+ if (from_head) {
+ multilist_sublist_insert_after(mls, hdr,
+ marker);
+ } else {
+ multilist_sublist_insert_before(mls, hdr,
+ marker);
+ }
+ multilist_sublist_unlock(mls);
/*
* If this header has b_rabd, we can use this since it
@@ -9185,32 +9197,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
&to_write);
if (ret != 0) {
arc_hdr_clear_flags(hdr,
- ARC_FLAG_L2_WRITING);
+ ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
- continue;
+ goto next;
}
l2arc_free_abd_on_write(to_write, asize, type);
}
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+ hdr->b_l2hdr.b_hits = 0;
+ hdr->b_l2hdr.b_arcs_state =
+ hdr->b_l1hdr.b_state->arcs_state;
+ mutex_enter(&dev->l2ad_mtx);
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
- mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, head);
- mutex_exit(&dev->l2ad_mtx);
+ }
+ list_insert_head(&dev->l2ad_buflist, hdr);
+ mutex_exit(&dev->l2ad_mtx);
+ arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+ ARC_FLAG_L2_WRITING);
+
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
+ l2arc_hdr_arcstats_increment(hdr);
+ boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+ mutex_exit(hash_lock);
+
+ if (pio == NULL) {
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
- /*
- * Create a list to save allocated abd buffers
- * for l2arc_log_blk_commit().
- */
list_create(&cb->l2wcb_abd_list,
sizeof (l2arc_lb_abd_buf_t),
offsetof(l2arc_lb_abd_buf_t, node));
@@ -9218,54 +9243,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ZIO_FLAG_CANFAIL);
}
- hdr->b_l2hdr.b_dev = dev;
- hdr->b_l2hdr.b_hits = 0;
-
- hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
- hdr->b_l2hdr.b_arcs_state =
- hdr->b_l1hdr.b_state->arcs_state;
- arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
- mutex_enter(&dev->l2ad_mtx);
- list_insert_head(&dev->l2ad_buflist, hdr);
- mutex_exit(&dev->l2ad_mtx);
-
- (void) zfs_refcount_add_many(&dev->l2ad_alloc,
- arc_hdr_size(hdr), hdr);
-
wzio = zio_write_phys(pio, dev->l2ad_vdev,
- hdr->b_l2hdr.b_daddr, asize, to_write,
+ dev->l2ad_hand, asize, to_write,
ZIO_CHECKSUM_OFF, NULL, hdr,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);
- write_lsize += HDR_GET_LSIZE(hdr);
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
zio_t *, wzio);
+ zio_nowait(wzio);
write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
- l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
- mutex_exit(hash_lock);
-
- /*
- * Append buf info to current log and commit if full.
- * arcstat_l2_{size,asize} kstats are updated
- * internally.
- */
- if (l2arc_log_blk_insert(dev, hdr)) {
- /*
- * l2ad_hand will be adjusted in
- * l2arc_log_blk_commit().
- */
+ if (commit) {
+ /* l2ad_hand will be adjusted inside. */
write_asize +=
l2arc_log_blk_commit(dev, pio, cb);
}
- zio_nowait(wzio);
+next:
+ multilist_sublist_lock(mls);
+ if (from_head)
+ hdr = multilist_sublist_next(mls, marker);
+ else
+ hdr = multilist_sublist_prev(mls, marker);
+ multilist_sublist_remove(mls, marker);
}
multilist_sublist_unlock(mls);
@@ -9274,9 +9279,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}
+ arc_state_free_marker(marker);
+
/* No buffers selected for writing? */
if (pio == NULL) {
- ASSERT0(write_lsize);
+ ASSERT0(write_psize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
@@ -10604,7 +10611,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
- L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+ L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
HDR_GET_PSIZE(hdr));
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index d9fc6cf6af34..5f3643f573f7 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -769,7 +769,7 @@ static void
dbuf_evict_one(void)
{
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
- multilist_sublist_t *mls = multilist_sublist_lock(
+ multilist_sublist_t *mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index f098e1daa44b..2ba26f68e398 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1665,7 +1665,7 @@ sync_dnodes_task(void *arg)
objset_t *os = soa->soa_os;
multilist_sublist_t *ms =
- multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+ multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
dmu_objset_sync_dnodes(ms, soa->soa_tx);
@@ -2076,8 +2076,8 @@ userquota_updates_task(void *arg)
dnode_t *dn;
userquota_cache_t cache = { { 0 } };
- multilist_sublist_t *list =
- multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+ multilist_sublist_t *list = multilist_sublist_lock_idx(
+ &os->os_synced_dnodes, uua->uua_sublist_idx);
ASSERT(multilist_sublist_head(list) == NULL ||
dmu_objset_userused_enabled(os));
@@ -2159,8 +2159,8 @@ dnode_rele_task(void *arg)
userquota_updates_arg_t *uua = arg;
objset_t *os = uua->uua_os;
- multilist_sublist_t *list =
- multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+ multilist_sublist_t *list = multilist_sublist_lock_idx(
+ &os->os_synced_dnodes, uua->uua_sublist_idx);
dnode_t *dn;
while ((dn = multilist_sublist_head(list)) != NULL) {
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index c4aa98ced433..9e762357b727 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -639,7 +639,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
{
multilist_t *ml = &mc->mc_metaslab_txg_list;
for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
metaslab_t *msp = multilist_sublist_head(mls);
multilist_sublist_unlock(mls);
while (msp != NULL) {
@@ -656,7 +656,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
i--;
break;
}
- mls = multilist_sublist_lock(ml, i);
+ mls = multilist_sublist_lock_idx(ml, i);
metaslab_t *next_msp = multilist_sublist_next(mls, msp);
multilist_sublist_unlock(mls);
if (txg >
@@ -2232,12 +2232,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
unsigned int idx = multilist_get_random_index(
&mc->mc_metaslab_txg_list);
multilist_sublist_t *mls =
- multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+ multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
metaslab_t *msp = multilist_sublist_head(mls);
multilist_sublist_unlock(mls);
while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
inuse * size) {
- VERIFY3P(mls, ==, multilist_sublist_lock(
+ VERIFY3P(mls, ==, multilist_sublist_lock_idx(
&mc->mc_metaslab_txg_list, idx));
ASSERT3U(idx, ==,
metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index b1cdf1c5c5f4..3d3ef86e6839 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
return (random_in_range(ml->ml_num_sublists));
}
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+ mutex_enter(&mls->mls_lock);
+}
+
/* Lock and return the sublist specified at the given index */
multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
{
multilist_sublist_t *mls;
@@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
multilist_sublist_t *
multilist_sublist_lock_obj(multilist_t *ml, void *obj)
{
- return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+ return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
}
void
@@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
list_insert_tail(&mls->mls_list, obj);
}
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_before(&mls->mls_list, next, obj);
+}
+
/*
* Move the object one element forward in the list.
*