diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/zil.c')
-rw-r--r-- | sys/contrib/openzfs/module/zfs/zil.c | 352 |
1 files changed, 233 insertions, 119 deletions
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 5642f082bdb8..9b5d866a8c22 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -91,15 +91,7 @@ * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ -static uint_t zfs_commit_timeout_pct = 5; - -/* - * Minimal time we care to delay commit waiting for more ZIL records. - * At least FreeBSD kernel can't sleep for less than 2us at its best. - * So requests to sleep for less then 5us is a waste of CPU time with - * a risk of significant log latency increase due to oversleep. - */ -static uint64_t zil_min_commit_timeout = 5000; +static uint_t zfs_commit_timeout_pct = 10; /* * See zil.h for more information about these fields. @@ -152,6 +144,7 @@ static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); +static uint64_t zil_max_waste_space(zilog_t *zilog); static int zil_bp_compare(const void *x1, const void *x2) @@ -1630,7 +1623,7 @@ zil_lwb_write_done(zio_t *zio) while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL && !vd->vdev_nowritecache) { + if (vd != NULL) { /* * The "ZIO_FLAG_DONT_PROPAGATE" is currently * always used within "zio_flush". This means, @@ -1719,24 +1712,6 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) } /* - * Define a limited set of intent log block sizes. - * - * These must be a multiple of 4KB. Note only the amount used (again - * aligned to 4KB) actually gets written. However, we can't always just - * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. - */ -static const struct { - uint64_t limit; - uint64_t blksz; -} zil_block_buckets[] = { - { 4096, 4096 }, /* non TX_WRITE */ - { 8192 + 4096, 8192 + 4096 }, /* database */ - { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ - { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ - { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ -}; - -/* * Maximum block size used by the ZIL. This is picked up when the ZIL is * initialized. Otherwise this should not be used directly; see * zl_max_block_size instead. @@ -1744,13 +1719,98 @@ static const struct { static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* + * Plan splitting of the provided burst size between several blocks. + */ +static uint_t +zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) +{ + uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t); + + if (size <= md) { + /* + * Small bursts are written as-is in one block. + */ + *minsize = size; + return (size); + } else if (size > 8 * md) { + /* + * Big bursts use maximum blocks. The first block size + * is hard to predict, but it does not really matter. + */ + *minsize = 0; + return (md); + } + + /* + * Medium bursts try to divide evenly to better utilize several SLOG + * VDEVs. The first block size we predict assuming the worst case of + * maxing out others. Fall back to using maximum blocks if due to + * large records or wasted space we can not predict anything better. + */ + uint_t s = size; + uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t)); + uint_t chunk = DIV_ROUND_UP(s, n); + uint_t waste = zil_max_waste_space(zilog); + waste = MAX(waste, zilog->zl_cur_max); + if (chunk <= md - waste) { + *minsize = MAX(s - (md - waste) * (n - 1), waste); + return (chunk); + } else { + *minsize = 0; + return (md); + } +} + +/* + * Try to predict next block size based on previous history. Make prediction + * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is + * less then 50%, extra writes may cost more, but we don't want single spike + * to badly affect our predictions. + */ +static uint_t +zil_lwb_predict(zilog_t *zilog) +{ + uint_t m, o; + + /* If we are in the middle of a burst, take it into account also. */ + if (zilog->zl_cur_size > 0) { + o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m); + } else { + o = UINT_MAX; + m = 0; + } + + /* Find minimum optimal size. We don't need to go below that. */ + for (int i = 0; i < ZIL_BURSTS; i++) + o = MIN(o, zilog->zl_prev_opt[i]); + + /* Find two biggest minimal first block sizes above the optimal. */ + uint_t m1 = MAX(m, o), m2 = o; + for (int i = 0; i < ZIL_BURSTS; i++) { + m = zilog->zl_prev_min[i]; + if (m >= m1) { + m2 = m1; + m1 = m; + } else if (m > m2) { + m2 = m; + } + } + + /* + * If second minimum size gives 50% saving -- use it. It may cost us + * one additional write later, but the space saving is just too big. + */ + return ((m1 < m2 * 2) ? m1 : m2); +} + +/* * Close the log block for being issued and allocate the next one. * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { - int i; + uint64_t blksz, plan, plan2; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -1765,34 +1825,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) return (NULL); /* - * Log blocks are pre-allocated. Here we select the size of the next - * block, based on size used in the last block. - * - first find the smallest bucket that will fit the block from a - * limited set of block sizes. This is because it's faster to write - * blocks allocated from the same metaslab as they are adjacent or - * close. - * - next find the maximum from the new suggested size and an array of - * previous sizes. This lessens a picket fence effect of wrongly - * guessing the size if we have a stream of say 2k, 64k, 2k, 64k - * requests. - * - * Note we only write what is used, but we can't just allocate - * the maximum block size because we can exhaust the available - * pool log space. + * Log blocks are pre-allocated. Here we select the size of the next + * block, based on what's left of this burst and the previous history. + * While we try to only write used part of the block, we can't just + * always allocate the maximum block size because we can exhaust all + * available pool log space, so we try to be reasonable. */ - uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); - for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) - continue; - zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); - zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; - for (i = 0; i < ZIL_PREV_BLKS; i++) - zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); - DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, - uint64_t, zil_blksz, - uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); - zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - - return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); + if (zilog->zl_cur_left > 0) { + /* + * We are in the middle of a burst and know how much is left. + * But if workload is multi-threaded there may be more soon. + * Try to predict what can it be and plan for the worst case. + */ + uint_t m; + plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + if (zilog->zl_parallel) { + plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left + + zil_lwb_predict(zilog), &m); + if (plan < plan2) + plan = plan2; + } + } else { + /* + * The previous burst is done and we can only predict what + * will come next. + */ + plan = zil_lwb_predict(zilog); + } + blksz = plan + sizeof (zil_chain_t); + blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t); + blksz = MIN(blksz, zilog->zl_max_block_size); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz, + uint64_t, plan); + + return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state)); } /* @@ -1843,7 +1909,7 @@ next_lwb: int wsz = lwb->lwb_sz; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; @@ -2004,6 +2070,42 @@ zil_max_copied_data(zilog_t *zilog) return (MIN(max_data, zil_maxcopied)); } +static uint64_t +zil_itx_record_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen); +} + +static uint64_t +zil_itx_data_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + lr_write_t *lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t)); + return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t), + uint64_t)); + } + return (0); +} + +static uint64_t +zil_itx_full_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen + zil_itx_data_size(itx)); +} + /* * Estimate space needed in the lwb for the itx. Allocate more lwbs or * split the itx as needed, but don't touch the actual transaction data. @@ -2046,16 +2148,9 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) } reclen = lr->lrc_reclen; - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - ASSERT3U(reclen, ==, sizeof (lr_write_t)); - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - ASSERT3U(reclen, >=, sizeof (lr_t)); - dlen = 0; - } + ASSERT3U(reclen, >=, sizeof (lr_t)); ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0)); - zilog->zl_cur_used += (reclen + dlen); + dlen = zil_itx_data_size(itx); cont: /* @@ -2096,6 +2191,7 @@ cont: clrw->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; + zilog->zl_cur_left -= dnow; } else { citx = itx; clr = lr; @@ -2117,10 +2213,8 @@ cont: list_insert_tail(&lwb->lwb_itxs, citx); dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; + if (dlen > 0) goto cont; - } if (lr->lrc_txtype == TX_WRITE && lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) @@ -2147,13 +2241,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) if (lr->lrc_txtype == TX_COMMIT) return; - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - dlen = 0; - } reclen = lr->lrc_reclen; + dlen = zil_itx_data_size(itx); ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; @@ -2584,6 +2673,7 @@ zil_get_commit_list(zilog_t *zilog) ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_t *sync_list = &itxg->itxg_itxs->i_sync_list; + itx_t *itx = NULL; if (unlikely(zilog->zl_suspend > 0)) { /* * ZIL was just suspended, but we lost the race. @@ -2593,10 +2683,20 @@ zil_get_commit_list(zilog_t *zilog) if (!list_is_empty(sync_list)) wtxg = MAX(wtxg, txg); } else { + itx = list_head(sync_list); list_move_tail(commit_list, sync_list); } mutex_exit(&itxg->itxg_lock); + + while (itx != NULL) { + uint64_t s = zil_itx_full_size(itx); + zilog->zl_cur_size += s; + zilog->zl_cur_left += s; + s = zil_itx_record_size(itx); + zilog->zl_cur_max = MAX(zilog->zl_cur_max, s); + itx = list_next(commit_list, itx); + } } return (wtxg); } @@ -2732,6 +2832,26 @@ zil_commit_writer_stall(zilog_t *zilog) ASSERT(list_is_empty(&zilog->zl_lwb_list)); } +static void +zil_burst_done(zilog_t *zilog) +{ + if (!list_is_empty(&zilog->zl_itx_commit_list) || + zilog->zl_cur_size == 0) + return; + + if (zilog->zl_parallel) + zilog->zl_parallel--; + + uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1); + zilog->zl_prev_rotor = r; + zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size, + &zilog->zl_prev_min[r]); + + zilog->zl_cur_size = 0; + zilog->zl_cur_max = 0; + zilog->zl_cur_left = 0; +} + /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly @@ -2746,7 +2866,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) list_t nolwb_waiters; lwb_t *lwb, *plwb; itx_t *itx; - boolean_t first = B_TRUE; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2772,9 +2891,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) zil_commit_activate_saxattr_feature(zilog); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); - first = (lwb->lwb_state == LWB_STATE_NEW) && - ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || - plwb->lwb_state == LWB_STATE_FLUSH_DONE); + + /* + * If the lwb is still opened, it means the workload is really + * multi-threaded and we won the chance of write aggregation. + * If it is not opened yet, but previous lwb is still not + * flushed, it still means the workload is multi-threaded, but + * there was too much time between the commits to aggregate, so + * we try aggregation next times, but without too much hopes. + */ + if (lwb->lwb_state == LWB_STATE_OPENED) { + zilog->zl_parallel = ZIL_BURSTS; + } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) + != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) { + zilog->zl_parallel = MAX(zilog->zl_parallel, + ZIL_BURSTS / 2); + } } while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { @@ -2849,7 +2981,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * Our lwb is done, leave the rest of * itx list to somebody else who care. */ - first = B_FALSE; + zilog->zl_parallel = ZIL_BURSTS; + zilog->zl_cur_left -= + zil_itx_full_size(itx); break; } } else { @@ -2859,8 +2993,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } list_insert_tail(&nolwb_itxs, itx); } + zilog->zl_cur_left -= zil_itx_full_size(itx); } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); + zilog->zl_cur_left -= zil_itx_full_size(itx); zil_itx_destroy(itx); } } @@ -2941,28 +3077,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. - * - * If we had no already running or open LWBs, it can be - * the workload is single-threaded. And if the ZIL write - * latency is very small or if the LWB is almost full, it - * may be cheaper to bypass the delay. */ - if (lwb->lwb_state == LWB_STATE_OPENED && first) { - hrtime_t sleep = zilog->zl_last_lwb_latency * - zfs_commit_timeout_pct / 100; - if (sleep < zil_min_commit_timeout || - lwb->lwb_nmax - lwb->lwb_nused < - lwb->lwb_nmax / 8) { - list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, - LWB_STATE_NEW); - zilog->zl_cur_used = 0; - if (lwb == NULL) { - while ((lwb = list_remove_head(ilwbs)) - != NULL) - zil_lwb_write_issue(zilog, lwb); - zil_commit_writer_stall(zilog); - } + if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + zil_burst_done(zilog); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + zil_commit_writer_stall(zilog); } } } @@ -3116,24 +3239,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ + zil_burst_done(zilog); lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - /* - * Since the lwb's zio hadn't been issued by the time this thread - * reached its timeout, we reset the zilog's "zl_cur_used" field - * to influence the zil block size selection algorithm. - * - * By having to issue the lwb's zio here, it means the size of the - * lwb was too large, given the incoming throughput of itxs. By - * setting "zl_cur_used" to zero, we communicate this fact to the - * block size selection algorithm, so it can take this information - * into account, and potentially select a smaller size for the - * next lwb block that is allocated. - */ - zilog->zl_cur_used = 0; - if (nlwb == NULL) { /* * When zil_lwb_write_close() returns NULL, this @@ -3728,7 +3838,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; - zilog->zl_max_block_size = zil_maxblocksize; + zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize, + ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ), + spa_maxblocksize(dmu_objset_spa(os))); mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3748,6 +3860,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); + for (int i = 0; i < ZIL_BURSTS; i++) { + zilog->zl_prev_opt[i] = zilog->zl_max_block_size - + sizeof (zil_chain_t); + } + return (zilog); } @@ -4250,9 +4367,6 @@ EXPORT_SYMBOL(zil_kstat_values_update); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); -ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW, - "Minimum delay we care for ZIL block commit"); - ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay"); |