diff options
Diffstat (limited to 'module/zfs/bptree.c')
-rw-r--r-- | module/zfs/bptree.c | 303 |
1 files changed, 303 insertions, 0 deletions
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c new file mode 100644 index 0000000000000..1827a3c4e3268 --- /dev/null +++ b/module/zfs/bptree.c @@ -0,0 +1,303 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + */ + +#include <sys/arc.h> +#include <sys/bptree.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/spa.h> + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code, + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT0(bt->bt_bytes); + ASSERT0(bt->bt_comp); + ASSERT0(bt->bt_uncomp); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t *bte; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte = kmem_zalloc(sizeof (*bte), KM_SLEEP); + bte->be_birth_txg = birth_txg; + bte->be_bp = *bp; + dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); + kmem_free(bte, sizeof (*bte)); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_REDACTED(bp)) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + boolean_t ioerr = B_FALSE; + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST | + TRAVERSE_NO_DECRYPT; + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + if (zfs_free_leak_on_eio) + flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + (longlong_t)i, + (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, flags, + bptree_visit_cb, &ba); + if (free) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT0(bte.be_zb.zb_level); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { + /* + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. + */ + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + } + + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; + } + } + + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + + ASSERT0(ba.ba_phys->bt_bytes); + ASSERT0(ba.ba_phys->bt_comp); + ASSERT0(ba.ba_phys->bt_uncomp); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} |