aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Motin <mav@FreeBSD.org>2024-04-22 18:41:03 +0000
committerGitHub <noreply@github.com>2024-04-22 18:41:03 +0000
commit4036b8d027fb7fe1a629b08a0d23cac975ab2eb9 (patch)
tree59854d445473c9951645b7b5bbd01eda3e22394b
parentc346068e5efeafd5676ab1644086877173ca4226 (diff)
downloadsrc-4036b8d027fb7fe1a629b08a0d23cac975ab2eb9.tar.gz
src-4036b8d027fb7fe1a629b08a0d23cac975ab2eb9.zip
Refactor dbuf_read() for safer decryption
In dbuf_read_verify_dnode_crypt(): - We don't need original dbuf locked there. Instead take a lock on a dnode dbuf, that is actually manipulated. - Block decryption for a dnode dbuf if it is currently being written. ARC hash lock does not protect anonymous buffers, so arc_untransform() is unsafe when used on buffers being written, that may happen in case of encrypted dnode buffers, since they are not copied by dbuf_dirty()/dbuf_hold_copy(). In dbuf_read(): - If the buffer is in flight, recheck its compression/encryption status after it is cached, since it may need arc_untransform(). Tested-by: Rich Ercolani <rincebrain@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #16104
-rw-r--r--module/zfs/dbuf.c214
1 files changed, 104 insertions, 110 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 5f3643f573f7..bb913f556374 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -161,13 +161,13 @@ struct {
} dbuf_sums;
#define DBUF_STAT_INCR(stat, val) \
- wmsum_add(&dbuf_sums.stat, val);
+ wmsum_add(&dbuf_sums.stat, val)
#define DBUF_STAT_DECR(stat, val) \
- DBUF_STAT_INCR(stat, -(val));
+ DBUF_STAT_INCR(stat, -(val))
#define DBUF_STAT_BUMP(stat) \
- DBUF_STAT_INCR(stat, 1);
+ DBUF_STAT_INCR(stat, 1)
#define DBUF_STAT_BUMPDOWN(stat) \
- DBUF_STAT_INCR(stat, -1);
+ DBUF_STAT_INCR(stat, -1)
#define DBUF_STAT_MAX(stat, v) { \
uint64_t _m; \
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
@@ -177,7 +177,6 @@ struct {
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
-static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
/*
* Global data structures and functions for the dbuf cache.
@@ -1418,13 +1417,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
* a decrypted block. Otherwise success.
*/
static int
-dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
{
- int bonuslen, max_bonuslen, err;
-
- err = dbuf_read_verify_dnode_crypt(db, flags);
- if (err)
- return (err);
+ int bonuslen, max_bonuslen;
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@@ -1509,32 +1504,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
*/
static int
-dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
{
- int err = 0;
objset_t *os = db->db_objset;
- arc_buf_t *dnode_abuf;
- dnode_t *dn;
+ dmu_buf_impl_t *dndb;
+ arc_buf_t *dnbuf;
zbookmark_phys_t zb;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
+ int err;
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
- !os->os_encrypted || os->os_raw_receive)
+ !os->os_encrypted || os->os_raw_receive ||
+ (dndb = dn->dn_dbuf) == NULL)
return (0);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
-
- if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
- DB_DNODE_EXIT(db);
+ dnbuf = dndb->db_buf;
+ if (!arc_is_encrypted(dnbuf))
return (0);
- }
+
+ mutex_enter(&dndb->db_mtx);
+
+ /*
+ * Since dnode buffer is modified by sync process, there can be only
+ * one copy of it. It means we can not modify (decrypt) it while it
+ * is being written. I don't see how this may happen now, since
+ * encrypted dnode writes by receive should be completed before any
+ * plain-text reads due to txg wait, but better be safe than sorry.
+ */
+ while (1) {
+ if (!arc_is_encrypted(dnbuf)) {
+ mutex_exit(&dndb->db_mtx);
+ return (0);
+ }
+ dbuf_dirty_record_t *dr = dndb->db_data_pending;
+ if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
+ break;
+ cv_wait(&dndb->db_changed, &dndb->db_mtx);
+ };
SET_BOOKMARK(&zb, dmu_objset_id(os),
- DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
- err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+ DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
+ err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
/*
* An error code of EACCES tells us that the key is still not
@@ -1547,7 +1556,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
err = 0;
- DB_DNODE_EXIT(db);
+ mutex_exit(&dndb->db_mtx);
return (err);
}
@@ -1573,7 +1582,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
RW_LOCK_HELD(&db->db_parent->db_rwlock));
if (db->db_blkid == DMU_BONUS_BLKID) {
- err = dbuf_read_bonus(db, dn, flags);
+ err = dbuf_read_bonus(db, dn);
goto early_unlock;
}
@@ -1635,10 +1644,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
goto early_unlock;
}
- err = dbuf_read_verify_dnode_crypt(db, flags);
- if (err != 0)
- goto early_unlock;
-
db->db_state = DB_READ;
DTRACE_SET_STATE(db, "read issued");
mutex_exit(&db->db_mtx);
@@ -1754,19 +1759,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
{
- int err = 0;
- boolean_t prefetch;
dnode_t *dn;
+ boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
+ int err;
- /*
- * We don't have to hold the mutex to check db_state because it
- * can't be freed while we have a hold on the buffer.
- */
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
+ /*
+ * Ensure that this block's dnode has been decrypted if the caller
+ * has requested decrypted data.
+ */
+ err = dbuf_read_verify_dnode_crypt(db, dn, flags);
+ if (err != 0)
+ goto done;
+
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0;
@@ -1775,13 +1784,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
db->db_partial_read = B_TRUE;
else if (!(flags & DB_RF_PARTIAL_MORE))
db->db_partial_read = B_FALSE;
- if (db->db_state == DB_CACHED) {
+ miss = (db->db_state != DB_CACHED);
+
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
/*
- * Ensure that this block's dnode has been decrypted if
- * the caller has requested decrypted data.
+ * Another reader came in while the dbuf was in flight between
+ * UNCACHED and CACHED. Either a writer will finish filling
+ * the buffer, sending the dbuf to CACHED, or the first reader's
+ * request will reach the read_done callback and send the dbuf
+ * to CACHED. Otherwise, a failure occurred and the dbuf will
+ * be sent to UNCACHED.
*/
- err = dbuf_read_verify_dnode_crypt(db, flags);
+ if (flags & DB_RF_NEVERWAIT) {
+ mutex_exit(&db->db_mtx);
+ DB_DNODE_EXIT(db);
+ goto done;
+ }
+ do {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
+ zio_t *, pio);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ } while (db->db_state == DB_READ || db->db_state == DB_FILL);
+ if (db->db_state == DB_UNCACHED) {
+ err = SET_ERROR(EIO);
+ mutex_exit(&db->db_mtx);
+ DB_DNODE_EXIT(db);
+ goto done;
+ }
+ }
+ if (db->db_state == DB_CACHED) {
/*
* If the arc buf is compressed or encrypted and the caller
* requested uncompressed data, we need to untransform it
@@ -1789,8 +1823,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
* unauthenticated blocks, which will verify their MAC if
* the key is now available.
*/
- if (err == 0 && db->db_buf != NULL &&
- (flags & DB_RF_NO_DECRYPT) == 0 &&
+ if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
(arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@@ -1804,17 +1837,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
- if (err == 0 && prefetch) {
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- B_FALSE, flags & DB_RF_HAVESTRUCT);
- }
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_hits);
- } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
- boolean_t need_wait = B_FALSE;
-
+ } else {
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
-
if (pio == NULL && (db->db_state == DB_NOFILL ||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
spa_t *spa = dn->dn_objset->os_spa;
@@ -1822,65 +1848,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
need_wait = B_TRUE;
}
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
- /*
- * dbuf_read_impl has dropped db_mtx and our parent's rwlock
- * for us
- */
- if (!err && prefetch) {
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- db->db_state != DB_CACHED,
- flags & DB_RF_HAVESTRUCT);
- }
-
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_misses);
+ /* dbuf_read_impl drops db_mtx and parent's rwlock. */
+ miss = (db->db_state != DB_CACHED);
+ }
- /*
- * If we created a zio_root we must execute it to avoid
- * leaking it, even if it isn't attached to any work due
- * to an error in dbuf_read_impl().
- */
- if (need_wait) {
- if (err == 0)
- err = zio_wait(pio);
- else
- (void) zio_wait(pio);
- pio = NULL;
- }
- } else {
- /*
- * Another reader came in while the dbuf was in flight
- * between UNCACHED and CACHED. Either a writer will finish
- * writing the buffer (sending the dbuf to CACHED) or the
- * first reader's request will reach the read_done callback
- * and send the dbuf to CACHED. Otherwise, a failure
- * occurred and the dbuf went to UNCACHED.
- */
- mutex_exit(&db->db_mtx);
- if (prefetch) {
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- B_TRUE, flags & DB_RF_HAVESTRUCT);
- }
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_misses);
+ if (err == 0 && prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
+ flags & DB_RF_HAVESTRUCT);
+ }
+ DB_DNODE_EXIT(db);
- /* Skip the wait per the caller's request. */
- if ((flags & DB_RF_NEVERWAIT) == 0) {
- mutex_enter(&db->db_mtx);
- while (db->db_state == DB_READ ||
- db->db_state == DB_FILL) {
- ASSERT(db->db_state == DB_READ ||
- (flags & DB_RF_HAVESTRUCT) == 0);
- DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
- db, zio_t *, pio);
- cv_wait(&db->db_changed, &db->db_mtx);
- }
- if (db->db_state == DB_UNCACHED)
- err = SET_ERROR(EIO);
- mutex_exit(&db->db_mtx);
- }
+ /*
+ * If we created a zio we must execute it to avoid leaking it, even if
+ * it isn't attached to any work due to an error in dbuf_read_impl().
+ */
+ if (need_wait) {
+ if (err == 0)
+ err = zio_wait(pio);
+ else
+ (void) zio_wait(pio);
+ pio = NULL;
}
+done:
+ if (miss)
+ DBUF_STAT_BUMP(hash_misses);
+ else
+ DBUF_STAT_BUMP(hash_hits);
if (pio && err != 0) {
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
ZIO_FLAG_CANFAIL);