From 30af21b02569ac192f52ce6e6511015f8a8d5729 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Wed, 19 Jun 2019 09:48:13 -0700
Subject: Implement Redacted Send/Receive

Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.

Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream.  When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark.  This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.

The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.

The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.

Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
---
 module/zfs/dsl_dataset.c | 410 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 312 insertions(+), 98 deletions(-)

(limited to 'module/zfs/dsl_dataset.c')

diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 966c2cc93d13..0cd458ef4364 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -57,6 +57,7 @@
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/policy.h>
+#include <sys/dmu_send.h>
 #include <sys/dmu_recv.h>
 #include <sys/zio_compress.h>
 #include <zfs_fletcher.h>
@@ -72,6 +73,7 @@
  * of this setting.
  */
 int zfs_max_recordsize = 1 * 1024 * 1024;
+int zfs_allow_redacted_dataset_mount = 0;
 
 #define	SWITCH64(x, y) \
 	{ \
@@ -131,7 +133,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
-	if (BP_IS_HOLE(bp))
+	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
@@ -220,7 +222,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
-	if (BP_IS_HOLE(bp))
+	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -284,6 +286,9 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
 	}
+
+	dsl_bookmark_block_killed(ds, bp, tx);
+
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
 	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
@@ -395,6 +400,8 @@ dsl_dataset_evict_async(void *dbu)
 		ds->ds_prev = NULL;
 	}
 
+	dsl_bookmark_fini_ds(ds);
+
 	bplist_destroy(&ds->ds_pending_deadlist);
 	if (dsl_deadlist_is_open(&ds->ds_deadlist))
 		dsl_deadlist_close(&ds->ds_deadlist);
@@ -564,8 +571,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 
 		bplist_create(&ds->ds_pending_deadlist);
 
-		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
-		    offsetof(dmu_sendarg_t, dsa_link));
+		list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t),
+		    offsetof(dmu_sendstatus_t, dss_link));
 
 		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
 		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
@@ -588,14 +595,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
-			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
-				int zaperr = zap_lookup(mos, ds->ds_object,
-				    DS_FIELD_BOOKMARK_NAMES,
-				    sizeof (ds->ds_bookmarks), 1,
-				    &ds->ds_bookmarks);
-				if (zaperr != ENOENT)
-					VERIFY0(zaperr);
-			}
+			err = dsl_bookmark_init_ds(ds);
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
@@ -647,9 +647,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
 				dsl_deadlist_close(&ds->ds_remap_deadlist);
+			dsl_bookmark_fini_ds(ds);
 			if (ds->ds_prev)
 				dsl_dataset_rele(ds->ds_prev, ds);
 			dsl_dir_rele(ds->ds_dir, ds);
+			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+				if (dsl_dataset_feature_is_active(ds, f))
+					unload_zfeature(ds, f);
+			}
+
 			list_destroy(&ds->ds_prop_cbs);
 			list_destroy(&ds->ds_sendstreams);
 			mutex_destroy(&ds->ds_lock);
@@ -784,14 +790,14 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
 	return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
 }
 
-int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
-    void *tag, dsl_dataset_t **dsp)
+static int
+dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
+    void *tag, boolean_t override, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
 	if (err != 0)
 		return (err);
-	if (!dsl_dataset_tryown(*dsp, tag)) {
+	if (!dsl_dataset_tryown(*dsp, tag, override)) {
 		dsl_dataset_rele_flags(*dsp, flags, tag);
 		*dsp = NULL;
 		return (SET_ERROR(EBUSY));
@@ -799,20 +805,49 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
 	return (0);
 }
 
+
 int
-dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
     void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp));
+}
+
+int
+dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj,
+    ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp));
+}
+
+static int
+dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+    void *tag, boolean_t override, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
 	if (err != 0)
 		return (err);
-	if (!dsl_dataset_tryown(*dsp, tag)) {
+	if (!dsl_dataset_tryown(*dsp, tag, override)) {
 		dsl_dataset_rele_flags(*dsp, flags, tag);
 		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
+int
+dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+    void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp));
+}
+
+int
+dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+    void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp));
+}
+
 /*
  * See the comment above dsl_pool_hold() for details.  In summary, a long
  * hold is used to prevent destruction of a dataset while the pool hold
@@ -927,13 +962,16 @@ dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
 }
 
 boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override)
 {
 	boolean_t gotit = FALSE;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	mutex_enter(&ds->ds_lock);
-	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
+	if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) ||
+	    (dsl_dataset_feature_is_active(ds,
+	    SPA_FEATURE_REDACTED_DATASETS) &&
+	    !zfs_allow_redacted_dataset_mount)))) {
 		ds->ds_owner = tag;
 		dsl_dataset_long_hold(ds, tag);
 		gotit = TRUE;
@@ -1696,6 +1734,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
 	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_add_key(&ds->ds_deadlist,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+	dsl_bookmark_snapshotted(ds, tx);
 
 	if (dsl_dataset_remap_deadlist_exists(ds)) {
 		uint64_t remap_deadlist_obj =
@@ -2013,6 +2052,8 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 	bplist_iterate(&ds->ds_pending_deadlist,
 	    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 
+	dsl_bookmark_sync_done(ds, tx);
+
 	if (os->os_synced_dnodes != NULL) {
 		multilist_destroy(os->os_synced_dnodes);
 		os->os_synced_dnodes = NULL;
@@ -2151,6 +2192,34 @@ get_receive_resume_stats_impl(dsl_dataset_t *ds)
 		    DS_FIELD_RESUME_RAWOK) == 0) {
 			fnvlist_add_boolean(token_nv, "rawok");
 		}
+		if (dsl_dataset_feature_is_active(ds,
+		    SPA_FEATURE_REDACTED_DATASETS)) {
+			uint64_t num_redact_snaps;
+			uint64_t *redact_snaps;
+			VERIFY(dsl_dataset_get_uint64_array_feature(ds,
+			    SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
+			    &redact_snaps));
+			fnvlist_add_uint64_array(token_nv, "redact_snaps",
+			    redact_snaps, num_redact_snaps);
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
+			uint64_t num_redact_snaps, int_size;
+			uint64_t *redact_snaps;
+			VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
+			    &num_redact_snaps));
+			ASSERT3U(int_size, ==, sizeof (uint64_t));
+
+			redact_snaps = kmem_alloc(int_size * num_redact_snaps,
+			    KM_SLEEP);
+			VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
+			    num_redact_snaps, redact_snaps));
+			fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
+			    redact_snaps, num_redact_snaps);
+			kmem_free(redact_snaps, int_size * num_redact_snaps);
+		}
 		packed = fnvlist_pack(token_nv, &packed_size);
 		fnvlist_free(token_nv);
 		compressed = kmem_alloc(packed_size, KM_SLEEP);
@@ -2336,6 +2405,13 @@ dsl_get_inconsistent(dsl_dataset_t *ds)
 	    1 : 0);
 }
 
+uint64_t
+dsl_get_redacted(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_feature_is_active(ds,
+	    SPA_FEATURE_REDACTED_DATASETS));
+}
+
 uint64_t
 dsl_get_available(dsl_dataset_t *ds)
 {
@@ -2391,6 +2467,18 @@ dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
 	}
 }
 
+void
+dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval)
+{
+	uint64_t nsnaps;
+	uint64_t *snaps;
+	if (dsl_dataset_get_uint64_array_feature(ds,
+	    SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) {
+		fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps,
+		    nsnaps);
+	}
+}
+
 /*
  * Returns the mountpoint property and source for the given dataset in the value
  * and source buffers. The value buffer must be at least as large as MAXPATHLEN
@@ -2496,6 +2584,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 		dsl_dir_stats(ds->ds_dir, nv);
 	}
 
+	nvlist_t *propval = fnvlist_alloc();
+	dsl_get_redact_snaps(ds, propval);
+	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS),
+	    propval);
+	nvlist_free(propval);
+
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
 	    dsl_get_available(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
@@ -2564,6 +2658,7 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 	stat->dds_creation_txg = dsl_get_creationtxg(ds);
 	stat->dds_inconsistent = dsl_get_inconsistent(ds);
 	stat->dds_guid = dsl_get_guid(ds);
+	stat->dds_redacted = dsl_get_redacted(ds);
 	stat->dds_origin[0] = '\0';
 	if (ds->ds_is_snapshot) {
 		stat->dds_is_snapshot = B_TRUE;
@@ -2891,28 +2986,11 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 	}
 
 	/* must not have any bookmarks after the most recent snapshot */
-	nvlist_t *proprequest = fnvlist_alloc();
-	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
-	nvlist_t *bookmarks = fnvlist_alloc();
-	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
-	fnvlist_free(proprequest);
-	if (error != 0) {
+	if (dsl_bookmark_latest_txg(ds) >
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
-		nvlist_t *valuenv =
-		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
-		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
-		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
-		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
-			fnvlist_free(bookmarks);
-			dsl_dataset_rele(ds, FTAG);
-			return (SET_ERROR(EEXIST));
-		}
+		return (SET_ERROR(EEXIST));
 	}
-	fnvlist_free(bookmarks);
 
 	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
 	if (error != 0) {
@@ -3025,7 +3103,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
-	dsl_dataset_t *origin_ds;
+	dsl_dataset_t *origin_ds, *origin_head;
 	int err;
 	uint64_t unused;
 	uint64_t ss_mv_cnt;
@@ -3045,6 +3123,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 	}
 
 	snap = list_head(&ddpa->shared_snaps);
+	origin_head = snap->ds;
 	if (snap == NULL) {
 		err = SET_ERROR(ENOENT);
 		goto out;
@@ -3141,6 +3220,32 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 		ddpa->uncomp += dluncomp;
 	}
 
+	/*
+	 * Check that bookmarks that are being transferred don't have
+	 * name conflicts.
+	 */
+	for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
+	    dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) {
+		if (strlen(dbn->dbn_name) >= max_snap_len) {
+			err = SET_ERROR(ENAMETOOLONG);
+			goto out;
+		}
+		zfs_bookmark_phys_t bm;
+		err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone,
+		    dbn->dbn_name, &bm);
+
+		if (err == 0) {
+			fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name);
+			conflicting_snaps = B_TRUE;
+		} else if (err == ESRCH) {
+			err = 0;
+		} else if (err != 0) {
+			goto out;
+		}
+	}
+
 	/*
 	 * In order to return the full list of conflicting snapshots, we check
 	 * whether there was a conflict after traversing all of them.
@@ -3298,6 +3403,25 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
 	}
 
+	/*
+	 * Move bookmarks to this dir.
+	 */
+	dsl_bookmark_node_t *dbn_next;
+	for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
+	    dbn = dbn_next) {
+		dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn);
+
+		avl_remove(&origin_head->ds_bookmarks, dbn);
+		VERIFY0(zap_remove(dp->dp_meta_objset,
+		    origin_head->ds_bookmarks_obj, dbn->dbn_name, tx));
+
+		dsl_bookmark_node_add(hds, dbn, tx);
+	}
+
+	dsl_bookmark_next_changed(hds, origin_ds, tx);
+
 	/* move snapshots to this dir */
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
@@ -3758,9 +3882,9 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
 	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/*
-	 * Reset origin's unique bytes, if it exists.
+	 * Reset origin's unique bytes.
 	 */
-	if (clone->ds_prev) {
+	{
 		dsl_dataset_t *origin = clone->ds_prev;
 		uint64_t comp, uncomp;
 
@@ -3858,6 +3982,12 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
 	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
 	dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
 
+	/*
+	 * If there is a bookmark at the origin, its "next dataset" is
+	 * changing, so we need to reset its FBN.
+	 */
+	dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx);
+
 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
 	spa_history_log_internal_ds(clone, "clone swap", tx,
@@ -4148,95 +4278,145 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
 }
 
 /*
- * Return (in *usedp) the amount of space written in new that is not
- * present in oldsnap.  New may be a snapshot or the head.  Old must be
- * a snapshot before new, in new's filesystem (or its origin).  If not then
- * fail and return EINVAL.
+ * Return (in *usedp) the amount of space referenced by "new" that was not
+ * referenced at the time the bookmark corresponds to.  "New" may be a
+ * snapshot or a head.  The bookmark must be before new, in
+ * new's filesystem (or its origin) -- caller verifies this.
  *
  * The written space is calculated by considering two components:  First, we
  * ignore any freed space, and calculate the written as new's used space
  * minus old's used space.  Next, we add in the amount of space that was freed
- * between the two snapshots, thus reducing new's used space relative to old's.
- * Specifically, this is the space that was born before old->ds_creation_txg,
- * and freed before new (ie. on new's deadlist or a previous deadlist).
+ * between the two time points, thus reducing new's used space relative to
+ * old's. Specifically, this is the space that was born before
+ * zbm_creation_txg, and freed before new (ie. on new's deadlist or a
+ * previous deadlist).
  *
  * space freed                         [---------------------]
  * snapshots                       ---O-------O--------O-------O------
- *                                         oldsnap            new
+ *                                         bookmark           new
+ *
+ * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN
+ * flag is not set, we will calculate the freed_before_next based on the
+ * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap.
  */
-int
-dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+static int
+dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp,
+    dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
-	uint64_t snapobj;
 	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
+	if (dsl_dataset_is_snapshot(new)) {
+		ASSERT3U(bmp->zbm_creation_txg, <,
+		    dsl_dataset_phys(new)->ds_creation_txg);
+	}
 
 	*usedp = 0;
 	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
-	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
+	*usedp -= bmp->zbm_referenced_bytes_refd;
 
 	*compp = 0;
 	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
-	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
+	*compp -= bmp->zbm_compressed_bytes_refd;
 
 	*uncompp = 0;
 	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
-	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
+	*uncompp -= bmp->zbm_uncompressed_bytes_refd;
 
-	snapobj = new->ds_object;
-	while (snapobj != oldsnap->ds_object) {
-		dsl_dataset_t *snap;
-		uint64_t used, comp, uncomp;
+	dsl_dataset_t *snap = new;
 
-		if (snapobj == new->ds_object) {
-			snap = new;
-		} else {
-			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
-			if (err != 0)
-				break;
-		}
+	while (dsl_dataset_phys(snap)->ds_prev_snap_txg >
+	    bmp->zbm_creation_txg) {
+		uint64_t used, comp, uncomp;
 
-		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
-		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
-			/*
-			 * The blocks in the deadlist can not be born after
-			 * ds_prev_snap_txg, so get the whole deadlist space,
-			 * which is more efficient (especially for old-format
-			 * deadlists).  Unfortunately the deadlist code
-			 * doesn't have enough information to make this
-			 * optimization itself.
-			 */
-			dsl_deadlist_space(&snap->ds_deadlist,
-			    &used, &comp, &uncomp);
-		} else {
-			dsl_deadlist_space_range(&snap->ds_deadlist,
-			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
-			    &used, &comp, &uncomp);
-		}
+		dsl_deadlist_space_range(&snap->ds_deadlist,
+		    0, bmp->zbm_creation_txg,
+		    &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
-		/*
-		 * If we get to the beginning of the chain of snapshots
-		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
-		 * was not a snapshot of/before new.
-		 */
-		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+		uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 		if (snap != new)
 			dsl_dataset_rele(snap, FTAG);
-		if (snapobj == 0) {
-			err = SET_ERROR(EINVAL);
+		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+		if (err != 0)
 			break;
-		}
+	}
 
+	/*
+	 * We might not have the FBN if we are calculating written from
+	 * a snapshot (because we didn't know the correct "next" snapshot
+	 * until now).
+	 */
+	if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) {
+		*usedp += bmp->zbm_referenced_freed_before_next_snap;
+		*compp += bmp->zbm_compressed_freed_before_next_snap;
+		*uncompp += bmp->zbm_uncompressed_freed_before_next_snap;
+	} else {
+		ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==,
+		    bmp->zbm_creation_txg);
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp);
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
 	}
+	if (snap != new)
+		dsl_dataset_rele(snap, FTAG);
 	return (err);
 }
 
+/*
+ * Return (in *usedp) the amount of space written in new that was not
+ * present at the time the bookmark corresponds to.  New may be a
+ * snapshot or the head.  Old must be a bookmark before new, in
+ * new's filesystem (or its origin) -- caller verifies this.
+ */
+int
+dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp,
+    dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN))
+		return (SET_ERROR(ENOTSUP));
+	return (dsl_dataset_space_written_impl(bmp, new,
+	    usedp, compp, uncompp));
+}
+
+/*
+ * Return (in *usedp) the amount of space written in new that is not
+ * present in oldsnap.  New may be a snapshot or the head.  Old must be
+ * a snapshot before new, in new's filesystem (or its origin).  If not then
+ * fail and return EINVAL.
+ */
+int
+dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	if (!dsl_dataset_is_before(new, oldsnap, 0))
+		return (SET_ERROR(EINVAL));
+
+	zfs_bookmark_phys_t zbm = { 0 };
+	dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap);
+	zbm.zbm_guid = dsp->ds_guid;
+	zbm.zbm_creation_txg = dsp->ds_creation_txg;
+	zbm.zbm_creation_time = dsp->ds_creation_time;
+	zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
+	zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
+	zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
+
+	/*
+	 * If oldsnap is the origin (or origin's origin, ...) of new,
+	 * we can't easily calculate the effective FBN.  Therefore,
+	 * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate
+	 * it relative to the correct "next": the next snapshot towards "new",
+	 * rather than the next snapshot in oldsnap's dsl_dir.
+	 */
+	return (dsl_dataset_space_written_impl(&zbm, new,
+	    usedp, compp, uncompp));
+}
+
 /*
  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
  * lastsnap, and all snapshots in between are deleted.
@@ -4327,16 +4507,26 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
 
 	if (later->ds_dir == earlier->ds_dir)
 		return (B_TRUE);
-	if (!dsl_dir_is_clone(later->ds_dir))
+
+	/*
+	 * We check dd_origin_obj explicitly here rather than using
+	 * dsl_dir_is_clone() so that we will return TRUE if "earlier"
+	 * is $ORIGIN@$ORIGIN.  dsl_dataset_space_written() depends on
+	 * this behavior.
+	 */
+	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0)
 		return (B_FALSE);
 
-	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
-		return (B_TRUE);
 	dsl_dataset_t *origin;
 	error = dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
 	if (error != 0)
 		return (B_FALSE);
+	if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg &&
+	    origin->ds_dir == earlier->ds_dir) {
+		dsl_dataset_rele(origin, FTAG);
+		return (B_TRUE);
+	}
 	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
 	dsl_dataset_rele(origin, FTAG);
 	return (ret);
@@ -4453,6 +4643,26 @@ dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
 	spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 }
 
+void
+dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
+    uint64_t num_redact_snaps, dmu_tx_t *tx)
+{
+	uint64_t dsobj = ds->ds_object;
+	struct feature_type_uint64_array_arg *ftuaa =
+	    kmem_zalloc(sizeof (*ftuaa), KM_SLEEP);
+	ftuaa->length = (int64_t)num_redact_snaps;
+	if (num_redact_snaps > 0) {
+		ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t),
+		    KM_SLEEP);
+		bcopy(redact_snaps, ftuaa->array, num_redact_snaps *
+		    sizeof (uint64_t));
+	}
+	dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS,
+	    ftuaa, tx);
+	ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;
+}
+
+
 #if defined(_KERNEL)
 #if defined(_LP64)
 module_param(zfs_max_recordsize, int, 0644);
@@ -4463,6 +4673,10 @@ module_param(zfs_max_recordsize, int, 0444);
 MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
 #endif
 
+module_param(zfs_allow_redacted_dataset_mount, int, 0644);
+MODULE_PARM_DESC(zfs_allow_redacted_dataset_mount,
+	"Allow mounting of redacted datasets");
+
 EXPORT_SYMBOL(dsl_dataset_hold);
 EXPORT_SYMBOL(dsl_dataset_hold_flags);
 EXPORT_SYMBOL(dsl_dataset_hold_obj);
-- 
cgit v1.2.3