summaryrefslogtreecommitdiff
path: root/uts/common/fs/zfs/sys
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2012-07-18 08:12:04 +0000
committerMartin Matuska <mm@FreeBSD.org>2012-07-18 08:12:04 +0000
commitaf56e8c4b416d774961b41eee1eb349d657ebb8c (patch)
treee332d1e6089905f45302dedddb9967a87ade136a /uts/common/fs/zfs/sys
parent93a00b0821525e25814cd720fafd04d600811c28 (diff)
Notes
Diffstat (limited to 'uts/common/fs/zfs/sys')
-rw-r--r--uts/common/fs/zfs/sys/arc.h142
-rw-r--r--uts/common/fs/zfs/sys/bplist.h57
-rw-r--r--uts/common/fs/zfs/sys/bpobj.h91
-rw-r--r--uts/common/fs/zfs/sys/dbuf.h375
-rw-r--r--uts/common/fs/zfs/sys/ddt.h246
-rw-r--r--uts/common/fs/zfs/sys/dmu.h740
-rw-r--r--uts/common/fs/zfs/sys/dmu_impl.h272
-rw-r--r--uts/common/fs/zfs/sys/dmu_objset.h183
-rw-r--r--uts/common/fs/zfs/sys/dmu_traverse.h64
-rw-r--r--uts/common/fs/zfs/sys/dmu_tx.h148
-rw-r--r--uts/common/fs/zfs/sys/dmu_zfetch.h76
-rw-r--r--uts/common/fs/zfs/sys/dnode.h329
-rw-r--r--uts/common/fs/zfs/sys/dsl_dataset.h283
-rw-r--r--uts/common/fs/zfs/sys/dsl_deadlist.h87
-rw-r--r--uts/common/fs/zfs/sys/dsl_deleg.h78
-rw-r--r--uts/common/fs/zfs/sys/dsl_dir.h167
-rw-r--r--uts/common/fs/zfs/sys/dsl_pool.h151
-rw-r--r--uts/common/fs/zfs/sys/dsl_prop.h119
-rw-r--r--uts/common/fs/zfs/sys/dsl_scan.h108
-rw-r--r--uts/common/fs/zfs/sys/dsl_synctask.h79
-rw-r--r--uts/common/fs/zfs/sys/metaslab.h80
-rw-r--r--uts/common/fs/zfs/sys/metaslab_impl.h89
-rw-r--r--uts/common/fs/zfs/sys/refcount.h107
-rw-r--r--uts/common/fs/zfs/sys/rrwlock.h80
-rw-r--r--uts/common/fs/zfs/sys/sa.h170
-rw-r--r--uts/common/fs/zfs/sys/sa_impl.h287
-rw-r--r--uts/common/fs/zfs/sys/spa.h706
-rw-r--r--uts/common/fs/zfs/sys/spa_boot.h42
-rw-r--r--uts/common/fs/zfs/sys/spa_impl.h235
-rw-r--r--uts/common/fs/zfs/sys/space_map.h179
-rw-r--r--uts/common/fs/zfs/sys/txg.h131
-rw-r--r--uts/common/fs/zfs/sys/txg_impl.h75
-rw-r--r--uts/common/fs/zfs/sys/uberblock.h46
-rw-r--r--uts/common/fs/zfs/sys/uberblock_impl.h63
-rw-r--r--uts/common/fs/zfs/sys/unique.h59
-rw-r--r--uts/common/fs/zfs/sys/vdev.h161
-rw-r--r--uts/common/fs/zfs/sys/vdev_disk.h56
-rw-r--r--uts/common/fs/zfs/sys/vdev_file.h46
-rw-r--r--uts/common/fs/zfs/sys/vdev_impl.h322
-rw-r--r--uts/common/fs/zfs/sys/zap.h482
-rw-r--r--uts/common/fs/zfs/sys/zap_impl.h228
-rw-r--r--uts/common/fs/zfs/sys/zap_leaf.h245
-rw-r--r--uts/common/fs/zfs/sys/zfs_acl.h245
-rw-r--r--uts/common/fs/zfs/sys/zfs_context.h73
-rw-r--r--uts/common/fs/zfs/sys/zfs_ctldir.h73
-rw-r--r--uts/common/fs/zfs/sys/zfs_debug.h82
-rw-r--r--uts/common/fs/zfs/sys/zfs_dir.h74
-rw-r--r--uts/common/fs/zfs/sys/zfs_fuid.h131
-rw-r--r--uts/common/fs/zfs/sys/zfs_ioctl.h349
-rw-r--r--uts/common/fs/zfs/sys/zfs_onexit.h66
-rw-r--r--uts/common/fs/zfs/sys/zfs_rlock.h89
-rw-r--r--uts/common/fs/zfs/sys/zfs_sa.h143
-rw-r--r--uts/common/fs/zfs/sys/zfs_stat.h56
-rw-r--r--uts/common/fs/zfs/sys/zfs_vfsops.h159
-rw-r--r--uts/common/fs/zfs/sys/zfs_znode.h361
-rw-r--r--uts/common/fs/zfs/sys/zil.h428
-rw-r--r--uts/common/fs/zfs/sys/zil_impl.h147
-rw-r--r--uts/common/fs/zfs/sys/zio.h559
-rw-r--r--uts/common/fs/zfs/sys/zio_checksum.h75
-rw-r--r--uts/common/fs/zfs/sys/zio_compress.h84
-rw-r--r--uts/common/fs/zfs/sys/zio_impl.h175
-rw-r--r--uts/common/fs/zfs/sys/zrlock.h66
-rw-r--r--uts/common/fs/zfs/sys/zvol.h76
63 files changed, 11195 insertions, 0 deletions
diff --git a/uts/common/fs/zfs/sys/arc.h b/uts/common/fs/zfs/sys/arc.h
new file mode 100644
index 000000000000..8f189c62d31d
--- /dev/null
+++ b/uts/common/fs/zfs/sys/arc.h
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ARC_H
+#define _SYS_ARC_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+
+typedef struct arc_buf_hdr arc_buf_hdr_t;
+typedef struct arc_buf arc_buf_t;
+typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
+typedef int arc_evict_func_t(void *private);
+
+/* generic arc_done_func_t's which you can use */
+arc_done_func_t arc_bcopy_func;
+arc_done_func_t arc_getbuf_func;
+
+struct arc_buf {
+ arc_buf_hdr_t *b_hdr;
+ arc_buf_t *b_next;
+ kmutex_t b_evict_lock;
+ krwlock_t b_data_lock;
+ void *b_data;
+ arc_evict_func_t *b_efunc;
+ void *b_private;
+};
+
+typedef enum arc_buf_contents {
+ ARC_BUFC_DATA, /* buffer contains data */
+ ARC_BUFC_METADATA, /* buffer contains metadata */
+ ARC_BUFC_NUMTYPES
+} arc_buf_contents_t;
+/*
+ * These are the flags we pass into calls to the arc
+ */
+#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
+#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
+#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
+#define ARC_CACHED (1 << 4) /* I/O was already in cache */
+#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
+
+/*
+ * The following breakdows of arc_size exist for kstat only.
+ */
+typedef enum arc_space_type {
+ ARC_SPACE_DATA,
+ ARC_SPACE_HDRS,
+ ARC_SPACE_L2HDRS,
+ ARC_SPACE_OTHER,
+ ARC_SPACE_NUMTYPES
+} arc_space_type_t;
+
+void arc_space_consume(uint64_t space, arc_space_type_t type);
+void arc_space_return(uint64_t space, arc_space_type_t type);
+void *arc_data_buf_alloc(uint64_t space);
+void arc_data_buf_free(void *buf, uint64_t space);
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+ arc_buf_contents_t type);
+arc_buf_t *arc_loan_buf(spa_t *spa, int size);
+void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
+void arc_buf_add_ref(arc_buf_t *buf, void *tag);
+int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+int arc_buf_size(arc_buf_t *buf);
+void arc_release(arc_buf_t *buf, void *tag);
+int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb);
+int arc_released(arc_buf_t *buf);
+int arc_has_callback(arc_buf_t *buf);
+void arc_buf_freeze(arc_buf_t *buf);
+void arc_buf_thaw(arc_buf_t *buf);
+#ifdef ZFS_DEBUG
+int arc_referenced(arc_buf_t *buf);
+#endif
+
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb);
+
+void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
+int arc_buf_evict(arc_buf_t *buf);
+
+void arc_flush(spa_t *spa);
+void arc_tempreserve_clear(uint64_t reserve);
+int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
+
+void arc_init(void);
+void arc_fini(void);
+
+/*
+ * Level 2 ARC
+ */
+
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
+void l2arc_remove_vdev(vdev_t *vd);
+boolean_t l2arc_vdev_present(vdev_t *vd);
+void l2arc_init(void);
+void l2arc_fini(void);
+void l2arc_start(void);
+void l2arc_stop(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_H */
diff --git a/uts/common/fs/zfs/sys/bplist.h b/uts/common/fs/zfs/sys/bplist.h
new file mode 100644
index 000000000000..471be9047ec2
--- /dev/null
+++ b/uts/common/fs/zfs/sys/bplist.h
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_BPLIST_H
+#define _SYS_BPLIST_H
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bplist_entry {
+ blkptr_t bpe_blk;
+ list_node_t bpe_node;
+} bplist_entry_t;
+
+typedef struct bplist {
+ kmutex_t bpl_lock;
+ list_t bpl_list;
+} bplist_t;
+
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
+ void *arg, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPLIST_H */
diff --git a/uts/common/fs/zfs/sys/bpobj.h b/uts/common/fs/zfs/sys/bpobj.h
new file mode 100644
index 000000000000..3771a9541aa7
--- /dev/null
+++ b/uts/common/fs/zfs/sys/bpobj.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_BPOBJ_H
+#define _SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpo_entries blkptr_t's, representing
+ * a total of bpo_bytes physical space.
+ */
+ uint64_t bpo_num_blkptrs;
+ uint64_t bpo_bytes;
+ uint64_t bpo_comp;
+ uint64_t bpo_uncomp;
+ uint64_t bpo_subobjs;
+ uint64_t bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
+#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+ kmutex_t bpo_lock;
+ objset_t *bpo_os;
+ uint64_t bpo_object;
+ int bpo_epb;
+ uint8_t bpo_havecomp;
+ uint8_t bpo_havesubobj;
+ bpobj_phys_t *bpo_phys;
+ dmu_buf_t *bpo_dbuf;
+ dmu_buf_t *bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
diff --git a/uts/common/fs/zfs/sys/dbuf.h b/uts/common/fs/zfs/sys/dbuf.h
new file mode 100644
index 000000000000..cf1bbc030f45
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dbuf.h
@@ -0,0 +1,375 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DBUF_H
+#define _SYS_DBUF_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+#include <sys/zrlock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IN_DMU_SYNC 2
+
+/*
+ * define flags for dbuf_read
+ */
+
+#define DB_RF_MUST_SUCCEED (1 << 0)
+#define DB_RF_CANFAIL (1 << 1)
+#define DB_RF_HAVESTRUCT (1 << 2)
+#define DB_RF_NOPREFETCH (1 << 3)
+#define DB_RF_NEVERWAIT (1 << 4)
+#define DB_RF_CACHED (1 << 5)
+
+/*
+ * The simplified state transition diagram for dbufs looks like:
+ *
+ * +----> READ ----+
+ * | |
+ * | V
+ * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
+ * | ^ ^
+ * | | |
+ * +----> FILL ----+ |
+ * | |
+ * | |
+ * +--------> NOFILL -------+
+ */
+typedef enum dbuf_states {
+ DB_UNCACHED,
+ DB_FILL,
+ DB_NOFILL,
+ DB_READ,
+ DB_CACHED,
+ DB_EVICTING
+} dbuf_states_t;
+
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+struct dmu_buf_impl;
+
+typedef enum override_states {
+ DR_NOT_OVERRIDDEN,
+ DR_IN_DMU_SYNC,
+ DR_OVERRIDDEN
+} override_states_t;
+
+typedef struct dbuf_dirty_record {
+ /* link on our parents dirty list */
+ list_node_t dr_dirty_node;
+
+ /* transaction group this data will sync in */
+ uint64_t dr_txg;
+
+ /* zio of outstanding write IO */
+ zio_t *dr_zio;
+
+ /* pointer back to our dbuf */
+ struct dmu_buf_impl *dr_dbuf;
+
+ /* pointer to next dirty record */
+ struct dbuf_dirty_record *dr_next;
+
+ /* pointer to parent dirty record */
+ struct dbuf_dirty_record *dr_parent;
+
+ union dirty_types {
+ struct dirty_indirect {
+
+ /* protect access to list */
+ kmutex_t dr_mtx;
+
+ /* Our list of dirty children */
+ list_t dr_children;
+ } di;
+ struct dirty_leaf {
+
+ /*
+ * dr_data is set when we dirty the buffer
+ * so that we can retain the pointer even if it
+ * gets COW'd in a subsequent transaction group.
+ */
+ arc_buf_t *dr_data;
+ blkptr_t dr_overridden_by;
+ override_states_t dr_override_state;
+ uint8_t dr_copies;
+ } dl;
+ } dt;
+} dbuf_dirty_record_t;
+
+typedef struct dmu_buf_impl {
+ /*
+ * The following members are immutable, with the exception of
+ * db.db_data, which is protected by db_mtx.
+ */
+
+ /* the publicly visible structure */
+ dmu_buf_t db;
+
+ /* the objset we belong to */
+ struct objset *db_objset;
+
+ /*
+ * handle to safely access the dnode we belong to (NULL when evicted)
+ */
+ struct dnode_handle *db_dnode_handle;
+
+ /*
+ * our parent buffer; if the dnode points to us directly,
+ * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
+ * only accessed by sync thread ???
+ * (NULL when evicted)
+ * May change from NULL to non-NULL under the protection of db_mtx
+ * (see dbuf_check_blkptr())
+ */
+ struct dmu_buf_impl *db_parent;
+
+ /*
+ * link for hash table of all dmu_buf_impl_t's
+ */
+ struct dmu_buf_impl *db_hash_next;
+
+ /* our block number */
+ uint64_t db_blkid;
+
+ /*
+ * Pointer to the blkptr_t which points to us. May be NULL if we
+ * don't have one yet. (NULL when evicted)
+ */
+ blkptr_t *db_blkptr;
+
+ /*
+ * Our indirection level. Data buffers have db_level==0.
+ * Indirect buffers which point to data buffers have
+ * db_level==1. etc. Buffers which contain dnodes have
+ * db_level==0, since the dnodes are stored in a file.
+ */
+ uint8_t db_level;
+
+ /* db_mtx protects the members below */
+ kmutex_t db_mtx;
+
+ /*
+ * Current state of the buffer
+ */
+ dbuf_states_t db_state;
+
+ /*
+ * Refcount accessed by dmu_buf_{hold,rele}.
+ * If nonzero, the buffer can't be destroyed.
+ * Protected by db_mtx.
+ */
+ refcount_t db_holds;
+
+ /* buffer holding our data */
+ arc_buf_t *db_buf;
+
+ kcondvar_t db_changed;
+ dbuf_dirty_record_t *db_data_pending;
+
+ /* pointer to most recent dirty record for this buffer */
+ dbuf_dirty_record_t *db_last_dirty;
+
+ /*
+ * Our link on the owner dnodes's dn_dbufs list.
+ * Protected by its dn_dbufs_mtx.
+ */
+ list_node_t db_link;
+
+ /* Data which is unique to data (leaf) blocks: */
+
+ /* stuff we store for the user (see dmu_buf_set_user) */
+ void *db_user_ptr;
+ void **db_user_data_ptr_ptr;
+ dmu_buf_evict_func_t *db_evict_func;
+
+ uint8_t db_immediate_evict;
+ uint8_t db_freed_in_flight;
+
+ uint8_t db_dirtycnt;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define DBUF_MUTEXES 256
+#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+ uint64_t hash_table_mask;
+ dmu_buf_impl_t **hash_table;
+ kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+
+uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+
+dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+ void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+ void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
+
+dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+
+void dbuf_clear(dmu_buf_impl_t *db);
+void dbuf_evict(dmu_buf_impl_t *db);
+
+void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_unoverride(dbuf_dirty_record_t *dr);
+void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
+
+void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
+ struct dmu_tx *);
+
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
+#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
+#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+#define DB_GET_SPA(_spa_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_spa_p) = __dn->dn_objset->os_spa; \
+ DB_DNODE_EXIT(_db); \
+}
+#define DB_GET_OBJSET(_os_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_os_p) = __dn->dn_objset; \
+ DB_DNODE_EXIT(_db); \
+}
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define DBUF_IS_METADATA(_db) \
+ (dbuf_is_metadata(_db))
+
+#define DBUF_GET_BUFC_TYPE(_db) \
+ (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+
+#define DBUF_IS_CACHEABLE(_db) \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+
+#define DBUF_IS_L2CACHEABLE(_db) \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but gcc does not
+ * support that preprocessor token.
+ */
+#define dprintf_dbuf(dbuf, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dbuf)->db.db_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj); \
+ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+ "obj=%s lvl=%u blkid=%lld " fmt, \
+ __db_buf, (dbuf)->db_level, \
+ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ sprintf_blkptr(__blkbuf, bp); \
+ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DBUF_VERIFY(db) dbuf_verify(db)
+
+#else
+
+#define dprintf_dbuf(db, fmt, ...)
+#define dprintf_dbuf_bp(db, bp, fmt, ...)
+#define DBUF_VERIFY(db)
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/uts/common/fs/zfs/sys/ddt.h b/uts/common/fs/zfs/sys/ddt.h
new file mode 100644
index 000000000000..9724d6ecebb0
--- /dev/null
+++ b/uts/common/fs/zfs/sys/ddt.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DDT_H
+#define _SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+ DDT_TYPE_ZAP = 0,
+ DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+ DDT_CLASS_DITTO = 0,
+ DDT_CLASS_DUPLICATE,
+ DDT_CLASS_UNIQUE,
+ DDT_CLASSES
+};
+
+#define DDT_TYPE_CURRENT 0
+
+#define DDT_COMPRESS_BYTEORDER_MASK 0x80
+#define DDT_COMPRESS_FUNCTION_MASK 0x7f
+
+/*
+ * On-disk ddt entry: key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+ zio_cksum_t ddk_cksum; /* 256-bit block checksum */
+ uint64_t ddk_prop; /* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define DDK_GET_LSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_LSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_PSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_PSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8)
+#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+ dva_t ddp_dva[SPA_DVAS_PER_BP];
+ uint64_t ddp_refcnt;
+ uint64_t ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+ DDT_PHYS_DITTO = 0,
+ DDT_PHYS_SINGLE = 1,
+ DDT_PHYS_DOUBLE = 2,
+ DDT_PHYS_TRIPLE = 3,
+ DDT_PHYS_TYPES
+};
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+ ddt_key_t dde_key;
+ ddt_phys_t dde_phys[DDT_PHYS_TYPES];
+ zio_t *dde_lead_zio[DDT_PHYS_TYPES];
+ void *dde_repair_data;
+ enum ddt_type dde_type;
+ enum ddt_class dde_class;
+ uint8_t dde_loading;
+ uint8_t dde_loaded;
+ kcondvar_t dde_cv;
+ avl_node_t dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+ kmutex_t ddt_lock;
+ avl_tree_t ddt_tree;
+ avl_tree_t ddt_repair_tree;
+ enum zio_checksum ddt_checksum;
+ spa_t *ddt_spa;
+ objset_t *ddt_os;
+ uint64_t ddt_stat_object;
+ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+ ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
+ avl_node_t ddt_node;
+};
+
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+ uint64_t ddb_class;
+ uint64_t ddb_type;
+ uint64_t ddb_checksum;
+ uint64_t ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
+typedef struct ddt_ops {
+ char ddt_op_name[32];
+ int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+ boolean_t prehash);
+ int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+ int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+ void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
+ ddt_entry_t *dde);
+ int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ uint64_t *walk);
+ uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define DDT_NAMELEN 80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+ uint64_t txg);
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+ const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+ uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
+extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
+
+extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+ ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+ const blkptr_t *bp);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DDT_H */
diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h
new file mode 100644
index 000000000000..07f5949ebfea
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dmu.h
@@ -0,0 +1,740 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_DMU_H
+#define _SYS_DMU_H
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA. That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cred.h>
+#include <sys/time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct uio;
+struct xuio;
+struct page;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+struct zbookmark;
+struct spa;
+struct nvlist;
+struct arc_buf;
+struct zio_prop;
+struct sa_handle;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+
+typedef enum dmu_object_type {
+ DMU_OT_NONE,
+ /* general: */
+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
+ DMU_OT_BPOBJ, /* UINT64 */
+ DMU_OT_BPOBJ_HDR, /* UINT64 */
+ /* spa: */
+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
+ DMU_OT_SPACE_MAP, /* UINT64 */
+ /* zil: */
+ DMU_OT_INTENT_LOG, /* UINT64 */
+ /* dmu: */
+ DMU_OT_DNODE, /* DNODE */
+ DMU_OT_OBJSET, /* OBJSET */
+ /* dsl: */
+ DMU_OT_DSL_DIR, /* UINT64 */
+ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
+ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
+ DMU_OT_DSL_PROPS, /* ZAP */
+ DMU_OT_DSL_DATASET, /* UINT64 */
+ /* zpl: */
+ DMU_OT_ZNODE, /* ZNODE */
+ DMU_OT_OLDACL, /* Old ACL */
+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
+ DMU_OT_MASTER_NODE, /* ZAP */
+ DMU_OT_UNLINKED_SET, /* ZAP */
+ /* zvol: */
+ DMU_OT_ZVOL, /* UINT8 */
+ DMU_OT_ZVOL_PROP, /* ZAP */
+ /* other; for testing only! */
+ DMU_OT_PLAIN_OTHER, /* UINT8 */
+ DMU_OT_UINT64_OTHER, /* UINT64 */
+ DMU_OT_ZAP_OTHER, /* ZAP */
+ /* new object types: */
+ DMU_OT_ERROR_LOG, /* ZAP */
+ DMU_OT_SPA_HISTORY, /* UINT8 */
+ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
+ DMU_OT_POOL_PROPS, /* ZAP */
+ DMU_OT_DSL_PERMS, /* ZAP */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_SYSACL, /* SYSACL */
+ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
+ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
+ DMU_OT_NEXT_CLONES, /* ZAP */
+ DMU_OT_SCAN_QUEUE, /* ZAP */
+ DMU_OT_USERGROUP_USED, /* ZAP */
+ DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
+ DMU_OT_SCAN_XLATE, /* ZAP */
+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
+ DMU_OT_DEADLIST, /* ZAP */
+ DMU_OT_DEADLIST_HDR, /* UINT64 */
+ DMU_OT_DSL_CLONES, /* ZAP */
+ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
+ DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+ DMU_OST_NONE,
+ DMU_OST_META,
+ DMU_OST_ZFS,
+ DMU_OST_ZVOL,
+ DMU_OST_OTHER, /* For testing only! */
+ DMU_OST_ANY, /* Be careful! */
+ DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_oldacl_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define DS_FIND_SNAPSHOTS (1<<0)
+#define DS_FIND_CHILDREN (1<<1)
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
+
+#define DMU_USERUSED_OBJECT (-1ULL)
+#define DMU_GROUPUSED_OBJECT (-2ULL)
+#define DMU_DEADLIST_OBJECT (-3ULL)
+
+/*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define DMU_BONUS_BLKID (-1ULL)
+#define DMU_SPILL_BLKID (-2ULL)
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
+int dmu_objset_evict_dbufs(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
+ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+ uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
+int dmu_objset_rename(const char *name, const char *newname,
+ boolean_t recursive);
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+typedef struct dmu_buf {
+ uint64_t db_object; /* object that this buffer is part of */
+ uint64_t db_offset; /* byte offset in this object */
+ uint64_t db_size; /* size of buffer in bytes */
+ void *db_data; /* data in buffer */
+} dmu_buf_t;
+
+typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define DMU_POOL_DIRECTORY_OBJECT 1
+#define DMU_POOL_CONFIG "config"
+#define DMU_POOL_ROOT_DATASET "root_dataset"
+#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
+#define DMU_POOL_ERRLOG_LAST "errlog_last"
+#define DMU_POOL_SPARES "spares"
+#define DMU_POOL_DEFLATE "deflate"
+#define DMU_POOL_HISTORY "history"
+#define DMU_POOL_PROPS "pool_props"
+#define DMU_POOL_L2CACHE "l2cache"
+#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
+#define DMU_POOL_DDT "DDT-%s-%s-%s"
+#define DMU_POOL_DDT_STATS "DDT-statistics"
+#define DMU_POOL_CREATION_VERSION "creation_version"
+#define DMU_POOL_SCAN "scan"
+#define DMU_POOL_FREE_BPOBJ "free_bpobj"
+
+/*
+ * Allocate an object from this objset. The range of object numbers
+ * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg. The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number. If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out. It will be updated to be the next
+ * object which is allocated. Ignore objects which have not been
+ * modified since txg.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp,
+ boolean_t hole, uint64_t txg);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first. If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size. If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+ int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode. The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode. The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx);
+
+/*
+ * Decide how to write a block: checksum, compression, number of copies, etc.
+ */
+#define WP_NOFILL 0x1
+#define WP_DMU_SYNC 0x2
+#define WP_SPILL 0x4
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+ struct zio_prop *zp);
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data. As with any normal buffer, you must call dmu_buf_read() to
+ * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
+ * buffer as well. You must release your hold with dmu_buf_rele().
+ */
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
+int dmu_bonus_max(void);
+int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+ void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory. You must release the hold with
+ * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
+ * hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data. The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **, int flags);
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object. A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array. The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array. You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
+
+/*
+ * Returns NULL on success, or the existing user ptr if it's already
+ * been set.
+ *
+ * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ *
+ * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
+ * will be set to db->db_data when you are allowed to access it. Note
+ * that db->db_data (the pointer) can change when you do dmu_buf_read(),
+ * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
+ * *user_data_ptr_ptr will be set to the new value when it changes.
+ *
+ * If non-NULL, pageout func will be called when this buffer is being
+ * excised from the cache, so that you can clean up the data structure
+ * pointed to by user_ptr.
+ *
+ * dmu_evict_user() will call the pageout func for all buffers in a
+ * objset with a given pageout func.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+/*
+ * set_user_ie is the same as set_user, but request immediate eviction
+ * when hold count goes to zero.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
+ void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+
+/*
+ * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()). The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * Tells if the given dbuf is freeable.
+ */
+boolean_t dmu_buf_freeable(dmu_buf_t *);
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction. Then you must assign
+ * the transaction to a transaction group. Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction. You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned. You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define DMU_NEW_OBJECT (-1ULL)
+#define DMU_OBJECT_END (-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+ uint64_t len);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_wait(dmu_tx_t *tx);
+void dmu_tx_commit(dmu_tx_t *tx);
+
+/*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+
+/*
+ * Free up the data blocks for a defined range of a file. If size is
+ * zero, the range from offset to end-of-file is freed.
+ */
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx);
+int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size);
+int dmu_free_object(objset_t *os, uint64_t object);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+#define DMU_READ_PREFETCH 0 /* prefetch */
+#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx);
+void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, struct page *pp, dmu_tx_t *tx);
+struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
+void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
+ dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+ size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
+
+extern int zfs_prefetch_disable;
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t len);
+
+typedef struct dmu_object_info {
+ /* All sizes are in bytes unless otherwise indicated. */
+ uint32_t doi_data_block_size;
+ uint32_t doi_metadata_block_size;
+ dmu_object_type_t doi_type;
+ dmu_object_type_t doi_bonus_type;
+ uint64_t doi_bonus_size;
+ uint8_t doi_indirection; /* 2 = dnode->indirect->data */
+ uint8_t doi_checksum;
+ uint8_t doi_compress;
+ uint8_t doi_pad[5];
+ uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
+ uint64_t doi_max_offset;
+ uint64_t doi_fill_count; /* number of non-empty blocks */
+} dmu_object_info_t;
+
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+
+typedef struct dmu_object_type_info {
+ arc_byteswap_func_t *ot_byteswap;
+ boolean_t ot_metadata;
+ char *ot_name;
+} dmu_object_type_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+ u_longlong_t *nblk512);
+
+typedef struct dmu_objset_stats {
+ uint64_t dds_num_clones; /* number of clones of this */
+ uint64_t dds_creation_txg;
+ uint64_t dds_guid;
+ dmu_objset_type_t dds_type;
+ uint8_t dds_is_snapshot;
+ uint8_t dds_inconsistent;
+ char dds_origin[MAXNAMELEN];
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+
+/*
+ * Add entries to the nvlist for all the objset's properties. See
+ * zfs_prop_table[] and zfs(1m) for details on the properties.
+ */
+void dmu_objset_stats(objset_t *os, struct nvlist *nv);
+
+/*
+ * Get the space usage statistics for statvfs().
+ *
+ * refdbytes is the amount of space "referenced" by this objset.
+ * availbytes is the amount of space available to this objset, taking
+ * into account quotas & reservations, assuming that no other objsets
+ * use the space first. These values correspond to the 'referenced' and
+ * 'available' properties, described in the zfs(1m) manpage.
+ *
+ * usedobjs and availobjs are the number of objects currently allocated,
+ * and available.
+ */
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+
+/*
+ * The fsid_guid is a 56-bit ID that can change to avoid collisions.
+ * (Contrast with the ds_guid which is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.)
+ */
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+
+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_syncprop(objset_t *os);
+extern uint64_t dmu_objset_logbias(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
+extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
+ int maxlen, boolean_t *conflict);
+extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp);
+
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+ void *bonus, uint64_t *userp, uint64_t *groupp);
+extern void dmu_objset_register_type(dmu_objset_type_t ost,
+ objset_used_cb_t *cb);
+extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
+extern void *dmu_objset_get_user(objset_t *os);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * Synchronous write.
+ * If a parent zio is provided this function initiates a write on the
+ * provided buffer as a child of the parent zio.
+ * In the absence of a parent zio, the write is completed synchronously.
+ * At write completion, blk is filled with the bp of the written block.
+ * Note that while the data covered by this function will be on stable
+ * storage when the write completes this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+ struct zilog *zgd_zilog;
+ struct blkptr *zgd_bp;
+ dmu_buf_t *zgd_db;
+ struct rl *zgd_rl;
+ void *zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+ uint64_t *off);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+ uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+ dmu_traverse_cb_t cb, void *arg);
+
+int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
+ struct vnode *vp, offset_t *off);
+
+typedef struct dmu_recv_cookie {
+ /*
+ * This structure is opaque!
+ *
+ * If logical and real are different, we are recving the stream
+ * into the "real" temporary clone, and then switching it with
+ * the "logical" target.
+ */
+ struct dsl_dataset *drc_logical_ds;
+ struct dsl_dataset *drc_real_ds;
+ struct drr_begin *drc_drrb;
+ char *drc_tosnap;
+ char *drc_top_ds;
+ boolean_t drc_newfs;
+ boolean_t drc_force;
+} dmu_recv_cookie_t;
+
+int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
+ boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep);
+int dmu_recv_end(dmu_recv_cookie_t *drc);
+
+int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp,
+ offset_t *off);
+
+/* CRC64 table */
+#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_H */
diff --git a/uts/common/fs/zfs/sys/dmu_impl.h b/uts/common/fs/zfs/sys/dmu_impl.h
new file mode 100644
index 000000000000..22f9f5f8c88c
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dmu_impl.h
@@ -0,0 +1,272 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_IMPL_H
+#define _SYS_DMU_IMPL_H
+
+#include <sys/txg_impl.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the locking strategy for the DMU. Numbers in parenthesis are
+ * cases that use that lock order, referenced below:
+ *
+ * ARC is self-contained
+ * bplist is self-contained
+ * refcount is self-contained
+ * txg is self-contained (hopefully!)
+ * zst_lock
+ * zf_rwlock
+ *
+ * XXX try to improve evicting path?
+ *
+ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
+ * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs
+ *
+ * dp_config_rwlock
+ * must be held before: everything
+ * protects dd namespace changes
+ * protects property changes globally
+ * held from:
+ * dsl_dir_open/r:
+ * dsl_dir_create_sync/w:
+ * dsl_dir_sync_destroy/w:
+ * dsl_dir_rename_sync/w:
+ * dsl_prop_changed_notify/r:
+ *
+ * os_obj_lock
+ * must be held before:
+ * everything except dp_config_rwlock
+ * protects os_obj_next
+ * held from:
+ * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ *
+ * dn_struct_rwlock
+ * must be held before:
+ * everything except dp_config_rwlock and os_obj_lock
+ * protects structure of dnode (eg. nlevels)
+ * db_blkptr can change when syncing out change to nlevels
+ * dn_maxblkid
+ * dn_nlevels
+ * dn_*blksz*
+ * phys nlevels, maxblkid, physical blkptr_t's (?)
+ * held from:
+ * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ * dmu_tx_count_free:
+ * dbuf_read_impl: db_mtx, dmu_zfetch()
+ * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ * dbuf_new_size: db_mtx
+ * dbuf_dirty: db_mtx
+ * dbuf_findbp: (callers, phys? - the real need)
+ * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
+ * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
+ * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
+ * dnode_sync/w (increase_indirection): db_mtx (phys)
+ * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
+ * dnode_new_blkid/w: (dn_maxblkid)
+ * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
+ * dnode_next_offset: (phys)
+ *
+ * dn_dbufs_mtx
+ * must be held before:
+ * db_mtx, hash_mutexes
+ * protects:
+ * dn_dbufs
+ * dn_evicted
+ * held from:
+ * dmu_evict_user: db_mtx (dn_dbufs)
+ * dbuf_free_range: db_mtx (dn_dbufs)
+ * dbuf_remove_ref: db_mtx, callees:
+ * dbuf_hash_remove: hash_mutexes, db_mtx
+ * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ * dnode_set_blksz: (dn_dbufs)
+ *
+ * hash_mutexes (global)
+ * must be held before:
+ * db_mtx
+ * protects dbuf_hash_table (global) and db_hash_next
+ * held from:
+ * dbuf_find: db_mtx
+ * dbuf_hash_insert: db_mtx
+ * dbuf_hash_remove: db_mtx
+ *
+ * db_mtx (meta-leaf)
+ * must be held before:
+ * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ * protects:
+ * db_state
+ * db_holds
+ * db_buf
+ * db_changed
+ * db_data_pending
+ * db_dirtied
+ * db_link
+ * db_dirty_node (??)
+ * db_dirtycnt
+ * db_d.*
+ * db.*
+ * held from:
+ * dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * dbuf_undirty: dn_dirty_mtx (db_d)
+ * dbuf_write_done: dn_dirty_mtx (db_state)
+ * dbuf_*
+ * dmu_buf_update_user: none (db_d)
+ * dmu_evict_user: none (db_d) (maybe can eliminate)
+ * dbuf_find: none (db_holds)
+ * dbuf_hash_insert: none (db_holds)
+ * dmu_buf_read_array_impl: none (db_state, db_changed)
+ * dmu_sync: none (db_dirty_node, db_d)
+ * dnode_reallocate: none (db)
+ *
+ * dn_mtx (leaf)
+ * protects:
+ * dn_dirty_dbufs
+ * dn_ranges
+ * phys accounting
+ * dn_allocated_txg
+ * dn_free_txg
+ * dn_assigned_txg
+ * dd_assigned_tx
+ * dn_notxholds
+ * dn_dirtyctx
+ * dn_dirtyctx_firstset
+ * (dn_phys copy fields?)
+ * (dn_phys contents?)
+ * held from:
+ * dnode_*
+ * dbuf_dirty: none
+ * dbuf_sync: none (phys accounting)
+ * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ * dbuf_write_done: none (phys accounting)
+ * dmu_object_info_from_dnode: none (accounting)
+ * dmu_tx_commit: none
+ * dmu_tx_hold_object_impl: none
+ * dmu_tx_try_assign: dn_notxholds(cv)
+ * dmu_tx_unassign: none
+ *
+ * dd_lock
+ * must be held before:
+ * ds_lock
+ * ancestors' dd_lock
+ * protects:
+ * dd_prop_cbs
+ * dd_sync_*
+ * dd_used_bytes
+ * dd_tempreserved
+ * dd_space_towrite
+ * dd_myname
+ * dd_phys accounting?
+ * held from:
+ * dsl_dir_*
+ * dsl_prop_changed_notify: none (dd_prop_cbs)
+ * dsl_prop_register: none (dd_prop_cbs)
+ * dsl_prop_unregister: none (dd_prop_cbs)
+ * dsl_dataset_block_freeable: none (dd_sync_*)
+ *
+ * os_lock (leaf)
+ * protects:
+ * os_dirty_dnodes
+ * os_free_dnodes
+ * os_dnodes
+ * os_downgraded_dbufs
+ * dn_dirtyblksz
+ * dn_dirty_link
+ * held from:
+ * dnode_create: none (os_dnodes)
+ * dnode_destroy: none (os_dnodes)
+ * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ *
+ * ds_lock
+ * protects:
+ * ds_objset
+ * ds_open_refcount
+ * ds_snapname
+ * ds_phys accounting
+ * ds_phys userrefs zapobj
+ * ds_reserved
+ * held from:
+ * dsl_dataset_*
+ *
+ * dr_mtx (leaf)
+ * protects:
+ * dr_children
+ * held from:
+ * dbuf_dirty
+ * dbuf_undirty
+ * dbuf_sync_indirect
+ * dnode_new_blkid
+ */
+
+struct objset;
+struct dmu_pool;
+
+typedef struct dmu_xuio {
+ int next;
+ int cnt;
+ struct arc_buf **bufs;
+ iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+ /* loaned yet not returned arc_buf */
+ kstat_named_t xuiostat_onloan_rbuf;
+ kstat_named_t xuiostat_onloan_wbuf;
+ /* whether a copy is made when loaning out a read buffer */
+ kstat_named_t xuiostat_rbuf_copied;
+ kstat_named_t xuiostat_rbuf_nocopy;
+ /* whether a copy is made when assigning a write buffer */
+ kstat_named_t xuiostat_wbuf_copied;
+ kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+ { "onloan_read_buf", KSTAT_DATA_UINT64 },
+ { "onloan_write_buf", KSTAT_DATA_UINT64 },
+ { "read_buf_copied", KSTAT_DATA_UINT64 },
+ { "read_buf_nocopy", KSTAT_DATA_UINT64 },
+ { "write_buf_copied", KSTAT_DATA_UINT64 },
+ { "write_buf_nocopy", KSTAT_DATA_UINT64 }
+};
+
+#define XUIOSTAT_INCR(stat, val) \
+ atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/dmu_objset.h b/uts/common/fs/zfs/sys/dmu_objset.h
new file mode 100644
index 000000000000..c6d202e2e81a
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dmu_objset.h
@@ -0,0 +1,183 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_DMU_OBJSET_H
+#define _SYS_DMU_OBJSET_H
+
+#include <sys/spa.h>
+#include <sys/arc.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern krwlock_t os_lock;
+
+struct dsl_dataset;
+struct dmu_tx;
+
+#define OBJSET_PHYS_SIZE 2048
+#define OBJSET_OLD_PHYS_SIZE 1024
+
+#define OBJSET_BUF_HAS_USERUSED(buf) \
+ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
+#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
+
+typedef struct objset_phys {
+ dnode_phys_t os_meta_dnode;
+ zil_header_t os_zil_header;
+ uint64_t os_type;
+ uint64_t os_flags;
+ char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
+ sizeof (zil_header_t) - sizeof (uint64_t)*2];
+ dnode_phys_t os_userused_dnode;
+ dnode_phys_t os_groupused_dnode;
+} objset_phys_t;
+
+struct objset {
+ /* Immutable: */
+ struct dsl_dataset *os_dsl_dataset;
+ spa_t *os_spa;
+ arc_buf_t *os_phys_buf;
+ objset_phys_t *os_phys;
+ /*
+ * The following "special" dnodes have no parent and are exempt from
+ * dnode_move(), but they root their descendents in this objset using
+ * handles anyway, so that all access to dnodes from dbufs consistently
+ * uses handles.
+ */
+ dnode_handle_t os_meta_dnode;
+ dnode_handle_t os_userused_dnode;
+ dnode_handle_t os_groupused_dnode;
+ zilog_t *os_zil;
+
+ /* can change, under dsl_dir's locks: */
+ uint8_t os_checksum;
+ uint8_t os_compress;
+ uint8_t os_copies;
+ uint8_t os_dedup_checksum;
+ uint8_t os_dedup_verify;
+ uint8_t os_logbias;
+ uint8_t os_primary_cache;
+ uint8_t os_secondary_cache;
+ uint8_t os_sync;
+
+ /* no lock needed: */
+ struct dmu_tx *os_synctx; /* XXX sketchy */
+ blkptr_t *os_rootbp;
+ zil_header_t os_zil_header;
+ list_t os_synced_dnodes;
+ uint64_t os_flags;
+
+ /* Protected by os_obj_lock */
+ kmutex_t os_obj_lock;
+ uint64_t os_obj_next;
+
+ /* Protected by os_lock */
+ kmutex_t os_lock;
+ list_t os_dirty_dnodes[TXG_SIZE];
+ list_t os_free_dnodes[TXG_SIZE];
+ list_t os_dnodes;
+ list_t os_downgraded_dbufs;
+
+ /* stuff we store for the user */
+ kmutex_t os_user_ptr_lock;
+ void *os_user_ptr;
+
+ /* SA layout/attribute registration */
+ sa_os_t *os_sa;
+};
+
+#define DMU_META_OBJSET 0
+#define DMU_META_DNODE_OBJECT 0
+#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
+#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
+#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
+
+#define DMU_OS_IS_L2CACHEABLE(os) \
+ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \
+ (os)->os_secondary_cache == ZFS_CACHE_METADATA)
+
+/* called from zpl */
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
+
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
+ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+ uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
+void dmu_objset_stats(objset_t *os, nvlist_t *nv);
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+ int flags);
+int dmu_objset_find_spa(spa_t *spa, const char *name,
+ int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
+int dmu_objset_prefetch(const char *name, void *arg);
+void dmu_objset_byteswap(void *buf, size_t size);
+int dmu_objset_evict_dbufs(objset_t *os);
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
+/* called from dsl */
+void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+boolean_t dmu_objset_is_dirty_anywhere(objset_t *os);
+objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+ blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
+int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
+ objset_t **osp);
+void dmu_objset_evict(objset_t *os);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_t *os);
+int dmu_objset_userspace_upgrade(objset_t *os);
+boolean_t dmu_objset_userspace_present(objset_t *os);
+
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_OBJSET_H */
diff --git a/uts/common/fs/zfs/sys/dmu_traverse.h b/uts/common/fs/zfs/sys/dmu_traverse.h
new file mode 100644
index 000000000000..5b326cd99c09
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DMU_TRAVERSE_H
+#define _SYS_DMU_TRAVERSE_H
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dnode_phys;
+struct dsl_dataset;
+struct zilog;
+struct arc_buf;
+
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp,
+ void *arg);
+
+#define TRAVERSE_PRE (1<<0)
+#define TRAVERSE_POST (1<<1)
+#define TRAVERSE_PREFETCH_METADATA (1<<2)
+#define TRAVERSE_PREFETCH_DATA (1<<3)
+#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
+#define TRAVERSE_HARD (1<<4)
+
+/* Special traverse error return value to indicate skipping of children */
+#define TRAVERSE_VISIT_NO_CHILDREN -1
+
+int traverse_dataset(struct dsl_dataset *ds,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/uts/common/fs/zfs/sys/dmu_tx.h b/uts/common/fs/zfs/sys/dmu_tx.h
new file mode 100644
index 000000000000..c5ea50fa8d82
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dmu_tx.h
@@ -0,0 +1,148 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_TX_H
+#define _SYS_DMU_TX_H
+
+#include <sys/inttypes.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf_impl;
+struct dmu_tx_hold;
+struct dnode_link;
+struct dsl_pool;
+struct dnode;
+struct dsl_dir;
+
+struct dmu_tx {
+ /*
+ * No synchronization is needed because a tx can only be handled
+ * by one thread.
+ */
+ list_t tx_holds; /* list of dmu_tx_hold_t */
+ objset_t *tx_objset;
+ struct dsl_dir *tx_dir;
+ struct dsl_pool *tx_pool;
+ uint64_t tx_txg;
+ uint64_t tx_lastsnap_txg;
+ uint64_t tx_lasttried_txg;
+ txg_handle_t tx_txgh;
+ void *tx_tempreserve_cookie;
+ struct dmu_tx_hold *tx_needassign_txh;
+ list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
+ uint8_t tx_anyobj;
+ int tx_err;
+#ifdef ZFS_DEBUG
+ uint64_t tx_space_towrite;
+ uint64_t tx_space_tofree;
+ uint64_t tx_space_tooverwrite;
+ uint64_t tx_space_tounref;
+ refcount_t tx_space_written;
+ refcount_t tx_space_freed;
+#endif
+};
+
+enum dmu_tx_hold_type {
+ THT_NEWOBJECT,
+ THT_WRITE,
+ THT_BONUS,
+ THT_FREE,
+ THT_ZAP,
+ THT_SPACE,
+ THT_SPILL,
+ THT_NUMTYPES
+};
+
+typedef struct dmu_tx_hold {
+ dmu_tx_t *txh_tx;
+ list_node_t txh_node;
+ struct dnode *txh_dnode;
+ uint64_t txh_space_towrite;
+ uint64_t txh_space_tofree;
+ uint64_t txh_space_tooverwrite;
+ uint64_t txh_space_tounref;
+ uint64_t txh_memory_tohold;
+ uint64_t txh_fudge;
+#ifdef ZFS_DEBUG
+ enum dmu_tx_hold_type txh_type;
+ uint64_t txh_arg1;
+ uint64_t txh_arg2;
+#endif
+} dmu_tx_hold_t;
+
+typedef struct dmu_tx_callback {
+ list_node_t dcb_node; /* linked to tx_callbacks list */
+ dmu_tx_callback_func_t *dcb_func; /* caller function pointer */
+ void *dcb_data; /* caller private data */
+} dmu_tx_callback_t;
+
+/*
+ * These routines are defined in dmu.h, and are called by the user.
+ */
+dmu_tx_t *dmu_tx_create(objset_t *dd);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_abort(dmu_tx_t *tx);
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+void dmu_tx_wait(dmu_tx_t *tx);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+void dmu_tx_do_callbacks(list_t *cb_list, int error);
+
+/*
+ * These routines are defined in dmu_spa.h, and are called by the SPA.
+ */
+extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * These routines are only called by the DMU.
+ */
+dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
+int dmu_tx_is_syncing(dmu_tx_t *tx);
+int dmu_tx_private_ok(dmu_tx_t *tx);
+void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
+void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
+void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
+int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
+
+#ifdef ZFS_DEBUG
+#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
+#else
+#define DMU_TX_DIRTY_BUF(tx, db)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TX_H */
diff --git a/uts/common/fs/zfs/sys/dmu_zfetch.h b/uts/common/fs/zfs/sys/dmu_zfetch.h
new file mode 100644
index 000000000000..78cadd2b1ee1
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DFETCH_H
+#define _DFETCH_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint64_t zfetch_array_rd_sz;
+
+struct dnode; /* so we can reference dnode */
+
+typedef enum zfetch_dirn {
+ ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
+ ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
+} zfetch_dirn_t;
+
+typedef struct zstream {
+ uint64_t zst_offset; /* offset of starting block in range */
+ uint64_t zst_len; /* length of range, in blocks */
+ zfetch_dirn_t zst_direction; /* direction of prefetch */
+ uint64_t zst_stride; /* length of stride, in blocks */
+ uint64_t zst_ph_offset; /* prefetch offset, in blocks */
+ uint64_t zst_cap; /* prefetch limit (cap), in blocks */
+ kmutex_t zst_lock; /* protects stream */
+ clock_t zst_last; /* lbolt of last prefetch */
+ avl_node_t zst_node; /* embed avl node here */
+} zstream_t;
+
+typedef struct zfetch {
+ krwlock_t zf_rwlock; /* protects zfetch structure */
+ list_t zf_stream; /* AVL tree of zstream_t's */
+ struct dnode *zf_dnode; /* dnode that owns this zfetch */
+ uint32_t zf_stream_cnt; /* # of active streams */
+ uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
+} zfetch_t;
+
+void zfetch_init(void);
+void zfetch_fini(void);
+
+void dmu_zfetch_init(zfetch_t *, struct dnode *);
+void dmu_zfetch_rele(zfetch_t *);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DFETCH_H */
diff --git a/uts/common/fs/zfs/sys/dnode.h b/uts/common/fs/zfs/sys/dnode.h
new file mode 100644
index 000000000000..9ad4be36bf85
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dnode.h
@@ -0,0 +1,329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DNODE_H
+#define _SYS_DNODE_H
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/refcount.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * dnode_hold() flags.
+ */
+#define DNODE_MUST_BE_ALLOCATED 1
+#define DNODE_MUST_BE_FREE 2
+
+/*
+ * dnode_next_offset() flags.
+ */
+#define DNODE_FIND_HOLE 1
+#define DNODE_FIND_BACKWARDS 2
+#define DNODE_FIND_HAVELOCK 4
+
+/*
+ * Fixed constants.
+ */
+#define DNODE_SHIFT 9 /* 512 bytes */
+#define DN_MIN_INDBLKSHIFT 10 /* 1k */
+#define DN_MAX_INDBLKSHIFT 14 /* 16k */
+#define DNODE_BLOCK_SHIFT 14 /* 16k */
+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
+
+/*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define DN_ID_CHKED_BONUS 0x1
+#define DN_ID_CHKED_SPILL 0x2
+#define DN_ID_OLD_EXIST 0x4
+#define DN_ID_NEW_EXIST 0x8
+
+/*
+ * Derived constants.
+ */
+#define DNODE_SIZE (1 << DNODE_SHIFT)
+#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
+#define DN_KILL_SPILLBLK (1)
+
+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
+ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
+
+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
+
+struct dmu_buf_impl;
+struct objset;
+struct zio;
+
+enum dnode_dirtycontext {
+ DN_UNDIRTIED,
+ DN_DIRTY_OPEN,
+ DN_DIRTY_SYNC
+};
+
+/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
+#define DNODE_FLAG_USED_BYTES (1<<0)
+#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
+
+/* Does dnode have a SA spill blkptr in bonus? */
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
+typedef struct dnode_phys {
+ uint8_t dn_type; /* dmu_object_type_t */
+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
+ uint8_t dn_nblkptr; /* length of dn_blkptr */
+ uint8_t dn_bonustype; /* type of data in bonus buffer */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_flags; /* DNODE_FLAG_* */
+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
+ uint16_t dn_bonuslen; /* length of dn_bonus */
+ uint8_t dn_pad2[4];
+
+ /* accounting is protected by dn_dirty_mtx */
+ uint64_t dn_maxblkid; /* largest allocated block ID */
+ uint64_t dn_used; /* bytes (or sectors) of disk space */
+
+ uint64_t dn_pad3[4];
+
+ blkptr_t dn_blkptr[1];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+ blkptr_t dn_spill;
+} dnode_phys_t;
+
+typedef struct dnode {
+ /*
+ * dn_struct_rwlock protects the structure of the dnode,
+ * including the number of levels of indirection (dn_nlevels),
+ * dn_maxblkid, and dn_next_*
+ */
+ krwlock_t dn_struct_rwlock;
+
+ /* Our link on dn_objset->os_dnodes list; protected by os_lock. */
+ list_node_t dn_link;
+
+ /* immutable: */
+ struct objset *dn_objset;
+ uint64_t dn_object;
+ struct dmu_buf_impl *dn_dbuf;
+ struct dnode_handle *dn_handle;
+ dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
+
+ /*
+ * Copies of stuff in dn_phys. They're valid in the open
+ * context (eg. even before the dnode is first synced).
+ * Where necessary, these are protected by dn_struct_rwlock.
+ */
+ dmu_object_type_t dn_type; /* object type */
+ uint16_t dn_bonuslen; /* bonus length */
+ uint8_t dn_bonustype; /* bonus type */
+ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_nlevels;
+ uint8_t dn_indblkshift;
+ uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint8_t dn_moved; /* Has this dnode been moved? */
+ uint16_t dn_datablkszsec; /* in 512b sectors */
+ uint32_t dn_datablksz; /* in bytes */
+ uint64_t dn_maxblkid;
+ uint8_t dn_next_nblkptr[TXG_SIZE];
+ uint8_t dn_next_nlevels[TXG_SIZE];
+ uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint8_t dn_next_bonustype[TXG_SIZE];
+ uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
+ uint16_t dn_next_bonuslen[TXG_SIZE];
+ uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+
+ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+ uint32_t dn_dbufs_count; /* count of dn_dbufs */
+
+ /* protected by os_lock: */
+ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
+
+ /* protected by dn_mtx: */
+ kmutex_t dn_mtx;
+ list_t dn_dirty_records[TXG_SIZE];
+ avl_tree_t dn_ranges[TXG_SIZE];
+ uint64_t dn_allocated_txg;
+ uint64_t dn_free_txg;
+ uint64_t dn_assigned_txg;
+ kcondvar_t dn_notxholds;
+ enum dnode_dirtycontext dn_dirtyctx;
+ uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
+
+ /* protected by own devices */
+ refcount_t dn_tx_holds;
+ refcount_t dn_holds;
+
+ kmutex_t dn_dbufs_mtx;
+ list_t dn_dbufs; /* descendent dbufs */
+
+ /* protected by dn_struct_rwlock */
+ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+
+ boolean_t dn_have_spill; /* have spill or are spilling */
+
+ /* parent IO for current sync write */
+ zio_t *dn_zio;
+
+ /* used in syncing context */
+ uint64_t dn_oldused; /* old phys used bytes */
+ uint64_t dn_oldflags; /* old phys dn_flags */
+ uint64_t dn_olduid, dn_oldgid;
+ uint64_t dn_newuid, dn_newgid;
+ int dn_id_flags;
+
+ /* holds prefetch structure */
+ struct zfetch dn_zfetch;
+} dnode_t;
+
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+ /* Protects dnh_dnode from modification by dnode_move(). */
+ zrlock_t dnh_zrlock;
+ dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+ size_t dnc_count; /* number of children */
+ dnode_handle_t dnc_children[1]; /* sized dynamically */
+} dnode_children_t;
+
+typedef struct free_range {
+ avl_node_t fr_node;
+ uint64_t fr_blkid;
+ uint64_t fr_nblks;
+} free_range_t;
+
+dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+ uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
+
+void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
+int dnode_hold(struct objset *dd, uint64_t object,
+ void *ref, dnode_t **dnp);
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
+ void *ref, dnode_t **dnp);
+boolean_t dnode_add_ref(dnode_t *dn, void *ref);
+void dnode_rele(dnode_t *dn, void *ref);
+void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
+void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
+void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_free(dnode_t *dn, dmu_tx_t *tx);
+void dnode_byteswap(dnode_phys_t *dnp);
+void dnode_buf_byteswap(void *buf, size_t size);
+void dnode_verify(dnode_t *dn);
+int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
+uint64_t dnode_current_max_length(dnode_t *dn);
+void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
+void dnode_clear_range(dnode_t *dn, uint64_t blkid,
+ uint64_t nblks, dmu_tx_t *tx);
+void dnode_diduse_space(dnode_t *dn, int64_t space);
+void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
+uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
+void dnode_init(void);
+void dnode_fini(void);
+int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
+ int minlvl, uint64_t blkfill, uint64_t txg);
+void dnode_evict_dbufs(dnode_t *dn);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define dprintf_dnode(dn, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dn)->dn_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj);\
+ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
+ __db_buf, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DNODE_VERIFY(dn) dnode_verify(dn)
+#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
+
+#else
+
+#define dprintf_dnode(db, fmt, ...)
+#define DNODE_VERIFY(dn)
+#define FREE_VERIFY(db, start, end, tx)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DNODE_H */
diff --git a/uts/common/fs/zfs/sys/dsl_dataset.h b/uts/common/fs/zfs/sys/dsl_dataset.h
new file mode 100644
index 000000000000..22733d070e8b
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -0,0 +1,283 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DATASET_H
+#define _SYS_DSL_DATASET_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/bplist.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+struct dsl_pool;
+
+#define DS_FLAG_INCONSISTENT (1ULL<<0)
+#define DS_IS_INCONSISTENT(ds) \
+ ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
+/*
+ * NB: nopromote can not yet be set, but we want support for it in this
+ * on-disk version, so that we don't need to upgrade for it later. It
+ * will be needed when we implement 'zfs split' (where the split off
+ * clone should not be promoted).
+ */
+#define DS_FLAG_NOPROMOTE (1ULL<<1)
+
+/*
+ * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
+ * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
+ * refquota/refreservations).
+ */
+#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
+
+/*
+ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
+ * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
+ */
+#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
+#define DS_IS_DEFER_DESTROY(ds) \
+ ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+
+/*
+ * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
+ * name lookups should be performed case-insensitively.
+ */
+#define DS_FLAG_CI_DATASET (1ULL<<16)
+
+typedef struct dsl_dataset_phys {
+ uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */
+ uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */
+ uint64_t ds_prev_snap_txg;
+ uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */
+ uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
+ uint64_t ds_creation_time; /* seconds since 1970 */
+ uint64_t ds_creation_txg;
+ uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
+ uint64_t ds_used_bytes;
+ uint64_t ds_compressed_bytes;
+ uint64_t ds_uncompressed_bytes;
+ uint64_t ds_unique_bytes; /* only relevant to snapshots */
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t ds_fsid_guid;
+ uint64_t ds_guid;
+ uint64_t ds_flags; /* DS_FLAG_* */
+ blkptr_t ds_bp;
+ uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
+ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
+ uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
+ uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_dataset {
+ /* Immutable: */
+ struct dsl_dir *ds_dir;
+ dsl_dataset_phys_t *ds_phys;
+ dmu_buf_t *ds_dbuf;
+ uint64_t ds_object;
+ uint64_t ds_fsid_guid;
+
+ /* only used in syncing context, only valid for non-snapshots: */
+ struct dsl_dataset *ds_prev;
+
+ /* has internal locking: */
+ dsl_deadlist_t ds_deadlist;
+ bplist_t ds_pending_deadlist;
+
+ /* to protect against multiple concurrent incremental recv */
+ kmutex_t ds_recvlock;
+
+ /* protected by lock on pool's dp_dirty_datasets list */
+ txg_node_t ds_dirty_link;
+ list_node_t ds_synced_link;
+
+ /*
+ * ds_phys->ds_<accounting> is also protected by ds_lock.
+ * Protected by ds_lock:
+ */
+ kmutex_t ds_lock;
+ objset_t *ds_objset;
+ uint64_t ds_userrefs;
+
+ /*
+ * ds_owner is protected by the ds_rwlock and the ds_lock
+ */
+ krwlock_t ds_rwlock;
+ kcondvar_t ds_exclusive_cv;
+ void *ds_owner;
+
+ /* no locking; only for making guesses */
+ uint64_t ds_trysnap_txg;
+
+ /* for objset_open() */
+ kmutex_t ds_opening_lock;
+
+ uint64_t ds_reserved; /* cached refreservation */
+ uint64_t ds_quota; /* cached refquota */
+
+ /* Protected by ds_lock; keep at end of struct for better locality */
+ char ds_snapname[MAXNAMELEN];
+} dsl_dataset_t;
+
+struct dsl_ds_destroyarg {
+ dsl_dataset_t *ds; /* ds to destroy */
+ dsl_dataset_t *rm_origin; /* also remove our origin? */
+ boolean_t is_origin_rm; /* set if removing origin snap */
+ boolean_t defer; /* destroy -d requested? */
+ boolean_t releasing; /* destroying due to release? */
+ boolean_t need_prep; /* do we need to retry due to EBUSY? */
+};
+
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define MAX_TAG_PREFIX_LEN 17
+
+struct dsl_ds_holdarg {
+ dsl_sync_task_group_t *dstg;
+ char *htag;
+ char *snapname;
+ boolean_t recursive;
+ boolean_t gotone;
+ boolean_t temphold;
+ char failed[MAXPATHLEN];
+};
+
+#define dsl_dataset_is_snapshot(ds) \
+ ((ds)->ds_phys->ds_num_children != 0)
+
+#define DS_UNIQUE_IS_ACCURATE(ds) \
+ (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+
+int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
+ void *tag, dsl_dataset_t **);
+int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+ void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
+ boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
+ void *tag);
+void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
+void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+ minor_t minor);
+uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
+ dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
+uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+ uint64_t flags, dmu_tx_t *tx);
+int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
+int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
+dsl_checkfunc_t dsl_dataset_destroy_check;
+dsl_syncfunc_t dsl_dataset_destroy_sync;
+dsl_checkfunc_t dsl_dataset_snapshot_check;
+dsl_syncfunc_t dsl_dataset_snapshot_sync;
+dsl_syncfunc_t dsl_dataset_user_hold_sync;
+int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
+int dsl_dataset_promote(const char *name, char *conflsnap);
+int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
+ boolean_t force);
+int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+ boolean_t recursive, boolean_t temphold, int cleanup_fd);
+int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+ boolean_t temphold);
+int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+ boolean_t recursive);
+int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
+ char *htag, boolean_t retry);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
+
+blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
+void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+
+spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+
+boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
+
+void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
+
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
+ dmu_tx_t *tx);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+ dmu_tx_t *tx, boolean_t async);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+ uint64_t blk_birth);
+uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
+
+void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
+void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
+void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
+void dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
+
+int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
+
+int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+ uint64_t asize, uint64_t inflight, uint64_t *used,
+ uint64_t *ref_rsrv);
+int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+ uint64_t quota);
+dsl_syncfunc_t dsl_dataset_set_quota_sync;
+int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+ uint64_t reservation);
+
+int dsl_destroy_inconsistent(const char *dsname, void *arg);
+
+#ifdef ZFS_DEBUG
+#define dprintf_ds(ds, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+ dsl_dataset_name(ds, __ds_name); \
+ dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_ds(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DATASET_H */
diff --git a/uts/common/fs/zfs/sys/dsl_deadlist.h b/uts/common/fs/zfs/sys/dsl_deadlist.h
new file mode 100644
index 000000000000..d2c16d72c17e
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_deadlist.h
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DEADLIST_H
+#define _SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+ uint64_t dl_used;
+ uint64_t dl_comp;
+ uint64_t dl_uncomp;
+ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+ objset_t *dl_os;
+ uint64_t dl_object;
+ avl_tree_t dl_tree;
+ boolean_t dl_havetree;
+ struct dmu_buf *dl_dbuf;
+ dsl_deadlist_phys_t *dl_phys;
+ kmutex_t dl_lock;
+
+ /* if it's the old on-disk format: */
+ bpobj_t dl_bpobj;
+ boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+ avl_node_t dle_node;
+ uint64_t dle_mintxg;
+ bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+ uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
diff --git a/uts/common/fs/zfs/sys/dsl_deleg.h b/uts/common/fs/zfs/sys/dsl_deleg.h
new file mode 100644
index 000000000000..73c43bd23879
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -0,0 +1,78 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DELEG_H
+#define _SYS_DSL_DELEG_H
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_DELEG_PERM_NONE ""
+#define ZFS_DELEG_PERM_CREATE "create"
+#define ZFS_DELEG_PERM_DESTROY "destroy"
+#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
+#define ZFS_DELEG_PERM_ROLLBACK "rollback"
+#define ZFS_DELEG_PERM_CLONE "clone"
+#define ZFS_DELEG_PERM_PROMOTE "promote"
+#define ZFS_DELEG_PERM_RENAME "rename"
+#define ZFS_DELEG_PERM_MOUNT "mount"
+#define ZFS_DELEG_PERM_SHARE "share"
+#define ZFS_DELEG_PERM_SEND "send"
+#define ZFS_DELEG_PERM_RECEIVE "receive"
+#define ZFS_DELEG_PERM_ALLOW "allow"
+#define ZFS_DELEG_PERM_USERPROP "userprop"
+#define ZFS_DELEG_PERM_VSCAN "vscan"
+#define ZFS_DELEG_PERM_USERQUOTA "userquota"
+#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
+#define ZFS_DELEG_PERM_USERUSED "userused"
+#define ZFS_DELEG_PERM_GROUPUSED "groupused"
+#define ZFS_DELEG_PERM_HOLD "hold"
+#define ZFS_DELEG_PERM_RELEASE "release"
+#define ZFS_DELEG_PERM_DIFF "diff"
+
+/*
+ * Note: the names of properties that are marked delegatable are also
+ * valid delegated permissions
+ */
+
+int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
+int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
+int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
+void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
+int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
+int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
+int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx);
+boolean_t dsl_delegation_on(objset_t *os);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DELEG_H */
diff --git a/uts/common/fs/zfs/sys/dsl_dir.h b/uts/common/fs/zfs/sys/dsl_dir.h
new file mode 100644
index 000000000000..2191635dd813
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_dir.h
@@ -0,0 +1,167 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DIR_H
+#define _SYS_DSL_DIR_H
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/refcount.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+typedef enum dd_used {
+ DD_USED_HEAD,
+ DD_USED_SNAP,
+ DD_USED_CHILD,
+ DD_USED_CHILD_RSRV,
+ DD_USED_REFRSRV,
+ DD_USED_NUM
+} dd_used_t;
+
+#define DD_FLAG_USED_BREAKDOWN (1<<0)
+
+typedef struct dsl_dir_phys {
+ uint64_t dd_creation_time; /* not actually used */
+ uint64_t dd_head_dataset_obj;
+ uint64_t dd_parent_obj;
+ uint64_t dd_origin_obj;
+ uint64_t dd_child_dir_zapobj;
+ /*
+ * how much space our children are accounting for; for leaf
+ * datasets, == physical space used by fs + snaps
+ */
+ uint64_t dd_used_bytes;
+ uint64_t dd_compressed_bytes;
+ uint64_t dd_uncompressed_bytes;
+ /* Administrative quota setting */
+ uint64_t dd_quota;
+ /* Administrative reservation setting */
+ uint64_t dd_reserved;
+ uint64_t dd_props_zapobj;
+ uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
+ uint64_t dd_flags;
+ uint64_t dd_used_breakdown[DD_USED_NUM];
+ uint64_t dd_clones; /* dsl_dir objects */
+ uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+struct dsl_dir {
+ /* These are immutable; no lock needed: */
+ uint64_t dd_object;
+ dsl_dir_phys_t *dd_phys;
+ dmu_buf_t *dd_dbuf;
+ dsl_pool_t *dd_pool;
+
+ /* protected by lock on pool's dp_dirty_dirs list */
+ txg_node_t dd_dirty_link;
+
+ /* protected by dp_config_rwlock */
+ dsl_dir_t *dd_parent;
+
+ /* Protected by dd_lock */
+ kmutex_t dd_lock;
+ list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+ uint64_t dd_origin_txg;
+
+ /* gross estimate of space used by in-flight tx's */
+ uint64_t dd_tempreserved[TXG_SIZE];
+ /* amount of space we expect to write; == amount of dirty data */
+ int64_t dd_space_towrite[TXG_SIZE];
+
+ /* protected by dd_lock; keep at end of struct for better locality */
+ char dd_myname[MAXNAMELEN];
+};
+
+void dsl_dir_close(dsl_dir_t *dd, void *tag);
+int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
+int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
+ const char **tailp);
+int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **);
+void dsl_dir_name(dsl_dir_t *dd, char *buf);
+int dsl_dir_namelen(dsl_dir_t *dd);
+uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
+ const char *name, dmu_tx_t *tx);
+dsl_checkfunc_t dsl_dir_destroy_check;
+dsl_syncfunc_t dsl_dir_destroy_sync;
+void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
+uint64_t dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
+void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
+ uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep,
+ dmu_tx_t *tx);
+void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
+void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
+void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+ dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
+int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
+ uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation);
+int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
+boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
+void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
+ uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
+
+/* internal reserved dir name */
+#define MOS_DIR_NAME "$MOS"
+#define ORIGIN_DIR_NAME "$ORIGIN"
+#define XLATION_DIR_NAME "$XLATION"
+#define FREE_DIR_NAME "$FREE"
+
+#ifdef ZFS_DEBUG
+#define dprintf_dd(dd, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
+ KM_SLEEP); \
+ dsl_dir_name(dd, __ds_name); \
+ dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_dd(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DIR_H */
diff --git a/uts/common/fs/zfs/sys/dsl_pool.h b/uts/common/fs/zfs/sys/dsl_pool.h
new file mode 100644
index 000000000000..7d25bd7c020d
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_pool.h
@@ -0,0 +1,151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_POOL_H
+#define _SYS_DSL_POOL_H
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/txg_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/ddt.h>
+#include <sys/arc.h>
+#include <sys/bpobj.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+struct dsl_scan;
+
+/* These macros are for indexing into the zfs_all_blkstats_t. */
+#define DMU_OT_DEFERRED DMU_OT_NONE
+#define DMU_OT_TOTAL DMU_OT_NUMTYPES
+
+typedef struct zfs_blkstat {
+ uint64_t zb_count;
+ uint64_t zb_asize;
+ uint64_t zb_lsize;
+ uint64_t zb_psize;
+ uint64_t zb_gangs;
+ uint64_t zb_ditto_2_of_2_samevdev;
+ uint64_t zb_ditto_2_of_3_samevdev;
+ uint64_t zb_ditto_3_of_3_samevdev;
+} zfs_blkstat_t;
+
+typedef struct zfs_all_blkstats {
+ zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
+} zfs_all_blkstats_t;
+
+
+typedef struct dsl_pool {
+ /* Immutable */
+ spa_t *dp_spa;
+ struct objset *dp_meta_objset;
+ struct dsl_dir *dp_root_dir;
+ struct dsl_dir *dp_mos_dir;
+ struct dsl_dir *dp_free_dir;
+ struct dsl_dataset *dp_origin_snap;
+ uint64_t dp_root_dir_obj;
+ struct taskq *dp_vnrele_taskq;
+
+ /* No lock needed - sync context only */
+ blkptr_t dp_meta_rootbp;
+ list_t dp_synced_datasets;
+ hrtime_t dp_read_overhead;
+ uint64_t dp_throughput; /* bytes per millisec */
+ uint64_t dp_write_limit;
+ uint64_t dp_tmp_userrefs_obj;
+ bpobj_t dp_free_bpobj;
+
+ struct dsl_scan *dp_scan;
+
+ /* Uses dp_lock */
+ kmutex_t dp_lock;
+ uint64_t dp_space_towrite[TXG_SIZE];
+ uint64_t dp_tempreserved[TXG_SIZE];
+
+ /* Has its own locking */
+ tx_state_t dp_tx;
+ txg_list_t dp_dirty_datasets;
+ txg_list_t dp_dirty_dirs;
+ txg_list_t dp_sync_tasks;
+
+ /*
+ * Protects administrative changes (properties, namespace)
+ * It is only held for write in syncing context. Therefore
+ * syncing context does not need to ever have it for read, since
+ * nobody else could possibly have it for write.
+ */
+ krwlock_t dp_config_rwlock;
+
+ zfs_all_blkstats_t *dp_blkstats;
+} dsl_pool_t;
+
+int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+void dsl_pool_close(dsl_pool_t *dp);
+dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
+void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
+int dsl_pool_sync_context(dsl_pool_t *dp);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
+int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
+void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_memory_pressure(dsl_pool_t *dp);
+void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
+void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
+ const blkptr_t *bpp);
+int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
+int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
+void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+
+taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
+
+extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t *now, dmu_tx_t *tx);
+extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, dmu_tx_t *tx);
+extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_POOL_H */
diff --git a/uts/common/fs/zfs/sys/dsl_prop.h b/uts/common/fs/zfs/sys/dsl_prop.h
new file mode 100644
index 000000000000..a636ad35096b
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_prop.h
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_PROP_H
+#define _SYS_DSL_PROP_H
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_synctask.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+
+/* The callback func may not call into the DMU or DSL! */
+typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+
+typedef struct dsl_prop_cb_record {
+ list_node_t cbr_node; /* link on dd_prop_cbs */
+ struct dsl_dataset *cbr_ds;
+ const char *cbr_propname;
+ dsl_prop_changed_cb_t *cbr_func;
+ void *cbr_arg;
+} dsl_prop_cb_record_t;
+
+typedef struct dsl_props_arg {
+ nvlist_t *pa_props;
+ zprop_source_t pa_source;
+} dsl_props_arg_t;
+
+typedef struct dsl_prop_set_arg {
+ const char *psa_name;
+ zprop_source_t psa_source;
+ int psa_intsz;
+ int psa_numints;
+ const void *psa_value;
+
+ /*
+ * Used to handle the special requirements of the quota and reservation
+ * properties.
+ */
+ uint64_t psa_effective_value;
+} dsl_prop_setarg_t;
+
+int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_numcb(struct dsl_dataset *ds);
+
+int dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint,
+ boolean_t snapshot);
+
+dsl_syncfunc_t dsl_props_set_sync;
+int dsl_prop_set(const char *ddname, const char *propname,
+ zprop_source_t source, int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
+void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+ dmu_tx_t *tx);
+
+void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+ zprop_source_t source, uint64_t *value);
+int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#ifdef ZFS_DEBUG
+void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#define DSL_PROP_CHECK_PREDICTION(dd, psa) \
+ dsl_prop_check_prediction((dd), (psa))
+#else
+#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */
+#endif
+
+/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
+boolean_t dsl_prop_get_hasrecvd(objset_t *os);
+void dsl_prop_set_hasrecvd(objset_t *os);
+void dsl_prop_unset_hasrecvd(objset_t *os);
+
+void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
+void dsl_prop_nvlist_add_string(nvlist_t *nv,
+ zfs_prop_t prop, const char *value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_PROP_H */
diff --git a/uts/common/fs/zfs/sys/dsl_scan.h b/uts/common/fs/zfs/sys/dsl_scan.h
new file mode 100644
index 000000000000..c79666e67de0
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_scan.h
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_SCAN_H
+#define _SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+ uint64_t scn_func; /* pool_scan_func_t */
+ uint64_t scn_state; /* dsl_scan_state_t */
+ uint64_t scn_queue_obj;
+ uint64_t scn_min_txg;
+ uint64_t scn_max_txg;
+ uint64_t scn_cur_min_txg;
+ uint64_t scn_cur_max_txg;
+ uint64_t scn_start_time;
+ uint64_t scn_end_time;
+ uint64_t scn_to_examine; /* total bytes to be scanned */
+ uint64_t scn_examined; /* bytes scanned so far */
+ uint64_t scn_to_process;
+ uint64_t scn_processed;
+ uint64_t scn_errors; /* scan I/O error count */
+ uint64_t scn_ddt_class_max;
+ ddt_bookmark_t scn_ddt_bookmark;
+ zbookmark_t scn_bookmark;
+ uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+ DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+typedef struct dsl_scan {
+ struct dsl_pool *scn_dp;
+
+ boolean_t scn_pausing;
+ uint64_t scn_restart_txg;
+ uint64_t scn_sync_start_time;
+ zio_t *scn_zio_root;
+
+ /* for debugging / information */
+ uint64_t scn_visited_this_txg;
+
+ dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+ struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
diff --git a/uts/common/fs/zfs/sys/dsl_synctask.h b/uts/common/fs/zfs/sys/dsl_synctask.h
new file mode 100644
index 000000000000..9126290cdb5b
--- /dev/null
+++ b/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_SYNCTASK_H
+#define _SYS_DSL_SYNCTASK_H
+
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+
+typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
+
+typedef struct dsl_sync_task {
+ list_node_t dst_node;
+ dsl_checkfunc_t *dst_checkfunc;
+ dsl_syncfunc_t *dst_syncfunc;
+ void *dst_arg1;
+ void *dst_arg2;
+ int dst_err;
+} dsl_sync_task_t;
+
+typedef struct dsl_sync_task_group {
+ txg_node_t dstg_node;
+ list_t dstg_tasks;
+ struct dsl_pool *dstg_pool;
+ uint64_t dstg_txg;
+ int dstg_err;
+ int dstg_space;
+ boolean_t dstg_nowaiter;
+} dsl_sync_task_group_t;
+
+dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
+void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
+ dsl_checkfunc_t *, dsl_syncfunc_t *,
+ void *arg1, void *arg2, int blocks_modified);
+int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
+void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
+void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
+void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
+
+int dsl_sync_task_do(struct dsl_pool *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified);
+void dsl_sync_task_do_nowait(struct dsl_pool *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SYNCTASK_H */
diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h
new file mode 100644
index 000000000000..583d6303bd5a
--- /dev/null
+++ b/uts/common/fs/zfs/sys/metaslab.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_METASLAB_H
+#define _SYS_METASLAB_H
+
+#include <sys/spa.h>
+#include <sys/space_map.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern space_map_ops_t *zfs_metaslab_ops;
+
+extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+ uint64_t start, uint64_t size, uint64_t txg);
+extern void metaslab_fini(metaslab_t *msp);
+extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);
+
+#define METASLAB_HINTBP_FAVOR 0x0
+#define METASLAB_HINTBP_AVOID 0x1
+#define METASLAB_GANG_HEADER 0x2
+
+extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+ blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
+extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
+ boolean_t now);
+extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
+
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+ space_map_ops_t *ops);
+extern void metaslab_class_destroy(metaslab_class_t *mc);
+extern int metaslab_class_validate(metaslab_class_t *mc);
+
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+ int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
+
+extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
+ vdev_t *vd);
+extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_activate(metaslab_group_t *mg);
+extern void metaslab_group_passivate(metaslab_group_t *mg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_H */
diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h
new file mode 100644
index 000000000000..07988dd51a73
--- /dev/null
+++ b/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_IMPL_H
+#define _SYS_METASLAB_IMPL_H
+
+#include <sys/metaslab.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct metaslab_class {
+ spa_t *mc_spa;
+ metaslab_group_t *mc_rotor;
+ space_map_ops_t *mc_ops;
+ uint64_t mc_aliquot;
+ uint64_t mc_alloc; /* total allocated space */
+ uint64_t mc_deferred; /* total deferred frees */
+ uint64_t mc_space; /* total space (alloc + free) */
+ uint64_t mc_dspace; /* total deflated space */
+};
+
+struct metaslab_group {
+ kmutex_t mg_lock;
+ avl_tree_t mg_metaslab_tree;
+ uint64_t mg_aliquot;
+ uint64_t mg_bonus_area;
+ int64_t mg_bias;
+ int64_t mg_activation_count;
+ metaslab_class_t *mg_class;
+ vdev_t *mg_vd;
+ metaslab_group_t *mg_prev;
+ metaslab_group_t *mg_next;
+};
+
+/*
+ * Each metaslab's free space is tracked in space map object in the MOS,
+ * which is only updated in syncing context. Each time we sync a txg,
+ * we append the allocs and frees from that txg to the space map object.
+ * When the txg is done syncing, metaslab_sync_done() updates ms_smo
+ * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
+ */
+struct metaslab {
+ kmutex_t ms_lock; /* metaslab lock */
+ space_map_obj_t ms_smo; /* synced space map object */
+ space_map_obj_t ms_smo_syncing; /* syncing space map object */
+ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */
+ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */
+ space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */
+ space_map_t ms_map; /* in-core free space map */
+ int64_t ms_deferspace; /* sum of ms_defermap[] space */
+ uint64_t ms_weight; /* weight vs. others in group */
+ metaslab_group_t *ms_group; /* metaslab group */
+ avl_node_t ms_group_node; /* node in metaslab group tree */
+ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/refcount.h b/uts/common/fs/zfs/sys/refcount.h
new file mode 100644
index 000000000000..1752c64e3e8b
--- /dev/null
+++ b/uts/common/fs/zfs/sys/refcount.h
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_REFCOUNT_H
+#define _SYS_REFCOUNT_H
+
+#include <sys/inttypes.h>
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * If the reference is held only by the calling function and not any
+ * particular object, use FTAG (which is a string) for the holder_tag.
+ * Otherwise, use the object that holds the reference.
+ */
+#define FTAG ((char *)__func__)
+
+#ifdef ZFS_DEBUG
+typedef struct reference {
+ list_node_t ref_link;
+ void *ref_holder;
+ uint64_t ref_number;
+ uint8_t *ref_removed;
+} reference_t;
+
+typedef struct refcount {
+ kmutex_t rc_mtx;
+ list_t rc_list;
+ list_t rc_removed;
+ int64_t rc_count;
+ int64_t rc_removed_count;
+} refcount_t;
+
+/* Note: refcount_t must be initialized with refcount_create() */
+
+void refcount_create(refcount_t *rc);
+void refcount_destroy(refcount_t *rc);
+void refcount_destroy_many(refcount_t *rc, uint64_t number);
+int refcount_is_zero(refcount_t *rc);
+int64_t refcount_count(refcount_t *rc);
+int64_t refcount_add(refcount_t *rc, void *holder_tag);
+int64_t refcount_remove(refcount_t *rc, void *holder_tag);
+int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
+int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
+
+void refcount_init(void);
+void refcount_fini(void);
+
+#else /* ZFS_DEBUG */
+
+typedef struct refcount {
+ uint64_t rc_count;
+} refcount_t;
+
+#define refcount_create(rc) ((rc)->rc_count = 0)
+#define refcount_destroy(rc) ((rc)->rc_count = 0)
+#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
+#define refcount_is_zero(rc) ((rc)->rc_count == 0)
+#define refcount_count(rc) ((rc)->rc_count)
+#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
+#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define refcount_add_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, number)
+#define refcount_remove_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, -number)
+#define refcount_transfer(dst, src) { \
+ uint64_t __tmp = (src)->rc_count; \
+ atomic_add_64(&(src)->rc_count, -__tmp); \
+ atomic_add_64(&(dst)->rc_count, __tmp); \
+}
+
+#define refcount_init()
+#define refcount_fini()
+
+#endif /* ZFS_DEBUG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_REFCOUNT_H */
diff --git a/uts/common/fs/zfs/sys/rrwlock.h b/uts/common/fs/zfs/sys/rrwlock.h
new file mode 100644
index 000000000000..19a43c97fc3c
--- /dev/null
+++ b/uts/common/fs/zfs/sys/rrwlock.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_RR_RW_LOCK_H
+#define _SYS_RR_RW_LOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/inttypes.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+/*
+ * A reader-writer lock implementation that allows re-entrant reads, but
+ * still gives writers priority on "new" reads.
+ *
+ * See rrwlock.c for more details about the implementation.
+ *
+ * Fields of the rrwlock_t structure:
+ * - rr_lock: protects modification and reading of rrwlock_t fields
+ * - rr_cv: cv for waking up readers or waiting writers
+ * - rr_writer: thread id of the current writer
+ * - rr_anon_rount: number of active anonymous readers
+ * - rr_linked_rcount: total number of non-anonymous active readers
+ * - rr_writer_wanted: a writer wants the lock
+ */
+typedef struct rrwlock {
+ kmutex_t rr_lock;
+ kcondvar_t rr_cv;
+ kthread_t *rr_writer;
+ refcount_t rr_anon_rcount;
+ refcount_t rr_linked_rcount;
+ boolean_t rr_writer_wanted;
+} rrwlock_t;
+
+/*
+ * 'tag' is used in reference counting tracking. The
+ * 'tag' must be the same in a rrw_enter() as in its
+ * corresponding rrw_exit().
+ */
+void rrw_init(rrwlock_t *rrl);
+void rrw_destroy(rrwlock_t *rrl);
+void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
+void rrw_exit(rrwlock_t *rrl, void *tag);
+boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
+
+#define RRW_READ_HELD(x) rrw_held(x, RW_READER)
+#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_RR_RW_LOCK_H */
diff --git a/uts/common/fs/zfs/sys/sa.h b/uts/common/fs/zfs/sys/sa.h
new file mode 100644
index 000000000000..bc89fa07d222
--- /dev/null
+++ b/uts/common/fs/zfs/sys/sa.h
@@ -0,0 +1,170 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SA_H
+#define _SYS_SA_H
+
+#include <sys/dmu.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+ SA_UINT64_ARRAY,
+ SA_UINT32_ARRAY,
+ SA_UINT16_ARRAY,
+ SA_UINT8_ARRAY,
+ SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+ char *sa_name; /* attribute name */
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap; /* bswap functon enum */
+ sa_attr_type_t sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+ boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+ void *sa_data;
+ sa_data_locator_t *sa_data_func;
+ uint16_t sa_length;
+ sa_attr_type_t sa_attr;
+ /* the following are private to the sa framework */
+ void *sa_addr;
+ uint16_t sa_buftype;
+ uint16_t sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+ b[idx].sa_attr = attr;\
+ b[idx].sa_data_func = func; \
+ b[idx].sa_data = data; \
+ b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+ SA_HDL_SHARED,
+ SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+ uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+ uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void sa_update_user(sa_handle_t *, sa_handle_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init();
+void sa_cache_fini();
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_H */
diff --git a/uts/common/fs/zfs/sys/sa_impl.h b/uts/common/fs/zfs/sys/sa_impl.h
new file mode 100644
index 000000000000..6661e47cfc83
--- /dev/null
+++ b/uts/common/fs/zfs/sys/sa_impl.h
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SA_IMPL_H
+#define _SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+ sa_attr_type_t sa_attr;
+ uint8_t sa_registered;
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap;
+ char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | unused | len | bswap | attr num |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16 0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ * ......
+ *
+ */
+
+#define ATTR_BSWAP(x) BF32_GET(x, 16, 8)
+#define ATTR_LENGTH(x) BF32_GET(x, 24, 16)
+#define ATTR_NUM(x) BF32_GET(x, 0, 16)
+#define ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+ BF64_SET(x, 24, 16, length); \
+ BF64_SET(x, 16, 8, bswap); \
+ BF64_SET(x, 0, 16, attr); \
+}
+
+#define TOC_OFF(x) BF32_GET(x, 0, 23)
+#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1)
+#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4)
+#define TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+ BF32_SET(x, 31, 1, 1); \
+ BF32_SET(x, 24, 7, len_idx); \
+ BF32_SET(x, 0, 24, offset); \
+}
+
+#define SA_LAYOUTS "LAYOUTS"
+#define SA_REGISTRY "REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+ avl_node_t lot_num_node;
+ avl_node_t lot_hash_node;
+ uint64_t lot_num;
+ uint64_t lot_hash;
+ sa_attr_type_t *lot_attrs; /* array of attr #'s */
+ uint32_t lot_var_sizes; /* how many aren't fixed size */
+ uint32_t lot_attr_count; /* total attr count */
+ list_t lot_idx_tab; /* should be only a couple of entries */
+ int lot_instance; /* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+ list_node_t sa_next;
+ sa_lot_t *sa_layout;
+ uint16_t *sa_variable_lengths;
+ refcount_t sa_refcount;
+ uint32_t *sa_idx_tab; /* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read. The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout. Both
+ * of these tree's are interconnected. The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+ kmutex_t sa_lock;
+ boolean_t sa_need_attr_registration;
+ boolean_t sa_force_spill;
+ uint64_t sa_master_obj;
+ uint64_t sa_reg_attr_obj;
+ uint64_t sa_layout_attr_obj;
+ int sa_num_attrs;
+ sa_attr_table_t *sa_attr_table; /* private attr table */
+ sa_update_cb_t *sa_update_cb;
+ avl_tree_t sa_layout_num_tree; /* keyed by layout number */
+ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */
+ int sa_user_table_sz;
+ sa_attr_type_t *sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define SA_MAGIC 0x2F505A /* ZFS SA */
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+ /* ... Data follows the lengths. */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+
+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
+ BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+ SA_BONUS = 1,
+ SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+ SA_LOOKUP,
+ SA_UPDATE,
+ SA_ADD,
+ SA_REPLACE,
+ SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+ kmutex_t sa_lock;
+ dmu_buf_t *sa_bonus;
+ dmu_buf_t *sa_spill;
+ objset_t *sa_os;
+ void *sa_userp;
+ sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */
+ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */
+};
+
+#define SA_GET_DB(hdl, type) \
+ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define SA_GET_HDR(hdl, type) \
+ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+ type))->db.db_data))
+
+#define SA_IDX_TAB_GET(hdl, type) \
+ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define IS_SA_BONUSTYPE(a) \
+ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define SA_BONUSTYPE_FROM_DB(db) \
+ (dmu_get_bonustype((dmu_buf_t *)db))
+
+#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define SA_LAYOUT_NUM(x, type) \
+ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+ SA_REGISTERED_LEN(sa, attr))
+
+#define SA_SET_HDR(hdr, num, size) \
+ { \
+ hdr->sa_magic = SA_MAGIC; \
+ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+ }
+
+#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+ { \
+ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+ bulk.sa_buftype = type; \
+ bulk.sa_addr = \
+ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+ (uintptr_t)hdr); \
+}
+
+#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+ sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+ uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+ uint16_t *, sa_hdr_phys_t *);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h
new file mode 100644
index 000000000000..456ec06dc456
--- /dev/null
+++ b/uts/common/fs/zfs/sys/spa.h
@@ -0,0 +1,706 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_H
+#define _SYS_SPA_H
+
+#include <sys/avl.h>
+#include <sys/zfs_context.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Forward references that lots of things need.
+ */
+typedef struct spa spa_t;
+typedef struct vdev vdev_t;
+typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
+typedef struct zilog zilog_t;
+typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
+struct dsl_pool;
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) \
+ ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
+#define BF64_SET(x, low, len, val) \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define BF32_SET_SB(x, low, len, shift, bias, val) \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define BF64_SET_SB(x, low, len, shift, bias, val) \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define SPA_MINBLOCKSHIFT 9
+#define SPA_MAXBLOCKSHIFT 17
+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
+
+#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+
+/*
+ * Size of block to hold the configuration data (a packed nvlist)
+ */
+#define SPA_CONFIG_BLOCKSIZE (1 << 14)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+ uint64_t dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+ uint64_t zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | vdev1 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1 |G| offset1 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2 | vdev2 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3 |G| offset2 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4 | vdev3 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5 |G| offset3 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9 | physical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | logical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | fill count |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c | checksum[0] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d | checksum[1] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e | checksum[2] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f | checksum[3] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev virtual device ID
+ * offset offset into virtual device
+ * LSIZE logical size
+ * PSIZE physical size (after compression)
+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
+ * GRID RAID-Z layout information (reserved for future use)
+ * cksum checksum function
+ * comp compression function
+ * G gang block indicator
+ * B byteorder (endianness)
+ * D dedup
+ * X unused
+ * lvl level of indirection
+ * type DMU object type
+ * phys birth txg of block allocation; zero if same as logical birth txg
+ * log. birth transaction group in which the block was logically born
+ * fill count number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define DVA_GET_ASIZE(dva) \
+ BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_ASIZE(dva, x) \
+ BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
+#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define DVA_GET_OFFSET(dva) \
+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_OFFSET(dva, x) \
+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define BP_GET_LSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define BP_SET_LSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_PSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define BP_SET_PSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
+#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
+
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_PHYSICAL_BIRTH(bp) \
+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define BP_SET_BIRTH(bp, logical, physical) \
+{ \
+ (bp)->blk_birth = (logical); \
+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
+
+#define BP_GET_ASIZE(bp) \
+ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_GET_UCSIZE(bp) \
+ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
+
+#define BP_GET_NDVAS(bp) \
+ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_COUNT_GANG(bp) \
+ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[2]))
+
+#define DVA_EQUAL(dva1, dva2) \
+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define BP_EQUAL(bp1, bp2) \
+ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
+#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+ ((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
+
+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
+{ \
+ (zcp)->zc_word[0] = w0; \
+ (zcp)->zc_word[1] = w1; \
+ (zcp)->zc_word[2] = w2; \
+ (zcp)->zc_word[3] = w3; \
+}
+
+#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
+#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+ BP_GET_PSIZE(bp))
+
+#define BP_ZERO(bp) \
+{ \
+ (bp)->blk_dva[0].dva_word[0] = 0; \
+ (bp)->blk_dva[0].dva_word[1] = 0; \
+ (bp)->blk_dva[1].dva_word[0] = 0; \
+ (bp)->blk_dva[1].dva_word[1] = 0; \
+ (bp)->blk_dva[2].dva_word[0] = 0; \
+ (bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_prop = 0; \
+ (bp)->blk_pad[0] = 0; \
+ (bp)->blk_pad[1] = 0; \
+ (bp)->blk_phys_birth = 0; \
+ (bp)->blk_birth = 0; \
+ (bp)->blk_fill = 0; \
+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+/*
+ * Note: the byteorder is either 0 or -1, both of which are palindromes.
+ * This simplifies the endianness handling a bit.
+ */
+#ifdef _BIG_ENDIAN
+#define ZFS_HOST_BYTEORDER (0ULL)
+#else
+#define ZFS_HOST_BYTEORDER (-1ULL)
+#endif
+
+#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+
+#define BP_SPRINTF_LEN 320
+
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \
+{ \
+ static const char *copyname[] = \
+ { "zero", "single", "double", "triple" }; \
+ int size = BP_SPRINTF_LEN; \
+ int len = 0; \
+ int copies = 0; \
+ \
+ if (bp == NULL) { \
+ len = func(buf + len, size - len, "<NULL>"); \
+ } else if (BP_IS_HOLE(bp)) { \
+ len = func(buf + len, size - len, "<hole>"); \
+ } else { \
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
+ const dva_t *dva = &bp->blk_dva[d]; \
+ if (DVA_IS_VALID(dva)) \
+ copies++; \
+ len += func(buf + len, size - len, \
+ "DVA[%d]=<%llu:%llx:%llx>%c", d, \
+ (u_longlong_t)DVA_GET_VDEV(dva), \
+ (u_longlong_t)DVA_GET_OFFSET(dva), \
+ (u_longlong_t)DVA_GET_ASIZE(dva), \
+ ws); \
+ } \
+ if (BP_IS_GANG(bp) && \
+ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
+ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
+ copies--; \
+ len += func(buf + len, size - len, \
+ "[L%llu %s] %s %s %s %s %s %s%c" \
+ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
+ "cksum=%llx:%llx:%llx:%llx", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ checksum, \
+ compress, \
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
+ BP_IS_GANG(bp) ? "gang" : "contiguous", \
+ BP_GET_DEDUP(bp) ? "dedup" : "unique", \
+ copyname[copies], \
+ ws, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)BP_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth, \
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
+ (u_longlong_t)bp->blk_fill, \
+ ws, \
+ (u_longlong_t)bp->blk_cksum.zc_word[0], \
+ (u_longlong_t)bp->blk_cksum.zc_word[1], \
+ (u_longlong_t)bp->blk_cksum.zc_word[2], \
+ (u_longlong_t)bp->blk_cksum.zc_word[3]); \
+ } \
+ ASSERT(len < size); \
+}
+
+#include <sys/dmu.h>
+
+#define BP_GET_BUFC_TYPE(bp) \
+ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
+ ARC_BUFC_METADATA : ARC_BUFC_DATA);
+
+typedef enum spa_import_type {
+ SPA_IMPORT_EXISTING,
+ SPA_IMPORT_ASSEMBLE
+} spa_import_type_t;
+
+/* state manipulation functions */
+extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+ nvlist_t *policy, nvlist_t **config);
+extern int spa_get_stats(const char *pool, nvlist_t **config,
+ char *altroot, size_t buflen);
+extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
+ const char *history_str, nvlist_t *zplprops);
+extern int spa_import_rootpool(char *devpath, char *devid);
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+ uint64_t flags);
+extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
+extern int spa_destroy(char *pool);
+extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
+ boolean_t hardforce);
+extern int spa_reset(char *pool);
+extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_unrequest(spa_t *spa, int flag);
+extern void spa_async_suspend(spa_t *spa);
+extern void spa_async_resume(spa_t *spa);
+extern spa_t *spa_inject_addref(char *pool);
+extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
+
+#define SPA_ASYNC_CONFIG_UPDATE 0x01
+#define SPA_ASYNC_REMOVE 0x02
+#define SPA_ASYNC_PROBE 0x04
+#define SPA_ASYNC_RESILVER_DONE 0x08
+#define SPA_ASYNC_RESILVER 0x10
+#define SPA_ASYNC_AUTOEXPAND 0x20
+#define SPA_ASYNC_REMOVE_DONE 0x40
+#define SPA_ASYNC_REMOVE_STOP 0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define SPA_REMOVE_UNSPARE 0x01
+#define SPA_REMOVE_DONE 0x02
+
+/* device manipulation */
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
+ int replacing);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
+ int replace_done);
+extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
+extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
+extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp);
+
+/* spare state (which is global across all pools) */
+extern void spa_spare_add(vdev_t *vd);
+extern void spa_spare_remove(vdev_t *vd);
+extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
+extern void spa_spare_activate(vdev_t *vd);
+
+/* L2ARC state (which is global across all pools) */
+extern void spa_l2cache_add(vdev_t *vd);
+extern void spa_l2cache_remove(vdev_t *vd);
+extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
+extern void spa_l2cache_activate(vdev_t *vd);
+extern void spa_l2cache_drop(spa_t *spa);
+
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
+
+/* spa syncing */
+extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
+extern void spa_sync_allpools(void);
+
+/*
+ * DEFERRED_FREE must be large enough that regular blocks are not
+ * deferred. XXX so can't we change it back to 1?
+ */
+#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */
+#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
+#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
+
+/* spa namespace global mutex */
+extern kmutex_t spa_namespace_lock;
+
+/*
+ * SPA configuration functions in spa_config.c
+ */
+
+#define SPA_CONFIG_UPDATE_POOL 0
+#define SPA_CONFIG_UPDATE_VDEVS 1
+
+extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
+extern void spa_config_load(void);
+extern nvlist_t *spa_all_configs(uint64_t *);
+extern void spa_config_set(spa_t *spa, nvlist_t *config);
+extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int getstats);
+extern void spa_config_update(spa_t *spa, int what);
+
+/*
+ * Miscellaneous SPA routines in spa_misc.c
+ */
+
+/* Namespace manipulation */
+extern spa_t *spa_lookup(const char *name);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
+extern void spa_remove(spa_t *spa);
+extern spa_t *spa_next(spa_t *prev);
+
+/* Refcount functions */
+extern void spa_open_ref(spa_t *spa, void *tag);
+extern void spa_close(spa_t *spa, void *tag);
+extern boolean_t spa_refcount_zero(spa_t *spa);
+
+#define SCL_NONE 0x00
+#define SCL_CONFIG 0x01
+#define SCL_STATE 0x02
+#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
+#define SCL_ALLOC 0x08
+#define SCL_ZIO 0x10
+#define SCL_FREE 0x20
+#define SCL_VDEV 0x40
+#define SCL_LOCKS 7
+#define SCL_ALL ((1 << SCL_LOCKS) - 1)
+#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO)
+
+/* Pool configuration locks */
+extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
+extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
+extern void spa_config_exit(spa_t *spa, int locks, void *tag);
+extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
+
+/* Pool vdev add/remove lock */
+extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int error, char *tag);
+extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
+
+/* Pool vdev state change lock */
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
+extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
+
+/* Log state */
+typedef enum spa_log_state {
+ SPA_LOG_UNKNOWN = 0, /* unknown log state */
+ SPA_LOG_MISSING, /* missing log(s) */
+ SPA_LOG_CLEAR, /* clear the log(s) */
+ SPA_LOG_GOOD, /* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+extern int spa_offline_log(spa_t *spa);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
+/* Accessor functions */
+extern boolean_t spa_shutting_down(spa_t *spa);
+extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
+extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
+extern void spa_altroot(spa_t *, char *, size_t);
+extern int spa_sync_pass(spa_t *spa);
+extern char *spa_name(spa_t *spa);
+extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_last_synced_txg(spa_t *spa);
+extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
+extern uint64_t spa_version(spa_t *spa);
+extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
+extern uint64_t spa_freeze_txg(spa_t *spa);
+extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern void spa_update_dspace(spa_t *spa);
+extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
+extern int spa_busy(void);
+extern uint8_t spa_get_failmode(spa_t *spa);
+extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
+
+/* Miscellaneous support routines */
+extern int spa_rename(const char *oldname, const char *newname);
+extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
+extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern char *spa_strdup(const char *);
+extern void spa_strfree(char *);
+extern uint64_t spa_get_random(uint64_t range);
+extern uint64_t spa_generate_guid(spa_t *spa);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
+extern void spa_freeze(spa_t *spa);
+extern void spa_upgrade(spa_t *spa, uint64_t version);
+extern void spa_evict_all(void);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
+ boolean_t l2cache);
+extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
+extern boolean_t spa_has_slogs(spa_t *spa);
+extern boolean_t spa_is_root(spa_t *spa);
+extern boolean_t spa_writeable(spa_t *spa);
+
+extern int spa_mode(spa_t *spa);
+extern uint64_t strtonum(const char *str, char **nptr);
+
+/* history logging */
+typedef enum history_log_type {
+ LOG_CMD_POOL_CREATE,
+ LOG_CMD_NORMAL,
+ LOG_INTERNAL
+} history_log_type_t;
+
+typedef struct history_arg {
+ char *ha_history_str;
+ history_log_type_t ha_log_type;
+ history_internal_events_t ha_event;
+ char *ha_zone;
+ uid_t ha_uid;
+} history_arg_t;
+
+extern char *spa_his_ievent_table[];
+
+extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
+extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
+ char *his_buf);
+extern int spa_history_log(spa_t *spa, const char *his_buf,
+ history_log_type_t what);
+extern void spa_history_log_internal(history_internal_events_t event,
+ spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
+extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
+
+/* error handling */
+struct zbookmark;
+extern void spa_log_error(spa_t *spa, zio_t *zio);
+extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+ zio_t *zio, uint64_t stateoroffset, uint64_t length);
+extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
+extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
+extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
+extern void spa_errlog_rotate(spa_t *spa);
+extern void spa_errlog_drain(spa_t *spa);
+extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
+extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
+
+/* vdev cache */
+extern void vdev_cache_stat_init(void);
+extern void vdev_cache_stat_fini(void);
+
+/* Initialization and termination */
+extern void spa_init(int flags);
+extern void spa_fini(void);
+extern void spa_boot_init();
+
+/* properties */
+extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
+extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
+extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
+extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
+
+/* asynchronous event notification */
+extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
+
+#ifdef ZFS_DEBUG
+#define dprintf_bp(bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ sprintf_blkptr(__blkbuf, (bp)); \
+ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_bp(bp, fmt, ...)
+#endif
+
+extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_H */
diff --git a/uts/common/fs/zfs/sys/spa_boot.h b/uts/common/fs/zfs/sys/spa_boot.h
new file mode 100644
index 000000000000..1d3622f5a108
--- /dev/null
+++ b/uts/common/fs/zfs/sys/spa_boot.h
@@ -0,0 +1,42 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_BOOT_H
+#define _SYS_SPA_BOOT_H
+
+#include <sys/nvpair.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern char *spa_get_bootprop(char *prop);
+extern void spa_free_bootprop(char *prop);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_BOOT_H */
diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h
new file mode 100644
index 000000000000..c965ffbbef87
--- /dev/null
+++ b/uts/common/fs/zfs/sys/spa_impl.h
@@ -0,0 +1,235 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_IMPL_H
+#define _SYS_SPA_IMPL_H
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/uberblock_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/refcount.h>
+#include <sys/bplist.h>
+#include <sys/bpobj.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_error_entry {
+ zbookmark_t se_bookmark;
+ char *se_name;
+ avl_node_t se_avl;
+} spa_error_entry_t;
+
+typedef struct spa_history_phys {
+ uint64_t sh_pool_create_len; /* ending offset of zpool create */
+ uint64_t sh_phys_max_off; /* physical EOF */
+ uint64_t sh_bof; /* logical BOF */
+ uint64_t sh_eof; /* logical EOF */
+ uint64_t sh_records_lost; /* num of records overwritten */
+} spa_history_phys_t;
+
+struct spa_aux_vdev {
+ uint64_t sav_object; /* MOS object for device list */
+ nvlist_t *sav_config; /* cached device config */
+ vdev_t **sav_vdevs; /* devices */
+ int sav_count; /* number devices */
+ boolean_t sav_sync; /* sync the device list */
+ nvlist_t **sav_pending; /* pending device additions */
+ uint_t sav_npending; /* # pending devices */
+};
+
+typedef struct spa_config_lock {
+ kmutex_t scl_lock;
+ kthread_t *scl_writer;
+ int scl_write_wanted;
+ kcondvar_t scl_cv;
+ refcount_t scl_count;
+} spa_config_lock_t;
+
+typedef struct spa_config_dirent {
+ list_node_t scd_link;
+ char *scd_path;
+} spa_config_dirent_t;
+
+enum zio_taskq_type {
+ ZIO_TASKQ_ISSUE = 0,
+ ZIO_TASKQ_ISSUE_HIGH,
+ ZIO_TASKQ_INTERRUPT,
+ ZIO_TASKQ_INTERRUPT_HIGH,
+ ZIO_TASKQ_TYPES
+};
+
+/*
+ * State machine for the zpool-pooname process. The states transitions
+ * are done as follows:
+ *
+ * From To Routine
+ * PROC_NONE -> PROC_CREATED spa_activate()
+ * PROC_CREATED -> PROC_ACTIVE spa_thread()
+ * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
+ * PROC_DEACTIVATE -> PROC_GONE spa_thread()
+ * PROC_GONE -> PROC_NONE spa_deactivate()
+ */
+typedef enum spa_proc_state {
+ SPA_PROC_NONE, /* spa_proc = &p0, no process created */
+ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
+ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
+ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
+ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
+struct spa {
+ /*
+ * Fields protected by spa_namespace_lock.
+ */
+ char spa_name[MAXNAMELEN]; /* pool name */
+ avl_node_t spa_avl; /* node in spa_namespace_avl */
+ nvlist_t *spa_config; /* last synced config */
+ nvlist_t *spa_config_syncing; /* currently syncing config */
+ nvlist_t *spa_config_splitting; /* config for splitting */
+ nvlist_t *spa_load_info; /* info and errors from load */
+ uint64_t spa_config_txg; /* txg of last config change */
+ int spa_sync_pass; /* iterate-to-convergence */
+ pool_state_t spa_state; /* pool state */
+ int spa_inject_ref; /* injection references */
+ uint8_t spa_sync_on; /* sync threads are running */
+ spa_load_state_t spa_load_state; /* current load operation */
+ uint64_t spa_import_flags; /* import specific flags */
+ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
+ dsl_pool_t *spa_dsl_pool;
+ metaslab_class_t *spa_normal_class; /* normal data class */
+ metaslab_class_t *spa_log_class; /* intent log data class */
+ uint64_t spa_first_txg; /* first txg after spa_open() */
+ uint64_t spa_final_txg; /* txg of export/destroy */
+ uint64_t spa_freeze_txg; /* freeze pool at this txg */
+ uint64_t spa_load_max_txg; /* best initial ub_txg */
+ uint64_t spa_claim_max_txg; /* highest claimed birth txg */
+ timespec_t spa_loaded_ts; /* 1st successful open time */
+ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
+ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
+ vdev_t *spa_root_vdev; /* top-level vdev container */
+ uint64_t spa_load_guid; /* initial guid for spa_load */
+ list_t spa_config_dirty_list; /* vdevs with dirty config */
+ list_t spa_state_dirty_list; /* vdevs with dirty state */
+ spa_aux_vdev_t spa_spares; /* hot spares */
+ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
+ uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_config_generation; /* config generation number */
+ uint64_t spa_syncing_txg; /* txg currently syncing */
+ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
+ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
+ uberblock_t spa_ubsync; /* last synced uberblock */
+ uberblock_t spa_uberblock; /* current uberblock */
+ boolean_t spa_extreme_rewind; /* rewind past deferred frees */
+ uint64_t spa_last_io; /* lbolt of last non-scan I/O */
+ kmutex_t spa_scrub_lock; /* resilver/scrub lock */
+ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
+ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
+ uint8_t spa_scrub_active; /* active or suspended? */
+ uint8_t spa_scrub_type; /* type of scrub we're doing */
+ uint8_t spa_scrub_finished; /* indicator to rotate logs */
+ uint8_t spa_scrub_started; /* started since last boot */
+ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
+ uint64_t spa_scan_pass_start; /* start time per pass/reboot */
+ uint64_t spa_scan_pass_exam; /* examined bytes per pass */
+ kmutex_t spa_async_lock; /* protect async state */
+ kthread_t *spa_async_thread; /* thread doing async task */
+ int spa_async_suspended; /* async tasks suspended */
+ kcondvar_t spa_async_cv; /* wait for thread_exit() */
+ uint16_t spa_async_tasks; /* async task mask */
+ char *spa_root; /* alternate root directory */
+ uint64_t spa_ena; /* spa-wide ereport ENA */
+ int spa_last_open_failed; /* error if last open failed */
+ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
+ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_txg; /* ub txg that loaded */
+ uint64_t spa_load_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_meta_errors; /* verify metadata err count */
+ uint64_t spa_load_data_errors; /* verify data err count */
+ uint64_t spa_verify_min_txg; /* start txg of verify scrub */
+ kmutex_t spa_errlog_lock; /* error log lock */
+ uint64_t spa_errlog_last; /* last error log object */
+ uint64_t spa_errlog_scrub; /* scrub error log object */
+ kmutex_t spa_errlist_lock; /* error list/ereport lock */
+ avl_tree_t spa_errlist_last; /* last error list */
+ avl_tree_t spa_errlist_scrub; /* scrub error list */
+ uint64_t spa_deflate; /* should we deflate? */
+ uint64_t spa_history; /* history object */
+ kmutex_t spa_history_lock; /* history lock */
+ vdev_t *spa_pending_vdev; /* pending vdev additions */
+ kmutex_t spa_props_lock; /* property lock */
+ uint64_t spa_pool_props_object; /* object for properties */
+ uint64_t spa_bootfs; /* default boot filesystem */
+ uint64_t spa_failmode; /* failure mode for the pool */
+ uint64_t spa_delegation; /* delegation on/off */
+ list_t spa_config_list; /* previous cache file(s) */
+ zio_t *spa_async_zio_root; /* root of all async I/O */
+ zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
+ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
+ kcondvar_t spa_suspend_cv; /* notification of resume */
+ uint8_t spa_suspended; /* pool is suspended */
+ uint8_t spa_claiming; /* pool is doing zil_claim() */
+ boolean_t spa_is_root; /* pool is root */
+ int spa_minref; /* num refs when first opened */
+ int spa_mode; /* FREAD | FWRITE */
+ spa_log_state_t spa_log_state; /* log state */
+ uint64_t spa_autoexpand; /* lun expansion on/off */
+ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+ uint64_t spa_ddt_stat_object; /* DDT statistics */
+ uint64_t spa_dedup_ditto; /* dedup ditto threshold */
+ uint64_t spa_dedup_checksum; /* default dedup checksum */
+ uint64_t spa_dspace; /* dspace in normal class */
+ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
+ kmutex_t spa_proc_lock; /* protects spa_proc* */
+ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
+ spa_proc_state_t spa_proc_state; /* see definition */
+ struct proc *spa_proc; /* "zpool-poolname" process */
+ uint64_t spa_did; /* if procp != p0, did of t1 */
+ boolean_t spa_autoreplace; /* autoreplace set in open */
+ int spa_vdev_locks; /* locks grabbed */
+ uint64_t spa_creation_version; /* version at pool creation */
+ uint64_t spa_prev_software_version;
+ /*
+ * spa_refcnt & spa_config_lock must be the last elements
+ * because refcount_t changes size based on compilation options.
+ * In order for the MDB module to function correctly, the other
+ * fields must remain in the same location.
+ */
+ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
+ refcount_t spa_refcount; /* number of opens */
+};
+
+extern const char *spa_config_path;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/space_map.h b/uts/common/fs/zfs/sys/space_map.h
new file mode 100644
index 000000000000..6f935c9db27e
--- /dev/null
+++ b/uts/common/fs/zfs/sys/space_map.h
@@ -0,0 +1,179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPACE_MAP_H
+#define _SYS_SPACE_MAP_H
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct space_map_ops space_map_ops_t;
+
+typedef struct space_map {
+ avl_tree_t sm_root; /* AVL tree of map segments */
+ uint64_t sm_space; /* sum of all segments in the map */
+ uint64_t sm_start; /* start of map */
+ uint64_t sm_size; /* size of map */
+ uint8_t sm_shift; /* unit shift */
+ uint8_t sm_pad[3]; /* unused */
+ uint8_t sm_loaded; /* map loaded? */
+ uint8_t sm_loading; /* map loading? */
+ kcondvar_t sm_load_cv; /* map load completion */
+ space_map_ops_t *sm_ops; /* space map block picker ops vector */
+ avl_tree_t *sm_pp_root; /* picker-private AVL tree */
+ void *sm_ppd; /* picker-private data */
+ kmutex_t *sm_lock; /* pointer to lock that protects map */
+} space_map_t;
+
+typedef struct space_seg {
+ avl_node_t ss_node; /* AVL node */
+ avl_node_t ss_pp_node; /* AVL picker-private node */
+ uint64_t ss_start; /* starting offset of this segment */
+ uint64_t ss_end; /* ending offset (non-inclusive) */
+} space_seg_t;
+
+typedef struct space_ref {
+ avl_node_t sr_node; /* AVL node */
+ uint64_t sr_offset; /* offset (start or end) */
+ int64_t sr_refcnt; /* associated reference count */
+} space_ref_t;
+
+typedef struct space_map_obj {
+ uint64_t smo_object; /* on-disk space map object */
+ uint64_t smo_objsize; /* size of the object */
+ uint64_t smo_alloc; /* space allocated from the map */
+} space_map_obj_t;
+
+struct space_map_ops {
+ void (*smop_load)(space_map_t *sm);
+ void (*smop_unload)(space_map_t *sm);
+ uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
+ void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
+ void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+ uint64_t (*smop_max)(space_map_t *sm);
+ boolean_t (*smop_fragmented)(space_map_t *sm);
+};
+
+/*
+ * debug entry
+ *
+ * 1 3 10 50
+ * ,---+--------+------------+---------------------------------.
+ * | 1 | action | syncpass | txg (lower bits) |
+ * `---+--------+------------+---------------------------------'
+ * 63 62 60 59 50 49 0
+ *
+ *
+ *
+ * non-debug entry
+ *
+ * 1 47 1 15
+ * ,-----------------------------------------------------------.
+ * | 0 | offset (sm_shift units) | type | run |
+ * `-----------------------------------------------------------'
+ * 63 62 17 16 15 0
+ */
+
+/* All this stuff takes and returns bytes */
+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
+#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
+#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
+
+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
+
+#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
+#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
+
+#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
+#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
+
+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
+
+#define SM_ALLOC 0x0
+#define SM_FREE 0x1
+
+/*
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ * This could use a lot more research, but for now, set the freelist
+ * block size to 4k (2^12).
+ */
+#define SPACE_MAP_BLOCKSHIFT 12
+
+typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
+ uint8_t shift, kmutex_t *lp);
+extern void space_map_destroy(space_map_t *sm);
+extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
+extern boolean_t space_map_contains(space_map_t *sm,
+ uint64_t start, uint64_t size);
+extern void space_map_vacate(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+extern void space_map_walk(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+
+extern void space_map_load_wait(space_map_t *sm);
+extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
+ uint8_t maptype, space_map_obj_t *smo, objset_t *os);
+extern void space_map_unload(space_map_t *sm);
+
+extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
+extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+extern uint64_t space_map_maxsize(space_map_t *sm);
+
+extern void space_map_sync(space_map_t *sm, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
+extern void space_map_truncate(space_map_obj_t *smo,
+ objset_t *os, dmu_tx_t *tx);
+
+extern void space_map_ref_create(avl_tree_t *t);
+extern void space_map_ref_destroy(avl_tree_t *t);
+extern void space_map_ref_add_seg(avl_tree_t *t,
+ uint64_t start, uint64_t end, int64_t refcnt);
+extern void space_map_ref_add_map(avl_tree_t *t,
+ space_map_t *sm, int64_t refcnt);
+extern void space_map_ref_generate_map(avl_tree_t *t,
+ space_map_t *sm, int64_t minref);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPACE_MAP_H */
diff --git a/uts/common/fs/zfs/sys/txg.h b/uts/common/fs/zfs/sys/txg.h
new file mode 100644
index 000000000000..e323d5efabb7
--- /dev/null
+++ b/uts/common/fs/zfs/sys/txg.h
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_H
+#define _SYS_TXG_H
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */
+#define TXG_SIZE 4 /* next power of 2 */
+#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
+#define TXG_INITIAL TXG_SIZE /* initial txg */
+#define TXG_IDX (txg & TXG_MASK)
+
+/* Number of txgs worth of frees we defer adding to in-core spacemaps */
+#define TXG_DEFER_SIZE 2
+
+#define TXG_WAIT 1ULL
+#define TXG_NOWAIT 2ULL
+
+typedef struct tx_cpu tx_cpu_t;
+
+typedef struct txg_handle {
+ tx_cpu_t *th_cpu;
+ uint64_t th_txg;
+} txg_handle_t;
+
+typedef struct txg_node {
+ struct txg_node *tn_next[TXG_SIZE];
+ uint8_t tn_member[TXG_SIZE];
+} txg_node_t;
+
+typedef struct txg_list {
+ kmutex_t tl_lock;
+ size_t tl_offset;
+ txg_node_t *tl_head[TXG_SIZE];
+} txg_list_t;
+
+struct dsl_pool;
+
+extern void txg_init(struct dsl_pool *dp, uint64_t txg);
+extern void txg_fini(struct dsl_pool *dp);
+extern void txg_sync_start(struct dsl_pool *dp);
+extern void txg_sync_stop(struct dsl_pool *dp);
+extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
+extern void txg_rele_to_quiesce(txg_handle_t *txghp);
+extern void txg_rele_to_sync(txg_handle_t *txghp);
+extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
+
+/*
+ * Delay the caller by the specified number of ticks or until
+ * the txg closes (whichever comes first). This is intended
+ * to be used to throttle writers when the system nears its
+ * capacity.
+ */
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+
+/*
+ * Wait until the given transaction group has finished syncing.
+ * Try to make this happen as soon as possible (eg. kick off any
+ * necessary syncs immediately). If txg==0, wait for the currently open
+ * txg to finish syncing.
+ */
+extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait until the given transaction group, or one after it, is
+ * the open transaction group. Try to make this happen as soon
+ * as possible (eg. kick off any necessary syncs immediately).
+ * If txg == 0, wait for the next open txg.
+ */
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Returns TRUE if we are "backed up" waiting for the syncing
+ * transaction to complete; otherwise returns FALSE.
+ */
+extern boolean_t txg_stalled(struct dsl_pool *dp);
+
+/* returns TRUE if someone is waiting for the next txg to sync */
+extern boolean_t txg_sync_waiting(struct dsl_pool *dp);
+
+/*
+ * Per-txg object lists.
+ */
+
+#define TXG_CLEAN(txg) ((txg) - 1)
+
+extern void txg_list_create(txg_list_t *tl, size_t offset);
+extern void txg_list_destroy(txg_list_t *tl);
+extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_H */
diff --git a/uts/common/fs/zfs/sys/txg_impl.h b/uts/common/fs/zfs/sys/txg_impl.h
new file mode 100644
index 000000000000..7b356eac1293
--- /dev/null
+++ b/uts/common/fs/zfs/sys/txg_impl.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_IMPL_H
+#define _SYS_TXG_IMPL_H
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct tx_cpu {
+ kmutex_t tc_lock;
+ kcondvar_t tc_cv[TXG_SIZE];
+ uint64_t tc_count[TXG_SIZE];
+ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
+ char tc_pad[16];
+};
+
+typedef struct tx_state {
+ tx_cpu_t *tx_cpu; /* protects right to enter txg */
+ kmutex_t tx_sync_lock; /* protects tx_state_t */
+ uint64_t tx_open_txg; /* currently open txg id */
+ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
+ uint64_t tx_syncing_txg; /* currently syncing txg id */
+ uint64_t tx_synced_txg; /* last synced txg id */
+
+ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
+ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
+
+ kcondvar_t tx_sync_more_cv;
+ kcondvar_t tx_sync_done_cv;
+ kcondvar_t tx_quiesce_more_cv;
+ kcondvar_t tx_quiesce_done_cv;
+ kcondvar_t tx_timeout_cv;
+ kcondvar_t tx_exit_cv; /* wait for all threads to exit */
+
+ uint8_t tx_threads; /* number of threads */
+ uint8_t tx_exiting; /* set when we're exiting */
+
+ kthread_t *tx_sync_thread;
+ kthread_t *tx_quiesce_thread;
+
+ taskq_t *tx_commit_cb_taskq; /* commit callback taskq */
+} tx_state_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/uberblock.h b/uts/common/fs/zfs/sys/uberblock.h
new file mode 100644
index 000000000000..b5bb91573145
--- /dev/null
+++ b/uts/common/fs/zfs/sys/uberblock.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_H
+#define _SYS_UBERBLOCK_H
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct uberblock uberblock_t;
+
+extern int uberblock_verify(uberblock_t *ub);
+extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_H */
diff --git a/uts/common/fs/zfs/sys/uberblock_impl.h b/uts/common/fs/zfs/sys/uberblock_impl.h
new file mode 100644
index 000000000000..6ab6aa3135a2
--- /dev/null
+++ b/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define _SYS_UBERBLOCK_IMPL_H
+
+#include <sys/uberblock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved. When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked. If the ub_version field is moved, we may not detect
+ * version mismatch. If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
+#define UBERBLOCK_SHIFT 10 /* up to 1K */
+
+struct uberblock {
+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
+ uint64_t ub_version; /* SPA_VERSION */
+ uint64_t ub_txg; /* txg of last sync */
+ uint64_t ub_guid_sum; /* sum of all vdev guids */
+ uint64_t ub_timestamp; /* UTC time of last sync */
+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
+
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/unique.h b/uts/common/fs/zfs/sys/unique.h
new file mode 100644
index 000000000000..2ef3093edf1c
--- /dev/null
+++ b/uts/common/fs/zfs/sys/unique.h
@@ -0,0 +1,59 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UNIQUE_H
+#define _SYS_UNIQUE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The number of significant bits in each unique value. */
+#define UNIQUE_BITS 56
+
+void unique_init(void);
+void unique_fini(void);
+
+/*
+ * Return a new unique value (which will not be uniquified against until
+ * it is unique_insert()-ed.
+ */
+uint64_t unique_create(void);
+
+/* Return a unique value, which equals the one passed in if possible. */
+uint64_t unique_insert(uint64_t value);
+
+/* Indicate that this value no longer needs to be uniquified against. */
+void unique_remove(uint64_t value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UNIQUE_H */
diff --git a/uts/common/fs/zfs/sys/vdev.h b/uts/common/fs/zfs/sys/vdev.h
new file mode 100644
index 000000000000..941f234dc68f
--- /dev/null
+++ b/uts/common/fs/zfs/sys/vdev.h
@@ -0,0 +1,161 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_H
+#define _SYS_VDEV_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum vdev_dtl_type {
+ DTL_MISSING, /* 0% replication: no copies of the data */
+ DTL_PARTIAL, /* less than 100% replication: some copies missing */
+ DTL_SCRUB, /* unable to fully repair during scrub/resilver */
+ DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
+ DTL_TYPES
+} vdev_dtl_type_t;
+
+extern boolean_t zfs_nocacheflush;
+
+extern int vdev_open(vdev_t *);
+extern void vdev_open_children(vdev_t *);
+extern boolean_t vdev_uses_zvols(vdev_t *);
+extern int vdev_validate(vdev_t *);
+extern void vdev_close(vdev_t *);
+extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
+extern void vdev_reopen(vdev_t *);
+extern int vdev_validate_aux(vdev_t *vd);
+extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
+
+extern boolean_t vdev_is_bootable(vdev_t *vd);
+extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
+extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
+ uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
+ uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
+extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+ int scrub_done);
+extern boolean_t vdev_dtl_required(vdev_t *vd);
+extern boolean_t vdev_resilver_needed(vdev_t *vd,
+ uint64_t *minp, uint64_t *maxp);
+
+extern void vdev_hold(vdev_t *);
+extern void vdev_rele(vdev_t *);
+
+extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern void vdev_metaslab_fini(vdev_t *vd);
+extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_expand(vdev_t *vd, uint64_t txg);
+extern void vdev_split(vdev_t *vd);
+
+
+extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_clear_stats(vdev_t *vd);
+extern void vdev_stat_update(zio_t *zio, uint64_t psize);
+extern void vdev_scan_stat_init(vdev_t *vd);
+extern void vdev_propagate_state(vdev_t *vd);
+extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
+ vdev_aux_t aux);
+
+extern void vdev_space_update(vdev_t *vd,
+ int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
+
+extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+
+extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
+ vdev_state_t *);
+extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
+extern void vdev_clear(spa_t *spa, vdev_t *vd);
+
+extern boolean_t vdev_is_dead(vdev_t *vd);
+extern boolean_t vdev_readable(vdev_t *vd);
+extern boolean_t vdev_writeable(vdev_t *vd);
+extern boolean_t vdev_allocatable(vdev_t *vd);
+extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
+
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern int vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+extern void vdev_cache_purge(vdev_t *vd);
+
+extern void vdev_queue_init(vdev_t *vd);
+extern void vdev_queue_fini(vdev_t *vd);
+extern zio_t *vdev_queue_io(zio_t *zio);
+extern void vdev_queue_io_done(zio_t *zio);
+
+extern void vdev_config_dirty(vdev_t *vd);
+extern void vdev_config_clean(vdev_t *vd);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
+ boolean_t);
+
+extern void vdev_state_dirty(vdev_t *vd);
+extern void vdev_state_clean(vdev_t *vd);
+
+typedef enum vdev_config_flag {
+ VDEV_CONFIG_SPARE = 1 << 0,
+ VDEV_CONFIG_L2CACHE = 1 << 1,
+ VDEV_CONFIG_REMOVING = 1 << 2
+} vdev_config_flag_t;
+
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
+extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
+ boolean_t getstats, vdev_config_flag_t flags);
+
+/*
+ * Label routines
+ */
+struct uberblock;
+extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern int vdev_label_number(uint64_t psise, uint64_t offset);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd);
+extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+
+typedef enum {
+ VDEV_LABEL_CREATE, /* create/add a new device */
+ VDEV_LABEL_REPLACE, /* replace an existing device */
+ VDEV_LABEL_SPARE, /* add a new hot spare */
+ VDEV_LABEL_REMOVE, /* remove an existing device */
+ VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
+ VDEV_LABEL_SPLIT /* generating new label for split-off dev */
+} vdev_labeltype_t;
+
+extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_H */
diff --git a/uts/common/fs/zfs/sys/vdev_disk.h b/uts/common/fs/zfs/sys/vdev_disk.h
new file mode 100644
index 000000000000..b748571ea0c3
--- /dev/null
+++ b/uts/common/fs/zfs/sys/vdev_disk.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_DISK_H
+#define _SYS_VDEV_DISK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+#ifdef _KERNEL
+#include <sys/buf.h>
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_disk {
+ ddi_devid_t vd_devid;
+ char *vd_minor;
+ ldi_handle_t vd_lh;
+} vdev_disk_t;
+
+#ifdef _KERNEL
+extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_DISK_H */
diff --git a/uts/common/fs/zfs/sys/vdev_file.h b/uts/common/fs/zfs/sys/vdev_file.h
new file mode 100644
index 000000000000..cd496735778c
--- /dev/null
+++ b/uts/common/fs/zfs/sys/vdev_file.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_FILE_H
+#define _SYS_VDEV_FILE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_file {
+ vnode_t *vf_vnode;
+} vdev_file_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_FILE_H */
diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h
new file mode 100644
index 000000000000..161bd21f05a6
--- /dev/null
+++ b/uts/common/fs/zfs/sys/vdev_impl.h
@@ -0,0 +1,322 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define _SYS_VDEV_IMPL_H
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+#include <sys/metaslab.h>
+#include <sys/nvpair.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/dkio.h>
+#include <sys/uberblock_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Virtual device descriptors.
+ *
+ * All storage pool operations go through the virtual device framework,
+ * which provides data replication and I/O scheduling.
+ */
+
+/*
+ * Forward declarations that lots of things need.
+ */
+typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
+
+/*
+ * Virtual device operations
+ */
+typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
+typedef void vdev_close_func_t(vdev_t *vd);
+typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
+typedef int vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_done_func_t(zio_t *zio);
+typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef void vdev_hold_func_t(vdev_t *vd);
+typedef void vdev_rele_func_t(vdev_t *vd);
+
+typedef struct vdev_ops {
+ vdev_open_func_t *vdev_op_open;
+ vdev_close_func_t *vdev_op_close;
+ vdev_asize_func_t *vdev_op_asize;
+ vdev_io_start_func_t *vdev_op_io_start;
+ vdev_io_done_func_t *vdev_op_io_done;
+ vdev_state_change_func_t *vdev_op_state_change;
+ vdev_hold_func_t *vdev_op_hold;
+ vdev_rele_func_t *vdev_op_rele;
+ char vdev_op_type[16];
+ boolean_t vdev_op_leaf;
+} vdev_ops_t;
+
+/*
+ * Virtual device properties
+ */
+struct vdev_cache_entry {
+ char *ve_data;
+ uint64_t ve_offset;
+ uint64_t ve_lastused;
+ avl_node_t ve_offset_node;
+ avl_node_t ve_lastused_node;
+ uint32_t ve_hits;
+ uint16_t ve_missed_update;
+ zio_t *ve_fill_io;
+};
+
+struct vdev_cache {
+ avl_tree_t vc_offset_tree;
+ avl_tree_t vc_lastused_tree;
+ kmutex_t vc_lock;
+};
+
+struct vdev_queue {
+ avl_tree_t vq_deadline_tree;
+ avl_tree_t vq_read_tree;
+ avl_tree_t vq_write_tree;
+ avl_tree_t vq_pending_tree;
+ kmutex_t vq_lock;
+};
+
+/*
+ * Virtual device descriptor
+ */
+struct vdev {
+ /*
+ * Common to all vdev types.
+ */
+ uint64_t vdev_id; /* child number in vdev parent */
+ uint64_t vdev_guid; /* unique ID for this vdev */
+ uint64_t vdev_guid_sum; /* self guid + all child guids */
+ uint64_t vdev_orig_guid; /* orig. guid prior to remove */
+ uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_min_asize; /* min acceptable asize */
+ uint64_t vdev_ashift; /* block alignment shift */
+ uint64_t vdev_state; /* see VDEV_STATE_* #defines */
+ uint64_t vdev_prevstate; /* used when reopening a vdev */
+ vdev_ops_t *vdev_ops; /* vdev operations */
+ spa_t *vdev_spa; /* spa for this vdev */
+ void *vdev_tsd; /* type-specific data */
+ vnode_t *vdev_name_vp; /* vnode for pathname */
+ vnode_t *vdev_devid_vp; /* vnode for devid */
+ vdev_t *vdev_top; /* top-level vdev */
+ vdev_t *vdev_parent; /* parent vdev */
+ vdev_t **vdev_child; /* array of children */
+ uint64_t vdev_children; /* number of children */
+ space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
+ vdev_stat_t vdev_stat; /* virtual device statistics */
+ boolean_t vdev_expanding; /* expand the vdev? */
+ boolean_t vdev_reopening; /* reopen in progress? */
+ int vdev_open_error; /* error on last open */
+ kthread_t *vdev_open_thread; /* thread opening children */
+ uint64_t vdev_crtxg; /* txg when top-level was added */
+
+ /*
+ * Top-level vdev state.
+ */
+ uint64_t vdev_ms_array; /* metaslab array object */
+ uint64_t vdev_ms_shift; /* metaslab size shift */
+ uint64_t vdev_ms_count; /* number of metaslabs */
+ metaslab_group_t *vdev_mg; /* metaslab group */
+ metaslab_t **vdev_ms; /* metaslab array */
+ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
+ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
+ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
+ boolean_t vdev_remove_wanted; /* async remove wanted? */
+ boolean_t vdev_probe_wanted; /* async probe wanted? */
+ uint64_t vdev_removing; /* device is being removed? */
+ list_node_t vdev_config_dirty_node; /* config dirty list */
+ list_node_t vdev_state_dirty_node; /* state dirty list */
+ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
+ uint64_t vdev_islog; /* is an intent log device */
+ uint64_t vdev_ishole; /* is a hole in the namespace */
+
+ /*
+ * Leaf vdev state.
+ */
+ uint64_t vdev_psize; /* physical device capacity */
+ space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */
+ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
+ uint64_t vdev_wholedisk; /* true if this is a whole disk */
+ uint64_t vdev_offline; /* persistent offline state */
+ uint64_t vdev_faulted; /* persistent faulted state */
+ uint64_t vdev_degraded; /* persistent degraded state */
+ uint64_t vdev_removed; /* persistent removed state */
+ uint64_t vdev_resilvering; /* persistent resilvering state */
+ uint64_t vdev_nparity; /* number of parity devices for raidz */
+ char *vdev_path; /* vdev path (if any) */
+ char *vdev_devid; /* vdev devid (if any) */
+ char *vdev_physpath; /* vdev device path (if any) */
+ char *vdev_fru; /* physical FRU location */
+ uint64_t vdev_not_present; /* not present during import */
+ uint64_t vdev_unspare; /* unspare when resilvering done */
+ hrtime_t vdev_last_try; /* last reopen time */
+ boolean_t vdev_nowritecache; /* true if flushwritecache failed */
+ boolean_t vdev_checkremove; /* temporary online test */
+ boolean_t vdev_forcefault; /* force online fault */
+ boolean_t vdev_splitting; /* split or repair in progress */
+ boolean_t vdev_delayed_close; /* delayed device close? */
+ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
+ uint8_t vdev_detached; /* device detached? */
+ uint8_t vdev_cant_read; /* vdev is failing all reads */
+ uint8_t vdev_cant_write; /* vdev is failing all writes */
+ uint64_t vdev_isspare; /* was a hot spare */
+ uint64_t vdev_isl2cache; /* was a l2cache device */
+ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
+ vdev_cache_t vdev_cache; /* physical block cache */
+ spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
+ zio_t *vdev_probe_zio; /* root of current probe */
+ vdev_aux_t vdev_label_aux; /* on-disk aux state */
+
+ /*
+ * For DTrace to work in userland (libzpool) context, these fields must
+ * remain at the end of the structure. DTrace will use the kernel's
+ * CTF definition for 'struct vdev', and since the size of a kmutex_t is
+ * larger in userland, the offsets for the rest fields would be
+ * incorrect.
+ */
+ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
+ kmutex_t vdev_stat_lock; /* vdev_stat */
+ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
+};
+
+#define VDEV_RAIDZ_MAXPARITY 3
+
+#define VDEV_PAD_SIZE (8 << 10)
+/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
+#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
+#define VDEV_PHYS_SIZE (112 << 10)
+#define VDEV_UBERBLOCK_RING (128 << 10)
+
+#define VDEV_UBERBLOCK_SHIFT(vd) \
+ MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+#define VDEV_UBERBLOCK_COUNT(vd) \
+ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
+#define VDEV_UBERBLOCK_OFFSET(vd, n) \
+ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
+#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
+
+typedef struct vdev_phys {
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+ char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
+ char vl_pad2[VDEV_PAD_SIZE]; /* 8K */
+ vdev_phys_t vl_vdev_phys; /* 112K */
+ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
+} vdev_label_t; /* 256K total */
+
+/*
+ * vdev_dirty() flags
+ */
+#define VDD_METASLAB 0x01
+#define VDD_DTL 0x02
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
+#define VDEV_LABELS 4
+
+#define VDEV_ALLOC_LOAD 0
+#define VDEV_ALLOC_ADD 1
+#define VDEV_ALLOC_SPARE 2
+#define VDEV_ALLOC_L2CACHE 3
+#define VDEV_ALLOC_ROOTPOOL 4
+#define VDEV_ALLOC_SPLIT 5
+
+/*
+ * Allocate or free a vdev
+ */
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+ vdev_ops_t *ops);
+extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
+ vdev_t *parent, uint_t id, int alloctype);
+extern void vdev_free(vdev_t *vd);
+
+/*
+ * Add or remove children and parents
+ */
+extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_compact_children(vdev_t *pvd);
+extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
+extern void vdev_remove_parent(vdev_t *cvd);
+
+/*
+ * vdev sync load and sync
+ */
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
+extern boolean_t vdev_log_state_valid(vdev_t *vd);
+extern void vdev_load(vdev_t *vd);
+extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
+
+/*
+ * Available vdev types.
+ */
+extern vdev_ops_t vdev_root_ops;
+extern vdev_ops_t vdev_mirror_ops;
+extern vdev_ops_t vdev_replacing_ops;
+extern vdev_ops_t vdev_raidz_ops;
+extern vdev_ops_t vdev_disk_ops;
+extern vdev_ops_t vdev_file_ops;
+extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
+extern vdev_ops_t vdev_spare_ops;
+
+/*
+ * Common size functions
+ */
+extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
+extern uint64_t vdev_get_min_asize(vdev_t *vd);
+extern void vdev_set_min_asize(vdev_t *vd);
+
+/*
+ * zdb uses this tunable, so it must be declared here to make lint happy.
+ */
+extern int zfs_vdev_cache_size;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/zap.h b/uts/common/fs/zfs/sys/zap.h
new file mode 100644
index 000000000000..a1130bbbaaae
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zap.h
@@ -0,0 +1,482 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZAP_H
+#define _SYS_ZAP_H
+
+/*
+ * ZAP - ZFS Attribute Processor
+ *
+ * The ZAP is a module which sits on top of the DMU (Data Management
+ * Unit) and implements a higher-level storage primitive using DMU
+ * objects. Its primary consumer is the ZPL (ZFS Posix Layer).
+ *
+ * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
+ * Users should use only zap routines to access a zapobj - they should
+ * not access the DMU object directly using DMU routines.
+ *
+ * The attributes stored in a zapobj are name-value pairs. The name is
+ * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
+ * terminating NULL). The value is an array of integers, which may be
+ * 1, 2, 4, or 8 bytes long. The total space used by the array (number
+ * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
+ * Note that an 8-byte integer value can be used to store the location
+ * (object number) of another dmu object (which may be itself a zapobj).
+ * Note that you can use a zero-length attribute to store a single bit
+ * of information - the attribute is present or not.
+ *
+ * The ZAP routines are thread-safe. However, you must observe the
+ * DMU's restriction that a transaction may not be operated on
+ * concurrently.
+ *
+ * Any of the routines that return an int may return an I/O error (EIO
+ * or ECHECKSUM).
+ *
+ *
+ * Implementation / Performance Notes:
+ *
+ * The ZAP is intended to operate most efficiently on attributes with
+ * short (49 bytes or less) names and single 8-byte values, for which
+ * the microzap will be used. The ZAP should be efficient enough so
+ * that the user does not need to cache these attributes.
+ *
+ * The ZAP's locking scheme makes its routines thread-safe. Operations
+ * on different zapobjs will be processed concurrently. Operations on
+ * the same zapobj which only read data will be processed concurrently.
+ * Operations on the same zapobj which modify data will be processed
+ * concurrently when there are many attributes in the zapobj (because
+ * the ZAP uses per-block locking - more than 128 * (number of cpus)
+ * small attributes will suffice).
+ */
+
+/*
+ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
+ * strings) for the names of attributes, rather than a byte string
+ * bounded by an explicit length. If some day we want to support names
+ * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
+ * we'll have to add routines for using length-bounded strings.
+ */
+
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The matchtype specifies which entry will be accessed.
+ * MT_EXACT: only find an exact match (non-normalized)
+ * MT_FIRST: find the "first" normalized (case and Unicode
+ * form) match; the designated "first" match will not change as long
+ * as the set of entries with this normalization doesn't change
+ * MT_BEST: if there is an exact match, find that, otherwise find the
+ * first normalized match
+ */
+typedef enum matchtype
+{
+ MT_EXACT,
+ MT_BEST,
+ MT_FIRST
+} matchtype_t;
+
+typedef enum zap_flags {
+ /* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+ ZAP_FLAG_HASH64 = 1 << 0,
+ /* Key is binary, not string (zap_add_uint64() can be used) */
+ ZAP_FLAG_UINT64_KEY = 1 << 1,
+ /*
+ * First word of key (which must be an array of uint64) is
+ * already randomly distributed.
+ */
+ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
+/*
+ * Create a new zapobj with no attributes and return its object number.
+ * MT_EXACT will cause the zap object to only support MT_EXACT lookups,
+ * otherwise any matchtype can be used for lookups.
+ *
+ * normflags specifies what normalization will be done. values are:
+ * 0: no normalization (legacy on-disk format, supports MT_EXACT matching
+ * only)
+ * U8_TEXTPREP_TOLOWER: case normalization will be performed.
+ * MT_FIRST/MT_BEST matching will find entries that match without
+ * regard to case (eg. looking for "foo" can find an entry "Foo").
+ * Eventually, other flags will permit unicode normalization as well.
+ */
+uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Create a new zapobj with no attributes from the given (unallocated)
+ * object number.
+ */
+int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_norm(objset_t *ds, uint64_t obj,
+ int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * The zapobj passed in must be a valid ZAP object for all of the
+ * following routines.
+ */
+
+/*
+ * Destroy this zapobj and all its attributes.
+ *
+ * Frees the object number using dmu_object_free.
+ */
+int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate attributes.
+ *
+ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
+ */
+
+/*
+ * Retrieve the contents of the attribute with the given name.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ *
+ * If 'integer_size' is smaller than the attribute's integer size, the
+ * call will fail and return EINVAL.
+ *
+ * If 'integer_size' is equal to or larger than the attribute's integer
+ * size, the call will succeed and return 0. * When converting to a
+ * larger integer size, the integers will be treated as unsigned (ie. no
+ * sign-extension will be performed).
+ *
+ * 'num_integers' is the length (in integers) of 'buf'.
+ *
+ * If the attribute is longer than the buffer, as many integers as will
+ * fit will be transferred to 'buf'. If the entire attribute was not
+ * transferred, the call will return EOVERFLOW.
+ *
+ * If rn_len is nonzero, realname will be set to the name of the found
+ * entry (which may be different from the requested name if matchtype is
+ * not MT_EXACT).
+ *
+ * If normalization_conflictp is not NULL, it will be set if there is
+ * another name with the same case/unicode normalized form.
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints);
+
+int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
+ int add, uint64_t *towrite, uint64_t *tooverwrite);
+
+/*
+ * Create an attribute with the given name and value.
+ *
+ * If an attribute with the given name already exists, the call will
+ * fail and return EEXIST.
+ */
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+
+/*
+ * Set the attribute with the given name to the given value. If an
+ * attribute with the given name does not exist, it will be created. If
+ * an attribute with the given name already exists, the previous value
+ * will be overwritten. The integer_size may be different from the
+ * existing attribute's integer size, in which case the attribute's
+ * integer size will be updated to the new value.
+ */
+int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/*
+ * Get the length (in integers) and the integer size of the specified
+ * attribute.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers);
+
+/*
+ * Remove the specified attribute.
+ *
+ * If the specified attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
+ matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx);
+
+/*
+ * Returns (in *count) the number of attributes in the specified zap
+ * object.
+ */
+int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+
+/*
+ * Returns (in name) the name of the entry whose (value & mask)
+ * (za_first_integer) is value, or ENOENT if not found. The string
+ * pointed to by name must be at least 256 bytes long. If mask==0, the
+ * match must be exact (ie, same as mask=-1ULL).
+ */
+int zap_value_search(objset_t *os, uint64_t zapobj,
+ uint64_t value, uint64_t mask, char *name);
+
+/*
+ * Transfer all the entries from fromobj into intoobj. Only works on
+ * int_size=8 num_integers=1 values. Fails if there are any duplicated
+ * entries.
+ */
+int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
+
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx);
+
+/*
+ * Manipulate entries where the name + value are the "same" (the name is
+ * a stringified version of the value).
+ */
+int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t *valuep);
+
+/*
+ * They name is a stringified version of key; increment its value by
+ * delta. Zero values will be zap_remove()-ed.
+ */
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx);
+
+struct zap;
+struct zap_leaf;
+typedef struct zap_cursor {
+ /* This structure is opaque! */
+ objset_t *zc_objset;
+ struct zap *zc_zap;
+ struct zap_leaf *zc_leaf;
+ uint64_t zc_zapobj;
+ uint64_t zc_serialized;
+ uint64_t zc_hash;
+ uint32_t zc_cd;
+} zap_cursor_t;
+
+typedef struct {
+ int za_integer_length;
+ /*
+ * za_normalization_conflict will be set if there are additional
+ * entries with this normalized form (eg, "foo" and "Foo").
+ */
+ boolean_t za_normalization_conflict;
+ uint64_t za_num_integers;
+ uint64_t za_first_integer; /* no sign extension for <8byte ints */
+ char za_name[MAXNAMELEN];
+} zap_attribute_t;
+
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one. The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
+
+/*
+ * Initialize a zap cursor, pointing to the "first" attribute of the
+ * zapobj. You must _fini the cursor when you are done with it.
+ */
+void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_fini(zap_cursor_t *zc);
+
+/*
+ * Get the attribute currently pointed to by the cursor. Returns
+ * ENOENT if at the end of the attributes.
+ */
+int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
+
+/*
+ * Advance the cursor to the next attribute.
+ */
+void zap_cursor_advance(zap_cursor_t *zc);
+
+/*
+ * Get a persistent cookie pointing to the current position of the zap
+ * cursor. The low 4 bits in the cookie are always zero, and thus can
+ * be used as to differentiate a serialized cookie from a different type
+ * of value. The cookie will be less than 2^32 as long as there are
+ * fewer than 2^22 (4.2 million) entries in the zap object.
+ */
+uint64_t zap_cursor_serialize(zap_cursor_t *zc);
+
+/*
+ * Advance the cursor to the attribute having the given key.
+ */
+int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument). You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
+ uint64_t zapobj, uint64_t serialized);
+
+
+#define ZAP_HISTOGRAM_SIZE 10
+
+typedef struct zap_stats {
+ /*
+ * Size of the pointer table (in number of entries).
+ * This is always a power of 2, or zero if it's a microzap.
+ * In general, it should be considerably greater than zs_num_leafs.
+ */
+ uint64_t zs_ptrtbl_len;
+
+ uint64_t zs_blocksize; /* size of zap blocks */
+
+ /*
+ * The number of blocks used. Note that some blocks may be
+ * wasted because old ptrtbl's and large name/value blocks are
+ * not reused. (Although their space is reclaimed, we don't
+ * reuse those offsets in the object.)
+ */
+ uint64_t zs_num_blocks;
+
+ /*
+ * Pointer table values from zap_ptrtbl in the zap_phys_t
+ */
+ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
+ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
+ uint64_t zs_ptrtbl_zt_blk; /* starting block number */
+ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
+ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
+
+ /*
+ * Values of the other members of the zap_phys_t
+ */
+ uint64_t zs_block_type; /* ZBT_HEADER */
+ uint64_t zs_magic; /* ZAP_MAGIC */
+ uint64_t zs_num_leafs; /* The number of leaf blocks */
+ uint64_t zs_num_entries; /* The number of zap entries */
+ uint64_t zs_salt; /* salt to stir into hash function */
+
+ /*
+ * Histograms. For all histograms, the last index
+ * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
+ * than what can be represented. For example
+ * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
+ * of leafs with more than 45 entries.
+ */
+
+ /*
+ * zs_leafs_with_n_pointers[n] is the number of leafs with
+ * 2^n pointers to it.
+ */
+ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_with_n_entries[n] is the number of leafs with
+ * [n*5, (n+1)*5) entries. In the current implementation, there
+ * can be at most 55 entries in any block, but there may be
+ * fewer if the name or value is large, or the block is not
+ * completely full.
+ */
+ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_n_tenths_full[n] is the number of leafs whose
+ * fullness is in the range [n/10, (n+1)/10).
+ */
+ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_entries_using_n_chunks[n] is the number of entries which
+ * consume n 24-byte chunks. (Note, large names/values only use
+ * one chunk, but contribute to zs_num_blocks_large.)
+ */
+ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_buckets_with_n_entries[n] is the number of buckets (each
+ * leaf has 64 buckets) with n entries.
+ * zs_buckets_with_n_entries[1] should be very close to
+ * zs_num_entries.
+ */
+ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
+} zap_stats_t;
+
+/*
+ * Get statistics about a ZAP object. Note: you need to be aware of the
+ * internal implementation of the ZAP to correctly interpret some of the
+ * statistics. This interface shouldn't be relied on unless you really
+ * know what you're doing.
+ */
+int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_H */
diff --git a/uts/common/fs/zfs/sys/zap_impl.h b/uts/common/fs/zfs/sys/zap_impl.h
new file mode 100644
index 000000000000..1dc322e02f6f
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zap_impl.h
@@ -0,0 +1,228 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZAP_IMPL_H
+#define _SYS_ZAP_IMPL_H
+
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int fzap_default_block_shift;
+
+#define ZAP_MAGIC 0x2F52AB2ABULL
+
+#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
+
+#define MZAP_ENT_LEN 64
+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
+#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
+#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+
+#define ZAP_NEED_CD (-1U)
+
+typedef struct mzap_ent_phys {
+ uint64_t mze_value;
+ uint32_t mze_cd;
+ uint16_t mze_pad; /* in case we want to chain them someday */
+ char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+ uint64_t mz_block_type; /* ZBT_MICRO */
+ uint64_t mz_salt;
+ uint64_t mz_normflags;
+ uint64_t mz_pad[5];
+ mzap_ent_phys_t mz_chunk[1];
+ /* actually variable size depending on block size */
+} mzap_phys_t;
+
+typedef struct mzap_ent {
+ avl_node_t mze_node;
+ int mze_chunkid;
+ uint64_t mze_hash;
+ uint32_t mze_cd; /* copy from mze_phys->mze_cd */
+} mzap_ent_t;
+
+#define MZE_PHYS(zap, mze) \
+ (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+struct dmu_buf;
+struct zap_leaf;
+
+#define ZBT_LEAF ((1ULL << 63) + 0)
+#define ZBT_HEADER ((1ULL << 63) + 1)
+#define ZBT_MICRO ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/*
+ * the embedded pointer table takes up half a block:
+ * block size / entry size (2^3) / 2
+ */
+#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
+
+/*
+ * The embedded pointer table starts half-way through the block. Since
+ * the pointer table itself is half the block, it starts at (64-bit)
+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
+ */
+#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
+ ((uint64_t *)(zap)->zap_f.zap_phys) \
+ [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+ uint64_t zap_block_type; /* ZBT_HEADER */
+ uint64_t zap_magic; /* ZAP_MAGIC */
+
+ struct zap_table_phys {
+ uint64_t zt_blk; /* starting block number */
+ uint64_t zt_numblks; /* number of blocks */
+ uint64_t zt_shift; /* bits to index it */
+ uint64_t zt_nextblk; /* next (larger) copy start block */
+ uint64_t zt_blks_copied; /* number source blocks copied */
+ } zap_ptrtbl;
+
+ uint64_t zap_freeblk; /* the next free block */
+ uint64_t zap_num_leafs; /* number of leafs */
+ uint64_t zap_num_entries; /* number of entries */
+ uint64_t zap_salt; /* salt to stir into hash function */
+ uint64_t zap_normflags; /* flags for u8_textprep_str() */
+ uint64_t zap_flags; /* zap_flags_t */
+ /*
+ * This structure is followed by padding, and then the embedded
+ * pointer table. The embedded pointer table takes up second
+ * half of the block. It is accessed using the
+ * ZAP_EMBEDDED_PTRTBL_ENT() macro.
+ */
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct zap {
+ objset_t *zap_objset;
+ uint64_t zap_object;
+ struct dmu_buf *zap_dbuf;
+ krwlock_t zap_rwlock;
+ boolean_t zap_ismicro;
+ int zap_normflags;
+ uint64_t zap_salt;
+ union {
+ struct {
+ zap_phys_t *zap_phys;
+
+ /*
+ * zap_num_entries_mtx protects
+ * zap_num_entries
+ */
+ kmutex_t zap_num_entries_mtx;
+ int zap_block_shift;
+ } zap_fat;
+ struct {
+ mzap_phys_t *zap_phys;
+ int16_t zap_num_entries;
+ int16_t zap_num_chunks;
+ int16_t zap_alloc_next;
+ avl_tree_t zap_avl;
+ } zap_micro;
+ } zap_u;
+} zap_t;
+
+typedef struct zap_name {
+ zap_t *zn_zap;
+ int zn_key_intlen;
+ const void *zn_key_orig;
+ int zn_key_orig_numints;
+ const void *zn_key_norm;
+ int zn_key_norm_numints;
+ uint64_t zn_hash;
+ matchtype_t zn_matchtype;
+ char zn_normbuf[ZAP_MAXNAMELEN];
+} zap_name_t;
+
+#define zap_f zap_u.zap_fat
+#define zap_m zap_u.zap_micro
+
+boolean_t zap_match(zap_name_t *zn, const char *matchname);
+int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
+void zap_unlockdir(zap_t *zap);
+void zap_evict(dmu_buf_t *db, void *vmzap);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
+void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
+
+#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+void fzap_byteswap(void *buf, size_t size);
+int fzap_count(zap_t *zap, uint64_t *count);
+int fzap_lookup(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ char *realname, int rn_len, boolean_t *normalization_conflictp);
+void fzap_prefetch(zap_name_t *zn);
+int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
+ uint64_t *tooverwrite);
+int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+int fzap_update(zap_name_t *zn,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int fzap_length(zap_name_t *zn,
+ uint64_t *integer_size, uint64_t *num_integers);
+int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
+int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
+void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+void zap_put_leaf(struct zap_leaf *l);
+
+int fzap_add_cd(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
+int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/zap_leaf.h b/uts/common/fs/zfs/sys/zap_leaf.h
new file mode 100644
index 000000000000..3a33636741d9
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zap_leaf.h
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZAP_LEAF_H
+#define _SYS_ZAP_LEAF_H
+
+#include <sys/zap.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zap;
+struct zap_name;
+struct zap_stats;
+
+#define ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+#define ZAP_LEAF_CHUNKSIZE 24
+
+/*
+ * The amount of space available for chunks is:
+ * block size (1<<l->l_bs) - hash entry size (2) * number of hash
+ * entries - header space (2*chunksize)
+ */
+#define ZAP_LEAF_NUMCHUNKS(l) \
+ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
+ ZAP_LEAF_CHUNKSIZE - 2)
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
+ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * Low water mark: when there are only this many chunks free, start
+ * growing the ptrtbl. Ideally, this should be larger than a
+ * "reasonably-sized" entry. 20 chunks is more than enough for the
+ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
+ * while still being only around 3% for 16k blocks.
+ */
+#define ZAP_LEAF_LOW_WATER (20)
+
+/*
+ * The leaf hash table has block size / 2^5 (32) number of entries,
+ * which should be more than enough for the maximum number of entries,
+ * which is less than block size / CHUNKSIZE (24) / minimum number of
+ * chunks per entry (3).
+ */
+#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
+#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
+
+/*
+ * The chunks start immediately after the hash table. The end of the
+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
+ * chunk_t.
+ */
+#define ZAP_LEAF_CHUNK(l, idx) \
+ ((zap_leaf_chunk_t *) \
+ ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
+
+typedef enum zap_chunk_type {
+ ZAP_CHUNK_FREE = 253,
+ ZAP_CHUNK_ENTRY = 252,
+ ZAP_CHUNK_ARRAY = 251,
+ ZAP_CHUNK_TYPE_MAX = 250
+} zap_chunk_type_t;
+
+#define ZLF_ENTRIES_CDSORTED (1<<0)
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+ struct zap_leaf_header {
+ uint64_t lh_block_type; /* ZBT_LEAF */
+ uint64_t lh_pad1;
+ uint64_t lh_prefix; /* hash prefix of this leaf */
+ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
+ uint16_t lh_nfree; /* number free chunks */
+ uint16_t lh_nentries; /* number of entries */
+ uint16_t lh_prefix_len; /* num bits used to id this */
+
+/* above is accessable to zap, below is zap_leaf private */
+
+ uint16_t lh_freelist; /* chunk head of free list */
+ uint8_t lh_flags; /* ZLF_* flags */
+ uint8_t lh_pad2[11];
+ } l_hdr; /* 2 24-byte chunks */
+
+ /*
+ * The header is followed by a hash table with
+ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
+ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
+ * zap_leaf_chunk structures. These structures are accessed
+ * with the ZAP_LEAF_CHUNK() macro.
+ */
+
+ uint16_t l_hash[1];
+} zap_leaf_phys_t;
+
+typedef union zap_leaf_chunk {
+ struct zap_leaf_entry {
+ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
+ uint8_t le_value_intlen; /* size of value's ints */
+ uint16_t le_next; /* next entry in hash chain */
+ uint16_t le_name_chunk; /* first chunk of the name */
+ uint16_t le_name_numints; /* ints in name (incl null) */
+ uint16_t le_value_chunk; /* first chunk of the value */
+ uint16_t le_value_numints; /* value length in ints */
+ uint32_t le_cd; /* collision differentiator */
+ uint64_t le_hash; /* hash value of the name */
+ } l_entry;
+ struct zap_leaf_array {
+ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t la_next; /* next blk or CHAIN_END */
+ } l_array;
+ struct zap_leaf_free {
+ uint8_t lf_type; /* always ZAP_CHUNK_FREE */
+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t lf_next; /* next in free list, or CHAIN_END */
+ } l_free;
+} zap_leaf_chunk_t;
+
+typedef struct zap_leaf {
+ krwlock_t l_rwlock;
+ uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
+ int l_bs; /* block size shift */
+ dmu_buf_t *l_dbuf;
+ zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+
+typedef struct zap_entry_handle {
+ /* below is set by zap_leaf.c and is public to zap.c */
+ uint64_t zeh_num_integers;
+ uint64_t zeh_hash;
+ uint32_t zeh_cd;
+ uint8_t zeh_integer_size;
+
+ /* below is private to zap_leaf.c */
+ uint16_t zeh_fakechunk;
+ uint16_t *zeh_chunkp;
+ zap_leaf_t *zeh_leaf;
+} zap_entry_handle_t;
+
+/*
+ * Return a handle to the named entry, or ENOENT if not found. The hash
+ * value must equal zap_hash(name).
+ */
+extern int zap_leaf_lookup(zap_leaf_t *l,
+ struct zap_name *zn, zap_entry_handle_t *zeh);
+
+/*
+ * Return a handle to the entry with this hash+cd, or the entry with the
+ * next closest hash+cd.
+ */
+extern int zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
+
+/*
+ * Read the first num_integers in the attribute. Integer size
+ * conversion will be done without sign extension. Return EINVAL if
+ * integer_size is too small. Return EOVERFLOW if there are more than
+ * num_integers in the attribute.
+ */
+extern int zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf);
+
+extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
+
+/*
+ * Replace the value of an existing entry.
+ *
+ * zap_entry_update may fail if it runs out of space (ENOSPC).
+ */
+extern int zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
+
+/*
+ * Remove an entry.
+ */
+extern void zap_entry_remove(zap_entry_handle_t *zeh);
+
+/*
+ * Create an entry. An equal entry must not exist, and this entry must
+ * belong in this leaf (according to its hash value). Fills in the
+ * entry handle on success. Returns 0 on success or ENOSPC on failure.
+ */
+extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
+
+/*
+ * Return true if there are additional entries with the same normalized
+ * form.
+ */
+extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
+ struct zap_name *zn, const char *name, struct zap *zap);
+
+/*
+ * Other stuff.
+ */
+
+extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
+extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
+ struct zap_stats *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_LEAF_H */
diff --git a/uts/common/fs/zfs/sys/zfs_acl.h b/uts/common/fs/zfs/sys/zfs_acl.h
new file mode 100644
index 000000000000..c1a0aeebdce4
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_acl.h
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ACL_H
+#define _SYS_FS_ZFS_ACL_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#endif
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct znode_phys;
+
+#define ACE_SLOT_CNT 6
+#define ZFS_ACL_VERSION_INITIAL 0ULL
+#define ZFS_ACL_VERSION_FUID 1ULL
+#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID
+
+/*
+ * ZFS ACLs are store in various forms.
+ * Files created with ACL version ZFS_ACL_VERSION_INITIAL
+ * will all be created with fixed length ACEs of type
+ * zfs_oldace_t.
+ *
+ * Files with ACL version ZFS_ACL_VERSION_FUID will be created
+ * with various sized ACEs. The abstraction entries will utilize
+ * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t
+ * and some specialized CIFS ACEs will use zfs_object_ace_t.
+ */
+
+/*
+ * All ACEs have a common hdr. For
+ * owner@, group@, and everyone@ this is all
+ * thats needed.
+ */
+typedef struct zfs_ace_hdr {
+ uint16_t z_type;
+ uint16_t z_flags;
+ uint32_t z_access_mask;
+} zfs_ace_hdr_t;
+
+typedef zfs_ace_hdr_t zfs_ace_abstract_t;
+
+/*
+ * Standard ACE
+ */
+typedef struct zfs_ace {
+ zfs_ace_hdr_t z_hdr;
+ uint64_t z_fuid;
+} zfs_ace_t;
+
+/*
+ * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE
+ * and will only be set/retrieved in a CIFS context.
+ */
+
+typedef struct zfs_object_ace {
+ zfs_ace_t z_ace;
+ uint8_t z_object_type[16]; /* object type */
+ uint8_t z_inherit_type[16]; /* inherited object type */
+} zfs_object_ace_t;
+
+typedef struct zfs_oldace {
+ uint32_t z_fuid; /* "who" */
+ uint32_t z_access_mask; /* access mask */
+ uint16_t z_flags; /* flags, i.e inheritance */
+ uint16_t z_type; /* type of entry allow/deny */
+} zfs_oldace_t;
+
+typedef struct zfs_acl_phys_v0 {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_count; /* Number of ACEs */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_pad; /* pad */
+ zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_acl_phys_v0_t;
+
+#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
+
+/*
+ * Size of ACL count is always 2 bytes.
+ * Necessary to for dealing with both V0 ACL and V1 ACL layout
+ */
+#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t))
+
+typedef struct zfs_acl_phys {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_size; /* Number of bytes in ACL */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_count; /* ace count */
+ uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+} zfs_acl_phys_t;
+
+typedef struct acl_ops {
+ uint32_t (*ace_mask_get) (void *acep); /* get access mask */
+ void (*ace_mask_set) (void *acep,
+ uint32_t mask); /* set access mask */
+ uint16_t (*ace_flags_get) (void *acep); /* get flags */
+ void (*ace_flags_set) (void *acep,
+ uint16_t flags); /* set flags */
+ uint16_t (*ace_type_get)(void *acep); /* get type */
+ void (*ace_type_set)(void *acep,
+ uint16_t type); /* set type */
+ uint64_t (*ace_who_get)(void *acep); /* get who/fuid */
+ void (*ace_who_set)(void *acep,
+ uint64_t who); /* set who/fuid */
+ size_t (*ace_size)(void *acep); /* how big is this ace */
+ size_t (*ace_abstract_size)(void); /* sizeof abstract entry */
+ int (*ace_mask_off)(void); /* off of access mask in ace */
+ int (*ace_data)(void *acep, void **datap);
+ /* ptr to data if any */
+} acl_ops_t;
+
+/*
+ * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's.
+ * Each node will have one or more ACEs associated with it. You will
+ * only have multiple nodes during a chmod operation. Normally only
+ * one node is required.
+ */
+typedef struct zfs_acl_node {
+ list_node_t z_next; /* Next chunk of ACEs */
+ void *z_acldata; /* pointer into actual ACE(s) */
+ void *z_allocdata; /* pointer to kmem allocated memory */
+ size_t z_allocsize; /* Size of blob in bytes */
+ size_t z_size; /* length of ACL data */
+ uint64_t z_ace_count; /* number of ACEs in this acl node */
+ int z_ace_idx; /* ace iterator positioned on */
+} zfs_acl_node_t;
+
+typedef struct zfs_acl {
+ uint64_t z_acl_count; /* Number of ACEs */
+ size_t z_acl_bytes; /* Number of bytes in ACL */
+ uint_t z_version; /* version of ACL */
+ void *z_next_ace; /* pointer to next ACE */
+ uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
+ zfs_acl_node_t *z_curr_node; /* current node iterator is handling */
+ list_t z_acl; /* chunks of ACE data */
+ acl_ops_t z_ops; /* ACL operations */
+} zfs_acl_t;
+
+typedef struct acl_locator_cb {
+ zfs_acl_t *cb_aclp;
+ zfs_acl_node_t *cb_acl_node;
+} zfs_acl_locator_cb_t;
+
+#define ACL_DATA_ALLOCED 0x1
+#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
+
+struct zfs_fuid_info;
+
+typedef struct zfs_acl_ids {
+ uint64_t z_fuid; /* file owner fuid */
+ uint64_t z_fgid; /* file group owner fuid */
+ uint64_t z_mode; /* mode to set on create */
+ zfs_acl_t *z_aclp; /* ACL to create with file */
+ struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */
+} zfs_acl_ids_t;
+
+/*
+ * Property values for acl_mode and acl_inherit.
+ *
+ * acl_mode can take discard, noallow, groupmask and passthrough.
+ * whereas acl_inherit has secure instead of groupmask.
+ */
+
+#define ZFS_ACL_DISCARD 0
+#define ZFS_ACL_NOALLOW 1
+#define ZFS_ACL_GROUPMASK 2
+#define ZFS_ACL_PASSTHROUGH 3
+#define ZFS_ACL_RESTRICTED 4
+#define ZFS_ACL_PASSTHROUGH_X 5
+
+struct znode;
+struct zfsvfs;
+
+#ifdef _KERNEL
+int zfs_acl_ids_create(struct znode *, int, vattr_t *,
+ cred_t *, vsecattr_t *, zfs_acl_ids_t *);
+void zfs_acl_ids_free(zfs_acl_ids_t *);
+boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *);
+int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
+int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
+void zfs_acl_rele(void *);
+void zfs_oldace_byteswap(ace_t *, int);
+void zfs_ace_byteswap(void *, size_t, boolean_t);
+extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
+extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+int zfs_fastaccesschk_execute(struct znode *, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
+extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
+extern int zfs_acl_access(struct znode *, int, cred_t *);
+void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
+int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
+int zfs_zaccess_rename(struct znode *, struct znode *,
+ struct znode *, struct znode *, cred_t *cr);
+void zfs_acl_free(zfs_acl_t *);
+int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
+ struct zfs_fuid_info **, zfs_acl_t **);
+int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
+uint64_t zfs_external_acl(struct znode *);
+int zfs_znode_acl_version(struct znode *);
+int zfs_acl_size(struct znode *, int *);
+zfs_acl_t *zfs_acl_alloc(int);
+zfs_acl_node_t *zfs_acl_node_alloc(size_t);
+void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
+void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
+ uint64_t *, uint64_t, uint64_t);
+int zfs_acl_chown_setattr(struct znode *);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _SYS_FS_ZFS_ACL_H */
diff --git a/uts/common/fs/zfs/sys/zfs_context.h b/uts/common/fs/zfs/sys/zfs_context.h
new file mode 100644
index 000000000000..558e9e1884e3
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_context.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define _SYS_ZFS_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/note.h>
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/conf.h>
+#include <sys/disp.h>
+#include <sys/debug.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <vm/seg_kmem.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/zfs_debug.h>
+#include <sys/sysevent.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <sys/fm/util.h>
+#include <sys/sunddi.h>
+
+#define CPU_SEQID (CPU->cpu_seqid)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/uts/common/fs/zfs/sys/zfs_ctldir.h b/uts/common/fs/zfs/sys/zfs_ctldir.h
new file mode 100644
index 000000000000..f88ef95fdca8
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _ZFS_CTLDIR_H
+#define _ZFS_CTLDIR_H
+
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_CTLDIR_NAME ".zfs"
+
+#define zfs_has_ctldir(zdp) \
+ ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
+ ((zdp)->z_zfsvfs->z_ctldir != NULL))
+#define zfs_show_ctldir(zdp) \
+ (zfs_has_ctldir(zdp) && \
+ ((zdp)->z_zfsvfs->z_show_ctldir))
+
+void zfsctl_create(zfsvfs_t *);
+void zfsctl_destroy(zfsvfs_t *);
+vnode_t *zfsctl_root(znode_t *);
+void zfsctl_init(void);
+void zfsctl_fini(void);
+boolean_t zfsctl_is_node(vnode_t *);
+
+int zfsctl_rename_snapshot(const char *from, const char *to);
+int zfsctl_destroy_snapshot(const char *snapname, int force);
+int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
+
+int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp);
+
+int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen,
+ fid_t *fidp);
+int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
+
+#define ZFSCTL_INO_ROOT 0x1
+#define ZFSCTL_INO_SNAPDIR 0x2
+#define ZFSCTL_INO_SHARES 0x3
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_CTLDIR_H */
diff --git a/uts/common/fs/zfs/sys/zfs_debug.h b/uts/common/fs/zfs/sys/zfs_debug.h
new file mode 100644
index 000000000000..50ecf9b36249
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_debug.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_DEBUG_H
+#define _SYS_ZFS_DEBUG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/*
+ * ZFS debugging
+ */
+
+#if defined(DEBUG) || !defined(_KERNEL)
+#define ZFS_DEBUG
+#endif
+
+extern int zfs_flags;
+
+#define ZFS_DEBUG_DPRINTF 0x0001
+#define ZFS_DEBUG_DBUF_VERIFY 0x0002
+#define ZFS_DEBUG_DNODE_VERIFY 0x0004
+#define ZFS_DEBUG_SNAPNAMES 0x0008
+#define ZFS_DEBUG_MODIFY 0x0010
+
+#ifdef ZFS_DEBUG
+extern void __dprintf(const char *file, const char *func,
+ int line, const char *fmt, ...);
+#define dprintf(...) \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) \
+ __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#else
+#define dprintf(...) ((void)0)
+#endif /* ZFS_DEBUG */
+
+extern void zfs_panic_recover(const char *fmt, ...);
+
+typedef struct zfs_dbgmsg {
+ list_node_t zdm_node;
+ time_t zdm_timestamp;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_DEBUG_H */
diff --git a/uts/common/fs/zfs/sys/zfs_dir.h b/uts/common/fs/zfs/sys/zfs_dir.h
new file mode 100644
index 000000000000..349f8ef37321
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_dir.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_DIR_H
+#define _SYS_FS_ZFS_DIR_H
+
+#include <sys/pathname.h>
+#include <sys/dmu.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* zfs_dirent_lock() flags */
+#define ZNEW 0x0001 /* entry should not exist */
+#define ZEXISTS 0x0002 /* entry should exist */
+#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */
+#define ZXATTR 0x0008 /* we want the xattr dir */
+#define ZRENAMING 0x0010 /* znode is being renamed */
+#define ZCILOOK 0x0020 /* case-insensitive lookup requested */
+#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */
+#define ZHAVELOCK 0x0080 /* z_name_lock is already held */
+
+/* mknode flags */
+#define IS_ROOT_NODE 0x01 /* create a root node */
+#define IS_XATTR 0x02 /* create an extended attribute node */
+
+extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
+ int, int *, pathname_t *);
+extern void zfs_dirent_unlock(zfs_dirlock_t *);
+extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+ boolean_t *);
+extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
+ pathname_t *);
+extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
+ uint_t, znode_t **, zfs_acl_ids_t *);
+extern void zfs_rmnode(znode_t *);
+extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
+extern boolean_t zfs_dirempty(znode_t *);
+extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
+extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
+extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
+extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int);
+extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_DIR_H */
diff --git a/uts/common/fs/zfs/sys/zfs_fuid.h b/uts/common/fs/zfs/sys/zfs_fuid.h
new file mode 100644
index 000000000000..0feb3ce4bb7c
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_fuid.h
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_FUID_H
+#define _SYS_FS_ZFS_FUID_H
+
+#ifdef _KERNEL
+#include <sys/kidmap.h>
+#include <sys/sid.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ ZFS_OWNER,
+ ZFS_GROUP,
+ ZFS_ACE_USER,
+ ZFS_ACE_GROUP
+} zfs_fuid_type_t;
+
+/*
+ * Estimate space needed for one more fuid table entry.
+ * for now assume its current size + 1K
+ */
+#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
+
+#define FUID_INDEX(x) ((x) >> 32)
+#define FUID_RID(x) ((x) & 0xffffffff)
+#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid))
+/*
+ * FUIDs cause problems for the intent log
+ * we need to replay the creation of the FUID,
+ * but we can't count on the idmapper to be around
+ * and during replay the FUID index may be different than
+ * before. Also, if an ACL has 100 ACEs and 12 different
+ * domains we don't want to log 100 domain strings, but rather
+ * just the unique 12.
+ */
+
+/*
+ * The FUIDs in the log will index into
+ * domain string table and the bottom half will be the rid.
+ * Used for mapping ephemeral uid/gid during ACL setting to FUIDs
+ */
+typedef struct zfs_fuid {
+ list_node_t z_next;
+ uint64_t z_id; /* uid/gid being converted to fuid */
+ uint64_t z_domidx; /* index in AVL domain table */
+ uint64_t z_logfuid; /* index for domain in log */
+} zfs_fuid_t;
+
+/* list of unique domains */
+typedef struct zfs_fuid_domain {
+ list_node_t z_next;
+ uint64_t z_domidx; /* AVL tree idx */
+ const char *z_domain; /* domain string */
+} zfs_fuid_domain_t;
+
+/*
+ * FUID information necessary for logging create, setattr, and setacl.
+ */
+typedef struct zfs_fuid_info {
+ list_t z_fuids;
+ list_t z_domains;
+ uint64_t z_fuid_owner;
+ uint64_t z_fuid_group;
+ char **z_domain_table; /* Used during replay */
+ uint32_t z_fuid_cnt; /* How many fuids in z_fuids */
+ uint32_t z_domain_cnt; /* How many domains */
+ size_t z_domain_str_sz; /* len of domain strings z_domain list */
+} zfs_fuid_info_t;
+
+#ifdef _KERNEL
+struct znode;
+extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
+extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
+ uint64_t, uint64_t, zfs_fuid_type_t);
+extern void zfs_fuid_destroy(zfsvfs_t *);
+extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
+ cred_t *, zfs_fuid_info_t **);
+extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
+ zfs_fuid_info_t **);
+extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr,
+ uid_t *uid, uid_t *gid);
+extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
+extern void zfs_fuid_info_free(zfs_fuid_info_t *);
+extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
+void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *);
+extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
+ char **retdomain, boolean_t addok);
+extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
+extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+#endif
+
+char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
+void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *);
+uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
+void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_FUID_H */
diff --git a/uts/common/fs/zfs/sys/zfs_ioctl.h b/uts/common/fs/zfs/sys/zfs_ioctl.h
new file mode 100644
index 000000000000..84bf794fe5f0
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -0,0 +1,349 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_IOCTL_H
+#define _SYS_ZFS_IOCTL_H
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zfs_stat.h>
+
+#ifdef _KERNEL
+#include <sys/nvpair.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Property values for snapdir
+ */
+#define ZFS_SNAPDIR_HIDDEN 0
+#define ZFS_SNAPDIR_VISIBLE 1
+
+/*
+ * Field manipulation macros for the drr_versioninfo field of the
+ * send stream header.
+ */
+
+/*
+ * Header types for zfs send streams.
+ */
+typedef enum drr_headertype {
+ DMU_SUBSTREAM = 0x1,
+ DMU_COMPOUNDSTREAM = 0x2
+} drr_headertype_t;
+
+#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2)
+#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x)
+
+#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30)
+#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x)
+
+/*
+ * Feature flags for zfs send streams (flags in drr_versioninfo)
+ */
+
+#define DMU_BACKUP_FEATURE_DEDUP (0x1)
+#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2)
+#define DMU_BACKUP_FEATURE_SA_SPILL (0x4)
+
+/*
+ * Mask of all supported backup features
+ */
+#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
+ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+
+/* Are all features in the given flag word currently supported? */
+#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
+
+/*
+ * The drr_versioninfo field of the dmu_replay_record has the
+ * following layout:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | reserved | feature-flags |C|S|
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * The low order two bits indicate the header type: SUBSTREAM (0x1)
+ * or COMPOUNDSTREAM (0x2). Using two bits for this is historical:
+ * this field used to be a version number, where the two version types
+ * were 1 and 2. Using two bits for this allows earlier versions of
+ * the code to be able to recognize send streams that don't use any
+ * of the features indicated by feature flags.
+ */
+
+#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
+
+#define DRR_FLAG_CLONE (1<<0)
+#define DRR_FLAG_CI_DATA (1<<1)
+
+/*
+ * flags in the drr_checksumflags field in the DRR_WRITE and
+ * DRR_WRITE_BYREF blocks
+ */
+#define DRR_CHECKSUM_DEDUP (1<<0)
+
+#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
+
+/*
+ * zfs ioctl command structure
+ */
+typedef struct dmu_replay_record {
+ enum {
+ DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
+ DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
+ DRR_SPILL, DRR_NUMTYPES
+ } drr_type;
+ uint32_t drr_payloadlen;
+ union {
+ struct drr_begin {
+ uint64_t drr_magic;
+ uint64_t drr_versioninfo; /* was drr_version */
+ uint64_t drr_creation_time;
+ dmu_objset_type_t drr_type;
+ uint32_t drr_flags;
+ uint64_t drr_toguid;
+ uint64_t drr_fromguid;
+ char drr_toname[MAXNAMELEN];
+ } drr_begin;
+ struct drr_end {
+ zio_cksum_t drr_checksum;
+ uint64_t drr_toguid;
+ } drr_end;
+ struct drr_object {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ dmu_object_type_t drr_bonustype;
+ uint32_t drr_blksz;
+ uint32_t drr_bonuslen;
+ uint8_t drr_checksumtype;
+ uint8_t drr_compress;
+ uint8_t drr_pad[6];
+ uint64_t drr_toguid;
+ /* bonus content follows */
+ } drr_object;
+ struct drr_freeobjects {
+ uint64_t drr_firstobj;
+ uint64_t drr_numobjs;
+ uint64_t drr_toguid;
+ } drr_freeobjects;
+ struct drr_write {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_pad2[6];
+ ddt_key_t drr_key; /* deduplication key */
+ /* content follows */
+ } drr_write;
+ struct drr_free {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ } drr_free;
+ struct drr_write_byref {
+ /* where to put the data */
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ /* where to find the prior copy of the data */
+ uint64_t drr_refguid;
+ uint64_t drr_refobject;
+ uint64_t drr_refoffset;
+ /* properties of the data */
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_pad2[6];
+ ddt_key_t drr_key; /* deduplication key */
+ } drr_write_byref;
+ struct drr_spill {
+ uint64_t drr_object;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint64_t drr_pad[4]; /* needed for crypto */
+ /* spill data follows */
+ } drr_spill;
+ } drr_u;
+} dmu_replay_record_t;
+
+/* diff record range types */
+typedef enum diff_type {
+ DDR_NONE = 0x1,
+ DDR_INUSE = 0x2,
+ DDR_FREE = 0x4
+} diff_type_t;
+
+/*
+ * The diff reports back ranges of free or in-use objects.
+ */
+typedef struct dmu_diff_record {
+ uint64_t ddr_type;
+ uint64_t ddr_first;
+ uint64_t ddr_last;
+} dmu_diff_record_t;
+
+typedef struct zinject_record {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+ uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
+ uint32_t zi_iotype;
+ int32_t zi_duration;
+ uint64_t zi_timer;
+} zinject_record_t;
+
+#define ZINJECT_NULL 0x1
+#define ZINJECT_FLUSH_ARC 0x2
+#define ZINJECT_UNLOAD_SPA 0x4
+
+typedef struct zfs_share {
+ uint64_t z_exportdata;
+ uint64_t z_sharedata;
+ uint64_t z_sharetype; /* 0 = share, 1 = unshare */
+ uint64_t z_sharemax; /* max length of share string */
+} zfs_share_t;
+
+/*
+ * ZFS file systems may behave the usual, POSIX-compliant way, where
+ * name lookups are case-sensitive. They may also be set up so that
+ * all the name lookups are case-insensitive, or so that only some
+ * lookups, the ones that set an FIGNORECASE flag, are case-insensitive.
+ */
+typedef enum zfs_case {
+ ZFS_CASE_SENSITIVE,
+ ZFS_CASE_INSENSITIVE,
+ ZFS_CASE_MIXED
+} zfs_case_t;
+
+typedef struct zfs_cmd {
+ char zc_name[MAXPATHLEN];
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ char zc_top_ds[MAXPATHLEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history; /* really (char *) */
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_t zc_inject_record;
+ boolean_t zc_defer_destroy;
+ boolean_t zc_temphold;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_pad[4]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_t;
+
+typedef struct zfs_useracct {
+ char zu_domain[256];
+ uid_t zu_rid;
+ uint32_t zu_pad;
+ uint64_t zu_space;
+} zfs_useracct_t;
+
+#define ZFSDEV_MAX_MINOR (1 << 16)
+#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1)
+
+#define ZPOOL_EXPORT_AFTER_SPLIT 0x1
+
+#ifdef _KERNEL
+
+typedef struct zfs_creat {
+ nvlist_t *zct_zplprops;
+ nvlist_t *zct_props;
+} zfs_creat_t;
+
+extern dev_info_t *zfs_dip;
+
+extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
+extern int zfs_secpolicy_rename_perms(const char *from,
+ const char *to, cred_t *cr);
+extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
+extern int zfs_busy(void);
+extern int zfs_unmount_snap(const char *, void *);
+
+/*
+ * ZFS minor numbers can refer to either a control device instance or
+ * a zvol. Depending on the value of zss_type, zss_data points to either
+ * a zvol_state_t or a zfs_onexit_t.
+ */
+enum zfs_soft_state_type {
+ ZSST_ZVOL,
+ ZSST_CTLDEV
+};
+
+typedef struct zfs_soft_state {
+ enum zfs_soft_state_type zss_type;
+ void *zss_data;
+} zfs_soft_state_t;
+
+extern void *zfsdev_get_soft_state(minor_t minor,
+ enum zfs_soft_state_type which);
+extern minor_t zfsdev_minor_alloc(void);
+
+extern void *zfsdev_state;
+extern kmutex_t zfsdev_state_lock;
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IOCTL_H */
diff --git a/uts/common/fs/zfs/sys/zfs_onexit.h b/uts/common/fs/zfs/sys/zfs_onexit.h
new file mode 100644
index 000000000000..4982bd4d0afc
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_onexit.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_ONEXIT_H
+#define _SYS_ZFS_ONEXIT_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct zfs_onexit {
+ kmutex_t zo_lock;
+ list_t zo_actions;
+} zfs_onexit_t;
+
+typedef struct zfs_onexit_action_node {
+ list_node_t za_link;
+ void (*za_func)(void *);
+ void *za_data;
+} zfs_onexit_action_node_t;
+
+extern void zfs_onexit_init(zfs_onexit_t **zo);
+extern void zfs_onexit_destroy(zfs_onexit_t *zo);
+
+#endif
+
+extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(int fd);
+extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle);
+extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
+ boolean_t fire);
+extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
+ void **data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_ONEXIT_H */
diff --git a/uts/common/fs/zfs/sys/zfs_rlock.h b/uts/common/fs/zfs/sys/zfs_rlock.h
new file mode 100644
index 000000000000..f302b663e22a
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_rlock.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_RLOCK_H
+#define _SYS_FS_ZFS_RLOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <sys/zfs_znode.h>
+
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rl_type_t;
+
+typedef struct rl {
+ znode_t *r_zp; /* znode this lock applies to */
+ avl_node_t r_node; /* avl node link */
+ uint64_t r_off; /* file range offset */
+ uint64_t r_len; /* file range length */
+ uint_t r_cnt; /* range reference count in tree */
+ rl_type_t r_type; /* range type */
+ kcondvar_t r_wr_cv; /* cv for waiting writers */
+ kcondvar_t r_rd_cv; /* cv for waiting readers */
+ uint8_t r_proxy; /* acting for original range */
+ uint8_t r_write_wanted; /* writer wants to lock this range */
+ uint8_t r_read_wanted; /* reader wants to lock this range */
+} rl_t;
+
+/*
+ * Lock a range (offset, length) as either shared (READER)
+ * or exclusive (WRITER or APPEND). APPEND is a special type that
+ * is converted to WRITER that specified to lock from the start of the
+ * end of file. zfs_range_lock() returns the range lock structure.
+ */
+rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void zfs_range_unlock(rl_t *rl);
+
+/*
+ * Reduce range locked as RW_WRITER from whole file to specified range.
+ * Asserts the whole file was previously locked.
+ */
+void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
+
+/*
+ * AVL comparison function used to compare range locks
+ */
+int zfs_range_compare(const void *arg1, const void *arg2);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_RLOCK_H */
diff --git a/uts/common/fs/zfs/sys/zfs_sa.h b/uts/common/fs/zfs/sys/zfs_sa.h
new file mode 100644
index 000000000000..cd312b27a94d
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_sa.h
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_SA_H
+#define _SYS_ZFS_SA_H
+
+#ifdef _KERNEL
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zil.h>
+
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the list of known attributes
+ * to the ZPL. The values of the actual
+ * attributes are not defined by the order
+ * the enums. It is controlled by the attribute
+ * registration mechanism. Two different file system
+ * could have different numeric values for the same
+ * attributes. this list is only used for dereferencing
+ * into the table that will hold the actual numeric value.
+ */
+typedef enum zpl_attr {
+ ZPL_ATIME,
+ ZPL_MTIME,
+ ZPL_CTIME,
+ ZPL_CRTIME,
+ ZPL_GEN,
+ ZPL_MODE,
+ ZPL_SIZE,
+ ZPL_PARENT,
+ ZPL_LINKS,
+ ZPL_XATTR,
+ ZPL_RDEV,
+ ZPL_FLAGS,
+ ZPL_UID,
+ ZPL_GID,
+ ZPL_PAD,
+ ZPL_ZNODE_ACL,
+ ZPL_DACL_COUNT,
+ ZPL_SYMLINK,
+ ZPL_SCANSTAMP,
+ ZPL_DACL_ACES,
+ ZPL_END
+} zpl_attr_t;
+
+#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108
+#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \
+ sizeof (zfs_acl_phys_t))
+
+#define SA_MODE_OFFSET 0
+#define SA_SIZE_OFFSET 8
+#define SA_GEN_OFFSET 16
+#define SA_UID_OFFSET 24
+#define SA_GID_OFFSET 32
+#define SA_PARENT_OFFSET 40
+
+extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
+extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
+
+/*
+ * This is a deprecated data structure that only exists for
+ * dealing with file systems create prior to ZPL version 5.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_zap; /* 144 - extra attributes */
+ uint64_t zp_pad[3]; /* 152 - future */
+ zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we use this space for the following:
+ * - symbolic links
+ * - 32-byte anti-virus scanstamp (regular files only)
+ */
+} znode_phys_t;
+
+#ifdef _KERNEL
+int zfs_sa_readlink(struct znode *, uio_t *);
+void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
+void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
+void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
+void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
+void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_SA_H */
diff --git a/uts/common/fs/zfs/sys/zfs_stat.h b/uts/common/fs/zfs/sys/zfs_stat.h
new file mode 100644
index 000000000000..465aefaa2063
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_stat.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_STAT_H
+#define _SYS_FS_ZFS_STAT_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/dmu.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A limited number of zpl level stats are retrievable
+ * with an ioctl. zfs diff is the current consumer.
+ */
+typedef struct zfs_stat {
+ uint64_t zs_gen;
+ uint64_t zs_mode;
+ uint64_t zs_links;
+ uint64_t zs_ctime[2];
+} zfs_stat_t;
+
+extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_STAT_H */
diff --git a/uts/common/fs/zfs/sys/zfs_vfsops.h b/uts/common/fs/zfs/sys/zfs_vfsops.h
new file mode 100644
index 000000000000..38c87df4300f
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -0,0 +1,159 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_VFSOPS_H
+#define _SYS_FS_ZFS_VFSOPS_H
+
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/vfs.h>
+#include <sys/zil.h>
+#include <sys/sa.h>
+#include <sys/rrwlock.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfsvfs zfsvfs_t;
+struct znode;
+
+struct zfsvfs {
+ vfs_t *z_vfs; /* generic fs struct */
+ zfsvfs_t *z_parent; /* parent fs */
+ objset_t *z_os; /* objset reference */
+ uint64_t z_root; /* id of root znode */
+ uint64_t z_unlinkedobj; /* id of unlinked zapobj */
+ uint64_t z_max_blksz; /* maximum block size for files */
+ uint64_t z_fuid_obj; /* fuid table object number */
+ uint64_t z_fuid_size; /* fuid table size */
+ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */
+ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */
+ krwlock_t z_fuid_lock; /* fuid lock */
+ boolean_t z_fuid_loaded; /* fuid tables are loaded */
+ boolean_t z_fuid_dirty; /* need to sync fuid table ? */
+ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */
+ zilog_t *z_log; /* intent log pointer */
+ uint_t z_acl_inherit; /* acl inheritance behavior */
+ zfs_case_t z_case; /* case-sense */
+ boolean_t z_utf8; /* utf8-only */
+ int z_norm; /* normalization flags */
+ boolean_t z_atime; /* enable atimes mount option */
+ boolean_t z_unmounted; /* unmounted */
+ rrwlock_t z_teardown_lock;
+ krwlock_t z_teardown_inactive_lock;
+ list_t z_all_znodes; /* all vnodes in the fs */
+ kmutex_t z_znodes_lock; /* lock for z_all_znodes */
+ vnode_t *z_ctldir; /* .zfs directory pointer */
+ boolean_t z_show_ctldir; /* expose .zfs in the root dir */
+ boolean_t z_issnap; /* true if this is a snapshot */
+ boolean_t z_vscan; /* virus scan on/off */
+ boolean_t z_use_fuids; /* version allows fuids */
+ boolean_t z_replay; /* set during ZIL replay */
+ boolean_t z_use_sa; /* version allow system attributes */
+ uint64_t z_version; /* ZPL version */
+ uint64_t z_shares_dir; /* hidden shares dir */
+ kmutex_t z_lock;
+ uint64_t z_userquota_obj;
+ uint64_t z_groupquota_obj;
+ uint64_t z_replay_eof; /* New end of file - replay only */
+ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
+#define ZFS_OBJ_MTX_SZ 64
+ kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
+};
+
+/*
+ * Normal filesystems (those not under .zfs/snapshot) have a total
+ * file ID size limited to 12 bytes (including the length field) due to
+ * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical
+ * reasons, this same limit is being imposed by the Solaris NFSv3 implementation
+ * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It
+ * is not possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2.
+ *
+ * For normal filesystems, we partition up the available space as follows:
+ * 2 bytes fid length (required)
+ * 6 bytes object number (48 bits)
+ * 4 bytes generation number (32 bits)
+ *
+ * We reserve only 48 bits for the object number, as this is the limit
+ * currently defined and imposed by the DMU.
+ */
+typedef struct zfid_short {
+ uint16_t zf_len;
+ uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
+} zfid_short_t;
+
+/*
+ * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes
+ * (including the length field). This makes files under .zfs/snapshot
+ * accessible by NFSv3 and NFSv4, but not NFSv2.
+ *
+ * For files under .zfs/snapshot, we partition up the available space
+ * as follows:
+ * 2 bytes fid length (required)
+ * 6 bytes object number (48 bits)
+ * 4 bytes generation number (32 bits)
+ * 6 bytes objset id (48 bits)
+ * 4 bytes currently just zero (32 bits)
+ *
+ * We reserve only 48 bits for the object number and objset id, as these are
+ * the limits currently defined and imposed by the DMU.
+ */
+typedef struct zfid_long {
+ zfid_short_t z_fid;
+ uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */
+} zfid_long_t;
+
+#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
+#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
+
+extern uint_t zfs_fsyncer_key;
+
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
+extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t *valuep);
+extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
+extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t quota);
+extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
+ boolean_t isgroup);
+extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
+ uint64_t fuid);
+extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
+extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
+extern void zfsvfs_free(zfsvfs_t *zfsvfs);
+extern int zfs_check_global_label(const char *dsname, const char *hexsl);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/uts/common/fs/zfs/sys/zfs_znode.h b/uts/common/fs/zfs/sys/zfs_znode.h
new file mode 100644
index 000000000000..3e9621a0ee24
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zfs_znode.h
@@ -0,0 +1,361 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ZNODE_H
+#define _SYS_FS_ZFS_ZNODE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/attr.h>
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/sa.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/rrwlock.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Additional file level attributes, that are stored
+ * in the upper half of zp_flags
+ */
+#define ZFS_READONLY 0x0000000100000000
+#define ZFS_HIDDEN 0x0000000200000000
+#define ZFS_SYSTEM 0x0000000400000000
+#define ZFS_ARCHIVE 0x0000000800000000
+#define ZFS_IMMUTABLE 0x0000001000000000
+#define ZFS_NOUNLINK 0x0000002000000000
+#define ZFS_APPENDONLY 0x0000004000000000
+#define ZFS_NODUMP 0x0000008000000000
+#define ZFS_OPAQUE 0x0000010000000000
+#define ZFS_AV_QUARANTINED 0x0000020000000000
+#define ZFS_AV_MODIFIED 0x0000040000000000
+#define ZFS_REPARSE 0x0000080000000000
+#define ZFS_OFFLINE 0x0000100000000000
+#define ZFS_SPARSE 0x0000200000000000
+
+#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
+{ \
+ if (value) \
+ pflags |= attr; \
+ else \
+ pflags &= ~attr; \
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
+ &pflags, sizeof (pflags), tx)); \
+}
+
+/*
+ * Define special zfs pflags
+ */
+#define ZFS_XATTR 0x1 /* is an extended attribute */
+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
+#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
+#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */
+#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
+#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
+#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
+#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
+
+#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME]
+#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME]
+#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME]
+#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME]
+#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN]
+#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES]
+#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR]
+#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK]
+#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV]
+#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP]
+#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID]
+#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID]
+#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT]
+#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS]
+#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE]
+#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT]
+#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
+#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
+#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
+#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
+
+/*
+ * Is ID ephemeral?
+ */
+#define IS_EPHEMERAL(x) (x > MAXUID)
+
+/*
+ * Should we use FUIDs?
+ */
+#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \
+ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \
+ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
+
+#define MASTER_NODE_OBJ 1
+
+/*
+ * Special attributes for master node.
+ * "userquota@" and "groupquota@" are also valid (from
+ * zfs_userquota_prop_prefixes[]).
+ */
+#define ZFS_FSID "FSID"
+#define ZFS_UNLINKED_SET "DELETE_QUEUE"
+#define ZFS_ROOT_OBJ "ROOT"
+#define ZPL_VERSION_STR "VERSION"
+#define ZFS_FUID_TABLES "FUID"
+#define ZFS_SHARES_DIR "SHARES"
+#define ZFS_SA_ATTRS "SA_ATTRS"
+
+#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
+
+/* Path component length */
+/*
+ * The generic fs code uses MAXNAMELEN to represent
+ * what the largest component length is. Unfortunately,
+ * this length includes the terminating NULL. ZFS needs
+ * to tell the users via pathconf() and statvfs() what the
+ * true maximum length of a component is, excluding the NULL.
+ */
+#define ZFS_MAXNAMELEN (MAXNAMELEN - 1)
+
+/*
+ * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
+ * the directory entries.
+ */
+#define IFTODT(mode) (((mode) & S_IFMT) >> 12)
+
+/*
+ * The directory entry has the type (currently unused on Solaris) in the
+ * top 4 bits, and the object number in the low 48 bits. The "middle"
+ * 12 bits are unused.
+ */
+#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
+#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
+
+/*
+ * Directory entry locks control access to directory entries.
+ * They are used to protect creates, deletes, and renames.
+ * Each directory znode has a mutex and a list of locked names.
+ */
+#ifdef _KERNEL
+typedef struct zfs_dirlock {
+ char *dl_name; /* directory entry being locked */
+ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
+ uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */
+ uint16_t dl_namesize; /* set if dl_name was allocated */
+ kcondvar_t dl_cv; /* wait for entry to be unlocked */
+ struct znode *dl_dzp; /* directory znode */
+ struct zfs_dirlock *dl_next; /* next in z_dirlocks list */
+} zfs_dirlock_t;
+
+typedef struct znode {
+ struct zfsvfs *z_zfsvfs;
+ vnode_t *z_vnode;
+ uint64_t z_id; /* object ID for this znode */
+ kmutex_t z_lock; /* znode modification lock */
+ krwlock_t z_parent_lock; /* parent lock for directories */
+ krwlock_t z_name_lock; /* "master" lock for dirent locks */
+ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
+ kmutex_t z_range_lock; /* protects changes to z_range_avl */
+ avl_tree_t z_range_avl; /* avl tree of file range locks */
+ uint8_t z_unlinked; /* file has been unlinked */
+ uint8_t z_atime_dirty; /* atime needs to be synced */
+ uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint8_t z_moved; /* Has this znode been moved? */
+ uint_t z_blksz; /* block size in bytes */
+ uint_t z_seq; /* modification sequence number */
+ uint64_t z_mapcnt; /* number of pages mapped to file */
+ uint64_t z_gen; /* generation (cached) */
+ uint64_t z_size; /* file size (cached) */
+ uint64_t z_atime[2]; /* atime (cached) */
+ uint64_t z_links; /* file links (cached) */
+ uint64_t z_pflags; /* pflags (cached) */
+ uint64_t z_uid; /* uid fuid (cached) */
+ uint64_t z_gid; /* gid fuid (cached) */
+ mode_t z_mode; /* mode (cached) */
+ uint32_t z_sync_cnt; /* synchronous open count */
+ kmutex_t z_acl_lock; /* acl data lock */
+ zfs_acl_t *z_acl_cached; /* cached acl */
+ list_node_t z_link_node; /* all znodes in fs link */
+ sa_handle_t *z_sa_hdl; /* handle to sa data */
+ boolean_t z_is_sa; /* are we native sa? */
+} znode_t;
+
+
+/*
+ * Range locking rules
+ * --------------------
+ * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
+ * file range needs to be locked as RL_WRITER. Only then can the pages be
+ * freed etc and zp_size reset. zp_size must be set within range lock.
+ * 2. For writes and punching holes (zfs_write & zfs_space) just the range
+ * being written or freed needs to be locked as RL_WRITER.
+ * Multiple writes at the end of the file must coordinate zp_size updates
+ * to ensure data isn't lost. A compare and swap loop is currently used
+ * to ensure the file size is at least the offset last written.
+ * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
+ * read needs to be locked as RL_READER. A check against zp_size can then
+ * be made for reading beyond end of file.
+ */
+
+/*
+ * Convert between znode pointers and vnode pointers
+ */
+#define ZTOV(ZP) ((ZP)->z_vnode)
+#define VTOZ(VP) ((znode_t *)(VP)->v_data)
+
+/*
+ * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
+ * ZFS_EXIT() must be called before exitting the vop.
+ * ZFS_VERIFY_ZP() verifies the znode is valid.
+ */
+#define ZFS_ENTER(zfsvfs) \
+ { \
+ rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \
+ if ((zfsvfs)->z_unmounted) { \
+ ZFS_EXIT(zfsvfs); \
+ return (EIO); \
+ } \
+ }
+
+#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+
+#define ZFS_VERIFY_ZP(zp) \
+ if ((zp)->z_sa_hdl == NULL) { \
+ ZFS_EXIT((zp)->z_zfsvfs); \
+ return (EIO); \
+ } \
+
+/*
+ * Macros for dealing with dmu_buf_hold
+ */
+#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1))
+#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \
+ (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
+ mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
+#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \
+ mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
+#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
+ mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
+
+/*
+ * Macros to encode/decode ZFS stored time values from/to struct timespec
+ */
+#define ZFS_TIME_ENCODE(tp, stmp) \
+{ \
+ (stmp)[0] = (uint64_t)(tp)->tv_sec; \
+ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \
+}
+
+#define ZFS_TIME_DECODE(tp, stmp) \
+{ \
+ (tp)->tv_sec = (time_t)(stmp)[0]; \
+ (tp)->tv_nsec = (long)(stmp)[1]; \
+}
+
+/*
+ * Timestamp defines
+ */
+#define ACCESSED (AT_ATIME)
+#define STATE_CHANGED (AT_CTIME)
+#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME)
+
+#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
+ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
+ zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
+
+extern int zfs_init_fs(zfsvfs_t *, znode_t **);
+extern void zfs_set_dataprop(objset_t *);
+extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
+ dmu_tx_t *tx);
+extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
+ uint64_t [2], boolean_t);
+extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
+extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
+extern void zfs_znode_init(void);
+extern void zfs_znode_fini(void);
+extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern int zfs_rezget(znode_t *);
+extern void zfs_zinactive(znode_t *);
+extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
+extern void zfs_znode_free(znode_t *);
+extern void zfs_remove_op_tables();
+extern int zfs_create_op_tables();
+extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
+extern dev_t zfs_cmpldev(uint64_t);
+extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
+extern int zfs_get_stats(objset_t *os, nvlist_t *nv);
+extern void zfs_znode_dmu_fini(znode_t *);
+
+extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *,
+ vattr_t *vap);
+extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
+ vattr_t *vap);
+extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, char *name, uint64_t foid);
+#define ZFS_NO_OBJECT 0 /* no object id */
+extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link);
+extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
+extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag);
+extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len);
+extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
+extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+ vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
+extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+
+extern caddr_t zfs_map_page(page_t *, enum seg_rw);
+extern void zfs_unmap_page(page_t *, caddr_t);
+
+extern zil_get_data_t zfs_get_data;
+extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
+extern int zfsfstype;
+
+#endif /* _KERNEL */
+
+extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZNODE_H */
diff --git a/uts/common/fs/zfs/sys/zil.h b/uts/common/fs/zfs/sys/zil.h
new file mode 100644
index 000000000000..a4c5575b2dba
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zil.h
@@ -0,0 +1,428 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_ZIL_H
+#define _SYS_ZIL_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log. The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset. The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t). The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log. All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
+ uint64_t zh_replay_seq; /* highest replayed sequence number */
+ blkptr_t zh_log; /* log chain */
+ uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
+ uint64_t zh_flags; /* header flags */
+ uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+ uint64_t zh_pad[3];
+} zil_header_t;
+
+/*
+ * zh_flags bit settings
+ */
+#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
+
+/*
+ * Log block chaining.
+ *
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
+ *
+ * The zio_eck_t contains a zec_cksum which for the intent log is
+ * the sequence number of this log block. A seq of 0 is invalid.
+ * The zec_cksum is checked by the SPA against the sequence
+ * number passed in the blk_cksum field of the blkptr_t
+ */
+typedef struct zil_chain {
+ uint64_t zc_pad;
+ blkptr_t zc_next_blk; /* next block in chain */
+ uint64_t zc_nused; /* bytes in log block used */
+ zio_eck_t zc_eck; /* block trailer */
+} zil_chain_t;
+
+#define ZIL_MIN_BLKSZ 4096ULL
+#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
+
+/*
+ * The words of a log block checksum.
+ */
+#define ZIL_ZC_GUID_0 0
+#define ZIL_ZC_GUID_1 1
+#define ZIL_ZC_OBJSET 2
+#define ZIL_ZC_SEQ 3
+
+typedef enum zil_create {
+ Z_FILE,
+ Z_DIR,
+ Z_XATTRDIR,
+} zil_create_t;
+
+/*
+ * size of xvattr log section.
+ * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
+ * for create time and a single 64 bit integer for all of the attributes,
+ * and 4 64 bit integers (32 bytes) for the scanstamp.
+ *
+ */
+
+#define ZIL_XVAT_SIZE(mapsize) \
+ sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
+ (sizeof (uint64_t) * 7)
+
+/*
+ * Size of ACL in log. The ACE data is padded out to properly align
+ * on 8 byte boundary.
+ */
+
+#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t)))
+
+/*
+ * Intent log transaction types and record structures
+ */
+#define TX_CREATE 1 /* Create file */
+#define TX_MKDIR 2 /* Make directory */
+#define TX_MKXATTR 3 /* Make XATTR directory */
+#define TX_SYMLINK 4 /* Create symbolic link to a file */
+#define TX_REMOVE 5 /* Remove file */
+#define TX_RMDIR 6 /* Remove directory */
+#define TX_LINK 7 /* Create hard link to a file */
+#define TX_RENAME 8 /* Rename a file */
+#define TX_WRITE 9 /* File write */
+#define TX_TRUNCATE 10 /* Truncate a file */
+#define TX_SETATTR 11 /* Set file attributes */
+#define TX_ACL_V0 12 /* Set old formatted ACL */
+#define TX_ACL 13 /* Set ACL */
+#define TX_CREATE_ACL 14 /* create with ACL */
+#define TX_CREATE_ATTR 15 /* create + attrs */
+#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
+#define TX_MKDIR_ACL 17 /* mkdir with ACL */
+#define TX_MKDIR_ATTR 18 /* mkdir with attr */
+#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
+#define TX_WRITE2 20 /* dmu_sync EALREADY write */
+#define TX_MAX_TYPE 21 /* Max transaction type */
+
+/*
+ * The transactions for mkdir, symlink, remove, rmdir, link, and rename
+ * may have the following bit set, indicating the original request
+ * specified case-insensitive handling of names.
+ */
+#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
+
+/*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order. For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define TX_OOO(txtype) \
+ ((txtype) == TX_WRITE || \
+ (txtype) == TX_TRUNCATE || \
+ (txtype) == TX_SETATTR || \
+ (txtype) == TX_ACL_V0 || \
+ (txtype) == TX_ACL || \
+ (txtype) == TX_WRITE2)
+
+/*
+ * Format of log records.
+ * The fields are carefully defined to allow them to be aligned
+ * and sized the same on sparc & intel architectures.
+ * Each log record has a common structure at the beginning.
+ *
+ * The log record on disk (lrc_seq) holds the sequence number of all log
+ * records which is used to ensure we don't replay the same record.
+ */
+typedef struct { /* common log record header */
+ uint64_t lrc_txtype; /* intent log transaction type */
+ uint64_t lrc_reclen; /* transaction record length */
+ uint64_t lrc_txg; /* dmu transaction group number */
+ uint64_t lrc_seq; /* see comment above */
+} lr_t;
+
+/*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id */
+} lr_ooo_t;
+
+/*
+ * Handle option extended vattr attributes.
+ *
+ * Whenever new attributes are added the version number
+ * will need to be updated as will code in
+ * zfs_log.c and zfs_replay.c
+ */
+typedef struct {
+ uint32_t lr_attr_masksize; /* number of elements in array */
+ uint32_t lr_attr_bitmap; /* First entry of array */
+ /* remainder of array and any additional fields */
+} lr_attr_t;
+
+/*
+ * log record for creates without optional ACL.
+ * This log record does support optional xvattr_t attributes.
+ */
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* object id of directory */
+ uint64_t lr_foid; /* object id of created file object */
+ uint64_t lr_mode; /* mode of object */
+ uint64_t lr_uid; /* uid of object */
+ uint64_t lr_gid; /* gid of object */
+ uint64_t lr_gen; /* generation (txg of creation) */
+ uint64_t lr_crtime[2]; /* creation time */
+ uint64_t lr_rdev; /* rdev of object to create */
+ /* name of object to create follows this */
+ /* for symlinks, link content follows name */
+ /* for creates with xvattr data, the name follows the xvattr info */
+} lr_create_t;
+
+/*
+ * FUID ACL record will be an array of ACEs from the original ACL.
+ * If this array includes ephemeral IDs, the record will also include
+ * an array of log-specific FUIDs to replace the ephemeral IDs.
+ * Only one copy of each unique domain will be present, so the log-specific
+ * FUIDs will use an index into a compressed domain table. On replay this
+ * information will be used to construct real FUIDs (and bypass idmap,
+ * since it may not be available).
+ */
+
+/*
+ * Log record for creates with optional ACL
+ * This log record is also used for recording any FUID
+ * information needed for replaying the create. If the
+ * file doesn't have any actual ACEs then the lr_aclcnt
+ * would be zero.
+ */
+typedef struct {
+ lr_create_t lr_create; /* common create portion */
+ uint64_t lr_aclcnt; /* number of ACEs in ACL */
+ uint64_t lr_domcnt; /* number of unique domains */
+ uint64_t lr_fuidcnt; /* number of real fuids */
+ uint64_t lr_acl_bytes; /* number of bytes in ACL */
+ uint64_t lr_acl_flags; /* ACL flags */
+ /* lr_acl_bytes number of variable sized ace's follows */
+ /* if create is also setting xvattr's, then acl data follows xvattr */
+ /* if ACE FUIDs are needed then they will follow the xvattr_t */
+ /* Following the FUIDs will be the domain table information. */
+ /* The FUIDs for the owner and group will be in the lr_create */
+ /* portion of the record. */
+ /* name follows ACL data */
+} lr_acl_create_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ /* name of object to remove follows this */
+} lr_remove_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ uint64_t lr_link_obj; /* obj id of link */
+ /* name of object to link follows this */
+} lr_link_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_sdoid; /* obj id of source directory */
+ uint64_t lr_tdoid; /* obj id of target directory */
+ /* 2 strings: names of source and destination follow this */
+} lr_rename_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to write */
+ uint64_t lr_offset; /* offset to write to */
+ uint64_t lr_length; /* user data length to write */
+ uint64_t lr_blkoff; /* no longer used */
+ blkptr_t lr_blkptr; /* spa block pointer for replay */
+ /* write data will follow for small writes */
+} lr_write_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id of file to truncate */
+ uint64_t lr_offset; /* offset to truncate from */
+ uint64_t lr_length; /* length to truncate */
+} lr_truncate_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to change attributes */
+ uint64_t lr_mask; /* mask of attributes to set */
+ uint64_t lr_mode; /* mode to set */
+ uint64_t lr_uid; /* uid to set */
+ uint64_t lr_gid; /* gid to set */
+ uint64_t lr_size; /* size to set */
+ uint64_t lr_atime[2]; /* access time */
+ uint64_t lr_mtime[2]; /* modification time */
+ /* optional attribute lr_attr_t may be here */
+} lr_setattr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* obj id of file */
+ uint64_t lr_aclcnt; /* number of acl entries */
+ /* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_v0_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* obj id of file */
+ uint64_t lr_aclcnt; /* number of ACEs in ACL */
+ uint64_t lr_domcnt; /* number of unique domains */
+ uint64_t lr_fuidcnt; /* number of real fuids */
+ uint64_t lr_acl_bytes; /* number of bytes in ACL */
+ uint64_t lr_acl_flags; /* ACL flags */
+ /* lr_acl_bytes number of variable sized ace's follows */
+} lr_acl_t;
+
+/*
+ * ZIL structure definitions, interface function prototype and globals.
+ */
+
+/*
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * In this mode, if we need to commit the write later, then the block
+ * is immediately written into the file system (using dmu_sync),
+ * and a pointer to the block is put into the log record.
+ * When the txg commits the block is linked in.
+ * This saves additionally writing the data into the log record.
+ * There are a few requirements for this to occur:
+ * - write is greater than zfs/zvol_immediate_write_sz
+ * - not using slogs (as slogs are assumed to always be faster
+ * than writing into the main pool)
+ * - the write occupies only one block
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FSYNC or FDSYNC), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
+ */
+typedef enum {
+ WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
+ /* and put blkptr in log, rather than actual data) */
+ WR_COPIED, /* immediate - data is copied into lr_write_t */
+ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
+ WR_NUM_STATES /* number of states */
+} itx_wr_state_t;
+
+typedef struct itx {
+ list_node_t itx_node; /* linkage on zl_itx_list */
+ void *itx_private; /* type-specific opaque data */
+ itx_wr_state_t itx_wr_state; /* write state */
+ uint8_t itx_sync; /* synchronous transaction */
+ uint64_t itx_sod; /* record size on disk */
+ uint64_t itx_oid; /* object id */
+ lr_t itx_lr; /* common part of log record */
+ /* followed by type-specific part of lr_xx_t and its immediate data */
+} itx_t;
+
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+ uint64_t txg);
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+ uint64_t txg);
+typedef int zil_replay_func_t();
+typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
+
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
+
+extern void zil_init(void);
+extern void zil_fini(void);
+
+extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys);
+extern void zil_free(zilog_t *zilog);
+
+extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
+extern void zil_close(zilog_t *zilog);
+
+extern void zil_replay(objset_t *os, void *arg,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
+extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
+
+extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
+extern void zil_itx_destroy(itx_t *itx);
+extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+
+extern void zil_commit(zilog_t *zilog, uint64_t oid);
+
+extern int zil_vdev_offline(const char *osname, void *txarg);
+extern int zil_claim(const char *osname, void *txarg);
+extern int zil_check_log_chain(const char *osname, void *txarg);
+extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
+
+extern int zil_suspend(zilog_t *zilog);
+extern void zil_resume(zilog_t *zilog);
+
+extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
+extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
+
+extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
+
+extern int zil_replay_disable;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_H */
diff --git a/uts/common/fs/zfs/sys/zil_impl.h b/uts/common/fs/zfs/sys/zil_impl.h
new file mode 100644
index 000000000000..1d4c0cc6c1de
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zil_impl.h
@@ -0,0 +1,147 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_ZIL_IMPL_H
+#define _SYS_ZIL_IMPL_H
+
+#include <sys/zil.h>
+#include <sys/dmu_objset.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Log write buffer.
+ */
+typedef struct lwb {
+ zilog_t *lwb_zilog; /* back pointer to log struct */
+ blkptr_t lwb_blk; /* on disk address of this log blk */
+ int lwb_nused; /* # used bytes in buffer */
+ int lwb_sz; /* size of block and buffer */
+ char *lwb_buf; /* log write buffer */
+ zio_t *lwb_zio; /* zio for this buffer */
+ dmu_tx_t *lwb_tx; /* tx for log block allocation */
+ uint64_t lwb_max_txg; /* highest txg in this lwb */
+ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
+} lwb_t;
+
+/*
+ * Intent log transaction lists
+ */
+typedef struct itxs {
+ list_t i_sync_list; /* list of synchronous itxs */
+ avl_tree_t i_async_tree; /* tree of foids for async itxs */
+} itxs_t;
+
+typedef struct itxg {
+ kmutex_t itxg_lock; /* lock for this structure */
+ uint64_t itxg_txg; /* txg for this chain */
+ uint64_t itxg_sod; /* total size on disk for this txg */
+ itxs_t *itxg_itxs; /* sync and async itxs */
+} itxg_t;
+
+/* for async nodes we build up an AVL tree of lists of async itxs per file */
+typedef struct itx_async_node {
+ uint64_t ia_foid; /* file object id */
+ list_t ia_list; /* list of async itxs for this foid */
+ avl_node_t ia_node; /* AVL tree linkage */
+} itx_async_node_t;
+
+/*
+ * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
+ * we've touched so we know which ones need a write cache flush at the end.
+ */
+typedef struct zil_vdev_node {
+ uint64_t zv_vdev; /* vdev to be flushed */
+ avl_node_t zv_node; /* AVL tree linkage */
+} zil_vdev_node_t;
+
+#define ZIL_PREV_BLKS 16
+
+/*
+ * Stable storage intent log management structure. One per dataset.
+ */
+struct zilog {
+ kmutex_t zl_lock; /* protects most zilog_t fields */
+ struct dsl_pool *zl_dmu_pool; /* DSL pool */
+ spa_t *zl_spa; /* handle for read/write log */
+ const zil_header_t *zl_header; /* log header buffer */
+ objset_t *zl_os; /* object set we're logging */
+ zil_get_data_t *zl_get_data; /* callback to get object content */
+ zio_t *zl_root_zio; /* log writer root zio */
+ uint64_t zl_lr_seq; /* on-disk log record sequence number */
+ uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */
+ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
+ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
+ uint64_t zl_replaying_seq; /* current replay seq number */
+ uint32_t zl_suspend; /* log suspend count */
+ kcondvar_t zl_cv_writer; /* log writer thread completion */
+ kcondvar_t zl_cv_suspend; /* log suspend completion */
+ uint8_t zl_suspending; /* log is currently suspending */
+ uint8_t zl_keep_first; /* keep first log block in destroy */
+ uint8_t zl_replay; /* replaying records while set */
+ uint8_t zl_stop_sync; /* for debugging */
+ uint8_t zl_writer; /* boolean: write setup in progress */
+ uint8_t zl_logbias; /* latency or throughput */
+ uint8_t zl_sync; /* synchronous or asynchronous */
+ int zl_parse_error; /* last zil_parse() error */
+ uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */
+ uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */
+ uint64_t zl_parse_blk_count; /* number of blocks parsed */
+ uint64_t zl_parse_lr_count; /* number of log records parsed */
+ uint64_t zl_next_batch; /* next batch number */
+ uint64_t zl_com_batch; /* committed batch number */
+ kcondvar_t zl_cv_batch[2]; /* batch condition variables */
+ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
+ list_t zl_itx_commit_list; /* itx list to be committed */
+ uint64_t zl_itx_list_sz; /* total size of records on list */
+ uint64_t zl_cur_used; /* current commit log size used */
+ list_t zl_lwb_list; /* in-flight log write list */
+ kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
+ avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */
+ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
+ avl_tree_t zl_bp_tree; /* track bps during log parse */
+ clock_t zl_replay_time; /* lbolt of when replay started */
+ uint64_t zl_replay_blks; /* number of log blocks replayed */
+ zil_header_t zl_old_header; /* debugging aid */
+ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+ uint_t zl_prev_rotor; /* rotor for zl_prev[] */
+};
+
+typedef struct zil_bp_node {
+ dva_t zn_dva;
+ avl_node_t zn_node;
+} zil_bp_node_t;
+
+#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+ sizeof (lr_write_t))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h
new file mode 100644
index 000000000000..97d8ec74d2e9
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zio.h
@@ -0,0 +1,559 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _ZIO_H
+#define _ZIO_H
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Embedded checksum
+ */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
+
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t) - \
+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+ sizeof (uint64_t))
+
+typedef struct zio_gbh {
+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
+ uint64_t zg_filler[SPA_GBH_FILLER];
+ zio_eck_t zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+ ZIO_CHECKSUM_INHERIT = 0,
+ ZIO_CHECKSUM_ON,
+ ZIO_CHECKSUM_OFF,
+ ZIO_CHECKSUM_LABEL,
+ ZIO_CHECKSUM_GANG_HEADER,
+ ZIO_CHECKSUM_ZILOG,
+ ZIO_CHECKSUM_FLETCHER_2,
+ ZIO_CHECKSUM_FLETCHER_4,
+ ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
+ ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+
+#define ZIO_CHECKSUM_MASK 0xffULL
+#define ZIO_CHECKSUM_VERIFY (1 << 8)
+
+#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
+#define ZIO_DEDUPDITTO_MIN 100
+
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
+#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
+#define BOOTFS_COMPRESS_VALID(compress) \
+ ((compress) == ZIO_COMPRESS_LZJB || \
+ ((compress) == ZIO_COMPRESS_ON && \
+ ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
+ (compress) == ZIO_COMPRESS_OFF)
+
+#define ZIO_FAILURE_MODE_WAIT 0
+#define ZIO_FAILURE_MODE_CONTINUE 1
+#define ZIO_FAILURE_MODE_PANIC 2
+
+#define ZIO_PRIORITY_NOW (zio_priority_table[0])
+#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
+#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
+#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3])
+#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4])
+#define ZIO_PRIORITY_AGG (zio_priority_table[5])
+#define ZIO_PRIORITY_FREE (zio_priority_table[6])
+#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7])
+#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
+#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
+#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
+#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
+#define ZIO_PRIORITY_TABLE_SIZE 12
+
+#define ZIO_PIPELINE_CONTINUE 0x100
+#define ZIO_PIPELINE_STOP 0x101
+
+enum zio_flag {
+ /*
+ * Flags inherited by gang, ddt, and vdev children,
+ * and that must be equal for two zios to aggregate
+ */
+ ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
+ ZIO_FLAG_IO_REPAIR = 1 << 1,
+ ZIO_FLAG_SELF_HEAL = 1 << 2,
+ ZIO_FLAG_RESILVER = 1 << 3,
+ ZIO_FLAG_SCRUB = 1 << 4,
+ ZIO_FLAG_SCAN_THREAD = 1 << 5,
+
+#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
+
+ /*
+ * Flags inherited by ddt, gang, and vdev children.
+ */
+ ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
+ ZIO_FLAG_SPECULATIVE = 1 << 7,
+ ZIO_FLAG_CONFIG_WRITER = 1 << 8,
+ ZIO_FLAG_DONT_RETRY = 1 << 9,
+ ZIO_FLAG_DONT_CACHE = 1 << 10,
+ ZIO_FLAG_NODATA = 1 << 11,
+ ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
+
+#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+
+ /*
+ * Flags inherited by vdev children.
+ */
+ ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 14,
+ ZIO_FLAG_TRYHARD = 1 << 15,
+ ZIO_FLAG_OPTIONAL = 1 << 16,
+
+#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
+
+ /*
+ * Flags not inherited by any children.
+ */
+ ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
+ ZIO_FLAG_IO_BYPASS = 1 << 19,
+ ZIO_FLAG_IO_REWRITE = 1 << 20,
+ ZIO_FLAG_RAW = 1 << 21,
+ ZIO_FLAG_GANG_CHILD = 1 << 22,
+ ZIO_FLAG_DDT_CHILD = 1 << 23,
+ ZIO_FLAG_GODFATHER = 1 << 24
+};
+
+#define ZIO_FLAG_MUSTSUCCEED 0
+
+#define ZIO_DDT_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \
+ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
+#define ZIO_GANG_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \
+ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
+
+#define ZIO_VDEV_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
+ ZIO_FLAG_CANFAIL)
+
+enum zio_child {
+ ZIO_CHILD_VDEV = 0,
+ ZIO_CHILD_GANG,
+ ZIO_CHILD_DDT,
+ ZIO_CHILD_LOGICAL,
+ ZIO_CHILD_TYPES
+};
+
+enum zio_wait_type {
+ ZIO_WAIT_READY = 0,
+ ZIO_WAIT_DONE,
+ ZIO_WAIT_TYPES
+};
+
+/*
+ * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent
+ * graveyard) to indicate checksum errors and fragmentation.
+ */
+#define ECKSUM EBADE
+#define EFRAGS EBADR
+
+typedef void zio_done_func_t(zio_t *zio);
+
+extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
+extern char *zio_type_name[ZIO_TYPES];
+
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool. By convention, the meta-objset (MOS)
+ * is objset 0, and the meta-dnode is object 0. This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
+ *
+ * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ *
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
+ *
+ * Note: this structure is passed between userland and the kernel.
+ * Therefore it must not change size or alignment between 32/64 bit
+ * compilation options.
+ */
+typedef struct zbookmark {
+ uint64_t zb_objset;
+ uint64_t zb_object;
+ int64_t zb_level;
+ uint64_t zb_blkid;
+} zbookmark_t;
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define ZB_DESTROYED_OBJSET (-1ULL)
+
+#define ZB_ROOT_OBJECT (0ULL)
+#define ZB_ROOT_LEVEL (-1LL)
+#define ZB_ROOT_BLKID (0ULL)
+
+#define ZB_ZIL_OBJECT (0ULL)
+#define ZB_ZIL_LEVEL (-2LL)
+
+typedef struct zio_prop {
+ enum zio_checksum zp_checksum;
+ enum zio_compress zp_compress;
+ dmu_object_type_t zp_type;
+ uint8_t zp_level;
+ uint8_t zp_copies;
+ uint8_t zp_dedup;
+ uint8_t zp_dedup_verify;
+} zio_prop_t;
+
+typedef struct zio_cksum_report zio_cksum_report_t;
+
+typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
+ const void *good_data);
+typedef void zio_cksum_free_f(void *cbdata, size_t size);
+
+struct zio_bad_cksum; /* defined in zio_checksum.h */
+
+struct zio_cksum_report {
+ struct zio_cksum_report *zcr_next;
+ nvlist_t *zcr_ereport;
+ nvlist_t *zcr_detector;
+ void *zcr_cbdata;
+ size_t zcr_cbinfo; /* passed to zcr_free() */
+ uint64_t zcr_align;
+ uint64_t zcr_length;
+ zio_cksum_finish_f *zcr_finish;
+ zio_cksum_free_f *zcr_free;
+
+ /* internal use only */
+ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */
+};
+
+typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
+ void *arg);
+
+zio_vsd_cksum_report_f zio_vsd_default_cksum_report;
+
+typedef struct zio_vsd_ops {
+ zio_done_func_t *vsd_free;
+ zio_vsd_cksum_report_f *vsd_cksum_report;
+} zio_vsd_ops_t;
+
+typedef struct zio_gang_node {
+ zio_gbh_phys_t *gn_gbh;
+ struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
+} zio_gang_node_t;
+
+typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
+ zio_gang_node_t *gn, void *data);
+
+typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size);
+
+typedef struct zio_transform {
+ void *zt_orig_data;
+ uint64_t zt_orig_size;
+ uint64_t zt_bufsize;
+ zio_transform_func_t *zt_transform;
+ struct zio_transform *zt_next;
+} zio_transform_t;
+
+typedef int zio_pipe_stage_t(zio_t *zio);
+
+/*
+ * The io_reexecute flags are distinct from io_flags because the child must
+ * be able to propagate them to the parent. The normal io_flags are local
+ * to the zio, not protected by any lock, and not modifiable by children;
+ * the reexecute flags are protected by io_lock, modifiable by children,
+ * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
+ */
+#define ZIO_REEXECUTE_NOW 0x01
+#define ZIO_REEXECUTE_SUSPEND 0x02
+
+typedef struct zio_link {
+ zio_t *zl_parent;
+ zio_t *zl_child;
+ list_node_t zl_parent_node;
+ list_node_t zl_child_node;
+} zio_link_t;
+
+struct zio {
+ /* Core information about this I/O */
+ zbookmark_t io_bookmark;
+ zio_prop_t io_prop;
+ zio_type_t io_type;
+ enum zio_child io_child_type;
+ int io_cmd;
+ uint8_t io_priority;
+ uint8_t io_reexecute;
+ uint8_t io_state[ZIO_WAIT_TYPES];
+ uint64_t io_txg;
+ spa_t *io_spa;
+ blkptr_t *io_bp;
+ blkptr_t *io_bp_override;
+ blkptr_t io_bp_copy;
+ list_t io_parent_list;
+ list_t io_child_list;
+ zio_link_t *io_walk_link;
+ zio_t *io_logical;
+ zio_transform_t *io_transform_stack;
+
+ /* Callback info */
+ zio_done_func_t *io_ready;
+ zio_done_func_t *io_done;
+ void *io_private;
+ int64_t io_prev_space_delta; /* DMU private */
+ blkptr_t io_bp_orig;
+
+ /* Data represented by this I/O */
+ void *io_data;
+ void *io_orig_data;
+ uint64_t io_size;
+ uint64_t io_orig_size;
+
+ /* Stuff for the vdev stack */
+ vdev_t *io_vd;
+ void *io_vsd;
+ const zio_vsd_ops_t *io_vsd_ops;
+
+ uint64_t io_offset;
+ uint64_t io_deadline;
+ avl_node_t io_offset_node;
+ avl_node_t io_deadline_node;
+ avl_tree_t *io_vdev_tree;
+
+ /* Internal pipeline state */
+ enum zio_flag io_flags;
+ enum zio_stage io_stage;
+ enum zio_stage io_pipeline;
+ enum zio_flag io_orig_flags;
+ enum zio_stage io_orig_stage;
+ enum zio_stage io_orig_pipeline;
+ int io_error;
+ int io_child_error[ZIO_CHILD_TYPES];
+ uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+ uint64_t io_child_count;
+ uint64_t io_parent_count;
+ uint64_t *io_stall;
+ zio_t *io_gang_leader;
+ zio_gang_node_t *io_gang_tree;
+ void *io_executor;
+ void *io_waiter;
+ kmutex_t io_lock;
+ kcondvar_t io_cv;
+
+ /* FMA state */
+ zio_cksum_report_t *io_cksum_report;
+ uint64_t io_ena;
+};
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
+ zio_done_func_t *done, void *private, enum zio_flag flags);
+
+extern zio_t *zio_root(spa_t *spa,
+ zio_done_func_t *done, void *private, enum zio_flag flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ void *data, uint64_t size, const zio_prop_t *zp,
+ zio_done_func_t *ready, zio_done_func_t *done, void *private,
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ void *data, uint64_t size, zio_done_func_t *done, void *private,
+ int priority, enum zio_flag flags, zbookmark_t *zb);
+
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
+
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
+ boolean_t labels);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
+ boolean_t labels);
+
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
+extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+extern void zio_execute(zio_t *zio);
+extern void zio_interrupt(zio_t *zio);
+
+extern zio_t *zio_walk_parents(zio_t *cio);
+extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_unique_parent(zio_t *cio);
+extern void zio_add_child(zio_t *pio, zio_t *cio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+extern void *zio_data_buf_alloc(size_t size);
+extern void zio_data_buf_free(void *buf, size_t size);
+
+extern void zio_resubmit_stage_async(void *);
+
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+ uint64_t offset, void *data, uint64_t size, int type, int priority,
+ enum zio_flag flags, zio_done_func_t *done, void *private);
+
+extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
+ void *data, uint64_t size, int type, int priority,
+ enum zio_flag flags, zio_done_func_t *done, void *private);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern int zio_worst_error(int e1, int e2);
+
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+ enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+ enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(enum zio_compress child,
+ enum zio_compress parent);
+
+extern void zio_suspend(spa_t *spa, zio_t *zio);
+extern int zio_resume(spa_t *spa);
+extern void zio_resume_wait(spa_t *spa);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+ struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+ struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
+extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
+
+/*
+ * Checksum ereport functions
+ */
+extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
+ uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
+extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+ const void *good_data, const void *bad_data, boolean_t drop_if_identical);
+
+extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
+extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
+
+/* If we have the good data in hand, this function can be used */
+extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
+
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_H */
diff --git a/uts/common/fs/zfs/sys/zio_checksum.h b/uts/common/fs/zfs/sys/zio_checksum.h
new file mode 100644
index 000000000000..0956c04ab1b4
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zio_checksum.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define _SYS_ZIO_CHECKSUM_H
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+ zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
+ int ci_correctable; /* number of correctable bits */
+ int ci_eck; /* uses zio embedded checksum? */
+ int ci_dedup; /* strong enough for dedup? */
+ char *ci_name; /* descriptive name */
+} zio_checksum_info_t;
+
+typedef struct zio_bad_cksum {
+ zio_cksum_t zbc_expected;
+ zio_cksum_t zbc_actual;
+ const char *zbc_checksum_name;
+ uint8_t zbc_byteswapped;
+ uint8_t zbc_injected;
+ uint8_t zbc_has_cksum; /* expected/actual valid */
+} zio_bad_cksum_t;
+
+extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
+
+/*
+ * Checksum routines.
+ */
+extern zio_checksum_t zio_checksum_SHA256;
+
+extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
+ void *data, uint64_t size);
+extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_CHECKSUM_H */
diff --git a/uts/common/fs/zfs/sys/zio_compress.h b/uts/common/fs/zfs/sys/zio_compress.h
new file mode 100644
index 000000000000..30bed1a676e3
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zio_compress.h
@@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_COMPRESS_H
+#define _SYS_ZIO_COMPRESS_H
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Common signature for all zio compress/decompress functions.
+ */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+typedef int zio_decompress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+ zio_compress_func_t *ci_compress; /* compression function */
+ zio_decompress_func_t *ci_decompress; /* decompression function */
+ int ci_level; /* level parameter */
+ char *ci_name; /* algorithm name */
+} zio_compress_info_t;
+
+extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
+
+/*
+ * Compression routines.
+ */
+extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+
+/*
+ * Compress and decompress data if necessary.
+ */
+extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_COMPRESS_H */
diff --git a/uts/common/fs/zfs/sys/zio_impl.h b/uts/common/fs/zfs/sys/zio_impl.h
new file mode 100644
index 000000000000..d90bd8bd5921
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zio_impl.h
@@ -0,0 +1,175 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_IMPL_H
+#define _ZIO_IMPL_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * zio pipeline stage definitions
+ */
+enum zio_stage {
+ ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
+
+ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
+
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
+
+ ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */
+
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */
+
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */
+
+ ZIO_STAGE_READY = 1 << 15, /* RWFCI */
+
+ ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
+
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
+
+ ZIO_STAGE_DONE = 1 << 20 /* RWFCI */
+};
+
+#define ZIO_INTERLOCK_STAGES \
+ (ZIO_STAGE_READY | \
+ ZIO_STAGE_DONE)
+
+#define ZIO_INTERLOCK_PIPELINE \
+ ZIO_INTERLOCK_STAGES
+
+#define ZIO_VDEV_IO_STAGES \
+ (ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_DONE | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DONE)
+
+#define ZIO_READ_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_CHECKSUM_VERIFY)
+
+#define ZIO_READ_PHYS_PIPELINE \
+ ZIO_READ_COMMON_STAGES
+
+#define ZIO_READ_PIPELINE \
+ (ZIO_READ_COMMON_STAGES | \
+ ZIO_STAGE_READ_BP_INIT)
+
+#define ZIO_DDT_CHILD_READ_PIPELINE \
+ ZIO_READ_COMMON_STAGES
+
+#define ZIO_DDT_READ_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_READ_BP_INIT | \
+ ZIO_STAGE_DDT_READ_START | \
+ ZIO_STAGE_DDT_READ_DONE)
+
+#define ZIO_WRITE_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_CHECKSUM_GENERATE)
+
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ZIO_WRITE_COMMON_STAGES
+
+#define ZIO_REWRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT)
+
+#define ZIO_WRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_CHILD_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_CHECKSUM_GENERATE | \
+ ZIO_STAGE_DDT_WRITE)
+
+#define ZIO_GANG_STAGES \
+ (ZIO_STAGE_GANG_ASSEMBLE | \
+ ZIO_STAGE_GANG_ISSUE)
+
+#define ZIO_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_DVA_FREE)
+
+#define ZIO_DDT_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_DDT_FREE)
+
+#define ZIO_CLAIM_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_DVA_CLAIM)
+
+#define ZIO_IOCTL_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define ZIO_BLOCKING_STAGES \
+ (ZIO_STAGE_DVA_ALLOCATE | \
+ ZIO_STAGE_DVA_CLAIM | \
+ ZIO_STAGE_VDEV_IO_START)
+
+extern void zio_inject_init(void);
+extern void zio_inject_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_IMPL_H */
diff --git a/uts/common/fs/zfs/sys/zrlock.h b/uts/common/fs/zfs/sys/zrlock.h
new file mode 100644
index 000000000000..dcd63f7b5b91
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zrlock.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZRLOCK_H
+#define _SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+ kmutex_t zr_mtx;
+ volatile int32_t zr_refcount;
+ kcondvar_t zr_cv;
+ uint16_t zr_pad;
+#ifdef ZFS_DEBUG
+ kthread_t *zr_owner;
+ const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#ifdef ZFS_DEBUG
+#define zrl_add(_z) zrl_add_debug((_z), __func__)
+extern void zrl_add_debug(zrlock_t *, const char *);
+#else
+extern void zrl_add(zrlock_t *);
+#endif
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
diff --git a/uts/common/fs/zfs/sys/zvol.h b/uts/common/fs/zfs/sys/zvol.h
new file mode 100644
index 000000000000..0059bf510260
--- /dev/null
+++ b/uts/common/fs/zfs/sys/zvol.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZVOL_H
+#define _SYS_ZVOL_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZVOL_OBJ 1ULL
+#define ZVOL_ZAP_OBJ 2ULL
+
+#ifdef _KERNEL
+extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
+extern int zvol_check_volblocksize(uint64_t volblocksize);
+extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
+extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+extern int zvol_create_minor(const char *);
+extern int zvol_remove_minor(const char *);
+extern void zvol_remove_minors(const char *);
+extern int zvol_set_volsize(const char *, major_t, uint64_t);
+
+extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
+extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+extern int zvol_strategy(buf_t *bp);
+extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+ int *rvalp);
+extern int zvol_busy(void);
+extern void zvol_init(void);
+extern void zvol_fini(void);
+
+extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+ uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+ void **rl_hdl, void **bonus_hdl);
+extern uint64_t zvol_get_volume_size(void *minor_hdl);
+extern int zvol_get_volume_wce(void *minor_hdl);
+extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
+ ssize_t resid, boolean_t sync);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZVOL_H */