diff options
Diffstat (limited to 'usr.sbin/makefs/zfs/zap.c')
| -rw-r--r-- | usr.sbin/makefs/zfs/zap.c | 567 | 
1 files changed, 567 insertions, 0 deletions
diff --git a/usr.sbin/makefs/zfs/zap.c b/usr.sbin/makefs/zfs/zap.c new file mode 100644 index 000000000000..316d1446cecf --- /dev/null +++ b/usr.sbin/makefs/zfs/zap.c @@ -0,0 +1,567 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Mark Johnston under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in + *    the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/endian.h> + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +#include <util.h> + +#include "makefs.h" +#include "zfs.h" + +typedef struct zfs_zap_entry { +	char		*name;		/* entry key, private copy */ +	uint64_t	hash;		/* key hash */ +	union { +		uint8_t	 *valp; +		uint16_t *val16p; +		uint32_t *val32p; +		uint64_t *val64p; +	};				/* entry value, an integer array */ +	uint64_t	val64;		/* embedded value for a common case */ +	size_t		intsz;		/* array element size; 1, 2, 4 or 8 */ +	size_t		intcnt;		/* array size */ +	STAILQ_ENTRY(zfs_zap_entry) next; +} zfs_zap_entry_t; + +struct zfs_zap { +	STAILQ_HEAD(, zfs_zap_entry) kvps; +	uint64_t	hashsalt;	/* key hash input */ +	unsigned long	kvpcnt;		/* number of key-value pairs */ +	unsigned long	chunks;		/* count of chunks needed for fat ZAP */ +	bool		micro;		/* can this be a micro ZAP? */ + +	dnode_phys_t	*dnode;		/* backpointer */ +	zfs_objset_t	*os;		/* backpointer */ +}; + +static uint16_t +zap_entry_chunks(zfs_zap_entry_t *ent) +{ +	return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) + +	    howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES)); +} + +static uint64_t +zap_hash(uint64_t salt, const char *name) +{ +	static uint64_t crc64_table[256]; +	const uint64_t crc64_poly = 0xC96C5795D7870F42UL; +	const uint8_t *cp; +	uint64_t crc; +	uint8_t c; + +	assert(salt != 0); +	if (crc64_table[128] == 0) { +		for (int i = 0; i < 256; i++) { +			uint64_t *t; + +			t = crc64_table + i; +			*t = i; +			for (int j = 8; j > 0; j--) +				*t = (*t >> 1) ^ (-(*t & 1) & crc64_poly); +		} +	} +	assert(crc64_table[128] == crc64_poly); + +	for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++) +		crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF]; + +	/* +	 * Only use 28 bits, since we need 4 bits in the cookie for the +	 * collision differentiator.  We MUST use the high bits, since +	 * those are the ones that we first pay attention to when +	 * choosing the bucket. +	 */ +	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + +	return (crc); +} + +zfs_zap_t * +zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode) +{ +	zfs_zap_t *zap; + +	zap = ecalloc(1, sizeof(*zap)); +	STAILQ_INIT(&zap->kvps); +	zap->hashsalt = ((uint64_t)random() << 32) | random(); +	zap->micro = true; +	zap->kvpcnt = 0; +	zap->chunks = 0; +	zap->dnode = dnode; +	zap->os = os; +	return (zap); +} + +void +zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt, +    const uint8_t *val) +{ +	zfs_zap_entry_t *ent; + +	assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8); +	assert(strlen(name) + 1 <= ZAP_MAXNAMELEN); +	assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN); + +	ent = ecalloc(1, sizeof(*ent)); +	ent->name = estrdup(name); +	ent->hash = zap_hash(zap->hashsalt, ent->name); +	ent->intsz = intsz; +	ent->intcnt = intcnt; +	if (intsz == sizeof(uint64_t) && intcnt == 1) { +		/* +		 * Micro-optimization to elide a memory allocation in that most +		 * common case where this is a directory entry. +		 */ +		ent->val64p = &ent->val64; +	} else { +		ent->valp = ecalloc(intcnt, intsz); +	} +	memcpy(ent->valp, val, intcnt * intsz); +	zap->kvpcnt++; +	zap->chunks += zap_entry_chunks(ent); +	STAILQ_INSERT_TAIL(&zap->kvps, ent, next); + +	if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) || +	    strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX)) +		zap->micro = false; +} + +void +zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val) +{ +	zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); +} + +void +zap_add_uint64_self(zfs_zap_t *zap, uint64_t val) +{ +	char name[32]; + +	(void)snprintf(name, sizeof(name), "%jx", (uintmax_t)val); +	zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); +} + +void +zap_add_string(zfs_zap_t *zap, const char *name, const char *val) +{ +	zap_add(zap, name, 1, strlen(val) + 1, (const uint8_t *)val); +} + +bool +zap_entry_exists(zfs_zap_t *zap, const char *name) +{ +	zfs_zap_entry_t *ent; + +	STAILQ_FOREACH(ent, &zap->kvps, next) { +		if (strcmp(ent->name, name) == 0) +			return (true); +	} +	return (false); +} + +static void +zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ +	dnode_phys_t *dnode; +	zfs_zap_entry_t *ent; +	mzap_phys_t *mzap; +	mzap_ent_phys_t *ment; +	off_t bytes, loc; +	uint16_t cd; + +	_Static_assert(MZAP_ENT_MAX <= UINT16_MAX, +	    "micro ZAP collision differentiator must fit in 16 bits"); + +	memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); +	mzap = (mzap_phys_t *)&zfs->filebuf[0]; +	mzap->mz_block_type = ZBT_MICRO; +	mzap->mz_salt = zap->hashsalt; +	mzap->mz_normflags = 0; + +	bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment); +	assert(bytes <= (off_t)MZAP_MAX_BLKSZ); + +	cd = 0; +	ment = &mzap->mz_chunk[0]; +	STAILQ_FOREACH(ent, &zap->kvps, next) { +		memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt); +		ment->mze_cd = cd++; +		(void)strlcpy(ment->mze_name, ent->name, +		    sizeof(ment->mze_name)); +		ment++; +	} + +	loc = objset_space_alloc(zfs, zap->os, &bytes); + +	dnode = zap->dnode; +	dnode->dn_maxblkid = 0; +	dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT; + +	vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc); +} + +/* + * Write some data to the fat ZAP leaf chunk starting at index "li". + * + * Note that individual integers in the value may be split among consecutive + * leaves. + */ +static void +zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz, +    const uint8_t *val) +{ +	struct zap_leaf_array *la; + +	assert(sz <= ZAP_MAXVALUELEN); +	assert(sz > 0); + +	for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) { +		n = MIN(resid, ZAP_LEAF_ARRAY_BYTES); + +		la = &ZAP_LEAF_CHUNK(l, li).l_array; +		assert(la->la_type == ZAP_CHUNK_FREE); +		la->la_type = ZAP_CHUNK_ARRAY; +		memcpy(la->la_array, val, n); +		la->la_next = li + 1; +	} +	la->la_next = 0xffff; +} + +/* + * Find the shortest hash prefix length which lets us distribute keys without + * overflowing a leaf block.  This is not (space) optimal, but is simple, and + * directories large enough to overflow a single 128KB leaf block are uncommon. + */ +static unsigned int +zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l) +{ +	zfs_zap_entry_t *ent; +	unsigned int prefixlen; + +	if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) { +		/* +		 * All chunks will fit in a single leaf block. +		 */ +		return (0); +	} + +	for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) { +		uint32_t *leafchunks; + +		leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks)); +		STAILQ_FOREACH(ent, &zap->kvps, next) { +			uint64_t li; +			uint16_t chunks; + +			li = ZAP_HASH_IDX(ent->hash, prefixlen); + +			chunks = zap_entry_chunks(ent); +			if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) { +				/* +				 * Not enough space, grow the prefix and retry. +				 */ +				break; +			} +			leafchunks[li] += chunks; +		} +		free(leafchunks); + +		if (ent == NULL) { +			/* +			 * Everything fits, we're done. +			 */ +			break; +		} +	} + +	/* +	 * If this fails, then we need to expand the pointer table.  For now +	 * this situation is unhandled since it is hard to trigger. +	 */ +	assert(prefixlen < (unsigned int)l->l_bs); + +	return (prefixlen); +} + +/* + * Initialize a fat ZAP leaf block. + */ +static void +zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen) +{ +	zap_leaf_phys_t *leaf; + +	leaf = l->l_phys; + +	leaf->l_hdr.lh_block_type = ZBT_LEAF; +	leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC; +	leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); +	leaf->l_hdr.lh_prefix = prefix; +	leaf->l_hdr.lh_prefix_len = prefixlen; + +	/* Initialize the leaf hash table. */ +	assert(leaf->l_hdr.lh_nfree < 0xffff); +	memset(leaf->l_hash, 0xff, +	    ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash)); + +	/* Initialize the leaf chunks. */ +	for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { +		struct zap_leaf_free *lf; + +		lf = &ZAP_LEAF_CHUNK(l, i).l_free; +		lf->lf_type = ZAP_CHUNK_FREE; +		if (i + 1 == ZAP_LEAF_NUMCHUNKS(l)) +			lf->lf_next = 0xffff; +		else +			lf->lf_next = i + 1; +	} +} + +static void +zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ +	struct dnode_cursor *c; +	zap_leaf_t l; +	zap_phys_t *zaphdr; +	struct zap_table_phys *zt; +	zfs_zap_entry_t *ent; +	dnode_phys_t *dnode; +	uint8_t *leafblks; +	uint64_t lblkcnt, *ptrhasht; +	off_t loc, blksz; +	size_t blkshift; +	unsigned int prefixlen; +	int ptrcnt; + +	/* +	 * For simplicity, always use the largest block size.  This should be ok +	 * since most directories will be micro ZAPs, but it's space inefficient +	 * for small ZAPs and might need to be revisited. +	 */ +	blkshift = MAXBLOCKSHIFT; +	blksz = (off_t)1 << blkshift; + +	/* +	 * Embedded pointer tables give up to 8192 entries.  This ought to be +	 * enough for anything except massive directories. +	 */ +	ptrcnt = (blksz / 2) / sizeof(uint64_t); + +	memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); +	zaphdr = (zap_phys_t *)&zfs->filebuf[0]; +	zaphdr->zap_block_type = ZBT_HEADER; +	zaphdr->zap_magic = ZAP_MAGIC; +	zaphdr->zap_num_entries = zap->kvpcnt; +	zaphdr->zap_salt = zap->hashsalt; + +	l.l_bs = blkshift; +	l.l_phys = NULL; + +	zt = &zaphdr->zap_ptrtbl; +	zt->zt_blk = 0; +	zt->zt_numblks = 0; +	zt->zt_shift = flsll(ptrcnt) - 1; +	zt->zt_nextblk = 0; +	zt->zt_blks_copied = 0; + +	/* +	 * How many leaf blocks do we need?  Initialize them and update the +	 * header. +	 */ +	prefixlen = zap_fat_write_prefixlen(zap, &l); +	lblkcnt = (uint64_t)1 << prefixlen; +	leafblks = ecalloc(lblkcnt, blksz); +	for (unsigned int li = 0; li < lblkcnt; li++) { +		l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); +		zap_fat_write_leaf_init(&l, li, prefixlen); +	} +	zaphdr->zap_num_leafs = lblkcnt; +	zaphdr->zap_freeblk = lblkcnt + 1; + +	/* +	 * For each entry, figure out which leaf block it belongs to based on +	 * the upper bits of its hash, allocate chunks from that leaf, and fill +	 * them out. +	 */ +	ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2); +	STAILQ_FOREACH(ent, &zap->kvps, next) { +		struct zap_leaf_entry *le; +		uint16_t *lptr; +		uint64_t hi, li; +		uint16_t namelen, nchunks, nnamechunks, nvalchunks; + +		hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift); +		li = ZAP_HASH_IDX(ent->hash, prefixlen); +		assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1); +		ptrhasht[hi] = li + 1; +		l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); + +		namelen = strlen(ent->name) + 1; + +		/* +		 * How many leaf chunks do we need for this entry? +		 */ +		nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES); +		nvalchunks = howmany(ent->intcnt, +		    ZAP_LEAF_ARRAY_BYTES / ent->intsz); +		nchunks = 1 + nnamechunks + nvalchunks; + +		/* +		 * Allocate a run of free leaf chunks for this entry, +		 * potentially extending a hash chain. +		 */ +		assert(l.l_phys->l_hdr.lh_nfree >= nchunks); +		l.l_phys->l_hdr.lh_nfree -= nchunks; +		l.l_phys->l_hdr.lh_nentries++; +		lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash); +		while (*lptr != 0xffff) { +			assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l)); +			le = ZAP_LEAF_ENTRY(&l, *lptr); +			assert(le->le_type == ZAP_CHUNK_ENTRY); +			le->le_cd++; +			lptr = &le->le_next; +		} +		*lptr = l.l_phys->l_hdr.lh_freelist; +		l.l_phys->l_hdr.lh_freelist += nchunks; +		assert(l.l_phys->l_hdr.lh_freelist <= +		    ZAP_LEAF_NUMCHUNKS(&l)); +		if (l.l_phys->l_hdr.lh_freelist == +		    ZAP_LEAF_NUMCHUNKS(&l)) +			l.l_phys->l_hdr.lh_freelist = 0xffff; + +		/* +		 * Integer values must be stored in big-endian format. +		 */ +		switch (ent->intsz) { +		case 1: +			break; +		case 2: +			for (uint16_t *v = ent->val16p; +			    v - ent->val16p < (ptrdiff_t)ent->intcnt; +			    v++) +				*v = htobe16(*v); +			break; +		case 4: +			for (uint32_t *v = ent->val32p; +			    v - ent->val32p < (ptrdiff_t)ent->intcnt; +			    v++) +				*v = htobe32(*v); +			break; +		case 8: +			for (uint64_t *v = ent->val64p; +			    v - ent->val64p < (ptrdiff_t)ent->intcnt; +			    v++) +				*v = htobe64(*v); +			break; +		default: +			assert(0); +		} + +		/* +		 * Finally, write out the leaf chunks for this entry. +		 */ +		le = ZAP_LEAF_ENTRY(&l, *lptr); +		assert(le->le_type == ZAP_CHUNK_FREE); +		le->le_type = ZAP_CHUNK_ENTRY; +		le->le_next = 0xffff; +		le->le_name_chunk = *lptr + 1; +		le->le_name_numints = namelen; +		le->le_value_chunk = *lptr + 1 + nnamechunks; +		le->le_value_intlen = ent->intsz; +		le->le_value_numints = ent->intcnt; +		le->le_hash = ent->hash; +		zap_fat_write_array_chunk(&l, *lptr + 1, namelen, +		    (uint8_t *)ent->name); +		zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks, +		    ent->intcnt * ent->intsz, ent->valp); +	} + +	/* +	 * Initialize unused slots of the pointer table. +	 */ +	for (int i = 0; i < ptrcnt; i++) +		if (ptrhasht[i] == 0) +			ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1; + +	/* +	 * Write the whole thing to disk. +	 */ +	dnode = zap->dnode; +	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; +	dnode->dn_maxblkid = lblkcnt + 1; + +	c = dnode_cursor_init(zfs, zap->os, zap->dnode, +	    (lblkcnt + 1) * blksz, blksz); + +	loc = objset_space_alloc(zfs, zap->os, &blksz); +	vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc, +	    dnode_cursor_next(zfs, c, 0)); + +	for (uint64_t i = 0; i < lblkcnt; i++) { +		loc = objset_space_alloc(zfs, zap->os, &blksz); +		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz, +		    blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz)); +	} + +	dnode_cursor_finish(zfs, c); + +	free(leafblks); +} + +void +zap_write(zfs_opt_t *zfs, zfs_zap_t *zap) +{ +	zfs_zap_entry_t *ent; + +	if (zap->micro) { +		zap_micro_write(zfs, zap); +	} else { +		assert(!STAILQ_EMPTY(&zap->kvps)); +		assert(zap->kvpcnt > 0); +		zap_fat_write(zfs, zap); +	} + +	while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) { +		STAILQ_REMOVE_HEAD(&zap->kvps, next); +		if (ent->val64p != &ent->val64) +			free(ent->valp); +		free(ent->name); +		free(ent); +	} +	free(zap); +}  | 
