diff options
| -rw-r--r-- | sys/conf/NOTES | 1 | ||||
| -rw-r--r-- | sys/conf/files | 4 | ||||
| -rw-r--r-- | sys/conf/options | 1 | ||||
| -rw-r--r-- | sys/geom/bde/g_bde.c | 282 | ||||
| -rw-r--r-- | sys/geom/bde/g_bde.h | 150 | ||||
| -rw-r--r-- | sys/geom/bde/g_bde_crypt.c | 356 | ||||
| -rw-r--r-- | sys/geom/bde/g_bde_lock.c | 311 | ||||
| -rw-r--r-- | sys/geom/bde/g_bde_work.c | 731 |
8 files changed, 1836 insertions, 0 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 81df5c3b3ad1..02e1d9f1080a 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -115,6 +115,7 @@ options PQ_CACHESIZE=512 # color for 512k/16k cache options INCLUDE_CONFIG_FILE # Include this file in kernel options GEOM_AES +options GEOM_BDE options GEOM_BSD options GEOM_GPT options GEOM_MBR diff --git a/sys/conf/files b/sys/conf/files index 2bace6d1bf26..c003bc3c66af 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -796,6 +796,10 @@ fs/umapfs/umap_vnops.c optional umapfs fs/unionfs/union_subr.c optional unionfs fs/unionfs/union_vfsops.c optional unionfs fs/unionfs/union_vnops.c optional unionfs +geom/bde/g_bde.c optional geom_bde +geom/bde/g_bde_crypt.c optional geom_bde +geom/bde/g_bde_lock.c optional geom_bde +geom/bde/g_bde_work.c optional geom_bde geom/geom_aes.c optional geom_aes geom/geom_bsd.c optional geom_bsd geom/geom_ctl.c standard diff --git a/sys/conf/options b/sys/conf/options index 8480b1cf0682..031184935fb2 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -88,6 +88,7 @@ GDB_REMOTE_CHAT opt_ddb.h GDBSPEED opt_ddb.h NO_GEOM opt_geom.h GEOM_AES opt_geom.h +GEOM_BDE opt_geom.h GEOM_BSD opt_geom.h GEOM_GPT opt_geom.h GEOM_MBR opt_geom.h diff --git a/sys/geom/bde/g_bde.c b/sys/geom/bde/g_bde.c new file mode 100644 index 000000000000..51fd77977004 --- /dev/null +++ b/sys/geom/bde/g_bde.c @@ -0,0 +1,282 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#include <sys/param.h> +#include <sys/stdint.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <geom/geom.h> +#include <geom/bde/g_bde.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/kthread.h> + +#define BDE_CLASS_NAME "BDE" + +static void +g_bde_start(struct bio *bp) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_bde_softc *sc; + + gp = bp->bio_to->geom; + cp = LIST_FIRST(&gp->consumer); + sc = gp->softc; + switch (bp->bio_cmd) { + case BIO_DELETE: + case BIO_READ: + case BIO_WRITE: + g_bde_start1(bp); + break; + case BIO_GETATTR: + case BIO_SETATTR: + if (g_handleattr_off_t(bp, "GEOM::mediasize", sc->mediasize)) + return; + if (g_handleattr_int(bp, "GEOM::sectorsize", sc->sectorsize)) + return; + g_io_deliver(bp, EOPNOTSUPP); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } + return; +} + +static void +g_bde_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + struct g_provider *pp; + struct g_bde_softc *sc; + int error; + + g_trace(G_T_TOPOLOGY, "g_bde_orphan(%p/%s)", cp, cp->provider->name); + g_topology_assert(); + KASSERT(cp->provider->error != 0, + ("g_bde_orphan with error == 0")); + + gp = cp->geom; + sc = gp->softc; + gp->flags |= G_GEOM_WITHER; + error = cp->provider->error; + LIST_FOREACH(pp, &gp->provider, provider) + g_orphan_provider(pp, error); + bzero(sc, sizeof(struct g_bde_softc)); /* destroy evidence */ + return; +} + +static int +g_bde_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp; + + gp = pp->geom; + cp = LIST_FIRST(&gp->consumer); + if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) { + de++; + dr++; + } + /* ... and let go of it on last close */ + if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) { + de--; + dr--; + } + return (g_access_rel(cp, dr, dw, de)); +} + +static int +g_bde_create(struct g_createargs *ga) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp; + struct g_bde_key *kp; + int error; + u_int sectorsize; + off_t mediasize; + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_create(%d)", ga->flag); + g_topology_assert(); + if (ga->flag == 1) { + /* + * Orderly dettachment. + */ + if (ga->geom != NULL) { + gp = ga->geom; + } else if (ga->provider != NULL) { + if (ga->provider->geom->class == ga->class) { + gp = ga->provider->geom; + } else { + LIST_FOREACH(cp, &ga->provider->consumers, + consumers) { + if (cp->geom->class == ga->class) { + gp = cp->geom; + break; + } + } + } + if (gp == NULL) + return (EINVAL); + } else { + return (EINVAL); + } + KASSERT(gp != NULL, ("NULL geom")); + pp = LIST_FIRST(&gp->provider); + KASSERT(pp != NULL, ("NULL provider")); + if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) + return (EBUSY); + g_orphan_provider(pp, ENXIO); + sc = gp->softc; + cp = LIST_FIRST(&gp->consumer); + KASSERT(cp != NULL, ("NULL consumer")); + sc->dead = 1; + wakeup(sc); + error = g_access_rel(cp, -1, -1, -1); + KASSERT(error == 0, ("error on close")); + g_detach(cp); + g_destroy_consumer(cp); + g_topology_unlock(); + while (sc->dead != 2 && !LIST_EMPTY(&pp->consumers)) + tsleep(sc, PRIBIO, "g_bdedie", hz); + g_topology_lock(); + g_destroy_provider(pp); + mtx_destroy(&sc->worklist_mutex); + bzero(&sc->key, sizeof sc->key); + g_free(sc); + g_destroy_geom(gp); + return (0); + } + + if (ga->flag != 0) + return (EOPNOTSUPP); + + if (ga->provider == NULL) + return (EINVAL); + /* + * Attach + */ + gp = g_new_geomf(ga->class, "%s.bde", ga->provider->name); + gp->start = g_bde_start; + gp->orphan = g_bde_orphan; + gp->access = g_bde_access; + gp->spoiled = g_std_spoiled; + cp = g_new_consumer(gp); + g_attach(cp, ga->provider); + error = g_access_rel(cp, 1, 1, 1); + if (error) { + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return (error); + } + g_topology_unlock(); + while (1) { + error = g_getattr("GEOM::sectorsize", cp, §orsize); + if (error) + break; + error = g_getattr("GEOM::mediasize", cp, &mediasize); + if (error) + break; + sc = g_malloc(sizeof(struct g_bde_softc), M_WAITOK | M_ZERO); + gp->softc = sc; + sc->geom = gp; + sc->consumer = cp; + + error = g_bde_decrypt_lock(sc, ga->ptr, + (u_char *)ga->ptr + 256, mediasize, sectorsize, NULL); + bzero(sc->arc4_sbox, sizeof sc->arc4_sbox); + if (error) + break; + kp = &sc->key; + + /* Initialize helper-fields */ + kp->keys_per_sector = kp->sectorsize / G_BDE_SKEYLEN; + kp->zone_cont = kp->keys_per_sector * kp->sectorsize; + kp->zone_width = kp->zone_cont + kp->sectorsize; + kp->media_width = kp->sectorN - kp->sector0 - + G_BDE_MAXKEYS * kp->sectorsize; + + /* Our external parameters */ + sc->zone_cont = kp->zone_cont; + sc->mediasize = g_bde_max_sector(kp); + sc->sectorsize = kp->sectorsize; + + TAILQ_INIT(&sc->freelist); + TAILQ_INIT(&sc->worklist); + mtx_init(&sc->worklist_mutex, "g_bde_worklist", NULL, MTX_DEF); + mtx_lock(&Giant); + /* XXX: error check */ + kthread_create(g_bde_worker, gp, &sc->thread, 0, 0, + "g_bde %s", gp->name); + mtx_unlock(&Giant); + g_topology_lock(); + pp = g_new_providerf(gp, gp->name); + pp->mediasize = sc->mediasize; + g_error_provider(pp, 0); + g_topology_unlock(); + break; + } + g_topology_lock(); + if (error == 0) { + ga->geom = gp; + return (0); + } else { + g_access_rel(cp, -1, -1, -1); + } + g_detach(cp); + g_destroy_consumer(cp); + if (gp->softc != NULL) + g_free(gp->softc); + g_destroy_geom(gp); + return (error); +} + +static struct g_class g_bde_class = { + BDE_CLASS_NAME, + NULL, + g_bde_create, + G_CLASS_INITIALIZER +}; + +DECLARE_GEOM_CLASS(g_bde_class, g_bde); diff --git a/sys/geom/bde/g_bde.h b/sys/geom/bde/g_bde.h new file mode 100644 index 000000000000..df924e420f10 --- /dev/null +++ b/sys/geom/bde/g_bde.h @@ -0,0 +1,150 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* These are quite, but not entirely unlike constants. */ +#define G_BDE_MKEYLEN (2048/8) +#define G_BDE_SKEYBITS 128 +#define G_BDE_SKEYLEN (G_BDE_SKEYBITS/8) +#define G_BDE_KKEYBITS 128 +#define G_BDE_KKEYLEN (G_BDE_KKEYBITS/8) +#define G_BDE_MAXKEYS 4 +#define G_BDE_LOCKSIZE 384 + +/* This just needs to be "large enough" */ +#define G_BDE_KEYBYTES 304 + +struct g_bde_work; +struct g_bde_softc; + +struct g_bde_sector { + struct g_bde_work *owner; + struct g_bde_softc *softc; + off_t offset; + u_int size; + u_int ref; + void *data; + TAILQ_ENTRY(g_bde_sector) list; + u_char valid; + u_char malloc; + enum {JUNK, IO, VALID} state; + int error; +}; + +struct g_bde_work { + struct mtx mutex; + off_t offset; + off_t length; + void *data; + struct bio *bp; + struct g_bde_softc *softc; + off_t so; + off_t kso; + u_int ko; + struct g_bde_sector *sp; + struct g_bde_sector *ksp; + TAILQ_ENTRY(g_bde_work) list; + enum {SETUP, WAIT, FINISH} state; + int error; +}; + +struct g_bde_key { + uint64_t sector0; + /* Physical byte offset of first byte used */ + uint64_t sectorN; + /* Physical byte offset of first byte not used */ + uint64_t keyoffset; + uint64_t lsector[G_BDE_MAXKEYS]; + /* Physical offsets */ + uint32_t sectorsize; + uint32_t flags; + uint8_t hash[16]; + uint8_t spare[48]; + uint8_t key[G_BDE_MKEYLEN]; + /* Non-stored help-fields */ + uint64_t zone_width; /* On-disk width of zone */ + uint64_t zone_cont; /* Payload width of zone */ + uint64_t media_width; /* Non-magic width of zone */ + u_int keys_per_sector; +}; + +struct g_bde_softc { + off_t mediasize; + u_int sectorsize; + uint64_t zone_cont; + struct g_geom *geom; + struct g_consumer *consumer; + TAILQ_HEAD(, g_bde_sector) freelist; + TAILQ_HEAD(, g_bde_work) worklist; + struct mtx worklist_mutex; + struct proc *thread; + struct g_bde_key key; + u_char arc4_sbox[256]; + u_char arc4_i, arc4_j; + int dead; + u_int nwork; + u_int nsect; + u_int ncache; +}; + +/* g_bde_crypt.c */ +void g_bde_crypt_delete(struct g_bde_work *wp); +void g_bde_crypt_read(struct g_bde_work *wp); +void g_bde_crypt_write(struct g_bde_work *wp); + +/* g_bde_key.c */ +void g_bde_zap_key(struct g_bde_softc *sc); +int g_bde_get_key(struct g_bde_softc *sc, void *ptr, int len); +int g_bde_init_keybytes(struct g_bde_softc *sc, char *passp, int len); + +/* g_bde_lock .c */ +void g_bde_encode_lock(struct g_bde_key *gl, u_char *ptr); +void g_bde_decode_lock(struct g_bde_key *gl, u_char *ptr); +u_char g_bde_arc4(struct g_bde_softc *sc); +void g_bde_arc4_seq(struct g_bde_softc *sc, void *ptr, u_int len); +void g_bde_arc4_seed(struct g_bde_softc *sc, void *ptr, u_int len); +int g_bde_keyloc_encrypt(struct g_bde_softc *sc, void *input, void *output); +int g_bde_keyloc_decrypt(struct g_bde_softc *sc, void *input, void *output); +int g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *sbox, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey); + +/* g_bde_math .c */ +uint64_t g_bde_max_sector(struct g_bde_key *lp); +void g_bde_map_sector(struct g_bde_key *lp, uint64_t isector, uint64_t *osector, uint64_t *ksector, u_int *koffset); + +/* g_bde_work.c */ +void g_bde_start1(struct bio *bp); +void g_bde_worker(void *arg); + diff --git a/sys/geom/bde/g_bde_crypt.c b/sys/geom/bde/g_bde_crypt.c new file mode 100644 index 000000000000..c649e23d3122 --- /dev/null +++ b/sys/geom/bde/g_bde_crypt.c @@ -0,0 +1,356 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * This source file contains the functions responsible for the crypto, keying + * and mapping operations on the I/O requests. + * + */ + +#include <sys/param.h> +#include <sys/stdint.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/libkern.h> +#include <sys/md5.h> + +#include <geom/geom.h> +#include <geom/bde/g_bde.h> + +#include <crypto/rijndael/rijndael.h> + +/* + * These four functions wrap the raw Rijndael functions and make sure we + * explode if something fails which shouldn't. + */ + +static void +AES_init(cipherInstance *ci) +{ + int error; + + error = rijndael_cipherInit(ci, MODE_CBC, NULL); + KASSERT(error > 0, ("rijndael_cipherInit %d", error)); +} + +static void +AES_makekey(keyInstance *ki, int dir, u_int len, void *key) +{ + int error; + + error = rijndael_makeKey(ki, dir, len, key); + KASSERT(error > 0, ("rijndael_makeKey %d", error)); +} + +static void +AES_encrypt(cipherInstance *ci, keyInstance *ki, void *in, void *out, u_int len) +{ + int error; + + error = rijndael_blockEncrypt(ci, ki, in, len * 8, out); + KASSERT(error > 0, ("rijndael_blockEncrypt %d", error)); +} + +static void +AES_decrypt(cipherInstance *ci, keyInstance *ki, void *in, void *out, u_int len) +{ + int error; + + error = rijndael_blockDecrypt(ci, ki, in, len * 8, out); + KASSERT(error > 0, ("rijndael_blockDecrypt %d", error)); +} + +/* + * Derive kkey from mkey + sector offset. + * + * Security objective: Derive a potentially very large number of distinct skeys + * from the comparatively small key material in our mkey, in such a way that + * if one, more or even many of the kkeys are compromised, this does not + * significantly help an attack on other kkeys and in particular does not + * weaken or compromised the mkey. + * + * We do this by cherry-picking characters out of the mkey, feeding these to + * MD5 with the sector offset in the middle and using the MD5 hash as kkey. + * + * The MD5 only acts as a "diode" against brute-force reversal, it offsers no + * protection if the input to MD5 is predictable or insufficiently uncorrelated + * from sector to sector. + * + * The amount of entropy in a sector number is very low, and the amount of + * entropy between two sector numbers is even lower, (only slightly higher than + * one bit), so we rely heavily on the mkey to make the cherry picking non- + * linear and irreversible. + * + * This strong dependency on the mkey is very desirable, but the low amount + * of entropy from the sector number means that the algorithm is vulnerable + * to mkeys which has a lumpy histogram of byte values or little entropy. + * + * If you read this comment in order to find a weak spot or the best way to + * attack GBDE, you have probably come to the right place. Good luck. + */ + +static void +g_bde_kkey(struct g_bde_softc *sc, keyInstance *ki, int dir, off_t sector) +{ + u_int u, v, w, t; + MD5_CTX ct; + u_char buf[16], c; + + MD5Init(&ct); + w = sector /= sc->sectorsize; + v = w % 211; /* A prime slightly smaller than G_BDE_MKEYLEN */ + u = w % 19; /* A small prime */ + for (t = 0; t < G_BDE_SKEYLEN; t++) { + u %= G_BDE_MKEYLEN; + v %= G_BDE_MKEYLEN; + c = sc->key.key[u] ^ sc->key.key[v]; + MD5Update(&ct, &c, 1); + v += c + t; + u += sc->key.key[c]; + if (w & 1) + v += 13; /* A small prime */ + else + u += 131; /* A prime roughly G_BDE_MKEYLEN / 2 */ + if (t == G_BDE_SKEYLEN / 2) + MD5Update(&ct, (void *)§or, sizeof sector); + } + w = v = u - 0; + MD5Update(&ct, (void *)§or, sizeof sector); + MD5Final(buf, &ct); + bzero(&ct, sizeof ct); + AES_makekey(ki, dir, G_BDE_KKEYBITS, buf); + bzero(buf, sizeof buf); +} + +/* + * Encryption work for read operation. + * + * Security objective: Find the kkey, find the skey, decrypt the sector data. + */ + +void +g_bde_crypt_read(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + u_char *d; + u_int n; + off_t o; + u_char skey[G_BDE_SKEYLEN]; + keyInstance ki; + cipherInstance ci; + + + AES_init(&ci); + sc = wp->softc; + o = 0; + for (n = 0; o < wp->length; n++, o += sc->sectorsize) { + d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; + g_bde_kkey(sc, &ki, DIR_DECRYPT, wp->offset + o); + AES_decrypt(&ci, &ki, d, skey, sizeof skey); + d = (u_char *)wp->data + o; + AES_makekey(&ki, DIR_DECRYPT, G_BDE_SKEYBITS, skey); + AES_decrypt(&ci, &ki, d, d, sc->sectorsize); + } + bzero(skey, sizeof skey); + bzero(&ci, sizeof ci); + bzero(&ki, sizeof ci); +} + +/* + * Encryption work for write operation. + * + * Security objective: Create random skey, encrypt sector data, + * encrypt skey with the kkey. + */ + +void +g_bde_crypt_write(struct g_bde_work *wp) +{ + u_char *s, *d; + struct g_bde_softc *sc; + u_int n; + off_t o; + u_char skey[G_BDE_SKEYLEN]; + keyInstance ki; + cipherInstance ci; + + sc = wp->softc; + AES_init(&ci); + o = 0; + for (n = 0; o < wp->length; n++, o += sc->sectorsize) { + + s = (u_char *)wp->data + o; + d = (u_char *)wp->sp->data + o; + arc4rand(&skey, sizeof skey, 0); + AES_makekey(&ki, DIR_ENCRYPT, G_BDE_SKEYBITS, skey); + AES_encrypt(&ci, &ki, s, d, sc->sectorsize); + + d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; + g_bde_kkey(sc, &ki, DIR_ENCRYPT, wp->offset + o); + AES_encrypt(&ci, &ki, skey, d, sizeof skey); + bzero(skey, sizeof skey); + } + bzero(skey, sizeof skey); + bzero(&ci, sizeof ci); + bzero(&ki, sizeof ci); +} + +/* + * Encryption work for delete operation. + * + * Security objective: Write random data to the sectors. + * + * XXX: At a hit in performance we would trash the encrypted skey as well. + * XXX: This would add frustration to the cleaning lady attack by making + * XXX: deletes look like writes. + */ + +void +g_bde_crypt_delete(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + u_char *d; + off_t o; + + sc = wp->softc; + d = wp->sp->data; + /* + * Do not unroll this loop! + * Our zone may be significantly wider than the amount of random + * bytes arc4rand likes to give in one reseeding, whereas our + * sectorsize is far more likely to be in the same range. + */ + for (o = 0; o < wp->length; o += sc->sectorsize) { + arc4rand(d, sc->sectorsize, 0); + d += sc->sectorsize; + } + /* + * Having written a long random sequence to disk here, we want to + * force a reseed, to avoid weakening the next time we use random + * data for something important. + */ + arc4rand(&o, sizeof o, 1); +} + +/* + * Calculate the total payload size of the encrypted device. + * + * Security objectives: none. + * + * This function needs to agree with g_bde_map_sector() about things. + */ + +uint64_t +g_bde_max_sector(struct g_bde_key *kp) +{ + uint64_t maxsect; + + maxsect = kp->media_width; + maxsect /= kp->zone_width; + maxsect *= kp->zone_cont; + return (maxsect); +} + +/* + * Convert an unencrypted side offset to offsets on the encrypted side. + * + * Security objective: Make it harder to identify what sectors contain what + * on a "cold" disk image. + * + * We do this by adding the "keyoffset" from the lock to the physical sector + * number modulus the available number of sectors, since all physical sectors + * presumably look the same cold, this should be enough. + * + * Shuffling things further is an option, but the incremental frustration is + * not currently deemed worth the run-time performance hit resulting from the + * increased number of disk arm movements it would incur. + * + * This function offers nothing but a trivial diversion for an attacker able + * to do "the cleaning lady attack" in its current static mapping form. + */ + +void +g_bde_map_sector(struct g_bde_key *kp, + uint64_t isector, + uint64_t *osector, + uint64_t *ksector, + u_int *koffset) +{ + + u_int zone, zoff, zidx, u; + uint64_t os; + + /* find which zone and the offset and index in it */ + zone = isector / kp->zone_cont; + zoff = isector % kp->zone_cont; + zidx = zoff / kp->sectorsize; + + /* Find physical sector address */ + os = zone * kp->zone_width + zoff; + os += kp->keyoffset; + os %= kp->media_width - (G_BDE_MAXKEYS * kp->sectorsize); + os += kp->sector0; + + /* Compensate for lock sectors */ + for (u = 0; u < G_BDE_MAXKEYS; u++) + if (os >= kp->lsector[u]) + os += kp->sectorsize; + + *osector = os; + + /* The key sector is the last in this zone. */ + os = (1 + zone) * kp->zone_width - kp->sectorsize; + os += kp->keyoffset; + os %= kp->media_width - (G_BDE_MAXKEYS * kp->sectorsize); + os += kp->sector0; + + for (u = 0; u < G_BDE_MAXKEYS; u++) + if (os >= kp->lsector[u]) + os += kp->sectorsize; + *ksector = os; + + *koffset = zidx * G_BDE_SKEYLEN; + +#if 0 + printf("off %jd %jd %jd %u\n", + (intmax_t)isector, + (intmax_t)*osector, + (intmax_t)*ksector, + *koffset); +#endif +} diff --git a/sys/geom/bde/g_bde_lock.c b/sys/geom/bde/g_bde_lock.c new file mode 100644 index 000000000000..e58683f9712a --- /dev/null +++ b/sys/geom/bde/g_bde_lock.c @@ -0,0 +1,311 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * This souce file contains routines which operates on the lock sectors, both + * for the kernel and the userland program gbde(1). + * + */ + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/stdint.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/md5.h> + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/systm.h> +#else +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#define g_free(foo) free(foo) +#endif + +#include <geom/geom.h> +#include <geom/bde/g_bde.h> + +#include <crypto/rijndael/rijndael.h> + +/* + * Encode/Decode the lock structure in byte-sequence format. + * + * Security objectives: none. + * + * C-structure packing and byte-endianess depends on architecture, compiler + * and compiler options. We therefore explicitly encode and decode struct + * g_bde_key using an invariant byte-sequence format. + * + */ + +void +g_bde_encode_lock(struct g_bde_key *gl, u_char *ptr) +{ + + bcopy(gl->hash, ptr + 0, sizeof gl->hash); + g_enc_le8(ptr + 16, gl->sector0); + g_enc_le8(ptr + 24, gl->sectorN); + g_enc_le8(ptr + 32, gl->keyoffset); + g_enc_le4(ptr + 40, gl->sectorsize); + g_enc_le4(ptr + 44, gl->flags); + g_enc_le8(ptr + 48, gl->lsector[0]); + g_enc_le8(ptr + 56, gl->lsector[1]); + g_enc_le8(ptr + 64, gl->lsector[2]); + g_enc_le8(ptr + 72, gl->lsector[3]); + bcopy(gl->spare, ptr + 80, sizeof gl->spare); + bcopy(gl->key, ptr + 128, sizeof gl->key); +} + +void +g_bde_decode_lock(struct g_bde_key *gl, u_char *ptr) +{ + bcopy(ptr + 0, gl->hash, sizeof gl->hash); + gl->sector0 = g_dec_le8(ptr + 16); + gl->sectorN = g_dec_le8(ptr + 24); + gl->keyoffset = g_dec_le8(ptr + 32); + gl->sectorsize = g_dec_le4(ptr + 40); + gl->flags = g_dec_le4(ptr + 44); + gl->lsector[0] = g_dec_le8(ptr + 48); + gl->lsector[1] = g_dec_le8(ptr + 56); + gl->lsector[2] = g_dec_le8(ptr + 64); + gl->lsector[3] = g_dec_le8(ptr + 72); + bcopy(ptr + 80, gl->spare, sizeof gl->spare); + bcopy(ptr + 128, gl->key, sizeof gl->key); +} + +/* + * Generate key-material used for protecting lock sectors. + * + * Security objectives: from the pass-phrase provide by the user, produce a + * reproducible stream of bits/bytes which resemeble pseudo-random bits. + * + * This is the stream-cipher algorithm called ARC4. See for instance the + * description in "Applied Cryptography" by Bruce Scneier. + */ + +u_char +g_bde_arc4(struct g_bde_softc *sc) +{ + u_char c; + + sc->arc4_j += sc->arc4_sbox[++sc->arc4_i]; + c = sc->arc4_sbox[sc->arc4_i]; + sc->arc4_sbox[sc->arc4_i] = sc->arc4_sbox[sc->arc4_j]; + sc->arc4_sbox[sc->arc4_j] = c; + c = sc->arc4_sbox[sc->arc4_i] + sc->arc4_sbox[sc->arc4_j]; + c = sc->arc4_sbox[c]; + return (c); +} + +void +g_bde_arc4_seq(struct g_bde_softc *sc, void *ptr, u_int len) +{ + u_char *p; + + p = ptr; + while (len--) + *p++ = g_bde_arc4(sc); +} + +void +g_bde_arc4_seed(struct g_bde_softc *sc, void *ptr, u_int len) +{ + u_char k[256], *p, c; + u_int i; + + p = ptr; + sc->arc4_i = 0; + bzero(k, sizeof k); + while(len--) + k[sc->arc4_i++] ^= *p++; + + sc->arc4_j = 0; + for (i = 0; i < 256; i++) + sc->arc4_sbox[i] = i; + for (i = 0; i < 256; i++) { + sc->arc4_j += sc->arc4_sbox[i] + k[i]; + c = sc->arc4_sbox[i]; + sc->arc4_sbox[i] = sc->arc4_sbox[sc->arc4_j]; + sc->arc4_sbox[sc->arc4_j] = c; + } + sc->arc4_i = 0; + sc->arc4_j = 0; +} + +/* + * Encrypt/Decrypt the metadata address with key-material. + */ + +int +g_bde_keyloc_encrypt(struct g_bde_softc *sc, void *input, void *output) +{ + u_char *p; + u_char buf[16], buf1[16]; + u_int i; + keyInstance ki; + cipherInstance ci; + + rijndael_cipherInit(&ci, MODE_CBC, NULL); + p = input; + g_bde_arc4_seq(sc, buf, sizeof buf); + for (i = 0; i < sizeof buf; i++) + buf1[i] = p[i] ^ buf[i]; + g_bde_arc4_seq(sc, buf, sizeof buf); + rijndael_makeKey(&ki, DIR_ENCRYPT, G_BDE_KKEYBITS, buf); + rijndael_blockEncrypt(&ci, &ki, buf1, 16 * 8, output); + bzero(&ci, sizeof ci); + return (0); +} + +int +g_bde_keyloc_decrypt(struct g_bde_softc *sc, void *input, void *output) +{ + u_char *p; + u_char buf1[16], buf2[16]; + u_int i; + keyInstance ki; + cipherInstance ci; + + rijndael_cipherInit(&ci, MODE_CBC, NULL); + g_bde_arc4_seq(sc, buf1, sizeof buf1); + g_bde_arc4_seq(sc, buf2, sizeof buf2); + rijndael_makeKey(&ki, DIR_DECRYPT, G_BDE_KKEYBITS, buf2); + rijndael_blockDecrypt(&ci, &ki, input, 16 * 8, output); + p = output; + for (i = 0; i < sizeof buf1; i++) + p[i] ^= buf1[i]; + bzero(&ci, sizeof ci); + return (0); +} + +/* + * Encode/Decode lock sectors. + */ + +int +g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *sbox, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey) +{ + u_char *buf, k1buf[16], k2buf[G_BDE_LOCKSIZE], k3buf[16], *q; + struct g_bde_key *gl; + uint64_t off[2]; + int error, m, i; + MD5_CTX c; + keyInstance ki; + cipherInstance ci; + + rijndael_cipherInit(&ci, MODE_CBC, NULL); + bcopy(sbox, sc->arc4_sbox, 256); + sc->arc4_i = 0; + sc->arc4_j = 0; + gl = &sc->key; + error = g_bde_keyloc_decrypt(sc, meta, off); + if (error) + return(error); + + if (off[0] + G_BDE_LOCKSIZE > (uint64_t)mediasize) { + bzero(off, sizeof off); + return (ESRCH); + } + off[1] = 0; + m = 1; + if (off[0] % sectorsize > sectorsize - G_BDE_LOCKSIZE) + m++; + buf = g_read_data(sc->consumer, + off[0] - (off[0] % sectorsize), + m * sectorsize, &error); + if (buf == NULL) { + off[0] = 0; + return(error); + } + + q = buf + off[0] % sectorsize; + + off[1] = 0; + for (i = 0; i < (int)sizeof(*gl); i++) + off[1] += q[i]; + + if (off[1] == 0) { + off[0] = 0; + g_free(buf); + return (ESRCH); + } + + g_bde_arc4_seq(sc, k1buf, sizeof k1buf); + g_bde_arc4_seq(sc, k2buf, sizeof k2buf); + g_bde_arc4_seq(sc, k3buf, sizeof k3buf); + + MD5Init(&c); + MD5Update(&c, "0000", 4); /* XXX: for future versioning */ + MD5Update(&c, k1buf, 16); + MD5Final(k1buf, &c); + + rijndael_makeKey(&ki, DIR_DECRYPT, 128, k3buf); + bzero(k3buf, sizeof k3buf); + rijndael_blockDecrypt(&ci, &ki, q, G_BDE_LOCKSIZE * 8, q); + + for (i = 0; i < G_BDE_LOCKSIZE; i++) + q[i] ^= k2buf[i]; + bzero(k2buf, sizeof k2buf); + + if (bcmp(q, k1buf, sizeof k1buf)) { + bzero(k1buf, sizeof k1buf); + bzero(buf, sectorsize * m); + g_free(buf); + off[0] = 0; + return (ESRCH); + } + bzero(k1buf, sizeof k1buf); + + g_bde_decode_lock(gl, q); + bzero(buf, sectorsize * m); + g_free(buf); + + off[1] = 0; + for (i = 0; i < (int)sizeof(gl->key); i++) + off[1] += gl->key[i]; + + if (off[1] == 0) { + off[0] = 0; + return (ENOENT); + } + for (i = 0; i < G_BDE_MAXKEYS; i++) + if (nkey != NULL && off[0] == gl->lsector[i]) + *nkey = i; + + return (0); +} diff --git a/sys/geom/bde/g_bde_work.c b/sys/geom/bde/g_bde_work.c new file mode 100644 index 000000000000..6f337fa3d00f --- /dev/null +++ b/sys/geom/bde/g_bde_work.c @@ -0,0 +1,731 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * This source file contains the state-engine which makes things happen in the + * right order. + * + * Outline: + * 1) g_bde_start1() + * Break the struct bio into multiple work packets one per zone. + * 2) g_bde_start2() + * Setup the necessary sector buffers and start those read operations + * which we can start at this time and put the item on the work-list. + * 3) g_bde_worker() + * Scan the work-list for items which are ready for crypto processing + * and call the matching crypto function in g_bde_crypt.c and schedule + * any writes needed. Read operations finish here by releasing the + * sector buffers and delivering the original bio request. + * 4) g_bde_write_done() + * Release sector buffers and deliver the original bio request. + * + * Because of the C-scope rules, the functions are almost perfectly in the + * opposite order in this source file. + * + * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add + * XXX: additional states to this state-engine. Since no hardware available + * XXX: at this time has AES support, implementing this has been postponed + * XXX: until such time as it would result in a benefit. + */ + +#include <sys/param.h> +#include <sys/stdint.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/kthread.h> + +#include <geom/geom.h> +#include <geom/bde/g_bde.h> + +static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp); +static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len); +static void g_bde_release_sector(struct g_bde_work *wp, struct g_bde_sector *sp); +static struct g_bde_sector *g_bde_get_sector(struct g_bde_work *wp, off_t offset); +static int g_bde_start_read(struct g_bde_sector *sp); + +/* + * Work item allocation. + * + * C++ would call these constructors and destructors. + */ +static u_int g_bde_nwork; +SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, ""); + +static struct g_bde_work * +g_bde_new_work(struct g_bde_softc *sc) +{ + struct g_bde_work *wp; + + wp = g_malloc(sizeof *wp, M_NOWAIT | M_ZERO); + if (wp == NULL) + return (wp); + wp->state = SETUP; + wp->softc = sc; + g_bde_nwork++; + sc->nwork++; + TAILQ_INSERT_TAIL(&sc->worklist, wp, list); + return (wp); +} + +static void +g_bde_delete_work(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + + sc = wp->softc; + g_bde_nwork--; + sc->nwork--; + TAILQ_REMOVE(&sc->worklist, wp, list); + g_free(wp); +} + +/* + * Sector buffer allocation + * + * These two functions allocate and free back variable sized sector buffers + */ + +static u_int g_bde_nsect; +SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, ""); + +void +g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp) +{ + + g_bde_nsect--; + sc->nsect--; + if (sp->malloc) + g_free(sp->data); + g_free(sp); +} + +struct g_bde_sector * +g_bde_new_sector(struct g_bde_work *wp, u_int len) +{ + struct g_bde_sector *sp; + + sp = g_malloc(sizeof *sp, M_NOWAIT | M_ZERO); + if (sp == NULL) + return (sp); + if (len > 0) { + sp->data = g_malloc(len, M_NOWAIT | M_ZERO); + if (sp->data == NULL) { + g_free(sp); + return (NULL); + } + sp->malloc = 1; + } + g_bde_nsect++; + wp->softc->nsect++; + sp->size = len; + sp->softc = wp->softc; + sp->ref = 1; + sp->owner = wp; + sp->offset = wp->so; + sp->state = JUNK; + return (sp); +} + +/* + * Skey sector cache. + * + * Nothing prevents two separate I/O requests from addressing the same zone + * and thereby needing the same skey sector. We therefore need to sequence + * I/O operations to the skey sectors. A certain amount of caching is also + * desirable, although the extent of benefit from this is not at this point + * determined. + * + * XXX: GEOM may be able to grow a generic caching facility at some point + * XXX: to support such needs. + */ + +static u_int g_bde_ncache; +SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, ""); + +static struct g_bde_sector * +g_bde_get_sector(struct g_bde_work *wp, off_t offset) +{ + struct g_bde_sector *sp; + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_get_sector(%p, %jd)", wp, (intmax_t)offset); + sc = wp->softc; + TAILQ_FOREACH(sp, &sc->freelist, list) { + if (sp->offset == offset) + break; + } + if (sp != NULL) { + sp->ref++; + KASSERT(sp->offset == offset, ("wrong offset")); + KASSERT(sp->softc == wp->softc, ("wrong softc")); + if (sp->ref == 1) + sp->owner = wp; + } else { + if (!TAILQ_EMPTY(&sc->freelist)) + sp = TAILQ_FIRST(&sc->freelist); + if (sp != NULL && sp->ref > 0) + sp = NULL; + if (sp == NULL) { + g_bde_ncache++; + sc->ncache++; + sp = g_bde_new_sector(wp, sc->sectorsize); + if (sp != NULL) { + TAILQ_INSERT_TAIL(&sc->freelist, sp, list); + sp->malloc = 2; + } + } + if (sp != NULL) { + sp->offset = offset; + sp->softc = wp->softc; + sp->ref = 1; + sp->owner = wp; + sp->state = JUNK; + sp->error = 0; + } + } + if (sp != NULL) { + TAILQ_REMOVE(&sc->freelist, sp, list); + TAILQ_INSERT_TAIL(&sc->freelist, sp, list); + } + wp->ksp = sp; + KASSERT(sp != NULL, ("get_sector failed")); + return(sp); +} + +static void +g_bde_release_sector(struct g_bde_work *wp, struct g_bde_sector *sp) +{ + struct g_bde_softc *sc; + struct g_bde_work *wp2; + + g_trace(G_T_TOPOLOGY, "g_bde_release_sector(%p)", sp); + KASSERT(sp->malloc == 2, ("Wrong sector released")); + sc = sp->softc; + KASSERT(sc != NULL, ("NULL sp->softc")); + KASSERT(wp == sp->owner, ("Releasing, not owner")); + sp->owner = NULL; + wp->ksp = NULL; + sp->ref--; + if (sp->ref > 0) { + TAILQ_REMOVE(&sc->freelist, sp, list); + TAILQ_INSERT_TAIL(&sc->freelist, sp, list); + TAILQ_FOREACH(wp2, &sc->worklist, list) { + if (wp2->ksp == sp) { + KASSERT(wp2 != wp, ("Self-reowning")); + sp->owner = wp2; + wakeup(sp->softc); + break; + } + } + KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp)); + } else if (sp->error != 0) { + sp->offset = ~0; + sp->error = 0; + sp->state = JUNK; + } + TAILQ_REMOVE(&sc->freelist, sp, list); + TAILQ_INSERT_HEAD(&sc->freelist, sp, list); +} + +static void +g_bde_purge_sector(struct g_bde_softc *sc, int fraction) +{ + struct g_bde_sector *sp; + int n; + + g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc); + n = sc->ncache / fraction + 1; + while(n--) { + TAILQ_FOREACH(sp, &sc->freelist, list) { + if (sp->ref != 0) + continue; + TAILQ_REMOVE(&sc->freelist, sp, list); + g_bde_ncache--; + sc->ncache--; + bzero(sp->data, sp->size); + g_bde_delete_sector(sc, sp); + break; + } + } +} + +static struct g_bde_sector * +g_bde_read_sector(struct g_bde_softc *sc, struct g_bde_work *wp, off_t offset) +{ + struct g_bde_sector *sp; + + g_trace(G_T_TOPOLOGY, "g_bde_read_sector(%p)", wp); + sp = g_bde_get_sector(wp, offset); + if (sp == NULL) + return (sp); + if (sp->owner != wp) + return (sp); + if (sp->state == VALID) + return (sp); + if (g_bde_start_read(sp) == 0) + return (sp); + g_bde_release_sector(wp, sp); + return (NULL); +} + +/* + * Contribute to the completion of the original bio request. + * + * We have no simple way to tell how many bits the original bio request has + * been segmented into, so the easiest way to determine when we can deliver + * it is to keep track of the number of bytes we have completed. We keep + * track of any errors underway and latch onto the first one. + * + * We always report "nothing done" in case of error, because random bits here + * and there may be completed and returning a number of completed bytes does + * not convey any useful information about which bytes they were. If some + * piece of broken code somewhere interprets this to mean that nothing has + * changed on the underlying media they deserve the lossage headed for them. + * + * A single mutex per g_bde instance is used to prevent contention. + */ + +static void +g_bde_contribute(struct bio *bp, off_t bytes, int error) +{ + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d", + bp, (intmax_t)bytes, error); + sc = bp->bio_driver1; + if (bp->bio_error == 0) + bp->bio_error = error; + bp->bio_completed += bytes; + KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution")); + if (bp->bio_completed == bp->bio_length) { + if (bp->bio_error != 0) + bp->bio_completed = 0; + g_io_deliver(bp, bp->bio_error); + } +} + +/* + * A write operation has finished. When we have all expected cows in the + * barn close the door and call it a day. + */ + +static void +g_bde_write_done(struct bio *bp) +{ + struct g_bde_sector *sp; + struct g_bde_work *wp; + struct g_bde_softc *sc; + + sp = bp->bio_caller1; + sc = bp->bio_caller2; + mtx_lock(&sc->worklist_mutex); + KASSERT(sp != NULL, ("NULL sp")); + KASSERT(sc != NULL, ("NULL sc")); + KASSERT(sp->owner != NULL, ("NULL sp->owner")); + g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp); + sp->error = bp->bio_error; + g_destroy_bio(bp); + wp = sp->owner; + if (wp->error == 0) + wp->error = sp->error; + + if (wp->bp->bio_cmd == BIO_DELETE) { + KASSERT(sp == wp->sp, ("trashed delete op")); + g_bde_contribute(wp->bp, wp->length, wp->error); + g_bde_delete_sector(sc, sp); + g_bde_delete_work(wp); + mtx_unlock(&sc->worklist_mutex); + return; + } + + KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()")); + KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op")); + if (wp->sp == sp) { + g_bde_delete_sector(sc, wp->sp); + wp->sp = NULL; + } else { + sp->state = VALID; + } + if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID) { + g_bde_contribute(wp->bp, wp->length, wp->error); + g_bde_release_sector(wp, wp->ksp); + g_bde_delete_work(wp); + } + mtx_unlock(&sc->worklist_mutex); + return; +} + +/* + * Send a write request for the given sector down the pipeline. + */ + +static int +g_bde_start_write(struct g_bde_sector *sp) +{ + struct bio *bp; + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp); + sc = sp->softc; + KASSERT(sc != NULL, ("NULL sc in g_bde_start_write")); + KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write")); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_WRITE; + bp->bio_offset = sp->offset; + bp->bio_data = sp->data; + bp->bio_length = sp->size; + bp->bio_done = g_bde_write_done; + bp->bio_caller1 = sp; + bp->bio_caller2 = sc; + sp->state = IO; + g_io_request(bp, sc->consumer); + return(0); +} + +/* + * A read operation has finished. Mark the sector no longer iobusy and + * wake up the worker thread and let it do its thing. + */ + +static void +g_bde_read_done(struct bio *bp) +{ + struct g_bde_sector *sp; + struct g_bde_softc *sc; + + sp = bp->bio_caller1; + g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp); + sc = bp->bio_caller2; + mtx_lock(&sc->worklist_mutex); + sp->error = bp->bio_error; + sp->state = VALID; + wakeup(sc); + g_destroy_bio(bp); + mtx_unlock(&sc->worklist_mutex); +} + +/* + * Send a read request for the given sector down the pipeline. + */ + +static int +g_bde_start_read(struct g_bde_sector *sp) +{ + struct bio *bp; + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp); + sc = sp->softc; + KASSERT(sc != NULL, ("Null softc in sp %p", sp)); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_READ; + bp->bio_offset = sp->offset; + bp->bio_data = sp->data; + bp->bio_length = sp->size; + bp->bio_done = g_bde_read_done; + bp->bio_caller1 = sp; + bp->bio_caller2 = sc; + sp->state = IO; + g_io_request(bp, sc->consumer); + return(0); +} + +/* + * The worker thread. + * + * The up/down path of GEOM is not allowed to sleep or do any major work + * so we use this thread to do the actual crypto operations and to push + * the state engine onwards. + * + * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption + * XXX: using a thread here is probably not needed. + */ + +void +g_bde_worker(void *arg) +{ + struct g_bde_softc *sc; + struct g_bde_work *wp; + struct g_geom *gp; + int busy, error; + + gp = arg; + sc = gp->softc; + + mtx_lock(&sc->worklist_mutex); + for (;;) { + busy = 0; + g_trace(G_T_TOPOLOGY, "g_bde_worker scan"); + TAILQ_FOREACH(wp, &sc->worklist, list) { + KASSERT(wp != NULL, ("NULL wp")); + KASSERT(wp->softc != NULL, ("NULL wp->softc")); + if (wp->state != WAIT) + continue; /* Not interesting here */ + + KASSERT(wp->bp != NULL, ("NULL wp->bp")); + KASSERT(wp->sp != NULL, ("NULL wp->sp")); + + if (wp->ksp != NULL) { + if (wp->ksp->owner != wp) + continue; + if (wp->ksp->state == IO) + continue; + KASSERT(wp->ksp->state == VALID, + ("Illegal sector state (JUNK ?)")); + } + + if (wp->bp->bio_cmd == BIO_READ && wp->sp->state != VALID) + continue; + + if (wp->ksp != NULL && wp->ksp->error != 0) { + g_bde_contribute(wp->bp, wp->length, + wp->ksp->error); + g_bde_delete_sector(sc, wp->sp); + g_bde_release_sector(wp, wp->ksp); + g_bde_delete_work(wp); + busy++; + break; + } + switch(wp->bp->bio_cmd) { + case BIO_READ: + if (wp->ksp != NULL && wp->sp->error == 0) { + mtx_unlock(&sc->worklist_mutex); + g_bde_crypt_read(wp); + mtx_lock(&sc->worklist_mutex); + } + g_bde_contribute(wp->bp, wp->length, + wp->sp->error); + g_bde_delete_sector(sc, wp->sp); + if (wp->ksp != NULL) + g_bde_release_sector(wp, wp->ksp); + g_bde_delete_work(wp); + break; + case BIO_WRITE: + wp->state = FINISH; + KASSERT(wp->sp->owner == wp, ("Write not owner sp")); + KASSERT(wp->ksp->owner == wp, ("Write not owner ksp")); + mtx_unlock(&sc->worklist_mutex); + g_bde_crypt_write(wp); + mtx_lock(&sc->worklist_mutex); + g_bde_start_write(wp->sp); + g_bde_start_write(wp->ksp); + break; + case BIO_DELETE: + wp->state = FINISH; + mtx_unlock(&sc->worklist_mutex); + g_bde_crypt_delete(wp); + mtx_lock(&sc->worklist_mutex); + g_bde_start_write(wp->sp); + break; + } + busy++; + break; + } + if (!busy) { + /* + * We don't look for our death-warrant until we are + * idle. Shouldn't make a difference in practice. + */ + if (sc->dead) + break; + g_trace(G_T_TOPOLOGY, "g_bde_worker sleep"); + error = msleep(sc, &sc->worklist_mutex, + PRIBIO, "g_bde", hz); + if (error == EWOULDBLOCK) { + /* + * Loose our skey cache in an orderly fashion. + * The exact rate can be tuned to be less + * aggressive if this is desirable. 10% per + * second means that the cache is gone in a + * few minutes. + */ + g_bde_purge_sector(sc, 10); + } + } + } + g_trace(G_T_TOPOLOGY, "g_bde_worker die"); + g_bde_purge_sector(sc, 1); + KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork)); + KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache)); + KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect)); + mtx_unlock(&sc->worklist_mutex); + sc->dead = 2; + wakeup(sc); + mtx_lock(&Giant); + kthread_exit(0); +} + +/* + * g_bde_start1 has chopped the incoming request up so all the requests + * we see here are inside a single zone. Map the data and key locations + * grab the buffers we need and fire off the first volley of read requests. + */ + +static void +g_bde_start2(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + + KASSERT(wp != NULL, ("NULL wp in g_bde_start2")); + g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp); + sc = wp->softc; + KASSERT(wp->softc != NULL, ("NULL wp->softc")); + g_bde_map_sector(&sc->key, wp->offset, &wp->so, &wp->kso, &wp->ko); + if (wp->bp->bio_cmd == BIO_READ) { + wp->sp = g_bde_new_sector(wp, 0); + if (wp->sp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_work(wp); + return; + } + wp->sp->size = wp->length; + wp->sp->data = wp->data; + if (g_bde_start_read(wp->sp) != 0) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_sector(sc, wp->sp); + g_bde_delete_work(wp); + return; + } + g_bde_read_sector(sc, wp, wp->kso); + if (wp->ksp == NULL) + wp->error = ENOMEM; + } else if (wp->bp->bio_cmd == BIO_DELETE) { + wp->sp = g_bde_new_sector(wp, wp->length); + if (wp->sp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_work(wp); + return; + } + } else if (wp->bp->bio_cmd == BIO_WRITE) { + wp->sp = g_bde_new_sector(wp, wp->length); + if (wp->sp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_work(wp); + return; + } + g_bde_read_sector(sc, wp, wp->kso); + if (wp->ksp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_sector(sc, wp->sp); + g_bde_delete_work(wp); + return; + } + } else { + KASSERT(0 == 1, + ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd)); + } + + wp->state = WAIT; + wakeup(sc); +} + +/* + * Split the incoming bio on zone boundaries and submit the resulting + * work structures to g_bde_start2(). + */ + +void +g_bde_start1(struct bio *bp) +{ + struct g_bde_softc *sc; + struct g_bde_work *wp; + off_t zone_start, left; + caddr_t p; + + sc = bp->bio_to->geom->softc; + bp->bio_driver1 = sc; + + mtx_lock(&sc->worklist_mutex); + zone_start = bp->bio_offset - bp->bio_offset % sc->zone_cont; + wp = g_bde_new_work(sc); + if (wp == NULL) { + g_io_deliver(bp, ENOMEM); + mtx_unlock(&sc->worklist_mutex); + return; + } + left = bp->bio_length; + p = bp->bio_data; + + /* Do the first and possible only fragment */ + wp->bp = bp; + wp->offset = bp->bio_offset; + wp->data = p; + wp->length = zone_start + sc->zone_cont - wp->offset; + if (wp->length >= left) { + /* Only this one fragment needed */ + wp->length = left; + g_bde_start2(wp); + mtx_unlock(&sc->worklist_mutex); + return; + } + + /* Submit the first fragment */ + g_bde_start2(wp); + left -= wp->length; + p += wp->length; + + /* Do the subsequent fragments */ + for(;left > 0;) { + wp = g_bde_new_work(sc); + if (wp == NULL) { + g_bde_contribute(bp, left, ENOMEM); + mtx_unlock(&sc->worklist_mutex); + return; + } + zone_start += sc->zone_cont; + wp->bp = bp; + wp->offset = zone_start; + wp->data = p; + if (left > sc->zone_cont) + wp->length = sc->zone_cont; + else + wp->length = left; + left -= wp->length; + p += wp->length; + g_bde_start2(wp); + } + mtx_unlock(&sc->worklist_mutex); +} |
