diff options
Diffstat (limited to 'contrib/awk/array.c')
-rw-r--r-- | contrib/awk/array.c | 526 |
1 files changed, 526 insertions, 0 deletions
diff --git a/contrib/awk/array.c b/contrib/awk/array.c new file mode 100644 index 0000000000000..b178cd2aafec5 --- /dev/null +++ b/contrib/awk/array.c @@ -0,0 +1,526 @@ +/* + * array.c - routines for associative arrays. + */ + +/* + * Copyright (C) 1986, 1988, 1989, 1991 - 97 the Free Software Foundation, Inc. + * + * This file is part of GAWK, the GNU implementation of the + * AWK Programming Language. + * + * GAWK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GAWK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +/* + * Tree walks (``for (iggy in foo)'') and array deletions use expensive + * linear searching. So what we do is start out with small arrays and + * grow them as needed, so that our arrays are hopefully small enough, + * most of the time, that they're pretty full and we're not looking at + * wasted space. + * + * The decision is made to grow the array if the average chain length is + * ``too big''. This is defined as the total number of entries in the table + * divided by the size of the array being greater than some constant. + */ + +#define AVG_CHAIN_MAX 10 /* don't want to linear search more than this */ + +#include "awk.h" + +static NODE *assoc_find P((NODE *symbol, NODE *subs, int hash1)); +static void grow_table P((NODE *symbol)); + +/* concat_exp --- concatenate expression list into a single string */ + +NODE * +concat_exp(tree) +register NODE *tree; +{ + register NODE *r; + char *str; + char *s; + size_t len; + int offset; + size_t subseplen; + char *subsep; + + if (tree->type != Node_expression_list) + return force_string(tree_eval(tree)); + r = force_string(tree_eval(tree->lnode)); + if (tree->rnode == NULL) + return r; + subseplen = SUBSEP_node->lnode->stlen; + subsep = SUBSEP_node->lnode->stptr; + len = r->stlen + subseplen + 2; + emalloc(str, char *, len, "concat_exp"); + memcpy(str, r->stptr, r->stlen+1); + s = str + r->stlen; + free_temp(r); + for (tree = tree->rnode; tree != NULL; tree = tree->rnode) { + if (subseplen == 1) + *s++ = *subsep; + else { + memcpy(s, subsep, subseplen+1); + s += subseplen; + } + r = force_string(tree_eval(tree->lnode)); + len += r->stlen + subseplen; + offset = s - str; + erealloc(str, char *, len, "concat_exp"); + s = str + offset; + memcpy(s, r->stptr, r->stlen+1); + s += r->stlen; + free_temp(r); + } + r = make_str_node(str, s - str, ALREADY_MALLOCED); + r->flags |= TEMP; + return r; +} + +/* assoc_clear --- flush all the values in symbol[] before doing a split() */ + +void +assoc_clear(symbol) +NODE *symbol; +{ + int i; + NODE *bucket, *next; + + if (symbol->var_array == NULL) + return; + for (i = 0; i < symbol->array_size; i++) { + for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) { + next = bucket->ahnext; + unref(bucket->ahname); + unref(bucket->ahvalue); + freenode(bucket); + } + symbol->var_array[i] = NULL; + } + free(symbol->var_array); + symbol->var_array = NULL; + symbol->array_size = symbol->table_size = 0; + symbol->flags &= ~ARRAYMAXED; +} + +/* hash --- calculate the hash function of the string in subs */ + +unsigned int +hash(s, len, hsize) +register const char *s; +register size_t len; +unsigned long hsize; +{ + register unsigned long h = 0; + + /* + * This is INCREDIBLY ugly, but fast. We break the string up into + * 8 byte units. On the first time through the loop we get the + * "leftover bytes" (strlen % 8). On every other iteration, we + * perform 8 HASHC's so we handle all 8 bytes. Essentially, this + * saves us 7 cmp & branch instructions. If this routine is + * heavily used enough, it's worth the ugly coding. + * + * OZ's original sdbm hash, copied from Margo Seltzers db package. + */ + + /* + * Even more speed: + * #define HASHC h = *s++ + 65599 * h + * Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts + */ +#define HASHC htmp = (h << 6); \ + h = *s++ + htmp + (htmp << 10) - h + + unsigned long htmp; + + h = 0; + +#if defined(VAXC) + /* + * This was an implementation of "Duff's Device", but it has been + * redone, separating the switch for extra iterations from the + * loop. This is necessary because the DEC VAX-C compiler is + * STOOPID. + */ + switch (len & (8 - 1)) { + case 7: HASHC; + case 6: HASHC; + case 5: HASHC; + case 4: HASHC; + case 3: HASHC; + case 2: HASHC; + case 1: HASHC; + default: break; + } + + if (len > (8 - 1)) { + register size_t loop = len >> 3; + do { + HASHC; + HASHC; + HASHC; + HASHC; + HASHC; + HASHC; + HASHC; + HASHC; + } while (--loop); + } +#else /* ! VAXC */ + /* "Duff's Device" for those who can handle it */ + if (len > 0) { + register size_t loop = (len + 8 - 1) >> 3; + + switch (len & (8 - 1)) { + case 0: + do { /* All fall throughs */ + HASHC; + case 7: HASHC; + case 6: HASHC; + case 5: HASHC; + case 4: HASHC; + case 3: HASHC; + case 2: HASHC; + case 1: HASHC; + } while (--loop); + } + } +#endif /* ! VAXC */ + + if (h >= hsize) + h %= hsize; + return h; +} + +/* assoc_find --- locate symbol[subs] */ + +static NODE * /* NULL if not found */ +assoc_find(symbol, subs, hash1) +NODE *symbol; +register NODE *subs; +int hash1; +{ + register NODE *bucket; + + for (bucket = symbol->var_array[hash1]; bucket != NULL; + bucket = bucket->ahnext) { + if (cmp_nodes(bucket->ahname, subs) == 0) + return bucket; + } + return NULL; +} + +/* in_array --- test whether the array element symbol[subs] exists or not */ + +int +in_array(symbol, subs) +NODE *symbol, *subs; +{ + register int hash1; + int ret; + + if (symbol->type == Node_param_list) + symbol = stack_ptr[symbol->param_cnt]; + if ((symbol->flags & SCALAR) != 0) + fatal("attempt to use scalar as array"); + /* + * evaluate subscript first, it could have side effects + */ + subs = concat_exp(subs); /* concat_exp returns a string node */ + if (symbol->var_array == NULL) { + free_temp(subs); + return 0; + } + hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size); + ret = (assoc_find(symbol, subs, hash1) != NULL); + free_temp(subs); + return ret; +} + +/* + * assoc_lookup: + * Find SYMBOL[SUBS] in the assoc array. Install it with value "" if it + * isn't there. Returns a pointer ala get_lhs to where its value is stored. + * + * SYMBOL is the address of the node (or other pointer) being dereferenced. + * SUBS is a number or string used as the subscript. + */ + +NODE ** +assoc_lookup(symbol, subs) +NODE *symbol, *subs; +{ + register int hash1; + register NODE *bucket; + + (void) force_string(subs); + + if ((symbol->flags & SCALAR) != 0) + fatal("attempt to use scalar as array"); + + if (symbol->var_array == NULL) { + symbol->type = Node_var_array; + symbol->array_size = symbol->table_size = 0; /* sanity */ + symbol->flags &= ~ARRAYMAXED; + grow_table(symbol); + hash1 = hash(subs->stptr, subs->stlen, + (unsigned long) symbol->array_size); + } else { + hash1 = hash(subs->stptr, subs->stlen, + (unsigned long) symbol->array_size); + bucket = assoc_find(symbol, subs, hash1); + if (bucket != NULL) { + free_temp(subs); + return &(bucket->ahvalue); + } + } + + /* It's not there, install it. */ + if (do_lint && subs->stlen == 0) + warning("subscript of array `%s' is null string", + symbol->vname); + + /* first see if we would need to grow the array, before installing */ + symbol->table_size++; + if ((symbol->flags & ARRAYMAXED) == 0 + && symbol->table_size/symbol->array_size > AVG_CHAIN_MAX) { + grow_table(symbol); + /* have to recompute hash value for new size */ + hash1 = hash(subs->stptr, subs->stlen, + (unsigned long) symbol->array_size); + } + + getnode(bucket); + bucket->type = Node_ahash; + if (subs->flags & TEMP) + bucket->ahname = dupnode(subs); + else { + unsigned int saveflags = subs->flags; + + subs->flags &= ~MALLOC; + bucket->ahname = dupnode(subs); + subs->flags = saveflags; + } + free_temp(subs); + + /* array subscripts are strings */ + bucket->ahname->flags &= ~NUMBER; + bucket->ahname->flags |= STRING; + bucket->ahvalue = Nnull_string; + bucket->ahnext = symbol->var_array[hash1]; + symbol->var_array[hash1] = bucket; + return &(bucket->ahvalue); +} + +/* do_delete --- perform `delete array[s]' */ + +void +do_delete(symbol, tree) +NODE *symbol, *tree; +{ + register int hash1; + register NODE *bucket, *last; + NODE *subs; + + if (symbol->type == Node_param_list) { + symbol = stack_ptr[symbol->param_cnt]; + if (symbol->type == Node_var) + return; + } + if (symbol->type == Node_var_array) { + if (symbol->var_array == NULL) + return; + } else + fatal("delete: illegal use of variable `%s' as array", + symbol->vname); + + if (tree == NULL) { /* delete array */ + assoc_clear(symbol); + return; + } + + subs = concat_exp(tree); /* concat_exp returns string node */ + hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size); + + last = NULL; + for (bucket = symbol->var_array[hash1]; bucket != NULL; + last = bucket, bucket = bucket->ahnext) + if (cmp_nodes(bucket->ahname, subs) == 0) + break; + free_temp(subs); + if (bucket == NULL) { + if (do_lint) + warning("delete: index `%s' not in array `%s'", + subs->stptr, symbol->vname); + return; + } + if (last != NULL) + last->ahnext = bucket->ahnext; + else + symbol->var_array[hash1] = bucket->ahnext; + unref(bucket->ahname); + unref(bucket->ahvalue); + freenode(bucket); + symbol->table_size--; + if (symbol->table_size <= 0) { + memset(symbol->var_array, '\0', + sizeof(NODE *) * symbol->array_size); + symbol->table_size = symbol->array_size = 0; + symbol->flags &= ~ARRAYMAXED; + free((char *) symbol->var_array); + symbol->var_array = NULL; + } +} + +/* assoc_scan --- start a ``for (iggy in foo)'' loop */ + +void +assoc_scan(symbol, lookat) +NODE *symbol; +struct search *lookat; +{ + lookat->sym = symbol; + lookat->idx = 0; + lookat->bucket = NULL; + lookat->retval = NULL; + if (symbol->var_array != NULL) + assoc_next(lookat); +} + +/* assoc_next --- actually find the next element in array */ + +void +assoc_next(lookat) +struct search *lookat; +{ + register NODE *symbol = lookat->sym; + + if (symbol == NULL) + fatal("null symbol in assoc_next"); + if (symbol->var_array == NULL || lookat->idx > symbol->array_size) { + lookat->retval = NULL; + return; + } + /* + * This is theoretically unsafe. The element bucket might have + * been freed if the body of the scan did a delete on the next + * element of the bucket. The only way to do that is by array + * reference, which is unlikely. Basically, if the user is doing + * anything other than an operation on the current element of an + * assoc array while walking through it sequentially, all bets are + * off. (The safe way is to register all search structs on an + * array with the array, and update all of them on a delete or + * insert) + */ + if (lookat->bucket != NULL) { + lookat->retval = lookat->bucket->ahname; + lookat->bucket = lookat->bucket->ahnext; + return; + } + for (; lookat->idx < symbol->array_size; lookat->idx++) { + NODE *bucket; + + if ((bucket = symbol->var_array[lookat->idx]) != NULL) { + lookat->retval = bucket->ahname; + lookat->bucket = bucket->ahnext; + lookat->idx++; + return; + } + } + lookat->retval = NULL; + lookat->bucket = NULL; + return; +} + +/* grow_table --- grow a hash table */ + +static void +grow_table(symbol) +NODE *symbol; +{ + NODE **old, **new, *chain, *next; + int i, j; + unsigned long hash1; + unsigned long oldsize, newsize; + /* + * This is an array of primes. We grow the table by an order of + * magnitude each time (not just doubling) so that growing is a + * rare operation. We expect, on average, that it won't happen + * more than twice. The final size is also chosen to be small + * enough so that MS-DOG mallocs can handle it. When things are + * very large (> 8K), we just double more or less, instead of + * just jumping from 8K to 64K. + */ + static long sizes[] = { 13, 127, 1021, 8191, 16381, 32749, 65497, +#if ! defined(MSDOS) && ! defined(OS2) && ! defined(atarist) + 131101, 262147, 524309, 1048583, 2097169, + 4194319, 8388617, 16777259, 33554467, + 67108879, 134217757, 268435459, 536870923, + 1073741827 +#endif + }; + + /* find next biggest hash size */ + newsize = oldsize = symbol->array_size; + for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) { + if (oldsize < sizes[i]) { + newsize = sizes[i]; + break; + } + } + + if (newsize == oldsize) { /* table already at max (!) */ + symbol->flags |= ARRAYMAXED; + return; + } + + /* allocate new table */ + emalloc(new, NODE **, newsize * sizeof(NODE *), "grow_table"); + memset(new, '\0', newsize * sizeof(NODE *)); + + /* brand new hash table, set things up and return */ + if (symbol->var_array == NULL) { + symbol->table_size = 0; + goto done; + } + + /* old hash table there, move stuff to new, free old */ + old = symbol->var_array; + for (i = 0; i < oldsize; i++) { + if (old[i] == NULL) + continue; + + for (chain = old[i]; chain != NULL; chain = next) { + next = chain->ahnext; + hash1 = hash(chain->ahname->stptr, + chain->ahname->stlen, newsize); + + /* remove from old list, add to new */ + chain->ahnext = new[hash1]; + new[hash1] = chain; + + } + } + free(old); + +done: + /* + * note that symbol->table_size does not change if an old array, + * and is explicitly set to 0 if a new one. + */ + symbol->var_array = new; + symbol->array_size = newsize; +} |