*** src.rgx/config.c Thu Jan 2 23:34:31 1997 --- config.c Thu Jan 2 23:51:21 1997 *************** *** 19,24 **** --- 19,25 ---- # include "compile.h" # include "csupport.h" # include "table.h" + # include "rgx.h" typedef struct { char *name; /* name of the option */ *************** *** 810,815 **** --- 811,819 ---- /* initialize interpreter */ i_init(conf[CREATE].u.str); + + /* initialize regular expressions */ + rgx_init(); /* initialize compiler */ c_init(conf[AUTO_OBJECT].u.str, *** src.rgx/kfun/extra.c Tue Sep 27 09:28:26 1994 --- kfun/extra.c Thu Feb 2 22:25:18 1995 *************** *** 560,562 **** --- 560,640 ---- error("Not yet implemented"); } # endif + + + # ifdef FUNCDEF + FUNCDEF("regexp_compile", kf_regexp_compile, p_regexp_compile) + # else + char p_regexp_compile[] = { C_TYPECHECKED | C_STATIC | C_VARARGS, + T_STRING | (1 << REFSHIFT), 2, T_STRING, T_INT }; + + /* + * NAME: kfun->regexp_compile() + * DESCRIPTION: compile a regexp pattern + */ + int kf_regexp_compile(nargs) + int nargs; + { + int case_matters; + array *compiled; + + if (nargs < 1) + return -1; + + case_matters = (nargs == 2 ? ! (sp++)->u.number : 1); + + compiled = rgx_new(sp->u.string, case_matters); + + str_del(sp->u.string); + sp->type = T_ARRAY; + arr_ref(sp->u.array = compiled); + + return 0; + } + # endif + + + # ifdef FUNCDEF + FUNCDEF("regexp_match", kf_regexp_match, p_regexp_match) + # else + char p_regexp_match[] = { C_TYPECHECKED | C_STATIC | C_VARARGS, + T_INT | (1 << REFSHIFT), 3, + T_STRING | (1 << REFSHIFT), T_STRING, T_INT }; + + /* + * NAME: kfun->regexp_match() + * DESCRIPTION: perform regexp matching with a previously compiled pattern + */ + int kf_regexp_match(nargs) + int nargs; + { + int reverse; + string *subject; + array *compiled, *result; + + if (nargs < 2) + return -1; + + reverse = (nargs == 3 ? (sp++)->u.number : 0); + subject = sp->u.string; + compiled = sp[1].u.array; + + if (compiled->size != 3) + return 1; + + result = rgx_match(d_get_elts(compiled), subject, reverse); + + str_del((sp++)->u.string); + arr_del(sp->u.array); + + if (result == (array *) 0) + { + sp->type = T_INT; + sp->u.number = 0; + } + else + arr_ref(sp->u.array = result); + + return 0; + } + # endif *** src.rgx/kfun/kfun.h Sun May 8 08:15:01 1994 --- kfun/kfun.h Thu Feb 2 22:25:18 1995 *************** *** 5,7 **** --- 5,8 ---- # include "xfloat.h" # include "interpret.h" # include "data.h" + # include "rgx.h" *** src.rgx/rgx.c Thu Jan 2 21:41:55 1997 --- rgx.c Thu Jan 2 21:17:46 1997 *************** *** 0 **** --- 1,213 ---- + # include "dgd.h" + # include "str.h" + # include "array.h" + # include "interpret.h" + # include + # include "rgx.h" + # include + + static char trans_table[256]; + + /* + * NAME: regexp->init() + * DESCRIPTION: initialize regexp handling + */ + void rgx_init() + { + register int i; + + for (i = 0; i < 256; ++i) + trans_table[i] = i; + for (i = 'a'; i <= 'z'; ++i) + trans_table[i] = i + 'A' - 'a'; + } + + /* + * NAME: regexp->new() + * DESCRIPTION: create a new regexp buffer + */ + array *rgx_new(pattern, case_matters) + string *pattern; + int case_matters; + { + char *translate; + struct re_pattern_buffer patbuf; + char fastmap[256]; + const char *compile_error; + array *result; + register value *v; + string *s; + + translate = (case_matters ? (char *) 0 : trans_table); + + patbuf.buffer = 0; + patbuf.allocated = 0; + patbuf.used = 0; + + patbuf.fastmap = fastmap; + patbuf.translate = translate; + + patbuf.fastmap_accurate = 0; + + { + int i; + long n = 0; + for (i = 0; i < pattern->len; i++) { + switch (pattern->text[i]) { + case '[': + if (pattern->text[++i] == '^') + i++; + for (i++; i < pattern->len; i++) + if (pattern->text[i] == ']') + break; + break; + case '%': + pattern->text[i++] = '\\'; /* skip escaped char */ + break; + case '\\': + pattern->text[i] == '%'; /* mark for expansion */ + n++; + break; + } + } + if (n) { + int j; + + s = str_new(NULL, pattern->len + n); + for (i = j = 0; i < pattern->len; i++, j++) { + switch (pattern->text[i]) { + case '[': + s->text[j++] = pattern->text[i++]; + if (i == pattern->len) + goto breakout; + if (pattern->text[i] == '^') { + s->text[j++] = pattern->text[i++]; + if (i == pattern->len) + goto breakout; + } + s->text[j++] = pattern->text[i++]; + if (i == pattern->len) + goto breakout; + for ( ; i < pattern->len; i++, j++) { + if ((s->text[j] = pattern->text[i]) == ']') + break; + } + break; + case '%': /* expand */ + s->text[j++] = '\\'; + s->text[j] = '\\'; + break; + case '\\': /* skip escaped char */ + s->text[j++] = pattern->text[i++]; + if (i == pattern->len) + goto breakout; + /* fallthru */ + default: + s->text[j] = pattern->text[i]; + } + } + breakout: + } + } + compile_error = re_compile_pattern(s->text, s->len, &patbuf); + str_del(s); + if (compile_error != (char *) 0) + { + regfree(&patbuf); + error(compile_error); + } + + re_compile_fastmap(&patbuf); + + result = arr_new(3L); + v = result->elts; + + v->type = T_STRING; + str_ref(v->u.string = str_new((char *) &patbuf, (long) sizeof(patbuf))); + ++v; + v->type = T_STRING; + str_ref(v->u.string = str_new((char *) patbuf.buffer, + (long) patbuf.allocated)); + ++v; + v->type = T_STRING; + str_ref(v->u.string = str_new(fastmap, 256L)); + + /* don't let regfree() try to free these */ + patbuf.fastmap = 0; + patbuf.translate = 0; + + regfree(&patbuf); + + return result; + } + + /* + * NAME: regexp->match() + * DESCRIPTION: perform regexp matching, given a pattern and subject string + */ + array *rgx_match(pattern, subject, reverse) + value *pattern; + string *subject; + int reverse; + { + long sub_len; + struct re_pattern_buffer patbuf; + struct re_registers regs; + regoff_t starts[RGX_NREGS + 1], ends[RGX_NREGS + 1]; + array *result; + register value *v; + register int i; + + if (pattern[0].u.string->len != sizeof(struct re_pattern_buffer)) + error("Invalid compiled pattern"); + + memcpy((char *) &patbuf, pattern[0].u.string->text, + sizeof(struct re_pattern_buffer)); + + if (patbuf.allocated != (unsigned long) pattern[1].u.string->len || + pattern[2].u.string->len != 256) + error("Invalid compiled pattern"); + + patbuf.buffer = (unsigned char *) pattern[1].u.string->text; + patbuf.fastmap = pattern[2].u.string->text; + + regs.num_regs = RGX_NREGS; + regs.start = starts; + regs.end = ends; + patbuf.regs_allocated = REGS_FIXED; + + sub_len = subject->len; + if (re_search(&patbuf, subject->text, sub_len, reverse ? sub_len : 0, + reverse ? -(sub_len + 1) : sub_len + 1, ®s) == -1) + return (array *) 0; + + result = arr_new((long) RGX_NREGS * 2); + v = result->elts; + + v->type = T_INT; + v->u.number = starts[0]; + ++v; + + v->type = T_INT; + v->u.number = ends[0] - 1; + ++v; + + for (i = 1; i < RGX_NREGS; ++i, v += 2) + { + v[0].type = T_INT; + v[1].type = T_INT; + + if (starts[i] == -1) + { + v[0].u.number = 0; + v[1].u.number = -1; + } + else + { + v[0].u.number = starts[i]; + v[1].u.number = ends[i] - 1; + } + } + + return result; + } *** src.rgx/rgx.h Thu Jan 2 21:42:05 1997 --- rgx.h Fri Feb 3 03:09:54 1995 *************** *** 0 **** --- 1,5 ---- + # define RGX_NREGS 10 + + extern void rgx_init P((void)); + extern array *rgx_new P((string*, int)); + extern array *rgx_match P((value*, string*, int)); *** doc.rgx/rgx_example.c Thu Jan 1 00:00:00 1970 --- ../doc/rgx_example.c Fri Feb 3 03:30:01 1995 *************** *** 0 **** --- 1,49 ---- + /* + * This file shows how an interface can be built to cache regexp patterns + * and ultimately provide a more streamlined interface to the regexp kfuns. + * + * Note that since regexp_match() severely depends on the return result from + * regexp_compile() being unaltered, it is a good idea to provide an + * interface like this, and also to mask the regexp_match() kfun from the + * auto object. + */ + + # define CACHE_SIZE 10 + + private mapping cache; + private string *list; + private string last_pattern; + + static + void create(void) + { + cache = ([ ]); + list = ({ }); + } + + int *match(string subject, string pattern) + { + string *buffer; + + if ((buffer = cache[pattern]) == 0) + { + buffer = regexp_compile(pattern); + + if (sizeof(list) >= CACHE_SIZE) + { + cache[list[0]] = 0; + list = list[1 ..] + ({ pattern }); + } + else + list += ({ pattern }); + + cache[pattern] = buffer; + } + else if (pattern != last_pattern) + { + list = list - ({ pattern }) + ({ pattern }); + last_pattern = pattern; + } + + return regexp_match(buffer, subject); + } *** doc.rgx/kfun/regexp_compile Thu Jan 1 00:00:00 1970 --- ../doc/kfun/regexp_compile Tue Jul 26 00:02:34 1994 *************** *** 0 **** --- 1,27 ---- + NAME + regexp_compile - compile a regular expression + + SYNOPSIS + varargs string *regexp_compile(string pattern, int case_insensitive) + + DESCRIPTION + The argument pattern is compiled as a regular expression. If the + argument case_insensitive is nonzero, the pattern is compiled in + such a way that subsequent matching will be done without case + sensitivity. The default is to be case-sensitive. + + An array of strings is returned; these strings contain binary + data and must not be altered in any way before being passed to + regexp_match(). + + The compiled regexp can be saved and used any number of times with + regexp_match(). + + ERRORS + If the argument pattern contains a syntactically malformed regular + expression, an error will result. An error can also occur if the + pattern is too complicated, or if there is not enough memory to + compile the pattern. + + SEE ALSO + kfun/regexp_match *** doc.rgx/kfun/regexp_match Thu Jan 1 00:00:00 1970 --- ../doc/kfun/regexp_match Mon Jul 25 22:19:42 1994 *************** *** 0 **** --- 1,34 ---- + NAME + regexp_match - perform regular expression matching + + SYNOPSIS + varargs int *regexp_match(string *pattern, string subject, int reverse) + + DESCRIPTION + The argument subject is matched against the compiled regular + expression pattern. If the argument reverse is nonzero, matching + is performed from right-to-left; otherwise, matching is performed + left-to-right. + + The pattern argument must be an array of strings exactly as it + was received from regexp_compile(); otherwise, the result of + calling this function is undefined. + + If the argument subject could not be matched with the regular + expression, 0 is returned. Otherwise, an array of 20 integers + is returned with this format: + + ({ start0, end0, start1, end1, ..., start9, end9 }) + + Each element is a character index into the subject string. The + first two elements, start0 and end0, indicate the part of the subject + that was matched by the regular expression as a whole. The following + elements indicate the starting and ending indices of each + subexpression (denoted by "%(" and "%)" pairs in the original + pattern) that were matched. + + If any subexpression was not matched, the corresponding start and + end elements will be 0 and -1, respectively. + + SEE ALSO + kfun/regexp_compile *** doc.rgx/regexps Thu Jan 1 00:00:00 1970 --- ../doc/regexps Mon Jul 25 22:58:57 1994 *************** *** 0 **** --- 1,32 ---- + + Regular expressions are composed of the following operators: + + . Match any single character + XY Match X immediately followed by Y + X* Match zero-or-more of X + X+ Match one-or-more of X + X? Match zero-or-one of X + X%|Y Match either X or Y + [charset] Match any single character in `charset' + [^charset] Match any single character not in `charset' + %(X%) Match X, but also remember the match as a subexpression + %digit Match the numbered previous subexpression + ^X Match X anchored at the beginning of a line + X$ Match X anchored at the end of a line + %b Match the empty string at the beginning or end of a word + %B Match the empty string only within the middle of a word + %< Match the beginning of a word + %> Match the end of a word + %w Match any word-constituent character + %W Match any character that is not word-constituent + + Any other character in a regular expression is matched literally with itself. + To match any of the special operator characters .*+?%[^$ literally, precede + the character with `%'. + + A `charset' is formed by listing all desired characters with brackets. To + include a literal `^' in a charset, do not list it in the first position. To + include a literal `]', list it immediately after the opening `[' or `[^'. All + characters are non-special (and should not be escaped) within a charset, + except `-', which denotes a character range. To include a literal `-', list it + either first or last. *** README.rgx.old Fri Jan 3 03:17:21 1997 --- ../README.rgx Fri Jan 3 03:14:29 1997 *************** *** 0 **** --- 1,18 ---- + dgd-rgx was written by Robert Leslie as an LPC interface to + GNU regex, adding two kfuns to DGD for regular expression matching: + + regexp_compile() + regexp_match() + + For a description of the regular expression language accepted by these kfuns, + please read doc/regexps. + + Complete details for the two kfuns can be found in the doc/kfun directory. + + Adapted by Adam David for DGD 1.0.97 and to use the unmodified + GNU regexp library. + + This software is a modification of DGD, and is therefore protected by the + DGD Copyright. + + There is no warranty for this software.