diff options
Diffstat (limited to 'textproc/amberfish')
-rw-r--r-- | textproc/amberfish/Makefile | 37 | ||||
-rw-r--r-- | textproc/amberfish/distinfo | 3 | ||||
-rw-r--r-- | textproc/amberfish/files/patch-Makefile.in | 11 | ||||
-rw-r--r-- | textproc/amberfish/files/porter.cc | 438 | ||||
-rw-r--r-- | textproc/amberfish/pkg-descr | 21 | ||||
-rw-r--r-- | textproc/amberfish/pkg-plist | 17 |
6 files changed, 527 insertions, 0 deletions
diff --git a/textproc/amberfish/Makefile b/textproc/amberfish/Makefile new file mode 100644 index 000000000000..209489095bce --- /dev/null +++ b/textproc/amberfish/Makefile @@ -0,0 +1,37 @@ +# New ports collection makefile for: amberfish +# Date created: 21 Aug 2008 +# Whom: Pedro Giffuni +# +# $FreeBSD$ +# + +PORTNAME= amberfish +PORTVERSION= 1.6.4 +CATEGORIES= textproc databases +MASTER_SITES= SF \ + http://etymon.com/software/amberfish/stable/ + +MAINTAINER= giffunip@tutopia.com +COMMENT= General purpose text retrieval Software + +LIB_DEPENDS= xerces-c.27:${PORTSDIR}/textproc/xerces-c2 + +GNU_CONFIGURE= yes +USE_GMAKE= yes + +MAN1= af.1 + +post-extract: + ${INSTALL_DATA} ${FILESDIR}/porter.cc ${WRKSRC}/src + +.if !defined(NOPORTDOCS) +post-build: + @(cd ${WRKSRC} && ${SETENV} ${MAKE_ENV} ${GMAKE} html) + +post-install: + ${MKDIR} ${DOCSDIR} + ${INSTALL_DATA} ${WRKSRC}/amberfish.png ${DOCSDIR} + ${INSTALL_MAN} ${WRKSRC}/doc/html/*.html ${DOCSDIR} +.endif + +.include <bsd.port.mk> diff --git a/textproc/amberfish/distinfo b/textproc/amberfish/distinfo new file mode 100644 index 000000000000..c3a92bf33394 --- /dev/null +++ b/textproc/amberfish/distinfo @@ -0,0 +1,3 @@ +MD5 (amberfish-1.6.4.tar.gz) = 8eb3f1e26da9d0317719822539c3b932 +SHA256 (amberfish-1.6.4.tar.gz) = 155ac6e6b9b76fb7cbd94952548f718ab6add72c3b4fd2482d89abb39d96ce76 +SIZE (amberfish-1.6.4.tar.gz) = 127198 diff --git a/textproc/amberfish/files/patch-Makefile.in b/textproc/amberfish/files/patch-Makefile.in new file mode 100644 index 000000000000..138b01a101d7 --- /dev/null +++ b/textproc/amberfish/files/patch-Makefile.in @@ -0,0 +1,11 @@ +--- src/Makefile.in.orig 2008-08-12 11:45:08.000000000 -0500 ++++ src/Makefile.in 2008-08-12 11:46:07.000000000 -0500 +@@ -66,7 +66,7 @@ + strip af + + install: all +- make strip ++ ${MAKE} strip + mkdir -p ${PREFIXBIN} + cp ${BIN} ${PREFIXBIN}/. + diff --git a/textproc/amberfish/files/porter.cc b/textproc/amberfish/files/porter.cc new file mode 100644 index 000000000000..a997d88ef24c --- /dev/null +++ b/textproc/amberfish/files/porter.cc @@ -0,0 +1,438 @@ + +/* This is the Porter stemming algorithm, coded up in ANSI C by the + author. It may be be regarded as cononical, in that it follows the + algorithm presented in + + Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, + no. 3, pp 130-137, + + only differing from it at the points maked --DEPARTURE-- below. + + See also http://www.tartarus.org/~martin/PorterStemmer + +The algorithm as described in the paper could be exactly replicated +by adjusting the points of DEPARTURE, but this is barely necessary, +because (a) the points of DEPARTURE are definitely improvements, and +(b) no encoding of the Porter stemmer I have seen is anything like +as exact as this version, even with the points of DEPARTURE! + +You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which +'stem' takes a list of inputs and sends the stemmed equivalent to +stdout. + +The algorithm as encoded here is particularly fast. + +Release 1 +*/ + +#include <string.h> /* for memmove */ + +#define TRUE 1 +#define FALSE 0 + +/* The main part of the stemming algorithm starts here. b is a buffer + holding a word to be stemmed. The letters are in b[k0], b[k0+1] ... + ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted + downwards as the stemming progresses. Zero termination is not in fact + used in the algorithm. + + Note that only lower case sequences are stemmed. Forcing to lower case + should be done before stem(...) is called. +*/ + +static char * b; /* buffer for word to be stemmed */ +static int k,k0,j; /* j is a general offset into the string */ + +/* cons(i) is TRUE <=> b[i] is a consonant. */ + +static int cons(int i) +{ + switch (b[i]) + { + case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE; + case 'y': return (i==k0) ? TRUE : !cons(i-1); + default: return TRUE; + } +} + + +/* m() measures the number of consonant sequences between k0 and j. if c is + a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + presence, + + <c><v> gives 0 + <c>vc<v> gives 1 + <c>vcvc<v> gives 2 + <c>vcvcvc<v> gives 3 + .... +*/ + +static int m() +{ + int n = 0; + int i = k0; + while(TRUE) + { + if (i > j) return n; + if (! cons(i)) break; i++; + } + i++; + while(TRUE) + { + while(TRUE) + { + if (i > j) return n; + if (cons(i)) break; + i++; + } + i++; + n++; + while(TRUE) + { + if (i > j) return n; + if (! cons(i)) break; + i++; + } + i++; + } +} + + +/* vowelinstem() is TRUE <=> k0,...j contains a vowel */ + +static int vowelinstem() +{ + int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE; + return FALSE; +} + + +/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */ + +static int doublec(int j) +{ + if (j < k0+1) return FALSE; + if (b[j] != b[j-1]) return FALSE; + return cons(j); +} + + +/* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short word. e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + +*/ + +static int cvc(int i) +{ + if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE; + { + int ch = b[i]; + if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE; + } + return TRUE; +} + + +/* ends(s) is TRUE <=> k0,...k ends with the string s. */ + +static int ends(char * s) +{ + int length = s[0]; + if (s[length] != b[k]) return FALSE; /* tiny speed-up */ + if (length > k-k0+1) return FALSE; + if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE; + j = k-length; + return TRUE; +} + + +/* setto(s) sets (j+1),...k to the characters in the string s, readjusting + k. */ + +static void setto(char * s) +{ + int length = s[0]; + memmove(b+j+1,s+1,length); + k = j+length; +} + + +/* r(s) is used further down. */ + +static void r(char * s) { if (m() > 0) setto(s); } + +/* step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + +*/ + +static void step1ab() +{ + if (b[k] == 's') + { + if (ends("\04" "sses")) k -= 2; else + if (ends("\03" "ies")) setto("\01" "i"); else + if (b[k-1] != 's') k--; + } + if (ends("\03" "eed")) { if (m() > 0) k--; } + else + if ((ends("\02" "ed") || ends("\03" "ing")) && vowelinstem()) + { + k = j; + if (ends("\02" "at")) setto("\03" "ate"); else + if (ends("\02" "bl")) setto("\03" "ble"); else + if (ends("\02" "iz")) setto("\03" "ize"); else + if (doublec(k)) + { + k--; + { + int ch = b[k]; + if (ch == 'l' || ch == 's' || ch == 'z') k++; + } + } + else if (m() == 1 && cvc(k)) setto("\01" "e"); + } +} + + +/* step1c() turns terminal y to i when there is another vowel in the stem. */ + +static void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; } + +/* step2() maps double suffices to single ones. so -ization ( = -ize plus + -ation) maps to -ize etc. note that the string before the suffix must give + m() > 0. */ + +static void step2() +{ + switch (b[k-1]) + { + case 'a': if (ends("\07" "ational")) { r("\03" "ate"); break; } + if (ends("\06" "tional")) { r("\04" "tion"); break; } + break; + case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; } + if (ends("\04" "anci")) { r("\04" "ance"); break; } + break; + case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; } + break; + case 'l': if (ends("\03" "bli")) /*-DEPARTURE-*/ + { + r("\03" "ble"); break; + } + +/* To match the published algorithm, replace this line with + case 'l': if (ends("\04" "abli")) { r("\04" "able"); break; } */ + + if (ends("\04" "alli")) { r("\02" "al"); break; } + if (ends("\05" "entli")) { r("\03" "ent"); break; } + if (ends("\03" "eli")) { r("\01" "e"); break; } + if (ends("\05" "ousli")) { r("\03" "ous"); break; } + break; + case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; } + if (ends("\05" "ation")) { r("\03" "ate"); break; } + if (ends("\04" "ator")) { r("\03" "ate"); break; } + break; + case 's': if (ends("\05" "alism")) { r("\02" "al"); break; } + if (ends("\07" "iveness")) { r("\03" "ive"); break; } + if (ends("\07" "fulness")) { r("\03" "ful"); break; } + if (ends("\07" "ousness")) { r("\03" "ous"); break; } + break; + case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; } + if (ends("\05" "iviti")) { r("\03" "ive"); break; } + if (ends("\06" "biliti")) { r("\03" "ble"); break; } + break; + case 'g': if (ends("\04" "logi")) /*-DEPARTURE-*/ + { + r("\03" "log"); break; + } + +/* To match the published algorithm, delete this line */ + + } +} + + +/* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */ + +static void step3() +{ + switch (b[k]) + { + case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; } + if (ends("\05" "ative")) { r("\00" ""); break; } + if (ends("\05" "alize")) { r("\02" "al"); break; } + break; + case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; } + break; + case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; } + if (ends("\03" "ful")) { r("\00" ""); break; } + break; + case 's': if (ends("\04" "ness")) { r("\00" ""); break; } + break; + } +} + + +/* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */ + +static void step4() +{ + switch (b[k-1]) + { + case 'a': if (ends("\02" "al")) break; return; + case 'c': if (ends("\04" "ance")) break; + if (ends("\04" "ence")) break; return; + case 'e': if (ends("\02" "er")) break; return; + case 'i': if (ends("\02" "ic")) break; return; + case 'l': if (ends("\04" "able")) break; + if (ends("\04" "ible")) break; return; + case 'n': if (ends("\03" "ant")) break; + if (ends("\05" "ement")) break; + if (ends("\04" "ment")) break; + if (ends("\03" "ent")) break; return; + case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break; + if (ends("\02" "ou")) break; return; +/* takes care of -ous */ + case 's': if (ends("\03" "ism")) break; return; + case 't': if (ends("\03" "ate")) break; + if (ends("\03" "iti")) break; return; + case 'u': if (ends("\03" "ous")) break; return; + case 'v': if (ends("\03" "ive")) break; return; + case 'z': if (ends("\03" "ize")) break; return; + default: return; + } + if (m() > 1) k = j; +} + + +/* step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. */ + +static void step5() +{ + j = k; + if (b[k] == 'e') + { + int a = m(); + if (a > 1 || a == 1 && !cvc(k-1)) k--; + } + if (b[k] == 'l' && doublec(k) && m() > 1) k--; +} + + +/* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from + p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last + character of a string, (p[j+1] == '\0'). The stemmer adjusts the + characters p[i] ... p[j] and returns the new end-point of the string, k. + Stemming never increases word length, so i <= k <= j. To turn the stemmer + into a module, declare 'stem' as extern, and delete the remainder of this + file. +*/ + +int stem(char * p, int i, int j) +{ /* copy the parameters into statics */ + b = p; k = j; k0 = i; + if (k <= k0+1) return k; /*-DEPARTURE-*/ + +/* With this line, strings of length 1 or 2 don't go through the + stemming process, although no mention is made of this in the + published algorithm. Remove the line to match the published + algorithm. */ + + step1ab(); step1c(); step2(); step3(); step4(); step5(); + return k; +} + + +/*--------------------stemmer definition ends here------------------------*/ + +#include <stdio.h> +#include <stdlib.h> /* for malloc, free */ +#include <ctype.h> /* for isupper, islower, tolower */ + +static char * s; /* a char * (=string) pointer; passed into b above */ + +#define INC 50 /* size units in which s is increased */ +static int i_max = INC; /* maximum offset in s */ + +void increase_s() +{ + i_max += INC; + { + char * new_s = (char *) malloc(i_max+1); + { /* copy across */ + int i; for (i = 0; i < i_max; i++) new_s[i] = s[i]; + } + free(s); s = new_s; + } +} + + +#define LETTER(ch) (isupper(ch) || islower(ch)) + +static void stemfile(FILE * f) +{ + while(TRUE) + { + int ch = getc(f); + if (ch == EOF) return; + if (LETTER(ch)) + { + int i = 0; + while(TRUE) + { + if (i == i_max) increase_s(); + + ch = tolower(ch); /* forces lower case */ + + s[i] = ch; i++; + ch = getc(f); + if (!LETTER(ch)) { ungetc(ch,f); break; } + } + s[stem(s,0,i-1)+1] = 0; +/* the previous line calls the stemmer and uses its result to + zero-terminate the string in s */ + printf("%s",s); + } + else putchar(ch); + } +} + +/* + * Commented out as required by amberfish's INSTALL file + * + int main(int argc, char * argv[]) + { + int i; + s = (char *) malloc(i_max+1); + for (i = 1; i < argc; i++) + { + FILE * f = fopen(argv[i],"r"); + if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); } + stemfile(f); + } + free(s); + return 0; + } +*/ diff --git a/textproc/amberfish/pkg-descr b/textproc/amberfish/pkg-descr new file mode 100644 index 000000000000..6f8318210b1a --- /dev/null +++ b/textproc/amberfish/pkg-descr @@ -0,0 +1,21 @@ +Amberfish is general purpose text retrieval software, developed at Etymon +by Nassib Nassar and distributed as open source software under the terms +of version 2 of the GNU General Public License (GPL). Its distinguishing +features are indexing/search of semi-structured text (i.e. both free tex +and multiply nested fields), built-in support for XML documents using the +Xerces library, structured queries allowing generalized field/tag paths, +hierarchical result sets (XML only), automatic searching across multiple +databases (allowing modular indexing), TREC format results, efficient +indexing, and relatively low memory requirements during indexing (and the +ability to index documents larger than available memory). Z39.50 support +is available. Other features include Boolean queries, right truncation, +phrase searching, relevance ranking, support for multiple documents per +file, incremental indexing, and easy integration with other UNIX tools, +The architecture is also designed to permit proximity queries; however, +they are not fully implemented at present. + +WWW: http://www.etymon.com/tr.html + +This port also includes the Porter stemming algorithm for suffix +stripping, available at: + http://www.tartarus.org/~martin/PorterStemmer diff --git a/textproc/amberfish/pkg-plist b/textproc/amberfish/pkg-plist new file mode 100644 index 000000000000..0844e358fbf2 --- /dev/null +++ b/textproc/amberfish/pkg-plist @@ -0,0 +1,17 @@ +bin/af +%%PORTDOCS%%%%DOCSDIR%%/Fields-and-XML.html +%%PORTDOCS%%%%DOCSDIR%%/Fields.html +%%PORTDOCS%%%%DOCSDIR%%/Indexing.html +%%PORTDOCS%%%%DOCSDIR%%/Introduction.html +%%PORTDOCS%%%%DOCSDIR%%/Listing-database-information.html +%%PORTDOCS%%%%DOCSDIR%%/More-about-XML.html +%%PORTDOCS%%%%DOCSDIR%%/Multiple-databases.html +%%PORTDOCS%%%%DOCSDIR%%/Multiple-documents-in-a-file.html +%%PORTDOCS%%%%DOCSDIR%%/Phrases.html +%%PORTDOCS%%%%DOCSDIR%%/Relevance-ranking.html +%%PORTDOCS%%%%DOCSDIR%%/Right-truncation.html +%%PORTDOCS%%%%DOCSDIR%%/Searching.html +%%PORTDOCS%%%%DOCSDIR%%/Tutorial.html +%%PORTDOCS%%%%DOCSDIR%%/amberfish.png +%%PORTDOCS%%%%DOCSDIR%%/index.html +%%PORTDOCS%%@dirrm %%DOCSDIR%% |