author: Martin Wilke <miwi@FreeBSD.org> 2008-09-30 14:19:57 +0000
committer: Martin Wilke <miwi@FreeBSD.org> 2008-09-30 14:19:57 +0000
commit: a022fa80ecb3fac9fb3cc1a379042bff9151a991 (patch)
tree: 9ad192a51bcf9e1baecc41c3b2d5ba1b917dfc66 /textproc/amberfish
parent: 9a067601932ca870b3f92750140071039f062206 (diff)
download: ports-a022fa80ecb3fac9fb3cc1a379042bff9151a991.tar.gz
ports-a022fa80ecb3fac9fb3cc1a379042bff9151a991.zip
6 files changed, 527 insertions, 0 deletions
diff --git a/textproc/amberfish/Makefile b/textproc/amberfish/Makefile
new file mode 100644
index 000000000000..209489095bce
--- /dev/null
+++ b/textproc/amberfish/Makefile
@@ -0,0 +1,37 @@
+# New ports collection makefile for:   amberfish
+# Date created:			21 Aug 2008
+# Whom:				Pedro Giffuni
+#
+# $FreeBSD$
+#
+
+PORTNAME=	amberfish
+PORTVERSION=	1.6.4
+CATEGORIES=	textproc databases
+MASTER_SITES=	SF	\
+		http://etymon.com/software/amberfish/stable/
+
+MAINTAINER=	giffunip@tutopia.com
+COMMENT=	General purpose text retrieval Software
+
+LIB_DEPENDS=	xerces-c.27:${PORTSDIR}/textproc/xerces-c2
+
+GNU_CONFIGURE=	yes
+USE_GMAKE=	yes
+
+MAN1=	af.1
+
+post-extract:
+	${INSTALL_DATA} ${FILESDIR}/porter.cc ${WRKSRC}/src
+
+.if !defined(NOPORTDOCS)
+post-build:
+	@(cd ${WRKSRC} && ${SETENV} ${MAKE_ENV} ${GMAKE} html)
+
+post-install:
+	${MKDIR} ${DOCSDIR}
+	${INSTALL_DATA} ${WRKSRC}/amberfish.png ${DOCSDIR}
+	${INSTALL_MAN} ${WRKSRC}/doc/html/*.html ${DOCSDIR}
+.endif
+
+.include <bsd.port.mk>
diff --git a/textproc/amberfish/distinfo b/textproc/amberfish/distinfo
new file mode 100644
index 000000000000..c3a92bf33394
--- /dev/null
+++ b/textproc/amberfish/distinfo
@@ -0,0 +1,3 @@
+MD5 (amberfish-1.6.4.tar.gz) = 8eb3f1e26da9d0317719822539c3b932
+SHA256 (amberfish-1.6.4.tar.gz) = 155ac6e6b9b76fb7cbd94952548f718ab6add72c3b4fd2482d89abb39d96ce76
+SIZE (amberfish-1.6.4.tar.gz) = 127198
diff --git a/textproc/amberfish/files/patch-Makefile.in b/textproc/amberfish/files/patch-Makefile.in
new file mode 100644
index 000000000000..138b01a101d7
--- /dev/null
+++ b/textproc/amberfish/files/patch-Makefile.in
@@ -0,0 +1,11 @@
+--- src/Makefile.in.orig	2008-08-12 11:45:08.000000000 -0500
++++ src/Makefile.in	2008-08-12 11:46:07.000000000 -0500
+@@ -66,7 +66,7 @@
+ 	strip af
+ 
+ install: all
+-	make strip
++	${MAKE} strip
+ 	mkdir -p ${PREFIXBIN}
+ 	cp ${BIN} ${PREFIXBIN}/.
+ 
diff --git a/textproc/amberfish/files/porter.cc b/textproc/amberfish/files/porter.cc
new file mode 100644
index 000000000000..a997d88ef24c
--- /dev/null
+++ b/textproc/amberfish/files/porter.cc
@@ -0,0 +1,438 @@
+
+/* This is the Porter stemming algorithm, coded up in ANSI C by the
+   author. It may be be regarded as cononical, in that it follows the
+   algorithm presented in
+
+   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+   no. 3, pp 130-137,
+
+   only differing from it at the points maked --DEPARTURE-- below.
+
+   See also http://www.tartarus.org/~martin/PorterStemmer
+
+The algorithm as described in the paper could be exactly replicated
+by adjusting the points of DEPARTURE, but this is barely necessary,
+because (a) the points of DEPARTURE are definitely improvements, and
+(b) no encoding of the Porter stemmer I have seen is anything like
+as exact as this version, even with the points of DEPARTURE!
+
+You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
+'stem' takes a list of inputs and sends the stemmed equivalent to
+stdout.
+
+The algorithm as encoded here is particularly fast.
+
+Release 1
+*/
+
+#include <string.h>                               /* for memmove */
+
+#define TRUE 1
+#define FALSE 0
+
+/* The main part of the stemming algorithm starts here. b is a buffer
+   holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
+   ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
+   downwards as the stemming progresses. Zero termination is not in fact
+   used in the algorithm.
+
+   Note that only lower case sequences are stemmed. Forcing to lower case
+   should be done before stem(...) is called.
+*/
+
+static char * b;                                  /* buffer for word to be stemmed */
+static int k,k0,j;                                /* j is a general offset into the string */
+
+/* cons(i) is TRUE <=> b[i] is a consonant. */
+
+static int cons(int i)
+{
+    switch (b[i])
+    {
+        case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
+        case 'y': return (i==k0) ? TRUE : !cons(i-1);
+        default: return TRUE;
+    }
+}
+
+
+/* m() measures the number of consonant sequences between k0 and j. if c is
+   a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+   presence,
+
+      <c><v>       gives 0
+      <c>vc<v>     gives 1
+      <c>vcvc<v>   gives 2
+      <c>vcvcvc<v> gives 3
+      ....
+*/
+
+static int m()
+{
+    int n = 0;
+    int i = k0;
+    while(TRUE)
+    {
+        if (i > j) return n;
+        if (! cons(i)) break; i++;
+    }
+    i++;
+    while(TRUE)
+    {
+        while(TRUE)
+        {
+            if (i > j) return n;
+            if (cons(i)) break;
+            i++;
+        }
+        i++;
+        n++;
+        while(TRUE)
+        {
+            if (i > j) return n;
+            if (! cons(i)) break;
+            i++;
+        }
+        i++;
+    }
+}
+
+
+/* vowelinstem() is TRUE <=> k0,...j contains a vowel */
+
+static int vowelinstem()
+{
+    int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE;
+    return FALSE;
+}
+
+
+/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
+
+static int doublec(int j)
+{
+    if (j < k0+1) return FALSE;
+    if (b[j] != b[j-1]) return FALSE;
+    return cons(j);
+}
+
+
+/* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+   and also if the second c is not w,x or y. this is used when trying to
+   restore an e at the end of a short word. e.g.
+
+      cav(e), lov(e), hop(e), crim(e), but
+      snow, box, tray.
+
+*/
+
+static int cvc(int i)
+{
+    if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE;
+    {
+        int ch = b[i];
+        if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
+    }
+    return TRUE;
+}
+
+
+/* ends(s) is TRUE <=> k0,...k ends with the string s. */
+
+static int ends(char * s)
+{
+    int length = s[0];
+    if (s[length] != b[k]) return FALSE;          /* tiny speed-up */
+    if (length > k-k0+1) return FALSE;
+    if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE;
+    j = k-length;
+    return TRUE;
+}
+
+
+/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
+   k. */
+
+static void setto(char * s)
+{
+    int length = s[0];
+    memmove(b+j+1,s+1,length);
+    k = j+length;
+}
+
+
+/* r(s) is used further down. */
+
+static void r(char * s) { if (m() > 0) setto(s); }
+
+/* step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+	caresses  ->  caress
+	ponies    ->  poni
+	ties      ->  ti
+	caress    ->  caress
+	cats      ->  cat
+
+	feed      ->  feed
+	agreed    ->  agree
+	disabled  ->  disable
+
+	matting   ->  mat
+	mating    ->  mate
+	meeting   ->  meet
+	milling   ->  mill
+	messing   ->  mess
+
+	meetings  ->  meet
+
+*/
+
+static void step1ab()
+{
+    if (b[k] == 's')
+    {
+        if (ends("\04" "sses")) k -= 2; else
+            if (ends("\03" "ies")) setto("\01" "i"); else
+                if (b[k-1] != 's') k--;
+    }
+    if (ends("\03" "eed")) { if (m() > 0) k--; }
+    else
+    if ((ends("\02" "ed") || ends("\03" "ing")) && vowelinstem())
+    {
+        k = j;
+        if (ends("\02" "at")) setto("\03" "ate"); else
+            if (ends("\02" "bl")) setto("\03" "ble"); else
+                if (ends("\02" "iz")) setto("\03" "ize"); else
+                    if (doublec(k))
+                    {
+                        k--;
+                        {
+                            int ch = b[k];
+                            if (ch == 'l' || ch == 's' || ch == 'z') k++;
+                        }
+                    }
+        else if (m() == 1 && cvc(k)) setto("\01" "e");
+    }
+}
+
+
+/* step1c() turns terminal y to i when there is another vowel in the stem. */
+
+static void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; }
+
+/* step2() maps double suffices to single ones. so -ization ( = -ize plus
+   -ation) maps to -ize etc. note that the string before the suffix must give
+   m() > 0. */
+
+static void step2()
+{
+    switch (b[k-1])
+    {
+        case 'a': if (ends("\07" "ational")) { r("\03" "ate"); break; }
+        if (ends("\06" "tional")) { r("\04" "tion"); break; }
+        break;
+        case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; }
+        if (ends("\04" "anci")) { r("\04" "ance"); break; }
+        break;
+        case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; }
+        break;
+        case 'l': if (ends("\03" "bli"))          /*-DEPARTURE-*/
+        {
+            r("\03" "ble"); break;
+        }
+
+/* To match the published algorithm, replace this line with
+   case 'l': if (ends("\04" "abli")) { r("\04" "able"); break; } */
+
+        if (ends("\04" "alli")) { r("\02" "al"); break; }
+        if (ends("\05" "entli")) { r("\03" "ent"); break; }
+        if (ends("\03" "eli")) { r("\01" "e"); break; }
+        if (ends("\05" "ousli")) { r("\03" "ous"); break; }
+        break;
+        case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; }
+        if (ends("\05" "ation")) { r("\03" "ate"); break; }
+        if (ends("\04" "ator")) { r("\03" "ate"); break; }
+        break;
+        case 's': if (ends("\05" "alism")) { r("\02" "al"); break; }
+        if (ends("\07" "iveness")) { r("\03" "ive"); break; }
+        if (ends("\07" "fulness")) { r("\03" "ful"); break; }
+        if (ends("\07" "ousness")) { r("\03" "ous"); break; }
+        break;
+        case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; }
+        if (ends("\05" "iviti")) { r("\03" "ive"); break; }
+        if (ends("\06" "biliti")) { r("\03" "ble"); break; }
+        break;
+        case 'g': if (ends("\04" "logi"))         /*-DEPARTURE-*/
+        {
+            r("\03" "log"); break;
+        }
+
+/* To match the published algorithm, delete this line */
+
+    }
+}
+
+
+/* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */
+
+static void step3()
+{
+    switch (b[k])
+    {
+        case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; }
+        if (ends("\05" "ative")) { r("\00" ""); break; }
+        if (ends("\05" "alize")) { r("\02" "al"); break; }
+        break;
+        case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; }
+        break;
+        case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; }
+        if (ends("\03" "ful")) { r("\00" ""); break; }
+        break;
+        case 's': if (ends("\04" "ness")) { r("\00" ""); break; }
+        break;
+    }
+}
+
+
+/* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+static void step4()
+{
+    switch (b[k-1])
+    {
+        case 'a': if (ends("\02" "al")) break; return;
+        case 'c': if (ends("\04" "ance")) break;
+        if (ends("\04" "ence")) break; return;
+        case 'e': if (ends("\02" "er")) break; return;
+        case 'i': if (ends("\02" "ic")) break; return;
+        case 'l': if (ends("\04" "able")) break;
+        if (ends("\04" "ible")) break; return;
+        case 'n': if (ends("\03" "ant")) break;
+        if (ends("\05" "ement")) break;
+        if (ends("\04" "ment")) break;
+        if (ends("\03" "ent")) break; return;
+        case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break;
+        if (ends("\02" "ou")) break; return;
+/* takes care of -ous */
+        case 's': if (ends("\03" "ism")) break; return;
+        case 't': if (ends("\03" "ate")) break;
+        if (ends("\03" "iti")) break; return;
+        case 'u': if (ends("\03" "ous")) break; return;
+        case 'v': if (ends("\03" "ive")) break; return;
+        case 'z': if (ends("\03" "ize")) break; return;
+        default: return;
+    }
+    if (m() > 1) k = j;
+}
+
+
+/* step5() removes a final -e if m() > 1, and changes -ll to -l if
+   m() > 1. */
+
+static void step5()
+{
+    j = k;
+    if (b[k] == 'e')
+    {
+        int a = m();
+        if (a > 1 || a == 1 && !cvc(k-1)) k--;
+    }
+    if (b[k] == 'l' && doublec(k) && m() > 1) k--;
+}
+
+
+/* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from
+   p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
+   character of a string, (p[j+1] == '\0'). The stemmer adjusts the
+   characters p[i] ... p[j] and returns the new end-point of the string, k.
+   Stemming never increases word length, so i <= k <= j. To turn the stemmer
+   into a module, declare 'stem' as extern, and delete the remainder of this
+   file.
+*/
+
+int stem(char * p, int i, int j)
+{                                                 /* copy the parameters into statics */
+    b = p; k = j; k0 = i;
+    if (k <= k0+1) return k;                      /*-DEPARTURE-*/
+
+/* With this line, strings of length 1 or 2 don't go through the
+   stemming process, although no mention is made of this in the
+   published algorithm. Remove the line to match the published
+   algorithm. */
+
+    step1ab(); step1c(); step2(); step3(); step4(); step5();
+    return k;
+}
+
+
+/*--------------------stemmer definition ends here------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>                               /* for malloc, free */
+#include <ctype.h>                                /* for isupper, islower, tolower */
+
+static char * s;                                  /* a char * (=string) pointer; passed into b above */
+
+#define INC 50                                    /* size units in which s is increased */
+static int i_max = INC;                           /* maximum offset in s */
+
+void increase_s()
+{
+    i_max += INC;
+    {
+        char * new_s = (char *) malloc(i_max+1);
+        {                                         /* copy across */
+            int i; for (i = 0; i < i_max; i++) new_s[i] = s[i];
+        }
+        free(s); s = new_s;
+    }
+}
+
+
+#define LETTER(ch) (isupper(ch) || islower(ch))
+
+static void stemfile(FILE * f)
+{
+    while(TRUE)
+    {
+        int ch = getc(f);
+        if (ch == EOF) return;
+        if (LETTER(ch))
+        {
+            int i = 0;
+            while(TRUE)
+            {
+                if (i == i_max) increase_s();
+
+                ch = tolower(ch);                 /* forces lower case */
+
+                s[i] = ch; i++;
+                ch = getc(f);
+                if (!LETTER(ch)) { ungetc(ch,f); break; }
+            }
+            s[stem(s,0,i-1)+1] = 0;
+/* the previous line calls the stemmer and uses its result to
+   zero-terminate the string in s */
+            printf("%s",s);
+        }
+        else putchar(ch);
+    }
+}
+
+/*
+ * Commented out as required by amberfish's INSTALL file
+ *
+	int main(int argc, char * argv[])
+	{
+	    int i;
+	    s = (char *) malloc(i_max+1);
+	    for (i = 1; i < argc; i++)
+	    {
+	        FILE * f = fopen(argv[i],"r");
+	        if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
+	        stemfile(f);
+	    }
+	    free(s);
+	    return 0;
+	}
+*/
diff --git a/textproc/amberfish/pkg-descr b/textproc/amberfish/pkg-descr
new file mode 100644
index 000000000000..6f8318210b1a
--- /dev/null
+++ b/textproc/amberfish/pkg-descr
@@ -0,0 +1,21 @@
+Amberfish is general purpose text retrieval software, developed at Etymon
+by Nassib Nassar and distributed as open source software under the terms
+of version 2 of the GNU General Public License (GPL). Its distinguishing
+features are indexing/search of semi-structured text (i.e. both free tex
+and multiply nested fields), built-in support for XML documents using the
+Xerces library, structured queries allowing generalized field/tag paths,
+hierarchical result sets (XML only), automatic searching across multiple
+databases (allowing modular indexing), TREC format results, efficient
+indexing, and relatively low memory requirements during indexing (and the
+ability to index documents larger than available memory). Z39.50 support
+is available. Other features include Boolean queries, right truncation,
+phrase searching, relevance ranking, support for multiple documents per
+file, incremental indexing, and easy integration with other UNIX tools,
+The architecture is also designed to permit proximity queries; however,
+they are not fully implemented at present.
+
+WWW: http://www.etymon.com/tr.html
+
+This port also includes the Porter stemming algorithm for suffix
+stripping, available at:
+     http://www.tartarus.org/~martin/PorterStemmer
diff --git a/textproc/amberfish/pkg-plist b/textproc/amberfish/pkg-plist
new file mode 100644
index 000000000000..0844e358fbf2
--- /dev/null
+++ b/textproc/amberfish/pkg-plist
@@ -0,0 +1,17 @@
+bin/af
+%%PORTDOCS%%%%DOCSDIR%%/Fields-and-XML.html
+%%PORTDOCS%%%%DOCSDIR%%/Fields.html
+%%PORTDOCS%%%%DOCSDIR%%/Indexing.html
+%%PORTDOCS%%%%DOCSDIR%%/Introduction.html
+%%PORTDOCS%%%%DOCSDIR%%/Listing-database-information.html
+%%PORTDOCS%%%%DOCSDIR%%/More-about-XML.html
+%%PORTDOCS%%%%DOCSDIR%%/Multiple-databases.html
+%%PORTDOCS%%%%DOCSDIR%%/Multiple-documents-in-a-file.html
+%%PORTDOCS%%%%DOCSDIR%%/Phrases.html
+%%PORTDOCS%%%%DOCSDIR%%/Relevance-ranking.html
+%%PORTDOCS%%%%DOCSDIR%%/Right-truncation.html
+%%PORTDOCS%%%%DOCSDIR%%/Searching.html
+%%PORTDOCS%%%%DOCSDIR%%/Tutorial.html
+%%PORTDOCS%%%%DOCSDIR%%/amberfish.png
+%%PORTDOCS%%%%DOCSDIR%%/index.html
+%%PORTDOCS%%@dirrm %%DOCSDIR%%
author	Martin Wilke <miwi@FreeBSD.org>	2008-09-30 14:19:57 +0000
committer	Martin Wilke <miwi@FreeBSD.org>	2008-09-30 14:19:57 +0000
commit	a022fa80ecb3fac9fb3cc1a379042bff9151a991 (patch)
tree	9ad192a51bcf9e1baecc41c3b2d5ba1b917dfc66 /textproc/amberfish
parent	9a067601932ca870b3f92750140071039f062206 (diff)
download	ports-a022fa80ecb3fac9fb3cc1a379042bff9151a991.tar.gz ports-a022fa80ecb3fac9fb3cc1a379042bff9151a991.zip