diff options
Diffstat (limited to 'troff/libhnj/hyphen.c')
-rw-r--r-- | troff/libhnj/hyphen.c | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/troff/libhnj/hyphen.c b/troff/libhnj/hyphen.c new file mode 100644 index 0000000000000..3d74b3becf8ce --- /dev/null +++ b/troff/libhnj/hyphen.c @@ -0,0 +1,515 @@ +/* LibHnj is dual licensed under LGPL and MPL. Boilerplate for both + * licenses follows. + */ + +/* LibHnj - a library for high quality hyphenation and justification + * Copyright (C) 1998 Raph Levien, + * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), + * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307 USA. +*/ + +/* + * The contents of this file are subject to the Mozilla Public License + * Version 1.0 (the "MPL"); you may not use this file except in + * compliance with the MPL. You may obtain a copy of the MPL at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the MPL is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL + * for the specific language governing rights and limitations under the + * MPL. + * + */ + +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, August 2005. + * + * Sccsid @(#)hyphen.c 1.5 (gritter) 4/19/06 + */ + +#include <stdlib.h> /* for NULL, malloc */ +#include <stdio.h> /* for fprintf */ +#include <string.h> /* for strdup */ + +#ifdef UNX +#include <unistd.h> /* for exit */ +#endif + +#define noVERBOSE + +#include "hyphen.h" +#include "hnjalloc.h" + +static char * +hnj_strdup (const char *s, HyphenDict *hp) +{ + char *new; + int l; + + l = strlen (s); + new = hnj_malloc (l + 1, hp); + memcpy (new, s, l); + new[l] = 0; + return new; +} + +/* a little bit of a hash table implementation. This simply maps strings + to state numbers */ + +typedef struct _HashTab HashTab; +typedef struct _HashEntry HashEntry; + +/* A cheap, but effective, hack. */ +#define HASH_SIZE 31627 + +struct _HashTab { + HashEntry *entries[HASH_SIZE]; +}; + +struct _HashEntry { + HashEntry *next; + char *key; + int val; +}; + +/* a char* hash function from ASU - adapted from Gtk+ */ +static unsigned int +hnj_string_hash (const char *s) +{ + const char *p; + unsigned int h=0, g; + + for(p = s; *p != '\0'; p += 1) { + h = ( h << 4 ) + *p; + if ( ( g = h & 0xf0000000 ) ) { + h = h ^ (g >> 24); + h = h ^ g; + } + } + return h /* % M */; +} + +static HashTab * +hnj_hash_new (HyphenDict *hp) +{ + HashTab *hashtab; + int i; + + hashtab = hnj_malloc (sizeof(HashTab), hp); + for (i = 0; i < HASH_SIZE; i++) + hashtab->entries[i] = NULL; + + return hashtab; +} + +static void +hnj_hash_free (HashTab *hashtab, HyphenDict *hp) +{ + int i; + HashEntry *e, *next; + + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = next) + { + next = e->next; + hnj_free (e->key, hp); + hnj_free (e, hp); + } + + hnj_free (hashtab, hp); +} + +/* assumes that key is not already present! */ +static void +hnj_hash_insert (HashTab *hashtab, const char *key, int val, HyphenDict *hp) +{ + int i; + HashEntry *e; + + i = hnj_string_hash (key) % HASH_SIZE; + e = hnj_malloc (sizeof(HashEntry), hp); + e->next = hashtab->entries[i]; + e->key = hnj_strdup (key, hp); + e->val = val; + hashtab->entries[i] = e; +} + +/* return val if found, otherwise -1 */ +static int +hnj_hash_lookup (HashTab *hashtab, const char *key) +{ + int i; + HashEntry *e; + + i = hnj_string_hash (key) % HASH_SIZE; + for (e = hashtab->entries[i]; e; e = e->next) + if (!strcmp (key, e->key)) + return e->val; + return -1; +} + +/* Get the state number, allocating a new state if necessary. */ +static int +hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) +{ + int state_num; + + state_num = hnj_hash_lookup (hashtab, string); + + if (state_num >= 0) + return state_num; + + hnj_hash_insert (hashtab, string, dict->num_states, dict); + /* predicate is true if dict->num_states is a power of two */ + if (dict->num_states >= dict->alc_states) + { + dict->alc_states *= 2; + dict->states = hnj_realloc (dict->states, + dict->alc_states * + sizeof(HyphenState), dict); + } + dict->states[dict->num_states].match = NULL; + dict->states[dict->num_states].fallback_state = -1; + dict->states[dict->num_states].num_trans = 0; + dict->states[dict->num_states].trans = NULL; + return dict->num_states++; +} + +/* add a transition from state1 to state2 through ch - assumes that the + transition does not already exist */ +static void +hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) +{ + int num_trans; + + num_trans = dict->states[state1].num_trans; + if (num_trans == 0) + { + dict->states[state1].alc_trans = 64; + dict->states[state1].trans = hnj_realloc (NULL, + dict->states[state1].alc_trans * sizeof(HyphenTrans), dict); + } + else if (num_trans >= dict->states[state1].alc_trans) + { + dict->states[state1].alc_trans *= 2; + dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, + dict->states[state1].alc_trans * + sizeof(HyphenTrans), dict); + } + dict->states[state1].trans[num_trans].ch = ch; + dict->states[state1].trans[num_trans].new_state = state2; + dict->states[state1].num_trans++; +} + +#ifdef VERBOSE +HashTab *global; + +static char * +get_state_str (int state) +{ + int i; + HashEntry *e; + + for (i = 0; i < HASH_SIZE; i++) + for (e = global->entries[i]; e; e = e->next) + if (e->val == state) + return e->key; + return NULL; +} +#endif + +HyphenDict * +hnj_hyphen_load (const char *fn) +{ + HyphenDict *dict; + HashTab *hashtab; + FILE *f; + char buf[80]; + char word[80]; + char pattern[80]; + int state_num, last_state; + int i, j; + char ch; + int found; + HashEntry *e; + + f = fopen (fn, "r"); + if (f == NULL) + return NULL; + + hashtab = hnj_hash_new (NULL); +#ifdef VERBOSE + global = hashtab; +#endif + hnj_hash_insert (hashtab, "", 0, NULL); + + dict = hnj_malloc (sizeof(HyphenDict), NULL); + memset(dict, 0, sizeof *dict); + dict->num_states = 1; + dict->alc_states = 12288; + dict->states = hnj_realloc (NULL, dict->alc_states*sizeof(HyphenState), dict); + dict->states[0].match = NULL; + dict->states[0].fallback_state = -1; + dict->states[0].num_trans = 0; + dict->states[0].trans = NULL; + + /* read in character set info */ + for (i=0;i<MAX_NAME;i++) dict->cset[i]= 0; + fgets(dict->cset, sizeof(dict->cset),f); + for (i=0;i<MAX_NAME;i++) + if ((dict->cset[i] == '\r') || (dict->cset[i] == '\n')) + dict->cset[i] = 0; + + while (fgets (buf, sizeof(buf), f) != NULL) + { + if (buf[0] != '%') + { + j = 0; + pattern[j] = '0'; + for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) + { + if (buf[i] >= '0' && buf[i] <= '9') + pattern[j] = buf[i]; + else + { + word[j] = buf[i]; + pattern[++j] = '0'; + } + } + word[j] = '\0'; + pattern[j + 1] = '\0'; + + /* Optimize away leading zeroes */ + for (i = 0; pattern[i] == '0'; i++); + +#ifdef VERBOSE + printf ("word %s pattern %s, j = %d\n", word, pattern + i, j); +#endif + found = hnj_hash_lookup (hashtab, word); + state_num = hnj_get_state (dict, hashtab, word); + dict->states[state_num].match = hnj_strdup (pattern + i, dict); + + /* now, put in the prefix transitions */ + for (; found < 0 ;j--) + { + last_state = state_num; + ch = word[j - 1]; + word[j - 1] = '\0'; + found = hnj_hash_lookup (hashtab, word); + state_num = hnj_get_state (dict, hashtab, word); + hnj_add_trans (dict, state_num, last_state, ch); + } + } + } + + /* Could do unioning of matches here (instead of the preprocessor script). + If we did, the pseudocode would look something like this: + + foreach state in the hash table + foreach i = [1..length(state) - 1] + state to check is substr (state, i) + look it up + if found, and if there is a match, union the match in. + + It's also possible to avoid the quadratic blowup by doing the + search in order of increasing state string sizes - then you + can break the loop after finding the first match. + + This step should be optional in any case - if there is a + preprocessed rule table, it's always faster to use that. + +*/ + + /* put in the fallback states */ + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = e->next) + { + state_num = -1; + for (j = 1; e->key[j-1]; j++) + { + state_num = hnj_hash_lookup (hashtab, e->key + j); + if (state_num >= 0) + break; + } + /* KBH: FIXME state 0 fallback_state should always be -1? */ + if (e->val && state_num >= 0) + dict->states[e->val].fallback_state = state_num; + } +#ifdef VERBOSE + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = e->next) + { + printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, + dict->states[e->val].fallback_state); + for (j = 0; j < dict->states[e->val].num_trans; j++) + printf (" %c->%d\n", dict->states[e->val].trans[j].ch, + dict->states[e->val].trans[j].new_state); + } +#endif + +#ifndef VERBOSE + hnj_hash_free (hashtab, dict); +#endif + + return dict; +} + +void hnj_hyphen_free (HyphenDict *dict) +{ + int state_num; + HyphenState *hstate; + + for (state_num = 0; state_num < dict->num_states; state_num++) + { + hstate = &dict->states[state_num]; + if (hstate->match) + hnj_free (hstate->match, dict); + if (hstate->trans) + hnj_free (hstate->trans, dict); + } + + hnj_free (dict->states, dict); + + hnj_free (dict, NULL); +} + +#define MAX_WORD 256 + +int hnj_hyphen_hyphenate (HyphenDict *dict, + const char *word, int word_size, + char *hyphens) +{ + char prep_word_buf[MAX_WORD]; + char *prep_word; + int i, j, k; + int state; + char ch; + HyphenState *hstate; + char *match; + int offset; + + if (word_size + 3 < MAX_WORD) + prep_word = prep_word_buf; + else + prep_word = hnj_malloc (word_size + 3, dict); + + j = 0; + prep_word[j++] = '.'; + + for (i = 0; i < word_size; i++) + prep_word[j++] = word[i]; + + for (i = 0; i < j; i++) + hyphens[i] = '0'; + + prep_word[j++] = '.'; + + prep_word[j] = '\0'; +#ifdef VERBOSE + printf ("prep_word = %s\n", prep_word); +#endif + + /* now, run the finite state machine */ + state = 0; + for (i = 0; i < j; i++) + { + ch = prep_word[i]; + for (;;) + { + + if (state == -1) { + /* return 1; + * KBH: FIXME shouldn't this be as follows? + */ + state = 0; + goto try_next_letter; + } + +#ifdef VERBOSE + char *state_str; + state_str = get_state_str (state); + + for (k = 0; k < i - strlen (state_str); k++) + putchar (' '); + printf ("%s", state_str); +#endif + + hstate = &dict->states[state]; + for (k = 0; k < hstate->num_trans; k++) + if (hstate->trans[k].ch == ch) + { + state = hstate->trans[k].new_state; + goto found_state; + } + state = hstate->fallback_state; +#ifdef VERBOSE + printf (" falling back, fallback_state %d\n", state); +#endif + } + found_state: +#ifdef VERBOSE + printf ("found state %d\n",state); +#endif + /* Additional optimization is possible here - especially, + elimination of trailing zeroes from the match. Leading zeroes + have already been optimized. */ + match = dict->states[state].match; + if (match) + { + offset = i + 1 - strlen (match); +#ifdef VERBOSE + for (k = 0; k < offset; k++) + putchar (' '); + printf ("%s\n", match); +#endif + /* This is a linear search because I tried a binary search and + found it to be just a teeny bit slower. */ + for (k = 0; match[k]; k++) + if (offset + k < word_size && hyphens[offset + k] < match[k]) + hyphens[offset + k] = match[k]; + } + + /* KBH: we need this to make sure we keep looking in a word + * for patterns even if the current character is not known in state 0 + * since patterns for hyphenation may occur anywhere in the word + */ + try_next_letter: ; + + } +#ifdef VERBOSE + for (i = 0; i < j; i++) + putchar (hyphens[i]); + putchar ('\n'); +#endif + + for (i = 0; i < j - 4; i++) +#if 0 + if (hyphens[i + 1] & 1) + hyphens[i] = '-'; +#else + hyphens[i] = hyphens[i + 1]; +#endif + hyphens[0] = '0'; + for (; i < word_size; i++) + hyphens[i] = '0'; + hyphens[word_size] = '\0'; + + if (prep_word != prep_word_buf) + hnj_free (prep_word, dict); + return 0; +} |