diff options
Diffstat (limited to 'gnu/usr.bin/groff/refer/token.cc')
| -rw-r--r-- | gnu/usr.bin/groff/refer/token.cc | 370 | 
1 files changed, 370 insertions, 0 deletions
diff --git a/gnu/usr.bin/groff/refer/token.cc b/gnu/usr.bin/groff/refer/token.cc new file mode 100644 index 000000000000..8847081bd482 --- /dev/null +++ b/gnu/usr.bin/groff/refer/token.cc @@ -0,0 +1,370 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. +     Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING.  If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include "refer.h" +#include "token.h" + +#define TOKEN_TABLE_SIZE 1009 +// I believe in Icelandic thorn sorts after z. +#define THORN_SORT_KEY "{" + +struct token_table_entry { +  const char *tok; +  token_info ti; +  token_table_entry(); +}; + +token_table_entry token_table[TOKEN_TABLE_SIZE]; +int ntokens = 0; + +static void skip_name(const char **ptr, const char *end) +{ +  if (*ptr < end) { +    switch (*(*ptr)++) { +    case '(': +      if (*ptr < end) { +	*ptr += 1; +	if (*ptr < end) +	  *ptr += 1; +      } +      break; +    case '[': +      while (*ptr < end) +	if (*(*ptr)++ == ']') +	  break; +      break; +    } +  } +} + +int get_token(const char **ptr, const char *end) +{ +  if (*ptr >= end) +    return 0; +  char c = *(*ptr)++; +  if (c == '\\' && *ptr < end) { +    switch (**ptr) { +    default: +      *ptr += 1; +      break; +    case '(': +    case '[': +      skip_name(ptr, end); +      break; +    case '*': +    case 'f': +      *ptr += 1; +      skip_name(ptr, end); +      break; +    } +  } +  return 1; +} + +token_info::token_info() +: type(TOKEN_OTHER), sort_key(0), other_case(0) +{ +} + +void token_info::set(token_type t, const char *sk, const char *oc) +{ +  assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); +  type = t; +  sort_key = sk; +  other_case = oc; +} + +void token_info::sortify(const char *start, const char *end, string &result) +     const +{ +  if (sort_key) +    result += sort_key; +  else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { +    for (; start < end; start++) +      if (csalpha(*start)) +	result += cmlower(*start); +  } +} + +int token_info::sortify_non_empty(const char *start, const char *end) const +{ +  if (sort_key) +    return *sort_key != '\0'; +  if (type != TOKEN_UPPER && type != TOKEN_LOWER) +    return 0; +  for (; start < end; start++) +    if (csalpha(*start)) +      return 1; +  return 0; +} + + +void token_info::lower_case(const char *start, const char *end, +			    string &result) const +{ +  if (type != TOKEN_UPPER) { +    while (start < end) +      result += *start++; +  } +  else if (other_case) +    result += other_case; +  else { +    while (start < end) +      result += cmlower(*start++); +  } +} + +void token_info::upper_case(const char *start, const char *end, +			    string &result) const +{ +  if (type != TOKEN_LOWER) { +    while (start < end) +      result += *start++; +  } +  else if (other_case) +    result += other_case; +  else { +    while (start < end) +      result += cmupper(*start++); +  } +} + +token_table_entry::token_table_entry() +: tok(0) +{ +} + +static void store_token(const char *tok, token_type typ, +			const char *sk = 0, const char *oc = 0) +{ +  unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; +  while (n >= 0) { +    if (token_table[n].tok == 0) { +      if (++ntokens == TOKEN_TABLE_SIZE) +	assert(0); +      token_table[n].tok = tok; +      break; +    } +    if (strcmp(tok, token_table[n].tok) == 0) +      break; +    if (--n < 0) +      n = TOKEN_TABLE_SIZE - 1; +  } +  token_table[n].ti.set(typ, sk, oc); +} + + +token_info default_token_info; + +const token_info *lookup_token(const char *start, const char *end) +{ +  unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; +  while (n >= 0) { +    if (token_table[n].tok == 0) +      break; +    if (strlen(token_table[n].tok) == end - start +	&& memcmp(token_table[n].tok, start, end - start) == 0) +      return &(token_table[n].ti); +    if (--n < 0) +      n = TOKEN_TABLE_SIZE - 1; +  } +  return &default_token_info; +} + +static void init_ascii() +{ +  for (const char *p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { +    char buf[2]; +    buf[0] = *p; +    buf[1] = '\0'; +    store_token(strsave(buf), TOKEN_LOWER); +    buf[0] = cmupper(buf[0]); +    store_token(strsave(buf), TOKEN_UPPER); +  } +  for (p = "0123456789"; *p; p++) { +    char buf[2]; +    buf[0] = *p; +    buf[1] = '\0'; +    const char *s = strsave(buf); +    store_token(s, TOKEN_OTHER, s); +  } +  for (p = ".,:;?!"; *p; p++) { +    char buf[2]; +    buf[0] = *p; +    buf[1] = '\0'; +    store_token(strsave(buf), TOKEN_PUNCT); +  } +  store_token("-", TOKEN_HYPHEN); +} + +static void store_letter(const char *lower, const char *upper, +		  const char *sort_key = 0) +{ +  store_token(lower, TOKEN_LOWER, sort_key, upper); +  store_token(upper, TOKEN_UPPER, sort_key, lower); +} + +static void init_letter(unsigned char uc_code, unsigned char lc_code, +		 const char *sort_key) +{ +  char lbuf[2]; +  lbuf[0] = lc_code; +  lbuf[1] = 0; +  char ubuf[2]; +  ubuf[0] = uc_code; +  ubuf[1] = 0; +  store_letter(strsave(lbuf), strsave(ubuf), sort_key); +} + +static void init_latin1() +{ +  init_letter(0xc0, 0xe0, "a"); +  init_letter(0xc1, 0xe1, "a"); +  init_letter(0xc2, 0xe2, "a"); +  init_letter(0xc3, 0xe3, "a"); +  init_letter(0xc4, 0xe4, "a"); +  init_letter(0xc5, 0xe5, "a"); +  init_letter(0xc6, 0xe6, "ae"); +  init_letter(0xc7, 0xe7, "c"); +  init_letter(0xc8, 0xe8, "e"); +  init_letter(0xc9, 0xe9, "e"); +  init_letter(0xca, 0xea, "e"); +  init_letter(0xcb, 0xeb, "e"); +  init_letter(0xcc, 0xec, "i"); +  init_letter(0xcd, 0xed, "i"); +  init_letter(0xce, 0xee, "i"); +  init_letter(0xcf, 0xef, "i"); + +  init_letter(0xd0, 0xf0, "d"); +  init_letter(0xd1, 0xf1, "n"); +  init_letter(0xd2, 0xf2, "o"); +  init_letter(0xd3, 0xf3, "o"); +  init_letter(0xd4, 0xf4, "o"); +  init_letter(0xd5, 0xf5, "o"); +  init_letter(0xd6, 0xf6, "o"); +  init_letter(0xd8, 0xf8, "o"); +  init_letter(0xd9, 0xf9, "u"); +  init_letter(0xda, 0xfa, "u"); +  init_letter(0xdb, 0xfb, "u"); +  init_letter(0xdc, 0xfc, "u"); +  init_letter(0xdd, 0xfd, "y"); +  init_letter(0xde, 0xfe, THORN_SORT_KEY); + +  store_token("\337", TOKEN_LOWER, "ss", "SS"); +  store_token("\377", TOKEN_LOWER, "y", "Y"); +} + +static void init_two_char_letter(char l1, char l2, char u1, char u2, +				 const char *sk = 0) +{ +  char buf[6]; +  buf[0] = '\\'; +  buf[1] = '('; +  buf[2] = l1; +  buf[3] = l2; +  buf[4] = '\0'; +  const char *p = strsave(buf); +  buf[2] = u1; +  buf[3] = u2; +  store_letter(p, strsave(buf), sk); +  buf[1] = '['; +  buf[4] = ']'; +  buf[5] = '\0'; +  p = strsave(buf); +  buf[2] = l1; +  buf[3] = l2; +  store_letter(strsave(buf), p, sk); +   +} + +static void init_special_chars() +{ +  for (const char *p = "':^`~"; *p; p++) +    for (const char *q = "aeiouy"; *q; q++) { +      // Use a variable to work around bug in gcc 2.0 +      char c = cmupper(*q); +      init_two_char_letter(*p, *q, *p, c); +    } +  for (p = "/l/o~n,coeaeij"; *p; p += 2) { +    // Use variables to work around bug in gcc 2.0 +    char c0 = cmupper(p[0]); +    char c1 = cmupper(p[1]); +    init_two_char_letter(p[0], p[1], c0, c1); +  } +  init_two_char_letter('v', 's', 'v', 'S', "s"); +  init_two_char_letter('v', 'z', 'v', 'Z', "z"); +  init_two_char_letter('o', 'a', 'o', 'A', "a"); +  init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); +  init_two_char_letter('-', 'd', '-', 'D'); +   +  store_token("\\(ss", TOKEN_LOWER, 0, "SS"); +  store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); + +  store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); +  store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); +  store_token("\\(hy", TOKEN_HYPHEN); +  store_token("\\[hy]", TOKEN_HYPHEN); +} + +static void init_strings() +{ +  char buf[6]; +  buf[0] = '\\'; +  buf[1] = '*'; +  for (const char *p = "'`^^,:~v_o./;"; *p; p++) { +    buf[2] = *p; +    buf[3] = '\0'; +    store_token(strsave(buf), TOKEN_ACCENT); +    buf[2] = '['; +    buf[3] = *p; +    buf[4] = ']'; +    buf[5] = '\0'; +    store_token(strsave(buf), TOKEN_ACCENT); +  } + +  // -ms special letters +  store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); +  store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); +  store_letter("\\*(d-", "\\*(D-"); +  store_letter("\\*[d-]", "\\*[D-]"); +  store_letter("\\*(ae", "\\*(Ae", "ae"); +  store_letter("\\*[ae]", "\\*[Ae]", "ae"); +  store_letter("\\*(oe", "\\*(Oe", "oe"); +  store_letter("\\*[oe]", "\\*[Oe]", "oe"); + +  store_token("\\*3", TOKEN_LOWER, "y", "Y"); +  store_token("\\*8", TOKEN_LOWER, "ss", "SS"); +  store_token("\\*q", TOKEN_LOWER, "o", "O"); +} + +struct token_initer { +  token_initer(); +}; + +static token_initer the_token_initer; + +token_initer::token_initer() +{ +  init_ascii(); +  init_latin1(); +  init_special_chars(); +  init_strings(); +  default_token_info.set(TOKEN_OTHER); +}  | 
