diff options
Diffstat (limited to 'gnu/usr.bin/groff/refer')
-rw-r--r-- | gnu/usr.bin/groff/refer/Makefile | 15 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/command.cc | 807 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/command.h | 36 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/label.y | 1173 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/ref.cc | 1144 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/ref.h | 120 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/refer.1 | 1282 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/refer.cc | 1221 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/refer.h | 78 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/token.cc | 370 | ||||
-rw-r--r-- | gnu/usr.bin/groff/refer/token.h | 81 |
11 files changed, 6327 insertions, 0 deletions
diff --git a/gnu/usr.bin/groff/refer/Makefile b/gnu/usr.bin/groff/refer/Makefile new file mode 100644 index 000000000000..af1d242e3f45 --- /dev/null +++ b/gnu/usr.bin/groff/refer/Makefile @@ -0,0 +1,15 @@ +# Makefile for refer + +PROG= refer +SRCS= command.cc ref.cc refer.cc token.cc +OBJS= label.o +CFLAGS+= -I. -I$(.CURDIR)/../include +CXXFLAGS+= -I. -I$(.CURDIR)/../include +LDADD+= $(LIBBIB) $(LIBGROFF) -lm +DPADD+= $(LIBBIB) $(LIBGROFF) $(LIBMATH) + +CLEANFILES+= label.cc label.tab.h + +.include "../../../usr.bin/Makefile.inc" +.include "../Makefile.cfg" +.include <bsd.prog.mk> diff --git a/gnu/usr.bin/groff/refer/command.cc b/gnu/usr.bin/groff/refer/command.cc new file mode 100644 index 000000000000..93b6cfe95e4b --- /dev/null +++ b/gnu/usr.bin/groff/refer/command.cc @@ -0,0 +1,807 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include "refer.h" +#include "refid.h" +#include "search.h" +#include "command.h" + +cset cs_field_name = csalpha; + +class input_item { + input_item *next; + char *filename; + int first_lineno; + string buffer; + const char *ptr; + const char *end; +public: + input_item(string &, const char *, int = 1); + ~input_item(); + int get_char(); + int peek_char(); + void skip_char(); + int get_location(const char **, int *); + + friend class input_stack; +}; + +input_item::input_item(string &s, const char *fn, int ln) +: filename(strsave(fn)), first_lineno(ln) +{ + buffer.move(s); + ptr = buffer.contents(); + end = ptr + buffer.length(); +} + +input_item::~input_item() +{ + a_delete filename; +} + +inline int input_item::peek_char() +{ + if (ptr >= end) + return EOF; + else + return (unsigned char)*ptr; +} + +inline int input_item::get_char() +{ + if (ptr >= end) + return EOF; + else + return (unsigned char)*ptr++; +} + +inline void input_item::skip_char() +{ + ptr++; +} + +int input_item::get_location(const char **filenamep, int *linenop) +{ + *filenamep = filename; + if (ptr == buffer.contents()) + *linenop = first_lineno; + else { + int ln = first_lineno; + const char *e = ptr - 1; + for (const char *p = buffer.contents(); p < e; p++) + if (*p == '\n') + ln++; + *linenop = ln; + } + return 1; +} + +class input_stack { + static input_item *top; +public: + static void init(); + static int get_char(); + static int peek_char(); + static void skip_char() { top->skip_char(); } + static void push_file(const char *); + static void push_string(string &, const char *, int); + static void error(const char *format, + const errarg &arg1 = empty_errarg, + const errarg &arg2 = empty_errarg, + const errarg &arg3 = empty_errarg); +}; + +input_item *input_stack::top = 0; + +void input_stack::init() +{ + while (top) { + input_item *tem = top; + top = top->next; + delete tem; + } +} + +int input_stack::get_char() +{ + while (top) { + int c = top->get_char(); + if (c >= 0) + return c; + input_item *tem = top; + top = top->next; + delete tem; + } + return -1; +} + +int input_stack::peek_char() +{ + while (top) { + int c = top->peek_char(); + if (c >= 0) + return c; + input_item *tem = top; + top = top->next; + delete tem; + } + return -1; +} + +void input_stack::push_file(const char *fn) +{ + FILE *fp; + if (strcmp(fn, "-") == 0) { + fp = stdin; + fn = "<standard input>"; + } + else { + errno = 0; + fp = fopen(fn, "r"); + if (fp == 0) { + error("can't open `%1': %2", fn, strerror(errno)); + return; + } + } + string buf; + int bol = 1; + int lineno = 1; + for (;;) { + int c = getc(fp); + if (bol && c == '.') { + // replace lines beginning with .R1 or .R2 with a blank line + c = getc(fp); + if (c == 'R') { + c = getc(fp); + if (c == '1' || c == '2') { + int cc = c; + c = getc(fp); + if (compatible_flag || c == ' ' || c == '\n' || c == EOF) { + while (c != '\n' && c != EOF) + c = getc(fp); + } + else { + buf += '.'; + buf += 'R'; + buf += cc; + } + } + else { + buf += '.'; + buf += 'R'; + } + } + else + buf += '.'; + } + if (c == EOF) + break; + if (illegal_input_char(c)) + error_with_file_and_line(fn, lineno, + "illegal input character code %1", int(c)); + else { + buf += c; + if (c == '\n') { + bol = 1; + lineno++; + } + else + bol = 0; + } + } + if (fp != stdin) + fclose(fp); + if (buf.length() > 0 && buf[buf.length() - 1] != '\n') + buf += '\n'; + input_item *it = new input_item(buf, fn); + it->next = top; + top = it; +} + +void input_stack::push_string(string &s, const char *filename, int lineno) +{ + input_item *it = new input_item(s, filename, lineno); + it->next = top; + top = it; +} + +void input_stack::error(const char *format, const errarg &arg1, + const errarg &arg2, const errarg &arg3) +{ + const char *filename; + int lineno; + for (input_item *it = top; it; it = it->next) + if (it->get_location(&filename, &lineno)) { + error_with_file_and_line(filename, lineno, format, arg1, arg2, arg3); + return; + } + ::error(format, arg1, arg2, arg3); +} + +void command_error(const char *format, const errarg &arg1, + const errarg &arg2, const errarg &arg3) +{ + input_stack::error(format, arg1, arg2, arg3); +} + +// # not recognized in "" +// \<newline> is recognized in "" +// # does not conceal newline +// if missing closing quote, word extends to end of line +// no special treatment of \ other than before newline +// \<newline> not recognized after # +// ; allowed as alternative to newline +// ; not recognized in "" +// don't clear word_buffer; just append on +// return -1 for EOF, 0 for newline, 1 for word + +int get_word(string &word_buffer) +{ + int c = input_stack::get_char(); + for (;;) { + if (c == '#') { + do { + c = input_stack::get_char(); + } while (c != '\n' && c != EOF); + break; + } + if (c == '\\' && input_stack::peek_char() == '\n') + input_stack::skip_char(); + else if (c != ' ' && c != '\t') + break; + c = input_stack::get_char(); + } + if (c == EOF) + return -1; + if (c == '\n' || c == ';') + return 0; + if (c == '"') { + for (;;) { + c = input_stack::peek_char(); + if (c == EOF || c == '\n') + break; + input_stack::skip_char(); + if (c == '"') { + int d = input_stack::peek_char(); + if (d == '"') + input_stack::skip_char(); + else + break; + } + else if (c == '\\') { + int d = input_stack::peek_char(); + if (d == '\n') + input_stack::skip_char(); + else + word_buffer += '\\'; + } + else + word_buffer += c; + } + return 1; + } + word_buffer += c; + for (;;) { + c = input_stack::peek_char(); + if (c == ' ' || c == '\t' || c == '\n' || c == '#' || c == ';') + break; + input_stack::skip_char(); + if (c == '\\') { + int d = input_stack::peek_char(); + if (d == '\n') + input_stack::skip_char(); + else + word_buffer += '\\'; + } + else + word_buffer += c; + } + return 1; +} + +union argument { + const char *s; + int n; +}; + +// This is for debugging. + +static void echo_command(int argc, argument *argv) +{ + for (int i = 0; i < argc; i++) + fprintf(stderr, "%s\n", argv[i].s); +} + +static void include_command(int argc, argument *argv) +{ + assert(argc == 1); + input_stack::push_file(argv[0].s); +} + +static void capitalize_command(int argc, argument *argv) +{ + if (argc > 0) + capitalize_fields = argv[0].s; + else + capitalize_fields.clear(); +} + +static void accumulate_command(int, argument *) +{ + accumulate = 1; +} + +static void no_accumulate_command(int, argument *) +{ + accumulate = 0; +} + +static void move_punctuation_command(int, argument *) +{ + move_punctuation = 1; +} + +static void no_move_punctuation_command(int, argument *) +{ + move_punctuation = 0; +} + +static void sort_command(int argc, argument *argv) +{ + if (argc == 0) + sort_fields = "AD"; + else + sort_fields = argv[0].s; + accumulate = 1; +} + +static void no_sort_command(int, argument *) +{ + sort_fields.clear(); +} + +static void articles_command(int argc, argument *argv) +{ + articles.clear(); + int i; + for (i = 0; i < argc; i++) { + articles += argv[i].s; + articles += '\0'; + } + int len = articles.length(); + for (i = 0; i < len; i++) + articles[i] = cmlower(articles[i]); +} + +static void database_command(int argc, argument *argv) +{ + for (int i = 0; i < argc; i++) + database_list.add_file(argv[i].s); +} + +static void default_database_command(int, argument *) +{ + search_default = 1; +} + +static void no_default_database_command(int, argument *) +{ + search_default = 0; +} + +static void bibliography_command(int argc, argument *argv) +{ + const char *saved_filename = current_filename; + int saved_lineno = current_lineno; + int saved_label_in_text = label_in_text; + label_in_text = 0; + if (!accumulate) + fputs(".]<\n", stdout); + for (int i = 0; i < argc; i++) + do_bib(argv[i].s); + if (accumulate) + output_references(); + else + fputs(".]>\n", stdout); + current_filename = saved_filename; + current_lineno = saved_lineno; + label_in_text = saved_label_in_text; +} + +static void annotate_command(int argc, argument *argv) +{ + if (argc > 0) + annotation_field = argv[0].s[0]; + else + annotation_field = 'X'; + if (argc == 2) + annotation_macro = argv[1].s; + else + annotation_macro = "AP"; +} + +static void no_annotate_command(int, argument *) +{ + annotation_macro.clear(); + annotation_field = -1; +} + +static void reverse_command(int, argument *argv) +{ + reverse_fields = argv[0].s; +} + +static void no_reverse_command(int, argument *) +{ + reverse_fields.clear(); +} + +static void abbreviate_command(int argc, argument *argv) +{ + abbreviate_fields = argv[0].s; + period_before_initial = argc > 1 ? argv[1].s : ". "; + period_before_last_name = argc > 2 ? argv[2].s : ". "; + period_before_other = argc > 3 ? argv[3].s : ". "; + period_before_hyphen = argc > 4 ? argv[4].s : "."; +} + +static void no_abbreviate_command(int, argument *) +{ + abbreviate_fields.clear(); +} + +string search_ignore_fields; + +static void search_ignore_command(int argc, argument *argv) +{ + if (argc > 0) + search_ignore_fields = argv[0].s; + else + search_ignore_fields = "XYZ"; + search_ignore_fields += '\0'; + linear_ignore_fields = search_ignore_fields.contents(); +} + +static void no_search_ignore_command(int, argument *) +{ + linear_ignore_fields = ""; +} + +static void search_truncate_command(int argc, argument *argv) +{ + if (argc > 0) + linear_truncate_len = argv[0].n; + else + linear_truncate_len = 6; +} + +static void no_search_truncate_command(int, argument *) +{ + linear_truncate_len = -1; +} + +static void discard_command(int argc, argument *argv) +{ + if (argc == 0) + discard_fields = "XYZ"; + else + discard_fields = argv[0].s; + accumulate = 1; +} + +static void no_discard_command(int, argument *) +{ + discard_fields.clear(); +} + +static void label_command(int, argument *argv) +{ + set_label_spec(argv[0].s); +} + +static void abbreviate_label_ranges_command(int argc, argument *argv) +{ + abbreviate_label_ranges = 1; + label_range_indicator = argc > 0 ? argv[0].s : "-"; +} + +static void no_abbreviate_label_ranges_command(int, argument *) +{ + abbreviate_label_ranges = 0; +} + +static void label_in_reference_command(int, argument *) +{ + label_in_reference = 1; +} + +static void no_label_in_reference_command(int, argument *) +{ + label_in_reference = 0; +} + +static void label_in_text_command(int, argument *) +{ + label_in_text = 1; +} + +static void no_label_in_text_command(int, argument *) +{ + label_in_text = 0; +} + +static void sort_adjacent_labels_command(int, argument *) +{ + sort_adjacent_labels = 1; +} + +static void no_sort_adjacent_labels_command(int, argument *) +{ + sort_adjacent_labels = 0; +} + +static void date_as_label_command(int argc, argument *argv) +{ + if (set_date_label_spec(argc > 0 ? argv[0].s : "D%a*")) + date_as_label = 1; +} + +static void no_date_as_label_command(int, argument *) +{ + date_as_label = 0; +} + +static void short_label_command(int, argument *argv) +{ + if (set_short_label_spec(argv[0].s)) + short_label_flag = 1; +} + +static void no_short_label_command(int, argument *) +{ + short_label_flag = 0; +} + +static void compatible_command(int, argument *) +{ + compatible_flag = 1; +} + +static void no_compatible_command(int, argument *) +{ + compatible_flag = 0; +} + +static void join_authors_command(int argc, argument *argv) +{ + join_authors_exactly_two = argv[0].s; + join_authors_default = argc > 1 ? argv[1].s : argv[0].s; + join_authors_last_two = argc == 3 ? argv[2].s : argv[0].s; +} + +static void bracket_label_command(int, argument *argv) +{ + pre_label = argv[0].s; + post_label = argv[1].s; + sep_label = argv[2].s; +} + +static void separate_label_second_parts_command(int, argument *argv) +{ + separate_label_second_parts = argv[0].s; +} + +static void et_al_command(int argc, argument *argv) +{ + et_al = argv[0].s; + et_al_min_elide = argv[1].n; + if (et_al_min_elide < 1) + et_al_min_elide = 1; + et_al_min_total = argc >= 3 ? argv[2].n : 0; +} + +static void no_et_al_command(int, argument *) +{ + et_al.clear(); + et_al_min_elide = 0; +} + +typedef void (*command_t)(int, argument *); + +/* arg_types is a string describing the numbers and types of arguments. +s means a string, i means an integer, f is a list of fields, F is +a single field, +? means that the previous argument is optional, * means that the +previous argument can occur any number of times. */ + +struct { + const char *name; + command_t func; + const char *arg_types; +} command_table[] = { + "include", include_command, "s", + "echo", echo_command, "s*", + "capitalize", capitalize_command, "f?", + "accumulate", accumulate_command, "", + "no-accumulate", no_accumulate_command, "", + "move-punctuation", move_punctuation_command, "", + "no-move-punctuation", no_move_punctuation_command, "", + "sort", sort_command, "s?", + "no-sort", no_sort_command, "", + "articles", articles_command, "s*", + "database", database_command, "ss*", + "default-database", default_database_command, "", + "no-default-database", no_default_database_command, "", + "bibliography", bibliography_command, "ss*", + "annotate", annotate_command, "F?s?", + "no-annotate", no_annotate_command, "", + "reverse", reverse_command, "s", + "no-reverse", no_reverse_command, "", + "abbreviate", abbreviate_command, "ss?s?s?s?", + "no-abbreviate", no_abbreviate_command, "", + "search-ignore", search_ignore_command, "f?", + "no-search-ignore", no_search_ignore_command, "", + "search-truncate", search_truncate_command, "i?", + "no-search-truncate", no_search_truncate_command, "", + "discard", discard_command, "f?", + "no-discard", no_discard_command, "", + "label", label_command, "s", + "abbreviate-label-ranges", abbreviate_label_ranges_command, "s?", + "no-abbreviate-label-ranges", no_abbreviate_label_ranges_command, "", + "label-in-reference", label_in_reference_command, "", + "no-label-in-reference", no_label_in_reference_command, "", + "label-in-text", label_in_text_command, "", + "no-label-in-text", no_label_in_text_command, "", + "sort-adjacent-labels", sort_adjacent_labels_command, "", + "no-sort-adjacent-labels", no_sort_adjacent_labels_command, "", + "date-as-label", date_as_label_command, "s?", + "no-date-as-label", no_date_as_label_command, "", + "short-label", short_label_command, "s", + "no-short-label", no_short_label_command, "", + "compatible", compatible_command, "", + "no-compatible", no_compatible_command, "", + "join-authors", join_authors_command, "sss?", + "bracket-label", bracket_label_command, "sss", + "separate-label-second-parts", separate_label_second_parts_command, "s", + "et-al", et_al_command, "sii?", + "no-et-al", no_et_al_command, "", +}; + +static int check_args(const char *types, const char *name, + int argc, argument *argv) +{ + int argno = 0; + while (*types) { + if (argc == 0) { + if (types[1] == '?') + break; + else if (types[1] == '*') { + assert(types[2] == '\0'); + break; + } + else { + input_stack::error("missing argument for command `%1'", name); + return 0; + } + } + switch (*types) { + case 's': + break; + case 'i': + { + char *ptr; + long n = strtol(argv->s, &ptr, 10); + if ((n == 0 && ptr == argv->s) + || *ptr != '\0') { + input_stack::error("argument %1 for command `%2' must be an integer", + argno + 1, name); + return 0; + } + argv->n = (int)n; + break; + } + case 'f': + { + for (const char *ptr = argv->s; *ptr != '\0'; ptr++) + if (!cs_field_name(*ptr)) { + input_stack::error("argument %1 for command `%2' must be a list of fields", + argno + 1, name); + return 0; + } + break; + } + case 'F': + if (argv->s[0] == '\0' || argv->s[1] != '\0' + || !cs_field_name(argv->s[0])) { + input_stack::error("argument %1 for command `%2' must be a field name", + argno + 1, name); + return 0; + } + break; + default: + assert(0); + } + if (types[1] == '?') + types += 2; + else if (types[1] != '*') + types += 1; + --argc; + ++argv; + ++argno; + } + if (argc > 0) { + input_stack::error("too many arguments for command `%1'", name); + return 0; + } + return 1; +} + +static void execute_command(const char *name, int argc, argument *argv) +{ + for (int i = 0; i < sizeof(command_table)/sizeof(command_table[0]); i++) + if (strcmp(name, command_table[i].name) == 0) { + if (check_args(command_table[i].arg_types, name, argc, argv)) + (*command_table[i].func)(argc, argv); + return; + } + input_stack::error("unknown command `%1'", name); +} + +static void command_loop() +{ + string command; + for (;;) { + command.clear(); + int res = get_word(command); + if (res != 1) { + if (res == 0) + continue; + break; + } + int argc = 0; + command += '\0'; + while ((res = get_word(command)) == 1) { + argc++; + command += '\0'; + } + argument *argv = new argument[argc]; + const char *ptr = command.contents(); + for (int i = 0; i < argc; i++) + argv[i].s = ptr = strchr(ptr, '\0') + 1; + execute_command(command.contents(), argc, argv); + a_delete argv; + if (res == -1) + break; + } +} + +void process_commands(const char *file) +{ + input_stack::init(); + input_stack::push_file(file); + command_loop(); +} + +void process_commands(string &s, const char *file, int lineno) +{ + input_stack::init(); + input_stack::push_string(s, file, lineno); + command_loop(); +} diff --git a/gnu/usr.bin/groff/refer/command.h b/gnu/usr.bin/groff/refer/command.h new file mode 100644 index 000000000000..2b977eae9a4c --- /dev/null +++ b/gnu/usr.bin/groff/refer/command.h @@ -0,0 +1,36 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +void process_commands(const char *file); +void process_commands(string &s, const char *file, int lineno); + +extern int accumulate; +extern int move_punctuation; +extern int search_default; +extern search_list database_list; +extern int label_in_text; +extern int label_in_reference; +extern int sort_adjacent_labels; +extern string pre_label; +extern string post_label; +extern string sep_label; + +extern void do_bib(const char *); +extern void output_references(); diff --git a/gnu/usr.bin/groff/refer/label.y b/gnu/usr.bin/groff/refer/label.y new file mode 100644 index 000000000000..d18eb89a9887 --- /dev/null +++ b/gnu/usr.bin/groff/refer/label.y @@ -0,0 +1,1173 @@ +/* -*- C++ -*- + Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +%{ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +int yylex(); +void yyerror(const char *); +int yyparse(); + +static const char *format_serial(char c, int n); + +struct label_info { + int start; + int length; + int count; + int total; + label_info(const string &); +}; + +label_info *lookup_label(const string &label); + +struct expression { + enum { + // Does the tentative label depend on the reference? + CONTAINS_VARIABLE = 01, + CONTAINS_STAR = 02, + CONTAINS_FORMAT = 04, + CONTAINS_AT = 010 + }; + virtual ~expression() { } + virtual void evaluate(int, const reference &, string &, + substring_position &) = 0; + virtual unsigned analyze() { return 0; } +}; + +class at_expr : public expression { +public: + at_expr() { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; } +}; + +class format_expr : public expression { + char type; + int width; + int first_number; +public: + format_expr(char c, int w = 0, int f = 1) + : type(c), width(w), first_number(f) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_FORMAT; } +}; + +class field_expr : public expression { + int number; + char name; +public: + field_expr(char nm, int num) : name(nm), number(num) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE; } +}; + +class literal_expr : public expression { + string s; +public: + literal_expr(const char *ptr, int len) : s(ptr, len) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class unary_expr : public expression { +protected: + expression *expr; +public: + unary_expr(expression *e) : expr(e) { } + ~unary_expr() { delete expr; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { return expr ? expr->analyze() : 0; } +}; + +// This caches the analysis of an expression. + +class analyzed_expr : public unary_expr { + unsigned flags; +public: + analyzed_expr(expression *); + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return flags; } +}; + +class star_expr : public unary_expr { +public: + star_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { + return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0) + | CONTAINS_STAR); + } +}; + +typedef void map_t(const char *, const char *, string &); + +class map_expr : public unary_expr { + map_t *func; +public: + map_expr(expression *e, map_t *f) : unary_expr(e), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +typedef const char *extractor_t(const char *, const char *, const char **); + +class extractor_expr : public unary_expr { + int part; + extractor_t *func; +public: + enum { BEFORE = +1, MATCH = 0, AFTER = -1 }; + extractor_expr(expression *e, extractor_t *f, int pt) + : unary_expr(e), func(f), part(pt) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class truncate_expr : public unary_expr { + int n; +public: + truncate_expr(expression *e, int i) : n(i), unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class separator_expr : public unary_expr { +public: + separator_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class binary_expr : public expression { +protected: + expression *expr1; + expression *expr2; +public: + binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { } + ~binary_expr() { delete expr1; delete expr2; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0); + } +}; + +class alternative_expr : public binary_expr { +public: + alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class list_expr : public binary_expr { +public: + list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class substitute_expr : public binary_expr { +public: + substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class ternary_expr : public expression { +protected: + expression *expr1; + expression *expr2; + expression *expr3; +public: + ternary_expr(expression *e1, expression *e2, expression *e3) + : expr1(e1), expr2(e2), expr3(e3) { } + ~ternary_expr() { delete expr1; delete expr2; delete expr3; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return ((expr1 ? expr1->analyze() : 0) + | (expr2 ? expr2->analyze() : 0) + | (expr3 ? expr3->analyze() : 0)); + } +}; + +class conditional_expr : public ternary_expr { +public: + conditional_expr(expression *e1, expression *e2, expression *e3) + : ternary_expr(e1, e2, e3) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +static expression *parsed_label = 0; +static expression *parsed_date_label = 0; +static expression *parsed_short_label = 0; + +static expression *parse_result; + +string literals; + +%} + +%union { + int num; + expression *expr; + struct { int ndigits; int val; } dig; + struct { int start; int len; } str; +} + +/* uppercase or lowercase letter */ +%token <num> TOKEN_LETTER +/* literal characters */ +%token <str> TOKEN_LITERAL +/* digit */ +%token <num> TOKEN_DIGIT + +%type <expr> conditional +%type <expr> alternative +%type <expr> list +%type <expr> string +%type <expr> substitute +%type <expr> optional_conditional +%type <num> number +%type <dig> digits +%type <num> optional_number +%type <num> flag + +%% + +expr: + optional_conditional + { parse_result = ($1 ? new analyzed_expr($1) : 0); } + ; + +conditional: + alternative + { $$ = $1; } + | alternative '?' optional_conditional ':' conditional + { $$ = new conditional_expr($1, $3, $5); } + ; + +optional_conditional: + /* empty */ + { $$ = 0; } + | conditional + { $$ = $1; } + ; + +alternative: + list + { $$ = $1; } + | alternative '|' list + { $$ = new alternative_expr($1, $3); } + | alternative '&' list + { $$ = new conditional_expr($1, $3, 0); } + ; + +list: + substitute + { $$ = $1; } + | list substitute + { $$ = new list_expr($1, $2); } + ; + +substitute: + string + { $$ = $1; } + | substitute '~' string + { $$ = new substitute_expr($1, $3); } + ; + +string: + '@' + { $$ = new at_expr; } + | TOKEN_LITERAL + { + $$ = new literal_expr(literals.contents() + $1.start, + $1.len); + } + | TOKEN_LETTER + { $$ = new field_expr($1, 0); } + | TOKEN_LETTER number + { $$ = new field_expr($1, $2 - 1); } + | '%' TOKEN_LETTER + { + switch ($2) { + case 'I': + case 'i': + case 'A': + case 'a': + $$ = new format_expr($2); + break; + default: + command_error("unrecognized format `%1'", char($2)); + $$ = new format_expr('a'); + break; + } + } + + | '%' digits + { + $$ = new format_expr('0', $2.ndigits, $2.val); + } + | string '.' flag TOKEN_LETTER optional_number + { + switch ($4) { + case 'l': + $$ = new map_expr($1, lowercase); + break; + case 'u': + $$ = new map_expr($1, uppercase); + break; + case 'c': + $$ = new map_expr($1, capitalize); + break; + case 'r': + $$ = new map_expr($1, reverse_name); + break; + case 'a': + $$ = new map_expr($1, abbreviate_name); + break; + case 'y': + $$ = new extractor_expr($1, find_year, $3); + break; + case 'n': + $$ = new extractor_expr($1, find_last_name, $3); + break; + default: + $$ = $1; + command_error("unknown function `%1'", char($4)); + break; + } + } + + | string '+' number + { $$ = new truncate_expr($1, $3); } + | string '-' number + { $$ = new truncate_expr($1, -$3); } + | string '*' + { $$ = new star_expr($1); } + | '(' optional_conditional ')' + { $$ = $2; } + | '<' optional_conditional '>' + { $$ = new separator_expr($2); } + ; + +optional_number: + /* empty */ + { $$ = -1; } + | number + { $$ = $1; } + ; + +number: + TOKEN_DIGIT + { $$ = $1; } + | number TOKEN_DIGIT + { $$ = $1*10 + $2; } + ; + +digits: + TOKEN_DIGIT + { $$.ndigits = 1; $$.val = $1; } + | digits TOKEN_DIGIT + { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; } + ; + + +flag: + /* empty */ + { $$ = 0; } + | '+' + { $$ = 1; } + | '-' + { $$ = -1; } + ; + +%% + +/* bison defines const to be empty unless __STDC__ is defined, which it +isn't under cfront */ + +#ifdef const +#undef const +#endif + +const char *spec_ptr; +const char *spec_end; +const char *spec_cur; + +int yylex() +{ + while (spec_ptr < spec_end && csspace(*spec_ptr)) + spec_ptr++; + spec_cur = spec_ptr; + if (spec_ptr >= spec_end) + return 0; + unsigned char c = *spec_ptr++; + if (csalpha(c)) { + yylval.num = c; + return TOKEN_LETTER; + } + if (csdigit(c)) { + yylval.num = c - '0'; + return TOKEN_DIGIT; + } + if (c == '\'') { + yylval.str.start = literals.length(); + for (; spec_ptr < spec_end; spec_ptr++) { + if (*spec_ptr == '\'') { + if (++spec_ptr < spec_end && *spec_ptr == '\'') + literals += '\''; + else { + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + } + else + literals += *spec_ptr; + } + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + return c; +} + +int set_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_label; + parsed_label = parse_result; + return 1; +} + +int set_date_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_date_label; + parsed_date_label = parse_result; + return 1; +} + +int set_short_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_short_label; + parsed_short_label = parse_result; + return 1; +} + +void yyerror(const char *message) +{ + if (spec_cur < spec_end) + command_error("label specification %1 before `%2'", message, spec_cur); + else + command_error("label specification %1 at end of string", + message, spec_cur); +} + +void at_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + ref.canonicalize_authors(result); + else { + const char *end, *start = ref.get_authors(&end); + if (start) + result.append(start, end - start); + } +} + +void format_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + return; + const label_info *lp = ref.get_label_ptr(); + int num = lp == 0 ? ref.get_number() : lp->count; + if (type != '0') + result += format_serial(type, num + 1); + else { + const char *ptr = itoa(num + first_number); + int pad = width - strlen(ptr); + while (--pad >= 0) + result += '0'; + result += ptr; + } +} + +static const char *format_serial(char c, int n) +{ + assert(n > 0); + static char buf[128]; // more than enough. + switch (c) { + case 'i': + case 'I': + { + char *p = buf; + // troff uses z and w to represent 10000 and 5000 in Roman + // numerals; I can find no historical basis for this usage + const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI"; + if (n >= 40000) + return itoa(n); + while (n >= 10000) { + *p++ = s[0]; + n -= 10000; + } + for (int i = 1000; i > 0; i /= 10, s += 2) { + int m = n/i; + n -= m*i; + switch (m) { + case 3: + *p++ = s[2]; + /* falls through */ + case 2: + *p++ = s[2]; + /* falls through */ + case 1: + *p++ = s[2]; + break; + case 4: + *p++ = s[2]; + *p++ = s[1]; + break; + case 8: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 7: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 6: + *p++ = s[1]; + *p++ = s[2]; + break; + case 5: + *p++ = s[1]; + break; + case 9: + *p++ = s[2]; + *p++ = s[0]; + } + } + *p = 0; + break; + } + case 'a': + case 'A': + { + char *p = buf; + // this is derived from troff/reg.c + while (n > 0) { + int d = n % 26; + if (d == 0) + d = 26; + n -= d; + n /= 26; + *p++ = c + d - 1; // ASCII dependent + } + *p-- = 0; + // Reverse it. + char *q = buf; + while (q < p) { + char temp = *q; + *q = *p; + *p = temp; + --p; + ++q; + } + break; + } + default: + assert(0); + } + return buf; +} + +void field_expr::evaluate(int, const reference &ref, + string &result, substring_position &) +{ + const char *end; + const char *start = ref.get_field(name, &end); + if (start) { + start = nth_field(number, start, &end); + if (start) + result.append(start, end - start); + } +} + +void literal_expr::evaluate(int, const reference &, + string &result, substring_position &) +{ + result += s; +} + +analyzed_expr::analyzed_expr(expression *e) +: unary_expr(e), flags(e ? e->analyze() : 0) +{ +} + +void analyzed_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr) + expr->evaluate(tentative, ref, result, pos); +} + +void star_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + const label_info *lp = ref.get_label_ptr(); + if (!tentative + && (lp == 0 || lp->total > 1) + && expr) + expr->evaluate(tentative, ref, result, pos); +} + +void separator_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + int is_first = pos.start < 0; + if (expr) + expr->evaluate(tentative, ref, result, pos); + if (is_first) { + pos.start = start_length; + pos.length = result.length() - start_length; + } +} + +void map_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + (*func)(temp.contents(), temp.contents() + temp.length(), result); + } +} + +void extractor_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *end, *start = (*func)(temp.contents(), + temp.contents() + temp.length(), + &end); + switch (part) { + case BEFORE: + if (start) + result.append(temp.contents(), start - temp.contents()); + else + result += temp; + break; + case MATCH: + if (start) + result.append(start, end - start); + break; + case AFTER: + if (start) + result.append(end, temp.contents() + temp.length() - end); + break; + default: + assert(0); + } + } +} + +static void first_part(int len, const char *ptr, const char *end, + string &result) +{ + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + int counts = ti->sortify_non_empty(token_start, ptr); + if (counts && --len < 0) + break; + if (counts || ti->is_accent()) + result.append(token_start, ptr - token_start); + } +} + +static void last_part(int len, const char *ptr, const char *end, + string &result) +{ + const char *start = ptr; + int count = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr)) + count++; + } + ptr = start; + int skip = count - len; + if (skip > 0) { + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + assert(0); + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) { + ptr = token_start; + break; + } + } + } + first_part(len, ptr, end, result); +} + +void truncate_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *start = temp.contents(); + const char *end = start + temp.length(); + if (n > 0) + first_part(n, start, end, result); + else if (n < 0) + last_part(-n, start, end, result); + } +} + +void alternative_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() == start_length && expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void list_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void substitute_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() > start_length && result[result.length() - 1] == '-') { + // ought to see if pos covers the - + result.set_length(result.length() - 1); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } +} + +void conditional_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + string temp; + substring_position temp_pos; + if (expr1) + expr1->evaluate(tentative, ref, temp, temp_pos); + if (temp.length() > 0) { + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } + else { + if (expr3) + expr3->evaluate(tentative, ref, result, pos); + } +} + +void reference::pre_compute_label() +{ + if (parsed_label != 0 + && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) { + label.clear(); + substring_position temp_pos; + parsed_label->evaluate(1, *this, label, temp_pos); + label_ptr = lookup_label(label); + } +} + +void reference::compute_label() +{ + label.clear(); + if (parsed_label) + parsed_label->evaluate(0, *this, label, separator_pos); + if (short_label_flag && parsed_short_label) + parsed_short_label->evaluate(0, *this, short_label, short_separator_pos); + if (date_as_label) { + string new_date; + if (parsed_date_label) { + substring_position temp_pos; + parsed_date_label->evaluate(0, *this, new_date, temp_pos); + } + set_date(new_date); + } + if (label_ptr) + label_ptr->count += 1; +} + +void reference::immediate_compute_label() +{ + if (label_ptr) + label_ptr->total = 2; // force use of disambiguator + compute_label(); +} + +int reference::merge_labels(reference **v, int n, label_type type, + string &result) +{ + if (abbreviate_label_ranges) + return merge_labels_by_number(v, n, type, result); + else + return merge_labels_by_parts(v, n, type, result); +} + +int reference::merge_labels_by_number(reference **v, int n, label_type type, + string &result) +{ + if (n <= 1) + return 0; + int num = get_number(); + // Only merge three or more labels. + if (v[0]->get_number() != num + 1 + || v[1]->get_number() != num + 2) + return 0; + for (int i = 2; i < n; i++) + if (v[i]->get_number() != num + i + 1) + break; + result = get_label(type); + result += label_range_indicator; + result += v[i - 1]->get_label(type); + return i; +} + +const substring_position &reference::get_separator_pos(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_separator_pos; + else + return separator_pos; +} + +const string &reference::get_label(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_label; + else + return label; +} + +int reference::merge_labels_by_parts(reference **v, int n, label_type type, + string &result) +{ + if (n <= 0) + return 0; + const string &lb = get_label(type); + const substring_position &sp = get_separator_pos(type); + if (sp.start < 0 + || sp.start != v[0]->get_separator_pos(type).start + || memcmp(lb.contents(), v[0]->get_label(type).contents(), + sp.start) != 0) + return 0; + result = lb; + int i = 0; + do { + result += separate_label_second_parts; + const substring_position &s = v[i]->get_separator_pos(type); + int sep_end_pos = s.start + s.length; + result.append(v[i]->get_label(type).contents() + sep_end_pos, + v[i]->get_label(type).length() - sep_end_pos); + } while (++i < n + && sp.start == v[i]->get_separator_pos(type).start + && memcmp(lb.contents(), v[i]->get_label(type).contents(), + sp.start) == 0); + return i; +} + +string label_pool; + +label_info::label_info(const string &s) +: count(0), total(1), length(s.length()), start(label_pool.length()) +{ + label_pool += s; +} + +static label_info **label_table = 0; +static int label_table_size = 0; +static int label_table_used = 0; + +label_info *lookup_label(const string &label) +{ + if (label_table == 0) { + label_table = new label_info *[17]; + label_table_size = 17; + for (int i = 0; i < 17; i++) + label_table[i] = 0; + } + unsigned h = hash_string(label.contents(), label.length()) % label_table_size; + for (label_info **ptr = label_table + h; + *ptr != 0; + (ptr == label_table) + ? (ptr = label_table + label_table_size - 1) + : ptr--) + if ((*ptr)->length == label.length() + && memcmp(label_pool.contents() + (*ptr)->start, label.contents(), + label.length()) == 0) { + (*ptr)->total += 1; + return *ptr; + } + label_info *result = *ptr = new label_info(label); + if (++label_table_used * 2 > label_table_size) { + // Rehash the table. + label_info **old_table = label_table; + int old_size = label_table_size; + label_table_size = next_size(label_table_size); + label_table = new label_info *[label_table_size]; + int i; + for (i = 0; i < label_table_size; i++) + label_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + unsigned h = hash_string(label_pool.contents() + old_table[i]->start, + old_table[i]->length); + for (label_info **p = label_table + (h % label_table_size); + *p != 0; + (p == label_table) + ? (p = label_table + label_table_size - 1) + : --p) + ; + *p = old_table[i]; + } + a_delete old_table; + } + return result; +} + +void clear_labels() +{ + for (int i = 0; i < label_table_size; i++) { + delete label_table[i]; + label_table[i] = 0; + } + label_table_used = 0; + label_pool.clear(); +} + +static void consider_authors(reference **start, reference **end, int i); + +void compute_labels(reference **v, int n) +{ + if (parsed_label + && (parsed_label->analyze() & expression::CONTAINS_AT) + && sort_fields.length() >= 2 + && sort_fields[0] == 'A' + && sort_fields[1] == '+') + consider_authors(v, v + n, 0); + for (int i = 0; i < n; i++) + v[i]->compute_label(); +} + + +/* A reference with a list of authors <A0,A1,...,AN> _needs_ author i +where 0 <= i <= N if there exists a reference with a list of authors +<B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i +and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0, +A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and +<B0,B1,...,BM>. If a reference needs author i we only have to call +need_author(j) for some j >= i such that the reference also needs +author j. */ + +/* This function handles 2 tasks: +determine which authors are needed (cannot be elided with et al.); +determine which authors can have only last names in the labels. + +References >= start and < end have the same first i author names. +Also they're sorted by A+. */ + +static void consider_authors(reference **start, reference **end, int i) +{ + if (start >= end) + return; + reference **p = start; + if (i >= (*p)->get_nauthors()) { + for (++p; p < end && i >= (*p)->get_nauthors(); p++) + ; + if (p < end && i > 0) { + // If we have an author list <A B C> and an author list <A B C D>, + // then both lists need C. + for (reference **q = start; q < end; q++) + (*q)->need_author(i - 1); + } + start = p; + } + while (p < end) { + reference **last_name_start = p; + reference **name_start = p; + for (++p; + p < end && i < (*p)->get_nauthors() + && same_author_last_name(**last_name_start, **p, i); + p++) { + if (!same_author_name(**name_start, **p, i)) { + consider_authors(name_start, p, i + 1); + name_start = p; + } + } + consider_authors(name_start, p, i + 1); + if (last_name_start == name_start) { + for (reference **q = last_name_start; q < p; q++) + (*q)->set_last_name_unambiguous(i); + } + // If we have an author list <A B C D> and <A B C E>, then the lists + // need author D and E respectively. + if (name_start > start || p < end) { + for (reference **q = last_name_start; q < p; q++) + (*q)->need_author(i); + } + } +} + +int same_author_last_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, 0, &ae1); + assert(as1 != 0); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, 0, &ae2); + assert(as2 != 0); + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + +int same_author_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, -1, &ae1); + assert(as1 != 0); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, -1, &ae2); + assert(as2 != 0); + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + + +void int_set::set(int i) +{ + assert(i >= 0); + int bytei = i >> 3; + if (bytei >= v.length()) { + int old_length = v.length(); + v.set_length(bytei + 1); + for (int j = old_length; j <= bytei; j++) + v[j] = 0; + } + v[bytei] |= 1 << (i & 7); +} + +int int_set::get(int i) const +{ + assert(i >= 0); + int bytei = i >> 3; + return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0; +} + +void reference::set_last_name_unambiguous(int i) +{ + last_name_unambiguous.set(i); +} + +void reference::need_author(int n) +{ + if (n > last_needed_author) + last_needed_author = n; +} + +const char *reference::get_authors(const char **end) const +{ + if (!computed_authors) { + ((reference *)this)->computed_authors = 1; + string &result = ((reference *)this)->authors; + int na = get_nauthors(); + result.clear(); + for (int i = 0; i < na; i++) { + if (last_name_unambiguous.get(i)) { + const char *e, *start = get_author_last_name(i, &e); + assert(start != 0); + result.append(start, e - start); + } + else { + const char *e, *start = get_author(i, &e); + assert(start != 0); + result.append(start, e - start); + } + if (i == last_needed_author + && et_al.length() > 0 + && et_al_min_elide > 0 + && last_needed_author + et_al_min_elide < na + && na >= et_al_min_total) { + result += et_al; + break; + } + if (i < na - 1) { + if (na == 2) + result += join_authors_exactly_two; + else if (i < na - 2) + result += join_authors_default; + else + result += join_authors_last_two; + } + } + } + const char *start = authors.contents(); + *end = start + authors.length(); + return start; +} + +int reference::get_nauthors() const +{ + if (nauthors < 0) { + const char *dummy; + for (int na = 0; get_author(na, &dummy) != 0; na++) + ; + ((reference *)this)->nauthors = na; + } + return nauthors; +} diff --git a/gnu/usr.bin/groff/refer/ref.cc b/gnu/usr.bin/groff/refer/ref.cc new file mode 100644 index 000000000000..0b78813ec190 --- /dev/null +++ b/gnu/usr.bin/groff/refer/ref.cc @@ -0,0 +1,1144 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. +Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +static const char *find_day(const char *, const char *, const char **); +static int find_month(const char *start, const char *end); +static void abbreviate_names(string &); + +#define DEFAULT_ARTICLES "the\000a\000an" + +string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES)); + +// Multiple occurrences of fields are separated by FIELD_SEPARATOR. +const char FIELD_SEPARATOR = '\0'; + +const char MULTI_FIELD_NAMES[] = "AE"; +const char *AUTHOR_FIELDS = "AQ"; + +enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM }; + +const char *reference_types[] = { + "other", + "journal-article", + "book", + "article-in-book", + "tech-report", + "bell-tm", +}; + +static string temp_fields[256]; + +reference::reference(const char *start, int len, reference_id *ridp) +: no(-1), field(0), nfields(0), h(0), merged(0), label_ptr(0), + computed_authors(0), last_needed_author(-1), nauthors(-1) +{ + for (int i = 0; i < 256; i++) + field_index[i] = NULL_FIELD_INDEX; + if (ridp) + rid = *ridp; + if (start == 0) + return; + if (len <= 0) + return; + const char *end = start + len; + const char *ptr = start; + assert(*ptr == '%'); + while (ptr < end) { + if (ptr + 1 < end && ptr[1] != '\0' + && ((ptr[1] != '%' && ptr[1] == annotation_field) + || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0' + && discard_fields.search(ptr[2]) < 0))) { + if (ptr[1] == '%') + ptr++; + string &f = temp_fields[(unsigned char)ptr[1]]; + ptr += 2; + while (ptr < end && csspace(*ptr)) + ptr++; + for (;;) { + for (;;) { + if (ptr >= end) { + f += '\n'; + break; + } + f += *ptr; + if (*ptr++ == '\n') + break; + } + if (ptr >= end || *ptr == '%') + break; + } + } + else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%' + && discard_fields.search(ptr[1]) < 0) { + string &f = temp_fields[(unsigned char)ptr[1]]; + if (f.length() > 0) { + if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0) + f += FIELD_SEPARATOR; + else + f.clear(); + } + ptr += 2; + if (ptr < end) { + if (*ptr == ' ') + ptr++; + for (;;) { + const char *p = ptr; + while (ptr < end && *ptr != '\n') + ptr++; + // strip trailing white space + const char *q = ptr; + while (q > p && q[-1] != '\n' && csspace(q[-1])) + q--; + while (p < q) + f += *p++; + if (ptr >= end) + break; + ptr++; + if (ptr >= end) + break; + if (*ptr == '%') + break; + f += ' '; + } + } + } + else { + // skip this field + for (;;) { + while (ptr < end && *ptr++ != '\n') + ; + if (ptr >= end || *ptr == '%') + break; + } + } + } + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) + nfields++; + field = new string[nfields]; + int j = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) { + field[j].move(temp_fields[i]); + if (abbreviate_fields.search(i) >= 0) + abbreviate_names(field[j]); + field_index[i] = j; + j++; + } +} + +reference::~reference() +{ + if (nfields > 0) + ad_delete(nfields) field; +} + +// ref is the inline, this is the database ref + +void reference::merge(reference &ref) +{ + int i; + for (i = 0; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + temp_fields[i].move(field[field_index[i]]); + for (i = 0; i < 256; i++) + if (ref.field_index[i] != NULL_FIELD_INDEX) + temp_fields[i].move(ref.field[ref.field_index[i]]); + for (i = 0; i < 256; i++) + field_index[i] = NULL_FIELD_INDEX; + int old_nfields = nfields; + nfields = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) + nfields++; + if (nfields != old_nfields) { + if (old_nfields > 0) + ad_delete(old_nfields) field; + field = new string[nfields]; + } + int j = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) { + field[j].move(temp_fields[i]); + field_index[i] = j; + j++; + } + merged = 1; +} + +void reference::insert_field(unsigned char c, string &s) +{ + assert(s.length() > 0); + if (field_index[c] != NULL_FIELD_INDEX) { + field[field_index[c]].move(s); + return; + } + assert(field_index[c] == NULL_FIELD_INDEX); + string *old_field = field; + field = new string[nfields + 1]; + int pos = 0; + for (int i = 0; i < int(c); i++) + if (field_index[i] != NULL_FIELD_INDEX) + pos++; + for (i = 0; i < pos; i++) + field[i].move(old_field[i]); + field[pos].move(s); + for (i = pos; i < nfields; i++) + field[i + 1].move(old_field[i]); + if (nfields > 0) + ad_delete(nfields) old_field; + nfields++; + field_index[c] = pos; + for (i = c + 1; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + field_index[i] += 1; +} + +void reference::delete_field(unsigned char c) +{ + if (field_index[c] == NULL_FIELD_INDEX) + return; + string *old_field = field; + field = new string[nfields - 1]; + for (int i = 0; i < int(field_index[c]); i++) + field[i].move(old_field[i]); + for (i = field_index[c]; i < nfields - 1; i++) + field[i].move(old_field[i + 1]); + if (nfields > 0) + ad_delete(nfields) old_field; + nfields--; + field_index[c] = NULL_FIELD_INDEX; + for (i = c + 1; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + field_index[i] -= 1; +} + +void reference::compute_hash_code() +{ + if (!rid.is_null()) + h = rid.hash(); + else { + h = 0; + for (int i = 0; i < nfields; i++) + if (field[i].length() > 0) { + h <<= 4; + h ^= hash_string(field[i].contents(), field[i].length()); + } + } +} + +void reference::set_number(int n) +{ + no = n; +} + +const char SORT_SEP = '\001'; +const char SORT_SUB_SEP = '\002'; +const char SORT_SUB_SUB_SEP = '\003'; + +// sep specifies additional word separators + +void sortify_words(const char *s, const char *end, const char *sep, + string &result) +{ + int non_empty = 0; + int need_separator = 0; + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + if ((s - token_start == 1 + && (*token_start == ' ' + || *token_start == '\n' + || (sep && *token_start != '\0' + && strchr(sep, *token_start) != 0))) + || (s - token_start == 2 + && token_start[0] == '\\' && token_start[1] == ' ')) { + if (non_empty) + need_separator = 1; + } + else { + const token_info *ti = lookup_token(token_start, s); + if (ti->sortify_non_empty(token_start, s)) { + if (need_separator) { + result += ' '; + need_separator = 0; + } + ti->sortify(token_start, s, result); + non_empty = 1; + } + } + } +} + +void sortify_word(const char *s, const char *end, string &result) +{ + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + const token_info *ti = lookup_token(token_start, s); + ti->sortify(token_start, s, result); + } +} + +void sortify_other(const char *s, int len, string &key) +{ + sortify_words(s, s + len, 0, key); +} + +void sortify_title(const char *s, int len, string &key) +{ + const char *end = s + len; + for (; s < end && (*s == ' ' || *s == '\n'); s++) + ; + const char *ptr = s; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + if (ptr - token_start == 1 + && (*token_start == ' ' || *token_start == '\n')) + break; + } + if (ptr < end) { + int first_word_len = ptr - s - 1; + const char *ae = articles.contents() + articles.length(); + for (const char *a = articles.contents(); + a < ae; + a = strchr(a, '\0') + 1) + if (first_word_len == strlen(a)) { + for (int j = 0; j < first_word_len; j++) + if (a[j] != cmlower(s[j])) + break; + if (j >= first_word_len) { + s = ptr; + for (; s < end && (*s == ' ' || *s == '\n'); s++) + ; + break; + } + } + } + sortify_words(s, end, 0, key); +} + +void sortify_name(const char *s, int len, string &key) +{ + const char *last_name_end; + const char *last_name = find_last_name(s, s + len, &last_name_end); + sortify_word(last_name, last_name_end, key); + key += SORT_SUB_SUB_SEP; + if (last_name > s) + sortify_words(s, last_name, ".", key); + key += SORT_SUB_SUB_SEP; + if (last_name_end < s + len) + sortify_words(last_name_end, s + len, ".,", key); +} + +void sortify_date(const char *s, int len, string &key) +{ + const char *year_end; + const char *year_start = find_year(s, s + len, &year_end); + if (!year_start) { + // Things without years are often `forthcoming', so it makes sense + // that they sort after things with explicit years. + key += 'A'; + sortify_words(s, s + len, 0, key); + return; + } + int n = year_end - year_start; + while (n < 4) { + key += '0'; + n++; + } + while (year_start < year_end) + key += *year_start++; + int m = find_month(s, s + len); + if (m < 0) + return; + key += 'A' + m; + const char *day_end; + const char *day_start = find_day(s, s + len, &day_end); + if (!day_start) + return; + if (day_end - day_start == 1) + key += '0'; + while (day_start < day_end) + key += *day_start++; +} + +// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification. + +void sortify_label(const char *s, int len, string &key) +{ + const char *end = s + len; + for (;;) { + for (const char *ptr = s; + ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP; + ptr++) + ; + if (ptr > s) + sortify_words(s, ptr, 0, key); + s = ptr; + if (s >= end) + break; + key += *s++; + } +} + +void reference::compute_sort_key() +{ + if (sort_fields.length() == 0) + return; + sort_fields += '\0'; + const char *sf = sort_fields.contents(); + while (*sf != '\0') { + if (sf > sort_fields) + sort_key += SORT_SEP; + char f = *sf++; + int n = 1; + if (*sf == '+') { + n = INT_MAX; + sf++; + } + else if (csdigit(*sf)) { + char *ptr; + long l = strtol(sf, &ptr, 10); + if (l == 0 && ptr == sf) + ; + else { + sf = ptr; + if (l < 0) { + n = 1; + } + else { + n = int(l); + } + } + } + if (f == '.') + sortify_label(label.contents(), label.length(), sort_key); + else if (f == AUTHOR_FIELDS[0]) + sortify_authors(n, sort_key); + else + sortify_field(f, n, sort_key); + } + sort_fields.set_length(sort_fields.length() - 1); +} + +void reference::sortify_authors(int n, string &result) const +{ + for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++) + if (contains_field(*p)) { + sortify_field(*p, n, result); + return; + } + sortify_field(AUTHOR_FIELDS[0], n, result); +} + +void reference::canonicalize_authors(string &result) const +{ + int len = result.length(); + sortify_authors(INT_MAX, result); + if (result.length() > len) + result += SORT_SUB_SEP; +} + +void reference::sortify_field(unsigned char f, int n, string &result) const +{ + typedef void (*sortify_t)(const char *, int, string &); + sortify_t sortifier = sortify_other; + switch (f) { + case 'A': + case 'E': + sortifier = sortify_name; + break; + case 'D': + sortifier = sortify_date; + break; + case 'B': + case 'J': + case 'T': + sortifier = sortify_title; + break; + } + int fi = field_index[(unsigned char)f]; + if (fi != NULL_FIELD_INDEX) { + string &str = field[fi]; + const char *start = str.contents(); + const char *end = start + str.length(); + for (int i = 0; i < n && start < end; i++) { + const char *p = start; + while (start < end && *start != FIELD_SEPARATOR) + start++; + if (i > 0) + result += SORT_SUB_SEP; + (*sortifier)(p, start - p, result); + if (start < end) + start++; + } + } +} + +int compare_reference(const reference &r1, const reference &r2) +{ + assert(r1.no >= 0); + assert(r2.no >= 0); + const char *s1 = r1.sort_key.contents(); + int n1 = r1.sort_key.length(); + const char *s2 = r2.sort_key.contents(); + int n2 = r2.sort_key.length(); + for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2) + if (*s1 != *s2) + return (int)(unsigned char)*s1 - (int)(unsigned char)*s2; + if (n2 > 0) + return -1; + if (n1 > 0) + return 1; + return r1.no - r2.no; +} + +int same_reference(const reference &r1, const reference &r2) +{ + if (!r1.rid.is_null() && r1.rid == r2.rid) + return 1; + if (r1.h != r2.h) + return 0; + if (r1.nfields != r2.nfields) + return 0; + int i = 0; + for (i = 0; i < 256; i++) + if (r1.field_index != r2.field_index) + return 0; + for (i = 0; i < r1.nfields; i++) + if (r1.field[i] != r2.field[i]) + return 0; + return 1; +} + +const char *find_last_name(const char *start, const char *end, + const char **endp) +{ + const char *ptr = start; + const char *last_word = start; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + if (ptr - token_start == 1) { + if (*token_start == ',') { + *endp = token_start; + return last_word; + } + else if (*token_start == ' ' || *token_start == '\n') { + if (ptr < end && *ptr != ' ' && *ptr != '\n') + last_word = ptr; + } + } + } + *endp = end; + return last_word; +} + +void abbreviate_name(const char *ptr, const char *end, string &result) +{ + const char *last_name_end; + const char *last_name_start = find_last_name(ptr, end, &last_name_end); + int need_period = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, last_name_start)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (need_period) { + if ((ptr - token_start == 1 && *token_start == ' ') + || (ptr - token_start == 2 && token_start[0] == '\\' + && token_start[1] == ' ')) + continue; + if (ti->is_upper()) + result += period_before_initial; + else + result += period_before_other; + need_period = 0; + } + result.append(token_start, ptr - token_start); + if (ti->is_upper()) { + const char *lower_ptr = ptr; + int first_token = 1; + for (;;) { + token_start = ptr; + if (!get_token(&ptr, last_name_start)) + break; + if ((ptr - token_start == 1 && *token_start == ' ') + || (ptr - token_start == 2 && token_start[0] == '\\' + && token_start[1] == ' ')) + break; + ti = lookup_token(token_start, ptr); + if (ti->is_hyphen()) { + const char *ptr1 = ptr; + if (get_token(&ptr1, last_name_start)) { + ti = lookup_token(ptr, ptr1); + if (ti->is_upper()) { + result += period_before_hyphen; + result.append(token_start, ptr1 - token_start); + ptr = ptr1; + } + } + } + else if (ti->is_upper()) { + // MacDougal -> MacD. + result.append(lower_ptr, ptr - lower_ptr); + lower_ptr = ptr; + first_token = 1; + } + else if (first_token && ti->is_accent()) { + result.append(token_start, ptr - token_start); + lower_ptr = ptr; + } + first_token = 0; + } + need_period = 1; + } + } + if (need_period) + result += period_before_last_name; + result.append(last_name_start, end - last_name_start); +} + +static void abbreviate_names(string &result) +{ + string str; + str.move(result); + const char *ptr = str.contents(); + const char *end = ptr + str.length(); + while (ptr < end) { + const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); + if (name_end == 0) + name_end = end; + abbreviate_name(ptr, name_end, result); + if (name_end >= end) + break; + ptr = name_end + 1; + result += FIELD_SEPARATOR; + } +} + +void reverse_name(const char *ptr, const char *name_end, string &result) +{ + const char *last_name_end; + const char *last_name_start = find_last_name(ptr, name_end, &last_name_end); + result.append(last_name_start, last_name_end - last_name_start); + while (last_name_start > ptr + && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n')) + last_name_start--; + if (last_name_start > ptr) { + result += ", "; + result.append(ptr, last_name_start - ptr); + } + if (last_name_end < name_end) + result.append(last_name_end, name_end - last_name_end); +} + +void reverse_names(string &result, int n) +{ + if (n <= 0) + return; + string str; + str.move(result); + const char *ptr = str.contents(); + const char *end = ptr + str.length(); + while (ptr < end) { + if (--n < 0) { + result.append(ptr, end - ptr); + break; + } + const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); + if (name_end == 0) + name_end = end; + reverse_name(ptr, name_end, result); + if (name_end >= end) + break; + ptr = name_end + 1; + result += FIELD_SEPARATOR; + } +} + +// Return number of field separators. + +int join_fields(string &f) +{ + const char *ptr = f.contents(); + int len = f.length(); + int nfield_seps = 0; + for (int j = 0; j < len; j++) + if (ptr[j] == FIELD_SEPARATOR) + nfield_seps++; + if (nfield_seps == 0) + return 0; + string temp; + int field_seps_left = nfield_seps; + for (j = 0; j < len; j++) { + if (ptr[j] == FIELD_SEPARATOR) { + if (nfield_seps == 1) + temp += join_authors_exactly_two; + else if (--field_seps_left == 0) + temp += join_authors_last_two; + else + temp += join_authors_default; + } + else + temp += ptr[j]; + } + f = temp; + return nfield_seps; +} + +void uppercase(const char *start, const char *end, string &result) +{ + for (;;) { + const char *token_start = start; + if (!get_token(&start, end)) + break; + const token_info *ti = lookup_token(token_start, start); + ti->upper_case(token_start, start, result); + } +} + +void lowercase(const char *start, const char *end, string &result) +{ + for (;;) { + const char *token_start = start; + if (!get_token(&start, end)) + break; + const token_info *ti = lookup_token(token_start, start); + ti->lower_case(token_start, start, result); + } +} + +void capitalize(const char *ptr, const char *end, string &result) +{ + int in_small_point_size = 0; + for (;;) { + const char *start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(start, ptr); + const char *char_end = ptr; + int is_lower = ti->is_lower(); + if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) { + const token_info *ti2 = lookup_token(char_end, ptr); + if (!ti2->is_accent()) + ptr = char_end; + } + if (is_lower) { + if (!in_small_point_size) { + result += "\\s-2"; + in_small_point_size = 1; + } + ti->upper_case(start, char_end, result); + result.append(char_end, ptr - char_end); + } + else { + if (in_small_point_size) { + result += "\\s+2"; + in_small_point_size = 0; + } + result.append(start, ptr - start); + } + } + if (in_small_point_size) + result += "\\s+2"; +} + +void capitalize_field(string &str) +{ + string temp; + capitalize(str.contents(), str.contents() + str.length(), temp); + str.move(temp); +} + +int is_terminated(const char *ptr, const char *end) +{ + const char *last_token = end; + for (;;) { + const char *p = ptr; + if (!get_token(&ptr, end)) + break; + last_token = p; + } + return end - last_token == 1 + && (*last_token == '.' || *last_token == '!' || *last_token == '?'); +} + +void reference::output(FILE *fp) +{ + fputs(".]-\n", fp); + for (int i = 0; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) { + string &f = field[field_index[i]]; + if (!csdigit(i)) { + int j = reverse_fields.search(i); + if (j >= 0) { + int n; + int len = reverse_fields.length(); + if (++j < len && csdigit(reverse_fields[j])) { + n = reverse_fields[j] - '0'; + for (++j; j < len && csdigit(reverse_fields[j]); j++) + // should check for overflow + n = n*10 + reverse_fields[j] - '0'; + } + else + n = INT_MAX; + reverse_names(f, n); + } + } + int is_multiple = join_fields(f) > 0; + if (capitalize_fields.search(i) >= 0) + capitalize_field(f); + if (memchr(f.contents(), '\n', f.length()) == 0) { + fprintf(fp, ".ds [%c ", i); + if (f[0] == ' ' || f[0] == '\\' || f[0] == '"') + putc('"', fp); + put_string(f, fp); + putc('\n', fp); + } + else { + fprintf(fp, ".de [%c\n", i); + put_string(f, fp); + fputs("..\n", fp); + } + if (i == 'P') { + int multiple_pages = 0; + if (f.length() > 0 && memchr(f.contents(), '-', f.length()) != 0) + multiple_pages = 1; + fprintf(fp, ".nr [P %d\n", multiple_pages); + } + else if (i == 'E') + fprintf(fp, ".nr [E %d\n", is_multiple); + } + for (const char *p = "TAO"; *p; p++) { + int fi = field_index[(unsigned char)*p]; + if (fi != NULL_FIELD_INDEX) { + string &f = field[fi]; + fprintf(fp, ".nr [%c %d\n", *p, + is_terminated(f.contents(), f.contents() + f.length())); + } + } + int t = classify(); + fprintf(fp, ".][ %d %s\n", t, reference_types[t]); + if (annotation_macro.length() > 0 && annotation_field >= 0 + && field_index[annotation_field] != NULL_FIELD_INDEX) { + putc('.', fp); + put_string(annotation_macro, fp); + putc('\n', fp); + put_string(field[field_index[annotation_field]], fp); + } +} + +void reference::print_sort_key_comment(FILE *fp) +{ + fputs(".\\\"", fp); + put_string(sort_key, fp); + putc('\n', fp); +} + +const char *find_year(const char *start, const char *end, const char **endp) +{ + for (;;) { + while (start < end && !csdigit(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csdigit(*ptr)) + ptr++; + if (ptr - start == 4 || ptr - start == 3 + || (ptr - start == 2 + && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) { + *endp = ptr; + return start; + } + start = ptr; + } + return 0; +} + +static const char *find_day(const char *start, const char *end, + const char **endp) +{ + for (;;) { + while (start < end && !csdigit(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csdigit(*ptr)) + ptr++; + if ((ptr - start == 1 && start[0] != '0') + || (ptr - start == 2 && + (start[0] == '1' + || start[0] == '2' + || (start[0] == '3' && start[1] <= '1') + || (start[0] == '0' && start[1] != '0')))) { + *endp = ptr; + return start; + } + start = ptr; + } + return 0; +} + +static int find_month(const char *start, const char *end) +{ + static const char *months[] = { + "january", + "february", + "march", + "april", + "may", + "june", + "july", + "august", + "september", + "october", + "november", + "december", + }; + for (;;) { + while (start < end && !csalpha(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csalpha(*ptr)) + ptr++; + if (ptr - start >= 3) { + for (int i = 0; i < sizeof(months)/sizeof(months[0]); i++) { + const char *q = months[i]; + const char *p = start; + for (; p < ptr; p++, q++) + if (cmlower(*p) != *q) + break; + if (p >= ptr) + return i; + } + } + start = ptr; + } + return -1; +} + +int reference::contains_field(char c) const +{ + return field_index[(unsigned char)c] != NULL_FIELD_INDEX; +} + +int reference::classify() +{ + if (contains_field('J')) + return JOURNAL_ARTICLE; + if (contains_field('B')) + return ARTICLE_IN_BOOK; + if (contains_field('G')) + return TECH_REPORT; + if (contains_field('R')) + return TECH_REPORT; + if (contains_field('I')) + return BOOK; + if (contains_field('M')) + return BELL_TM; + return OTHER; +} + +const char *reference::get_year(const char **endp) const +{ + if (field_index['D'] != NULL_FIELD_INDEX) { + string &date = field[field_index['D']]; + const char *start = date.contents(); + const char *end = start + date.length(); + return find_year(start, end, endp); + } + else + return 0; +} + +const char *reference::get_field(unsigned char c, const char **endp) const +{ + if (field_index[c] != NULL_FIELD_INDEX) { + string &f = field[field_index[c]]; + const char *start = f.contents(); + *endp = start + f.length(); + return start; + } + else + return 0; +} + +const char *reference::get_date(const char **endp) const +{ + return get_field('D', endp); +} + +const char *nth_field(int i, const char *start, const char **endp) +{ + while (--i >= 0) { + start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); + if (!start) + return 0; + start++; + } + const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); + if (e) + *endp = e; + return start; +} + +const char *reference::get_author(int i, const char **endp) const +{ + for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { + const char *start = get_field(*f, endp); + if (start) { + if (strchr(MULTI_FIELD_NAMES, *f) != 0) + return nth_field(i, start, endp); + else if (i == 0) + return start; + else + return 0; + } + } + return 0; +} + +const char *reference::get_author_last_name(int i, const char **endp) const +{ + for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { + const char *start = get_field(*f, endp); + if (start) { + if (strchr(MULTI_FIELD_NAMES, *f) != 0) { + start = nth_field(i, start, endp); + if (!start) + return 0; + } + if (*f == 'A') + return find_last_name(start, *endp, endp); + else + return start; + } + } + return 0; +} + +void reference::set_date(string &d) +{ + if (d.length() == 0) + delete_field('D'); + else + insert_field('D', d); +} + +int same_year(const reference &r1, const reference &r2) +{ + const char *ye1; + const char *ys1 = r1.get_year(&ye1); + const char *ye2; + const char *ys2 = r2.get_year(&ye2); + if (ys1 == 0) { + if (ys2 == 0) + return same_date(r1, r2); + else + return 0; + } + else if (ys2 == 0) + return 0; + else if (ye1 - ys1 != ye2 - ys2) + return 0; + else + return memcmp(ys1, ys2, ye1 - ys1) == 0; +} + +int same_date(const reference &r1, const reference &r2) +{ + const char *e1; + const char *s1 = r1.get_date(&e1); + const char *e2; + const char *s2 = r2.get_date(&e2); + if (s1 == 0) + return s2 == 0; + else if (s2 == 0) + return 0; + else if (e1 - s1 != e2 - s2) + return 0; + else + return memcmp(s1, s2, e1 - s1) == 0; +} + +const char *reference::get_sort_field(int i, int si, int ssi, + const char **endp) const +{ + const char *start = sort_key.contents(); + const char *end = start + sort_key.length(); + if (i < 0) { + *endp = end; + return start; + } + while (--i >= 0) { + start = (char *)memchr(start, SORT_SEP, end - start); + if (!start) + return 0; + start++; + } + const char *e = (char *)memchr(start, SORT_SEP, end - start); + if (e) + end = e; + if (si < 0) { + *endp = end; + return start; + } + while (--si >= 0) { + start = (char *)memchr(start, SORT_SUB_SEP, end - start); + if (!start) + return 0; + start++; + } + e = (char *)memchr(start, SORT_SUB_SEP, end - start); + if (e) + end = e; + if (ssi < 0) { + *endp = end; + return start; + } + while (--ssi >= 0) { + start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); + if (!start) + return 0; + start++; + } + e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); + if (e) + end = e; + *endp = end; + return start; +} + diff --git a/gnu/usr.bin/groff/refer/ref.h b/gnu/usr.bin/groff/refer/ref.h new file mode 100644 index 000000000000..28e498dc5d83 --- /dev/null +++ b/gnu/usr.bin/groff/refer/ref.h @@ -0,0 +1,120 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +struct label_info; + +enum label_type { NORMAL_LABEL, SHORT_LABEL }; +const int N_LABEL_TYPES = 2; + +struct substring_position { + int start; + int length; + substring_position() : start(-1) { } +}; + +class int_set { + string v; +public: + int_set() { } + void set(int i); + int get(int i) const; +}; + +class reference { +private: + unsigned h; + reference_id rid; + int merged; + string sort_key; + int no; + string *field; + int nfields; + unsigned char field_index[256]; + enum { NULL_FIELD_INDEX = 255 }; + string label; + substring_position separator_pos; + string short_label; + substring_position short_separator_pos; + label_info *label_ptr; + string authors; + int computed_authors; + int last_needed_author; + int nauthors; + int_set last_name_unambiguous; + + int contains_field(char) const; + void insert_field(unsigned char, string &s); + void delete_field(unsigned char); + void set_date(string &); + const char *get_sort_field(int i, int si, int ssi, const char **endp) const; + int merge_labels_by_parts(reference **, int, label_type, string &); + int merge_labels_by_number(reference **, int, label_type, string &); +public: + reference(const char * = 0, int = -1, reference_id * = 0); + ~reference(); + void output(FILE *); + void print_sort_key_comment(FILE *); + void set_number(int); + int get_number() const { return no; } + unsigned hash() const { return h; } + const string &get_label(label_type type) const; + const substring_position &get_separator_pos(label_type) const; + int is_merged() const { return merged; } + void compute_sort_key(); + void compute_hash_code(); + void pre_compute_label(); + void compute_label(); + void immediate_compute_label(); + int classify(); + void merge(reference &); + int merge_labels(reference **, int, label_type, string &); + int get_nauthors() const; + void need_author(int); + void set_last_name_unambiguous(int); + void sortify_authors(int, string &) const; + void canonicalize_authors(string &) const; + void sortify_field(unsigned char, int, string &) const; + const char *get_author(int, const char **) const; + const char *get_author_last_name(int, const char **) const; + const char *get_date(const char **) const; + const char *get_year(const char **) const; + const char *get_field(unsigned char, const char **) const; + const label_info *get_label_ptr() const { return label_ptr; } + const char *get_authors(const char **) const; + // for sorting + friend int compare_reference(const reference &r1, const reference &r2); + // for merging + friend int same_reference(const reference &, const reference &); + friend int same_year(const reference &, const reference &); + friend int same_date(const reference &, const reference &); + friend int same_author_last_name(const reference &, const reference &, int); + friend int same_author_name(const reference &, const reference &, int); +}; + +const char *find_year(const char *, const char *, const char **); +const char *find_last_name(const char *, const char *, const char **); + +const char *nth_field(int i, const char *start, const char **endp); + +void capitalize(const char *ptr, const char *end, string &result); +void reverse_name(const char *ptr, const char *end, string &result); +void uppercase(const char *ptr, const char *end, string &result); +void lowercase(const char *ptr, const char *end, string &result); +void abbreviate_name(const char *ptr, const char *end, string &result); diff --git a/gnu/usr.bin/groff/refer/refer.1 b/gnu/usr.bin/groff/refer/refer.1 new file mode 100644 index 000000000000..ad367f712408 --- /dev/null +++ b/gnu/usr.bin/groff/refer/refer.1 @@ -0,0 +1,1282 @@ +.\" -*- nroff -*- +.de TQ +.br +.ns +.TP \\$1 +.. +.\" Like TP, but if specified indent is more than half +.\" the current line-length - indent, use the default indent. +.de Tp +.ie \\n(.$=0:((0\\$1)*2u>(\\n(.lu-\\n(.iu)) .TP +.el .TP "\\$1" +.. +.\" The BSD man macros can't handle " in arguments to font change macros, +.\" so use \(ts instead of ". +.tr \(ts" +.TH REFER 1 "19 February 1993" "Groff Version 1.08" +.SH NAME +refer \- preprocess bibliographic references for groff +.SH SYNOPSIS +.nr a \n(.j +.ad l +.nr i \n(.i +.in +\w'\fBrefer 'u +.ti \niu +.B refer +.de OP +.ie \\n(.$-1 .RI "[\ \fB\\$1\fP" "\\$2" "\ ]" +.el .RB "[\ " "\\$1" "\ ]" +.. +.OP \-benvCPRS +.OP \-a n +.OP \-c fields +.OP \-f n +.OP \-i fields +.OP \-k field +.OP \-l m,n +.OP \-p filename +.OP \-s fields +.OP \-t n +.OP \-B field.macro +.RI [\ filename \|.\|.\|.\ ] +.br +.ad \na +.SH DESCRIPTION +This file documents the GNU version of +.BR refer , +which is part of the groff document formatting system. +.B refer +copies the contents of +.IR filename \|.\|.\|. +to the standard output, +except that lines between +.B .[ +and +.B .] +are interpreted as citations, +and lines between +.B .R1 +and +.B .R2 +are interpreted as commands about how citations are to be processed. +.LP +Each citation specifies a reference. +The citation can specify a reference that is contained in +a bibliographic database by giving a set of keywords +that only that reference contains. +Alternatively it can specify a reference by supplying a database +record in the citation. +A combination of these alternatives is also possible. +.LP +For each citation, +.B refer +can produce a mark in the text. +This mark consists of some label which can be separated from +the text and from other labels in various ways. +For each reference it also outputs +.B groff +commands that can be used by a macro package to produce a formatted +reference for each citation. +The output of +.B refer +must therefore be processed using a suitable macro package. +The +.B \-ms +and +.B \-me +macros are both suitable. +The commands to format a citation's reference can be output immediately after +the citation, +or the references may be accumulated, +and the commands output at some later point. +If the references are accumulated, then multiple citations of the same +reference will produce a single formatted reference. +.LP +The interpretation of lines between +.B .R1 +and +.B .R2 +as commands is a new feature of GNU refer. +Documents making use of this feature can still be processed by +Unix refer just by adding the lines +.RS +.LP +.nf +.ft B +\&.de R1 +\&.ig R2 +\&.. +.ft +.fi +.RE +to the beginning of the document. +This will cause +.B troff +to ignore everything between +.B .R1 +and +.BR .R2 . +The effect of some commands can also be achieved by options. +These options are supported mainly for compatibility with Unix refer. +It is usually more convenient to use commands. +.LP +.B refer +generates +.B .lf +lines so that filenames and line numbers in messages produced +by commands that read +.B refer +output will be correct; +it also interprets lines beginning with +.B .lf +so that filenames and line numbers in the messages and +.B .lf +lines that it produces will be accurate even if the input has been +preprocessed by a command such as +.BR soelim (1). +.SH OPTIONS +.LP +Most options are equivalent to commands +(for a description of these commands see the +.B Commands +subsection): +.TP +.B \-b +.B +no-label-in-text; no-label-in-reference +.TP +.B \-e +.B accumulate +.TP +.B \-n +.B no-default-database +.TP +.B \-C +.B compatible +.TP +.B \-P +.B move-puntuation +.TP +.B \-S +.B +label "(A.n|Q) ', ' (D.y|D)"; bracket-label " (" ) "; " +.TP +.BI \-a n +.B reverse +.BI A n +.TP +.BI \-c fields +.B capitalize +.I fields +.TP +.BI \-f n +.B label +.BI % n +.TP +.BI \-i fields +.B search-ignore +.I fields +.TP +.B \-k +.B label +.B L\(ti%a +.TP +.BI \-k field +.B label +.IB field \(ti%a +.TP +.B \-l +.B label +.BI A.nD.y%a +.TP +.BI \-l m +.B label +.BI A.n+ m D.y%a +.TP +.BI \-l, n +.B label +.BI A.nD.y\- n %a +.TP +.BI \-l m , n +.B label +.BI A.n+ m D.y\- n %a +.TP +.BI \-p filename +.B database +.I filename +.TP +.BI \-s spec +.B sort +.I spec +.TP +.BI \-t n +.B search-truncate +.I n +.LP +These options are equivalent to the following commands with the +addition that the filenames specified on the command line are +processed as if they were arguments to the +.B bibliography +command instead of in the normal way: +.TP +.B \-B +.B +annotate X AP; no-label-in-reference +.TP +.BI \-B field . macro +.B annotate +.I field +.IB macro ; +.B no-label-in-reference +.LP +The following options have no equivalent commands: +.TP +.B \-v +Print the version number. +.TP +.B \-R +Don't recognize lines beginning with +.BR .R1 / .R2 . +.SH USAGE +.SS Bibliographic databases +The bibliographic database is a text file consisting of records +separated by one or more blank lines. +Within each record fields start with a +.B % +at the beginning of a line. +Each field has a one character name that immediately follows the +.BR % . +It is best to use only upper and lower case letters for the names +of fields. +The name of the field should be followed by exactly one space, +and then by the contents of the field. +Empty fields are ignored. +The conventional meaning of each field is as follows: +.TP +.B A +The name of an author. +If the name contains a title such as +.B Jr. +at the end, +it should be separated from the last name by a comma. +There can be multiple occurrences of the +.B A +field. +The order is siginificant. +It is a good idea always to supply an +.B A +field or a +.B Q +field. +.TP +.B B +For an article that is part of a book, the title of the book +.TP +.B C +The place (city) of publication. +.TP +.B D +The date of publication. +The year should be specified in full. +If the month is specified, the name rather than the number of the month +should be used, but only the first three letters are required. +It is a good idea always to supply a +.B D +field; +if the date is unknown, a value such as +.B in press +or +.B unknown +can be used. +.TP +.B E +For an article that is part of a book, the name of an editor of the book. +Where the work has editors and no authors, +the names of the editors should be given as +.B A +fields and +.B ,\ (ed) +or +.B ,\ (eds) +should be appended to the last author. +.TP +.B G +US Government ordering number. +.TP +.B I +The publisher (issuer). +.TP +.B J +For an article in a journal, the name of the journal. +.TP +.B K +Keywords to be used for searching. +.TP +.B L +Label. +.TP +.B N +Journal issue number. +.TP +.B O +Other information. +This is usually printed at the end of the reference. +.TP +.B P +Page number. +A range of pages can be specified as +.IB m \- n\fR. +.TP +.B Q +The name of the author, if the author is not a person. +This will only be used if there are no +.B A +fields. +There can only be one +.B Q +field. +.TP +.B R +Technical report number. +.TP +.B S +Series name. +.TP +.B T +Title. +For an article in a book or journal, +this should be the title of the article. +.TP +.B V +Volume number of the journal or book. +.TP +.B X +Annotation. +.LP +For all fields except +.B A +and +.BR E , +if there is more than one occurrence of a particular field in a record, +only the last such field will be used. +.LP +If accent strings are used, they should follow the charater to be accented. +This means that the +.B AM +macro must be used with the +.B \-ms +macros. +Accent strings should not be quoted: +use one +.B \e +rather than two. +.SS Citations +The format of a citation is +.RS +.BI .[ opening-text +.br +.I +flags keywords +.br +.I fields +.br +.BI .] closing-text +.RE +.LP +The +.IR opening-text , +.IR closing-text +and +.I flags +components are optional. +Only one of the +.I keywords +and +.I fields +components need be specified. +.LP +The +.I keywords +component says to search the bibliographic databases for a reference +that contains all the words in +.IR keywords . +It is an error if more than one reference if found. +.LP +The +.I fields +components specifies additional fields to replace or supplement +those specified in the reference. +When references are being accumulated and the +.I keywords +component is non-empty, +then additional fields should be specified only on the first +occasion that a particular reference is cited, +and will apply to all citations of that reference. +.LP +The +.I opening-text +and +.I closing-text +component specifies strings to be used to bracket the label instead +of the strings specified in the +.B bracket-label +command. +If either of these components is non-empty, +the strings specified in the +.B bracket-label +command will not be used; +this behavior can be altered using the +.B [ +and +.B ] +flags. +Note that leading and trailing spaces are significant for these components. +.LP +The +.I flags +component is a list of +non-alphanumeric characters each of which modifies the treatment +of this particular citation. +Unix refer will treat these flags as part of the keywords and +so will ignore them since they are non-alphanumeric. +The following flags are currently recognized: +.TP +.B # +This says to use the label specified by the +.B short-label +command, +instead of that specified by the +.B label +command. +If no short label has been specified, the normal label will be used. +Typically the short label is used with author-date labels +and consists of only the date and possibly a disambiguating letter; +the +.B # +is supposed to be suggestive of a numeric type of label. +.TP +.B [ +Precede +.I opening-text +with the first string specified in the +.B bracket-label +command. +.TP +.B ] +Follow +.I closing-text +with the second string specified in the +.B bracket-label +command. +.LP +One advantages of using the +.B [ +and +.B ] +flags rather than including the brackets in +.I opening-text +and +.I closing-text +is that +you can change the style of bracket used in the document just by changing the +.B bracket-label +command. +Another advantage is that sorting and merging of citations +will not necessarily be inhibited if the flags are used. +.LP +If a label is to be inserted into the text, +it will be attached to the line preceding the +.B .[ +line. +If there is no such line, then an extra line will be inserted before the +.B .[ +line and a warning will be given. +.LP +There is no special notation for making a citation to multiple references. +Just use a sequence of citations, one for each reference. +Don't put anything between the citations. +The labels for all the citations will be attached to the line preceding +the first citation. +The labels may also be sorted or merged. +See the description of the +.B <> +label expression, and of the +.B sort-adjacent-labels +and +.B abbreviate-label-ranges +command. +A label will not be merged if its citation has a non-empty +.I opening-text +or +.IR closing-text . +However, the labels for a citation using the +.B ] +flag and without any +.I closing-text +immediately followed by a citation using the +.B [ +flag and without any +.I opening-text +may be sorted and merged +even though the first citation's +.I opening-text +or the second citation's +.I closing-text +is non-empty. +(If you wish to prevent this just make the first citation's +.I closing-text +.BR \e& .) +.SS Commands +Commands are contained between lines starting with +.B .R1 +and +.BR .R2 . +Recognition of these lines can be prevented by the +.B \-R +option. +When a +.B .R1 +line is recognized any accumulated references are flushed out. +Neither +.B .R1 +nor +.B .R2 +lines, +nor anything between them +is output. +.LP +Commands are separated by newlines or +.BR ; s. +.B # +introduces a comment that extends to the end of the line +(but does not conceal the newline). +Each command is broken up into words. +Words are separated by spaces or tabs. +A word that begins with +.B \(ts +extends to the next +.B \(ts +that is not followed by another +.BR \(ts . +If there is no such +.B \(ts +the word extends to the end of the line. +Pairs of +.B \(ts +in a word beginning with +.B \(ts +collapse to a single +.BR \(ts . +Neither +.B # +nor +.B ; +are recognized inside +.BR \(ts s. +A line can be continued by ending it with +.BR \e ; +this works everywhere except after a +.BR # . +.LP +.ds n \fR* +Each command +.I name +that is marked with \*n has an associated negative command +.BI no- name +that undoes the effect of +.IR name . +For example, the +.B no-sort +command specifies that references should not be sorted. +The negative commands take no arguments. +.LP +In the following description each argument must be a single word; +.I field +is used for a single upper or lower case letter naming a field; +.I fields +is used for a sequence of such letters; +.I m +and +.I n +are used for a non-negative numbers; +.I string +is used for an arbitrary string; +.I filename +is used for the name of a file. +.Tp \w'\fBabbreviate-label-ranges'u+2n +.BI abbreviate\*n\ fields\ string1\ string2\ string3\ string4 +Abbreviate the first names of +.IR fields . +An initial letter will be separated from another initial letter by +.IR string1 , +from the last name by +.IR string2 , +and from anything else +(such as a +.B von +or +.BR de ) +by +.IR string3 . +These default to a period followed by a space. +In a hyphenated first name, +the initial of the first part of the name will be separated from the hyphen by +.IR string4 ; +this defaults to a period. +No attempt is made to handle any ambiguities that might +result from abbreviation. +Names are abbreviated before sorting and before +label construction. +.TP +.BI abbreviate-label-ranges\*n\ string +Three or more adjacent labels that refer to consecutive references +will be abbreviated to a label consisting +of the first label, followed by +.I string +followed by the last label. +This is mainly useful with numeric labels. +If +.I string +is omitted it defaults to +.BR \- . +.TP +.B accumulate\*n +Accumulate references instead of writing out each reference +as it is encountered. +Accumulated references will be written out whenever a reference +of the form +.RS +.IP +.B .[ +.br +.B $LIST$ +.br +.B .] +.LP +is encountered, +after all input files hve been processed, +and whenever +.B .R1 +line is recognized. +.RE +.TP +.BI annotate\*n\ field\ string +.I field +is an annotation; +print it at the end of the reference as a paragraph preceded by the line +.RS +.IP +.BI . string +.LP +If +.I macro +is omitted it will default to +.BR AP ; +if +.I field +is also omitted it will default to +.BR X . +Only one field can be an annotation. +.RE +.TP +.BI articles\ string \fR\|.\|.\|. +.IR string \|.\|.\|. +are definite or indefinite articles, and should be ignored at the beginning of +.B T +fields when sorting. +Initially, +.BR the , +.B a +and +.B an +are recognized as articles. +.TP +.BI bibliography\ filename \fR\|.\|.\|. +Write out all the references contained in the bibliographic databases +.IR filename \|.\|.\|. +.TP +.BI bracket-label\ string1\ string2\ string3 +In the text, bracket each label +with +.I string1 +and +.IR string2 . +An occurrence of +.I string2 +immediately followed by +.I string1 +will be turned into +.IR string3 . +The default behavior is +.RS +.IP +.B +bracket-label \e*([. \e*(.] ", " +.RE +.TP +.BI capitalize\ fields +Convert +.I fields +to caps and small caps. +.TP +.B compatible\*n +Recognize +.B .R1 +and +.B .R2 +even when followed by a character other than space or newline. +.TP +.BI database\ filename \fR\|.\|.\|. +Search the bibligraphic databases +.IR filename \|.\|.\|. +For each +.I filename +if an index +.IB filename .i +created by +.BR indxbib (1) +exists, then it will be searched instead; +each index can cover multiple databases. +.TP +.BI date-as-label\*n\ string +.I string +is a label expression that specifies a string with which to replace the +.B D +field after constructing the label. +See the +.B "Label expressions" +subsection for a description of label expressions. +This command is useful if you do not want explicit labels in the +reference list, but instead want to handle any necessary +disambiguation by qualifying the date in some way. +The label used in the text would typically be some combination of the +author and date. +In most cases you should also use the +.B no-label-in-reference +command. +For example, +.RS +.IP +.B +date-as-label D.+yD.y%a*D.-y +.LP +would attach a disambiguating letter to the year part of the +.B D +field in the reference. +.RE +.TP +.B default-database\*n +The default database should be searched. +This is the default behavior, so the negative version of +this command is more useful. +refer determines whether the default database should be searched +on the first occasion that it needs to do a search. +Thus a +.B no-default-database +command must be given before then, +in order to be effective. +.TP +.BI discard\*n\ fields +When the reference is read, +.I fields +should be discarded; +no string definitions for +.I fields +will be output. +Initially, +.I fields +are +.BR XYZ . +.TP +.BI et-al\*n\ string\ m\ n +Control use of +.B +et al +in the evaluation of +.B @ +expressions in label expressions. +If the number of authors needed to make the author sequence +unambiguous is +.I u +and the total number of authors is +.I t +then the last +.IR t \|\-\| u +authors will be replaced by +.I string +provided that +.IR t \|\-\| u +is not less than +.I m +and +.I t +is not less than +.IR n . +The default behavior is +.RS +.IP +.B +et-al " et al" 2 3 +.RE +.TP +.BI include\ filename +Include +.I filename +and interpret the contents as commands. +.TP +.BI join-authors\ string1\ string2\ string3 +This says how authors should be joined together. +When there are exactly two authors, they will be joined with +.IR string1 . +When there are more than two authors, all but the last two will +be joined with +.IR string2 , +and the last two authors will be joined with +.IR string3 . +If +.I string3 +is omitted, +it will default to +.IR string1 ; +if +.I string2 +is also omitted it will also default to +.IR string1 . +For example, +.RS +.IP +.B +join-authors " and " ", " ", and " +.LP +will restore the default method for joining authors. +.RE +.TP +.B label-in-reference\*n +When outputting the reference, +define the string +.B [F +to be the reference's label. +This is the default behavior; so the negative version +of this command is more useful. +.TP +.B label-in-text\*n +For each reference output a label in the text. +The label will be separated from the surrounding text as described in the +.B bracket-label +command. +This is the default behavior; so the negative version +of this command is more useful. +.TP +.BI label\ string +.I string +is a label expression describing how to label each reference. +.TP +.BI separate-label-second-parts\ string +When merging two-part labels, separate the second part of the second +label from the first label with +.IR string . +See the description of the +.B <> +label expression. +.TP +.B move-punctuation\*n +In the text, move any punctuation at the end of line past the label. +It is usually a good idea to give this command unless you are using +superscripted numbers as labels. +.TP +.BI reverse\*n\ string +Reverse the fields whose names +are in +.IR string . +Each field name can be followed by a number which says +how many such fields should be reversed. +If no number is given for a field, all such fields will be reversed. +.TP +.BI search-ignore\*n\ fields +While searching for keys in databases for which no index exists, +ignore the contents of +.IR fields . +Initially, fields +.B XYZ +are ignored. +.TP +.BI search-truncate\*n\ n +Only require the first +.I n +characters of keys to be given. +In effect when searching for a given key +words in the database are truncated to the maximum of +.I n +and the length of the key. +Initially +.I n +is 6. +.TP +.BI short-label\*n\ string +.I string +is a label expression that specifies an alternative (usually shorter) +style of label. +This is used when the +.B # +flag is given in the citation. +When using author-date style labels, the identity of the author +or authors is sometimes clear from the context, and so it +may be desirable to omit the author or authors from the label. +The +.B short-label +command will typically be used to specify a label containing just +a date and possibly a disambiguating letter. +.TP +.BI sort\*n\ string +Sort references according to +.BR string . +References will automatically be accumulated. +.I string +should be a list of field names, each followed by a number, +indicating how many fields with the name should be used for sorting. +.B + +can be used to indicate that all the fields with the name should be used. +Also +.B . +can be used to indicate the references should be sorted using the +(tentative) label. +(The +.B +Label expressions +subsection describes the concept of a tentative label.) +.TP +.B sort-adjacent-labels\*n +Sort labels that are adjacent in the text according to their +position in the reference list. +This command should usually be given if the +.B abbreviate-label-ranges +command has been given, +or if the label expression contains a +.B <> +expression. +This will have no effect unless references are being accumulated. +.SS Label expressions +.LP +Label expressions can be evaluated both normally and tentatively. +The result of normal evaluation is used for output. +The result of tentative evaluation, called the +.I +tentative label, +is used to gather the information +that normal evaluation needs to disambiguate the label. +Label expressions specified by the +.B date-as-label +and +.B short-label +commands are not evaluated tentatively. +Normal and tentative evaluation are the same for all types +of expression other than +.BR @ , +.BR * , +and +.B % +expressions. +The description below applies to normal evaluation, +except where otherwise specified. +.TP +.I field +.TQ +.I field\ n +The +.IR n -th +part of +.IR field . +If +.I n +is omitted, it defaults to 1. +.TP +.BI ' string ' +The characters in +.I string +literally. +.TP +.B @ +All the authors joined as specified by the +.B join-authors +command. +The whole of each author's name will be used. +However, if the references are sorted by author +(that is the sort specification starts with +.BR A+ ), +then authors' last names will be used instead, provided that this does +not introduce ambiguity, +and also an initial subsequence of the authors may be used +instead of all the authors, again provided that this does not +introduce ambiguity. +The use of only the last name for the +.IR i -th +author of some reference +is considered to be ambiguous if +there is some other reference, +such that the first +.IR i \|-\|1 +authors of the references are the same, +the +.IR i -th +authors are not the same, +but the +.IR i -th +authors' last names are the same. +A proper initial subsequence of the sequence +of authors for some reference is considered to be ambiguous if there is +a reference with some other sequence of authors which also has +that subsequence as a proper initial subsequence. +When an initial subsequence of authors is used, the remaining +authors are replaced by the string specified by the +.B et-al +command; +this command may also specify additional requirements that must be +met before an initial subsequence can be used. +.B @ +tentatively evaluates to a canonical representation of the authors, +such that authors that compare equally for sorting purpose +will have the same representation. +.TP +.BI % n +.TQ +.B %a +.TQ +.B %A +.TQ +.B %i +.TQ +.B %I +The serial number of the reference formatted according to the character +following the +.BR % . +The serial number of a reference is 1 plus the number of earlier references +with same tentative label as this reference. +These expressions tentatively evaluate to an empty string. +.TP +.IB expr * +If there is another reference with the same tentative label as +this reference, then +.IR expr , +otherwise an empty string. +It tentatively evaluates to an empty string. +.TP +.IB expr + n +.TQ +.IB expr \- n +The first +.RB ( + ) +or last +.RB ( \- ) +.I n +upper or lower case letters or digits of +.IR expr . +Troff special characters (such as +.BR \e('a ) +count as a single letter. +Accent strings are retained but do not count towards the total. +.TP +.IB expr .l +.I expr +converted to lowercase. +.TP +.IB expr .u +.I expr +converted to uppercase. +.TP +.IB expr .c +.I expr +converted to caps and small caps. +.TP +.IB expr .r +.I expr +reversed so that the last name is first. +.TP +.IB expr .a +.I expr +with first names abbreviated. +Note that fields specified in the +.B abbreviate +command are abbreviated before any labels are evaluated. +Thus +.B .a +is useful only when you want a field to be abbreviated in a label +but not in a reference. +.TP +.IB expr .y +The year part of +.IR expr . +.TP +.IB expr .+y +The part of +.I expr +before the year, or the whole of +.I expr +if it does not contain a year. +.TP +.IB expr .\-y +The part of +.I expr +after the year, or an empty string if +.I expr +does not contain a year. +.TP +.IB expr .n +The last name part of +.IR expr . +.TP +.IB expr1 \(ti expr2 +.I expr1 +except that if the last character of +.I expr1 +is +.B \- +then it will be replaced by +.IR expr2 . +.TP +.I expr1\ expr2 +The concatenation of +.I expr1 +and +.IR expr2 . +.TP +.IB expr1 | expr2 +If +.I expr1 +is non-empty then +.I expr1 +otherwise +.IR expr2 . +.TP +.IB expr1 & expr2 +If +.I expr1 +is non-empty +then +.I expr2 +otherwise an empty string. +.TP +.IB expr1 ? expr2 : expr3 +If +.I expr1 +is non-empty +then +.I expr2 +otherwise +.IR expr3 . +.TP +.BI < expr > +The label is in two parts, which are separated by +.IR expr . +Two adjacent two-part labels which have the same first part will be +merged by appending the second part of the second label onto the first +label separated by the string specified in the +.B separate-label-second-parts +command (initially, a comma followed by a space); the resulting label +will also be a two-part label with the same first part as before +merging, and so additional labels can be merged into it. +Note that it is permissible for the first part to be empty; +this maybe desirable for expressions used in the +.B short-label +command. +.TP +.BI ( expr ) +The same as +.IR expr . +Used for grouping. +.LP +The above expressions are listed in order of precedence +(highest first); +.B & +and +.B | +have the same precedence. +.SS Macro interface +Each reference starts with a call to the macro +.BR ]- . +The string +.B [F +will be defined to be the label for this reference, +unless the +.B no-label-in-reference +command has been given. +There then follows a series of string definitions, +one for each field: +string +.BI [ X +corresponds to field +.IR X . +The number register +.B [P +is set to 1 if the +.B P +field contains a range of pages. +The +.BR [T , +.B [A +and +.B [O +number registers are set to 1 according as the +.BR T , +.B A +and +.B O +fields end with one of the characters +.BR .?! . +The +.B [E +number register will be set to 1 if the +.B [E +string contains more than one name. +The reference is followed by a call to the +.B ][ +macro. +The first argument to this macro gives a number representing +the type of the reference. +If a reference contains a +.B J +field, it will be classified as type 1, +otherwise if it contains a +.B B +field, it will type 3, +otherwise if it contains a +.B G +or +.B R +field it will be type 4, +otherwise if contains a +.B I +field it will be type 2, +otherwise it will be type 0. +The second argument is a symbolic name for the type: +.BR other , +.BR journal-article , +.BR book , +.B article-in-book +or +.BR tech-report . +Groups of references that have been accumulated +or are produced by the +.B bibliography +command are preceded by a call to the +.B ]< +macro and followed by a call to the +.B ]> +macro. +.SH FILES +.Tp \w'\fB/usr/share/dict/papers/Ind'u+2n +.B /usr/share/dict/papers/Ind +Default database. +.TP +.IB file .i +Index files. +.SH "SEE ALSO" +.BR indxbib (1), +.BR lookbib (1), +.BR lkbib (1) +.br +.SH BUGS +In label expressions, +.B <> +expressions are ignored inside +.BI . char +expressions. diff --git a/gnu/usr.bin/groff/refer/refer.cc b/gnu/usr.bin/groff/refer/refer.cc new file mode 100644 index 000000000000..cb6e218e3ea1 --- /dev/null +++ b/gnu/usr.bin/groff/refer/refer.cc @@ -0,0 +1,1221 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" +#include "search.h" +#include "command.h" + +const char PRE_LABEL_MARKER = '\013'; +const char POST_LABEL_MARKER = '\014'; +const char LABEL_MARKER = '\015'; // label_type is added on + +#define FORCE_LEFT_BRACKET 04 +#define FORCE_RIGHT_BRACKET 010 + +static FILE *outfp = stdout; + +string capitalize_fields; +string reverse_fields; +string abbreviate_fields; +string period_before_last_name = ". "; +string period_before_initial = "."; +string period_before_hyphen = ""; +string period_before_other = ". "; +string sort_fields; +int annotation_field = -1; +string annotation_macro; +string discard_fields = "XYZ"; +string pre_label = "\\*([."; +string post_label = "\\*(.]"; +string sep_label = ", "; +int accumulate = 0; +int move_punctuation = 0; +int abbreviate_label_ranges = 0; +string label_range_indicator; +int label_in_text = 1; +int label_in_reference = 1; +int date_as_label = 0; +int sort_adjacent_labels = 0; +// Join exactly two authors with this. +string join_authors_exactly_two = " and "; +// When there are more than two authors join the last two with this. +string join_authors_last_two = ", and "; +// Otherwise join authors with this. +string join_authors_default = ", "; +string separate_label_second_parts = ", "; +// Use this string to represent that there are other authors. +string et_al = " et al"; +// Use et al only if it can replace at least this many authors. +int et_al_min_elide = 2; +// Use et al only if the total number of authors is at least this. +int et_al_min_total = 3; + + +int compatible_flag = 0; + +int short_label_flag = 0; + +static int recognize_R1_R2 = 1; + +search_list database_list; +int search_default = 1; +static int default_database_loaded = 0; + +static reference **citation = 0; +static int ncitations = 0; +static int citation_max = 0; + +static reference **reference_hash_table = 0; +static int hash_table_size; +static int nreferences = 0; + +static int need_syncing = 0; +string pending_line; +string pending_lf_lines; + +static void output_pending_line(); +static unsigned immediately_handle_reference(const string &); +static void immediately_output_references(); +static unsigned store_reference(const string &); +static void divert_to_temporary_file(); +static reference *make_reference(const string &, unsigned *); +static void usage(); +static void do_file(const char *); +static void split_punct(string &line, string &punct); +static void output_citation_group(reference **v, int n, label_type, FILE *fp); +static void possibly_load_default_database(); + +int main(int argc, char **argv) +{ + program_name = argv[0]; + static char stderr_buf[BUFSIZ]; + setbuf(stderr, stderr_buf); + outfp = stdout; + int finished_options = 0; + int bib_flag = 0; + int done_spec = 0; + + for (--argc, ++argv; + !finished_options && argc > 0 && argv[0][0] == '-' + && argv[0][1] != '\0'; + argv++, argc--) { + const char *opt = argv[0] + 1; + while (opt != 0 && *opt != '\0') { + switch (*opt) { + case 'C': + compatible_flag = 1; + opt++; + break; + case 'B': + bib_flag = 1; + label_in_reference = 0; + label_in_text = 0; + ++opt; + if (*opt == '\0') { + annotation_field = 'X'; + annotation_macro = "AP"; + } + else if (csalnum(opt[0]) && opt[1] == '.' && opt[2] != '\0') { + annotation_field = opt[0]; + annotation_macro = opt + 2; + } + opt = 0; + break; + case 'P': + move_punctuation = 1; + opt++; + break; + case 'R': + recognize_R1_R2 = 0; + opt++; + break; + case 'S': + // Not a very useful spec. + set_label_spec("(A.n|Q)', '(D.y|D)"); + done_spec = 1; + pre_label = " ("; + post_label = ")"; + sep_label = "; "; + opt++; + break; + case 'V': + verify_flag = 1; + opt++; + break; + case 'f': + { + const char *num = 0; + if (*++opt == '\0') { + if (argc > 1) { + num = *++argv; + --argc; + } + else { + error("option `f' requires an argument"); + usage(); + } + } + else { + num = opt; + opt = 0; + } + for (const char *ptr = num; *ptr; ptr++) + if (!csdigit(*ptr)) { + error("bad character `%1' in argument to -f option", *ptr); + break; + } + if (*ptr == '\0') { + string spec; + spec = '%'; + spec += num; + spec += '\0'; + set_label_spec(spec.contents()); + done_spec = 1; + } + break; + } + case 'b': + label_in_text = 0; + label_in_reference = 0; + opt++; + break; + case 'e': + accumulate = 1; + opt++; + break; + case 'c': + capitalize_fields = ++opt; + opt = 0; + break; + case 'k': + { + char buf[5]; + if (csalpha(*++opt)) + buf[0] = *opt++; + else { + if (*opt != '\0') + error("bad field name `%1'", *opt++); + buf[0] = 'L'; + } + buf[1] = '~'; + buf[2] = '%'; + buf[3] = 'a'; + buf[4] = '\0'; + set_label_spec(buf); + done_spec = 1; + } + break; + case 'a': + { + for (const char *ptr = ++opt; *ptr; ptr++) + if (!csdigit(*ptr)) { + error("argument to `a' option not a number"); + break; + } + if (*ptr == '\0') { + reverse_fields = 'A'; + reverse_fields += opt; + } + opt = 0; + } + break; + case 'i': + linear_ignore_fields = ++opt; + opt = 0; + break; + case 'l': + { + char buf[INT_DIGITS*2 + 11]; // A.n+2D.y-3%a + strcpy(buf, "A.n"); + if (*++opt != '\0' && *opt != ',') { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("bad integer `%1' in `l' option", opt); + opt = 0; + break; + } + if (n < 0) + n = 0; + opt = ptr; + sprintf(strchr(buf, '\0'), "+%d", n); + } + strcat(buf, "D.y"); + if (*opt == ',') + opt++; + if (*opt != '\0') { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("bad integer `%1' in `l' option", opt); + opt = 0; + break; + } + if (n < 0) + n = 0; + sprintf(strchr(buf, '\0'), "-%d", n); + opt = ptr; + if (*opt != '\0') + error("argument to `l' option not of form `m,n'"); + } + strcat(buf, "%a"); + if (!set_label_spec(buf)) + assert(0); + done_spec = 1; + } + break; + case 'n': + search_default = 0; + opt++; + break; + case 'p': + { + const char *filename = 0; + if (*++opt == '\0') { + if (argc > 1) { + filename = *++argv; + argc--; + } + else { + error("option `p' requires an argument"); + usage(); + } + } + else { + filename = opt; + opt = 0; + } + database_list.add_file(filename); + } + break; + case 's': + if (*++opt == '\0') + sort_fields = "AD"; + else { + sort_fields = opt; + opt = 0; + } + accumulate = 1; + break; + case 't': + { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("bad integer `%1' in `t' option", opt); + opt = 0; + break; + } + if (n < 1) + n = 1; + linear_truncate_len = int(n); + opt = ptr; + break; + } + case 'v': + { + extern const char *version_string; + fprintf(stderr, "GNU refer version %s\n", version_string); + fflush(stderr); + opt++; + break; + } + case '-': + if (opt[1] == '\0') { + finished_options = 1; + opt++; + break; + } + // fall through + default: + error("unrecognized option `%1'", *opt); + usage(); + break; + } + } + } + if (!done_spec) + set_label_spec("%1"); + if (argc <= 0) { + if (bib_flag) + do_bib("-"); + else + do_file("-"); + } + else { + for (int i = 0; i < argc; i++) { + if (bib_flag) + do_bib(argv[i]); + else + do_file(argv[i]); + } + } + if (accumulate) + output_references(); + if (fflush(stdout) < 0) + fatal("output error"); + exit(0); +} + +static void usage() +{ + fprintf(stderr, +"usage: %s [-benvCPRS] [-aN] [-cXYZ] [-fN] [-iXYZ] [-kX] [-lM,N] [-p file]\n" +" [-sXYZ] [-tN] [-BL.M] [files ...]\n", + program_name); + exit(1); +} + +static void possibly_load_default_database() +{ + if (search_default && !default_database_loaded) { + char *filename = getenv("REFER"); + if (filename) + database_list.add_file(filename); + else + database_list.add_file(DEFAULT_INDEX, 1); + default_database_loaded = 1; + } +} + +static int is_list(const string &str) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + while (end > start && csspace(end[-1])) + end--; + while (start < end && csspace(*start)) + start++; + return end - start == 6 && memcmp(start, "$LIST$", 6) == 0; +} + +static void do_file(const char *filename) +{ + FILE *fp; + if (strcmp(filename, "-") == 0) { + fp = stdin; + } + else { + errno = 0; + fp = fopen(filename, "r"); + if (fp == 0) { + error("can't open `%1': %2", filename, strerror(errno)); + return; + } + current_filename = filename; + } + fprintf(outfp, ".lf 1 %s\n", filename); + string line; + current_lineno = 0; + for (;;) { + line.clear(); + for (;;) { + int c = getc(fp); + if (c == EOF) { + if (line.length() > 0) + line += '\n'; + break; + } + if (illegal_input_char(c)) + error("illegal input character code %1", c); + else { + line += c; + if (c == '\n') + break; + } + } + int len = line.length(); + if (len == 0) + break; + current_lineno++; + if (len >= 2 && line[0] == '.' && line[1] == '[') { + int start_lineno = current_lineno; + int start_of_line = 1; + string str; + string post; + string pre(line.contents() + 2, line.length() - 3); + for (;;) { + int c = getc(fp); + if (c == EOF) { + error_with_file_and_line(current_filename, start_lineno, + "missing `.]' line"); + break; + } + if (start_of_line) + current_lineno++; + if (start_of_line && c == '.') { + int d = getc(fp); + if (d == ']') { + while ((d = getc(fp)) != '\n' && d != EOF) { + if (illegal_input_char(d)) + error("illegal input character code %1", d); + else + post += d; + } + break; + } + if (d != EOF) + ungetc(d, fp); + } + if (illegal_input_char(c)) + error("illegal input character code %1", c); + else + str += c; + start_of_line = (c == '\n'); + } + if (is_list(str)) { + output_pending_line(); + if (accumulate) + output_references(); + else + error("found `$LIST$' but not accumulating references"); + } + else { + unsigned flags = (accumulate + ? store_reference(str) + : immediately_handle_reference(str)); + if (label_in_text) { + if (accumulate && outfp == stdout) + divert_to_temporary_file(); + if (pending_line.length() == 0) { + warning("can't attach citation to previous line"); + } + else + pending_line.set_length(pending_line.length() - 1); + string punct; + if (move_punctuation) + split_punct(pending_line, punct); + int have_text = pre.length() > 0 || post.length() > 0; + label_type lt = label_type(flags & ~(FORCE_LEFT_BRACKET + |FORCE_RIGHT_BRACKET)); + if ((flags & FORCE_LEFT_BRACKET) || !have_text) + pending_line += PRE_LABEL_MARKER; + pending_line += pre; + pending_line += LABEL_MARKER + lt; + pending_line += post; + if ((flags & FORCE_RIGHT_BRACKET) || !have_text) + pending_line += POST_LABEL_MARKER; + pending_line += punct; + pending_line += '\n'; + } + } + need_syncing = 1; + } + else if (len >= 4 + && line[0] == '.' && line[1] == 'l' && line[2] == 'f' + && (compatible_flag || line[3] == '\n' || line[3] == ' ')) { + pending_lf_lines += line; + line += '\0'; + if (interpret_lf_args(line.contents() + 3)) + current_lineno--; + } + else if (recognize_R1_R2 + && len >= 4 + && line[0] == '.' && line[1] == 'R' && line[2] == '1' + && (compatible_flag || line[3] == '\n' || line[3] == ' ')) { + line.clear(); + int start_of_line = 1; + int start_lineno = current_lineno; + for (;;) { + int c = getc(fp); + if (c != EOF && start_of_line) + current_lineno++; + if (start_of_line && c == '.') { + c = getc(fp); + if (c == 'R') { + c = getc(fp); + if (c == '2') { + c = getc(fp); + if (compatible_flag || c == ' ' || c == '\n' || c == EOF) { + while (c != EOF && c != '\n') + c = getc(fp); + break; + } + else { + line += '.'; + line += 'R'; + line += '2'; + } + } + else { + line += '.'; + line += 'R'; + } + } + else + line += '.'; + } + if (c == EOF) { + error_with_file_and_line(current_filename, start_lineno, + "missing `.R2' line"); + break; + } + if (illegal_input_char(c)) + error("illegal input character code %1", int(c)); + else { + line += c; + start_of_line = c == '\n'; + } + } + output_pending_line(); + if (accumulate) + output_references(); + else + nreferences = 0; + process_commands(line, current_filename, start_lineno + 1); + need_syncing = 1; + } + else { + output_pending_line(); + pending_line = line; + } + } + need_syncing = 0; + output_pending_line(); + if (fp != stdin) + fclose(fp); +} + +class label_processing_state { + enum { + NORMAL, + PENDING_LABEL, + PENDING_LABEL_POST, + PENDING_LABEL_POST_PRE, + PENDING_POST + } state; + label_type type; // type of pending labels + int count; // number of pending labels + reference **rptr; // pointer to next reference + int rcount; // number of references left + FILE *fp; + int handle_pending(int c); +public: + label_processing_state(reference **, int, FILE *); + ~label_processing_state(); + void process(int c); +}; + +static void output_pending_line() +{ + if (label_in_text && !accumulate && ncitations > 0) { + label_processing_state state(citation, ncitations, outfp); + int len = pending_line.length(); + for (int i = 0; i < len; i++) + state.process((unsigned char)(pending_line[i])); + } + else + put_string(pending_line, outfp); + pending_line.clear(); + if (pending_lf_lines.length() > 0) { + put_string(pending_lf_lines, outfp); + pending_lf_lines.clear(); + } + if (!accumulate) + immediately_output_references(); + if (need_syncing) { + fprintf(outfp, ".lf %d %s\n", current_lineno, current_filename); + need_syncing = 0; + } +} + +static void split_punct(string &line, string &punct) +{ + const char *start = line.contents(); + const char *end = start + line.length(); + const char *ptr = start; + const char *last_token_start = 0; + for (;;) { + if (ptr >= end) + break; + last_token_start = ptr; + if (*ptr == PRE_LABEL_MARKER || *ptr == POST_LABEL_MARKER + || (*ptr >= LABEL_MARKER && *ptr < LABEL_MARKER + N_LABEL_TYPES)) + ptr++; + else if (!get_token(&ptr, end)) + break; + } + if (last_token_start) { + const token_info *ti = lookup_token(last_token_start, end); + if (ti->is_punct()) { + punct.append(last_token_start, end - last_token_start); + line.set_length(last_token_start - start); + } + } +} + +static void divert_to_temporary_file() +{ + outfp = xtmpfile(); +} + +static void store_citation(reference *ref) +{ + if (ncitations >= citation_max) { + if (citation == 0) + citation = new reference*[citation_max = 100]; + else { + reference **old_citation = citation; + citation_max *= 2; + citation = new reference *[citation_max]; + memcpy(citation, old_citation, ncitations*sizeof(reference *)); + a_delete old_citation; + } + } + citation[ncitations++] = ref; +} + +static unsigned store_reference(const string &str) +{ + if (reference_hash_table == 0) { + reference_hash_table = new reference *[17]; + hash_table_size = 17; + for (int i = 0; i < hash_table_size; i++) + reference_hash_table[i] = 0; + } + unsigned flags; + reference *ref = make_reference(str, &flags); + ref->compute_hash_code(); + unsigned h = ref->hash(); + for (reference **ptr = reference_hash_table + (h % hash_table_size); + *ptr != 0; + ((ptr == reference_hash_table) + ? (ptr = reference_hash_table + hash_table_size - 1) + : --ptr)) + if (same_reference(**ptr, *ref)) + break; + if (*ptr != 0) { + if (ref->is_merged()) + warning("fields ignored because reference already used"); + delete ref; + ref = *ptr; + } + else { + *ptr = ref; + ref->set_number(nreferences); + nreferences++; + ref->pre_compute_label(); + ref->compute_sort_key(); + if (nreferences*2 >= hash_table_size) { + // Rehash it. + reference **old_table = reference_hash_table; + int old_size = hash_table_size; + hash_table_size = next_size(hash_table_size); + reference_hash_table = new reference*[hash_table_size]; + int i; + for (i = 0; i < hash_table_size; i++) + reference_hash_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + for (reference **p = (reference_hash_table + + (old_table[i]->hash() % hash_table_size)); + *p; + ((p == reference_hash_table) + ? (p = reference_hash_table + hash_table_size - 1) + : --p)) + ; + *p = old_table[i]; + } + a_delete old_table; + } + } + if (label_in_text) + store_citation(ref); + return flags; +} + +unsigned immediately_handle_reference(const string &str) +{ + unsigned flags; + reference *ref = make_reference(str, &flags); + ref->set_number(nreferences); + if (label_in_text || label_in_reference) { + ref->pre_compute_label(); + ref->immediate_compute_label(); + } + nreferences++; + store_citation(ref); + return flags; +} + +static void immediately_output_references() +{ + for (int i = 0; i < ncitations; i++) { + reference *ref = citation[i]; + if (label_in_reference) { + fputs(".ds [F ", outfp); + const string &label = ref->get_label(NORMAL_LABEL); + if (label.length() > 0 + && (label[0] == ' ' || label[0] == '\\' || label[0] == '"')) + putc('"', outfp); + put_string(label, outfp); + putc('\n', outfp); + } + ref->output(outfp); + delete ref; + } + ncitations = 0; +} + +static void output_citation_group(reference **v, int n, label_type type, + FILE *fp) +{ + if (sort_adjacent_labels) { + // Do an insertion sort. Usually n will be very small. + for (int i = 1; i < n; i++) { + int num = v[i]->get_number(); + reference *temp = v[i]; + for (int j = i - 1; j >= 0 && v[j]->get_number() > num; j--) + v[j + 1] = v[j]; + v[j + 1] = temp; + } + } + // This messes up if !accumulate. + if (accumulate && n > 1) { + // remove duplicates + int j = 1; + for (int i = 1; i < n; i++) + if (v[i]->get_label(type) != v[i - 1]->get_label(type)) + v[j++] = v[i]; + n = j; + } + string merged_label; + for (int i = 0; i < n; i++) { + int nmerged = v[i]->merge_labels(v + i + 1, n - i - 1, type, merged_label); + if (nmerged > 0) { + put_string(merged_label, fp); + i += nmerged; + } + else + put_string(v[i]->get_label(type), fp); + if (i < n - 1) + put_string(sep_label, fp); + } +} + + +label_processing_state::label_processing_state(reference **p, int n, FILE *f) +: state(NORMAL), count(0), rptr(p), rcount(n), fp(f) +{ +} + +label_processing_state::~label_processing_state() +{ + int handled = handle_pending(EOF); + assert(!handled); + assert(rcount == 0); +} + +int label_processing_state::handle_pending(int c) +{ + switch (state) { + case NORMAL: + break; + case PENDING_LABEL: + if (c == POST_LABEL_MARKER) { + state = PENDING_LABEL_POST; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count ; + rcount -= count; + state = NORMAL; + } + break; + case PENDING_LABEL_POST: + if (c == PRE_LABEL_MARKER) { + state = PENDING_LABEL_POST_PRE; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count; + rcount -= count; + put_string(post_label, fp); + state = NORMAL; + } + break; + case PENDING_LABEL_POST_PRE: + if (c >= LABEL_MARKER + && c < LABEL_MARKER + N_LABEL_TYPES + && c - LABEL_MARKER == type) { + count += 1; + state = PENDING_LABEL; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count; + rcount -= count; + put_string(sep_label, fp); + state = NORMAL; + } + break; + case PENDING_POST: + if (c == PRE_LABEL_MARKER) { + put_string(sep_label, fp); + state = NORMAL; + return 1; + } + else { + put_string(post_label, fp); + state = NORMAL; + } + break; + } + return 0; +} + +void label_processing_state::process(int c) +{ + if (handle_pending(c)) + return; + assert(state == NORMAL); + switch (c) { + case PRE_LABEL_MARKER: + put_string(pre_label, fp); + state = NORMAL; + break; + case POST_LABEL_MARKER: + state = PENDING_POST; + break; + case LABEL_MARKER: + case LABEL_MARKER + 1: + count = 1; + state = PENDING_LABEL; + type = label_type(c - LABEL_MARKER); + break; + default: + state = NORMAL; + putc(c, fp); + break; + } +} + +extern "C" { + +static int rcompare(const void *p1, const void *p2) +{ + return compare_reference(**(reference **)p1, **(reference **)p2); +} + +} + +void output_references() +{ + assert(accumulate); + if (nreferences > 0) { + int j = 0; + int i; + for (i = 0; i < hash_table_size; i++) + if (reference_hash_table[i] != 0) + reference_hash_table[j++] = reference_hash_table[i]; + assert(j == nreferences); + for (; j < hash_table_size; j++) + reference_hash_table[j] = 0; + qsort(reference_hash_table, nreferences, sizeof(reference*), rcompare); + for (i = 0; i < nreferences; i++) + reference_hash_table[i]->set_number(i); + compute_labels(reference_hash_table, nreferences); + } + if (outfp != stdout) { + rewind(outfp); + { + label_processing_state state(citation, ncitations, stdout); + int c; + while ((c = getc(outfp)) != EOF) + state.process(c); + } + ncitations = 0; + fclose(outfp); + outfp = stdout; + } + if (nreferences > 0) { + fputs(".]<\n", outfp); + for (int i = 0; i < nreferences; i++) { + if (sort_fields.length() > 0) + reference_hash_table[i]->print_sort_key_comment(outfp); + if (label_in_reference) { + fputs(".ds [F ", outfp); + const string &label = reference_hash_table[i]->get_label(NORMAL_LABEL); + if (label.length() > 0 + && (label[0] == ' ' || label[0] == '\\' || label[0] == '"')) + putc('"', outfp); + put_string(label, outfp); + putc('\n', outfp); + } + reference_hash_table[i]->output(outfp); + delete reference_hash_table[i]; + reference_hash_table[i] = 0; + } + fputs(".]>\n", outfp); + nreferences = 0; + } + clear_labels(); +} + +static reference *find_reference(const char *query, int query_len) +{ + // This is so that error messages look better. + while (query_len > 0 && csspace(query[query_len - 1])) + query_len--; + string str; + for (int i = 0; i < query_len; i++) + str += query[i] == '\n' ? ' ' : query[i]; + str += '\0'; + possibly_load_default_database(); + search_list_iterator iter(&database_list, str.contents()); + reference_id rid; + const char *start; + int len; + if (!iter.next(&start, &len, &rid)) { + error("no matches for `%1'", str.contents()); + return 0; + } + const char *end = start + len; + while (start < end) { + if (*start == '%') + break; + while (start < end && *start++ != '\n') + ; + } + if (start >= end) { + error("found a reference for `%1' but it didn't contain any fields", + str.contents()); + return 0; + } + reference *result = new reference(start, end - start, &rid); + if (iter.next(&start, &len, &rid)) + warning("multiple matches for `%1'", str.contents()); + return result; +} + +static reference *make_reference(const string &str, unsigned *flagsp) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + const char *ptr = start; + while (ptr < end) { + if (*ptr == '%') + break; + while (ptr < end && *ptr++ != '\n') + ; + } + *flagsp = 0; + for (; start < ptr; start++) { + if (*start == '#') + *flagsp = (SHORT_LABEL | (*flagsp & (FORCE_RIGHT_BRACKET + | FORCE_LEFT_BRACKET))); + else if (*start == '[') + *flagsp |= FORCE_LEFT_BRACKET; + else if (*start == ']') + *flagsp |= FORCE_RIGHT_BRACKET; + else if (!csspace(*start)) + break; + } + if (start >= end) { + error("empty reference"); + return new reference; + } + reference *database_ref = 0; + if (start < ptr) + database_ref = find_reference(start, ptr - start); + reference *inline_ref = 0; + if (ptr < end) + inline_ref = new reference(ptr, end - ptr); + if (inline_ref) { + if (database_ref) { + database_ref->merge(*inline_ref); + delete inline_ref; + return database_ref; + } + else + return inline_ref; + } + else if (database_ref) + return database_ref; + else + return new reference; +} + +static void do_ref(const string &str) +{ + if (accumulate) + (void)store_reference(str); + else { + (void)immediately_handle_reference(str); + immediately_output_references(); + } +} + +static void trim_blanks(string &str) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + while (end > start && end[-1] != '\n' && csspace(end[-1])) + --end; + str.set_length(end - start); +} + +void do_bib(const char *filename) +{ + FILE *fp; + if (strcmp(filename, "-") == 0) + fp = stdin; + else { + errno = 0; + fp = fopen(filename, "r"); + if (fp == 0) { + error("can't open `%1': %2", filename, strerror(errno)); + return; + } + current_filename = filename; + } + enum { + START, MIDDLE, BODY, BODY_START, BODY_BLANK, BODY_DOT + } state = START; + string body; + for (;;) { + int c = getc(fp); + if (c == EOF) + break; + if (illegal_input_char(c)) { + error("illegal input character code %1", c); + continue; + } + switch (state) { + case START: + if (c == '%') { + body = c; + state = BODY; + } + else if (c != '\n') + state = MIDDLE; + break; + case MIDDLE: + if (c == '\n') + state = START; + break; + case BODY: + body += c; + if (c == '\n') + state = BODY_START; + break; + case BODY_START: + if (c == '\n') { + do_ref(body); + state = START; + } + else if (c == '.') + state = BODY_DOT; + else if (csspace(c)) { + state = BODY_BLANK; + body += c; + } + else { + body += c; + state = BODY; + } + break; + case BODY_BLANK: + if (c == '\n') { + trim_blanks(body); + do_ref(body); + state = START; + } + else if (csspace(c)) + body += c; + else { + body += c; + state = BODY; + } + break; + case BODY_DOT: + if (c == ']') { + do_ref(body); + state = MIDDLE; + } + else { + body += '.'; + body += c; + state = c == '\n' ? BODY_START : BODY; + } + break; + default: + assert(0); + } + if (c == '\n') + current_lineno++; + } + switch (state) { + case START: + case MIDDLE: + break; + case BODY: + body += '\n'; + do_ref(body); + break; + case BODY_DOT: + case BODY_START: + do_ref(body); + break; + case BODY_BLANK: + trim_blanks(body); + do_ref(body); + break; + } + fclose(fp); +} + +// from the Dragon Book + +unsigned hash_string(const char *s, int len) +{ + const char *end = s + len; + unsigned h = 0, g; + while (s < end) { + h <<= 4; + h += *s++; + if ((g = h & 0xf0000000) != 0) { + h ^= g >> 24; + h ^= g; + } + } + return h; +} + +int next_size(int n) +{ + static const int table_sizes[] = { + 101, 503, 1009, 2003, 3001, 4001, 5003, 10007, 20011, 40009, + 80021, 160001, 500009, 1000003, 2000003, 4000037, 8000009, + 16000057, 32000011, 64000031, 128000003, 0 + }; + + for (const int *p = table_sizes; *p <= n && *p != 0; p++) + ; + assert(*p != 0); + return *p; +} + diff --git a/gnu/usr.bin/groff/refer/refer.h b/gnu/usr.bin/groff/refer/refer.h new file mode 100644 index 000000000000..932b85536e62 --- /dev/null +++ b/gnu/usr.bin/groff/refer/refer.h @@ -0,0 +1,78 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <errno.h> + +#include "errarg.h" +#include "error.h" +#include "lib.h" +#include "stringclass.h" +#include "cset.h" +#include "cmap.h" + +#include "defs.h" + +unsigned hash_string(const char *, int); +int next_size(int); + +extern string capitalize_fields; +extern string reverse_fields; +extern string abbreviate_fields; +extern string period_before_last_name; +extern string period_before_initial; +extern string period_before_hyphen; +extern string period_before_other; +extern string sort_fields; +extern int annotation_field; +extern string annotation_macro; +extern string discard_fields; +extern string articles; +extern int abbreviate_label_ranges; +extern string label_range_indicator; +extern int date_as_label; +extern string join_authors_exactly_two; +extern string join_authors_last_two; +extern string join_authors_default; +extern string separate_label_second_parts; +extern string et_al; +extern int et_al_min_elide; +extern int et_al_min_total; + +extern int compatible_flag; + +extern int set_label_spec(const char *); +extern int set_date_label_spec(const char *); +extern int set_short_label_spec(const char *); + +extern int short_label_flag; + +void clear_labels(); +void command_error(const char *, + const errarg &arg1 = empty_errarg, + const errarg &arg2 = empty_errarg, + const errarg &arg3 = empty_errarg); + +struct reference; + +void compute_labels(reference **, int); diff --git a/gnu/usr.bin/groff/refer/token.cc b/gnu/usr.bin/groff/refer/token.cc new file mode 100644 index 000000000000..8847081bd482 --- /dev/null +++ b/gnu/usr.bin/groff/refer/token.cc @@ -0,0 +1,370 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include "refer.h" +#include "token.h" + +#define TOKEN_TABLE_SIZE 1009 +// I believe in Icelandic thorn sorts after z. +#define THORN_SORT_KEY "{" + +struct token_table_entry { + const char *tok; + token_info ti; + token_table_entry(); +}; + +token_table_entry token_table[TOKEN_TABLE_SIZE]; +int ntokens = 0; + +static void skip_name(const char **ptr, const char *end) +{ + if (*ptr < end) { + switch (*(*ptr)++) { + case '(': + if (*ptr < end) { + *ptr += 1; + if (*ptr < end) + *ptr += 1; + } + break; + case '[': + while (*ptr < end) + if (*(*ptr)++ == ']') + break; + break; + } + } +} + +int get_token(const char **ptr, const char *end) +{ + if (*ptr >= end) + return 0; + char c = *(*ptr)++; + if (c == '\\' && *ptr < end) { + switch (**ptr) { + default: + *ptr += 1; + break; + case '(': + case '[': + skip_name(ptr, end); + break; + case '*': + case 'f': + *ptr += 1; + skip_name(ptr, end); + break; + } + } + return 1; +} + +token_info::token_info() +: type(TOKEN_OTHER), sort_key(0), other_case(0) +{ +} + +void token_info::set(token_type t, const char *sk, const char *oc) +{ + assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); + type = t; + sort_key = sk; + other_case = oc; +} + +void token_info::sortify(const char *start, const char *end, string &result) + const +{ + if (sort_key) + result += sort_key; + else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { + for (; start < end; start++) + if (csalpha(*start)) + result += cmlower(*start); + } +} + +int token_info::sortify_non_empty(const char *start, const char *end) const +{ + if (sort_key) + return *sort_key != '\0'; + if (type != TOKEN_UPPER && type != TOKEN_LOWER) + return 0; + for (; start < end; start++) + if (csalpha(*start)) + return 1; + return 0; +} + + +void token_info::lower_case(const char *start, const char *end, + string &result) const +{ + if (type != TOKEN_UPPER) { + while (start < end) + result += *start++; + } + else if (other_case) + result += other_case; + else { + while (start < end) + result += cmlower(*start++); + } +} + +void token_info::upper_case(const char *start, const char *end, + string &result) const +{ + if (type != TOKEN_LOWER) { + while (start < end) + result += *start++; + } + else if (other_case) + result += other_case; + else { + while (start < end) + result += cmupper(*start++); + } +} + +token_table_entry::token_table_entry() +: tok(0) +{ +} + +static void store_token(const char *tok, token_type typ, + const char *sk = 0, const char *oc = 0) +{ + unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; + while (n >= 0) { + if (token_table[n].tok == 0) { + if (++ntokens == TOKEN_TABLE_SIZE) + assert(0); + token_table[n].tok = tok; + break; + } + if (strcmp(tok, token_table[n].tok) == 0) + break; + if (--n < 0) + n = TOKEN_TABLE_SIZE - 1; + } + token_table[n].ti.set(typ, sk, oc); +} + + +token_info default_token_info; + +const token_info *lookup_token(const char *start, const char *end) +{ + unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; + while (n >= 0) { + if (token_table[n].tok == 0) + break; + if (strlen(token_table[n].tok) == end - start + && memcmp(token_table[n].tok, start, end - start) == 0) + return &(token_table[n].ti); + if (--n < 0) + n = TOKEN_TABLE_SIZE - 1; + } + return &default_token_info; +} + +static void init_ascii() +{ + for (const char *p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + store_token(strsave(buf), TOKEN_LOWER); + buf[0] = cmupper(buf[0]); + store_token(strsave(buf), TOKEN_UPPER); + } + for (p = "0123456789"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + const char *s = strsave(buf); + store_token(s, TOKEN_OTHER, s); + } + for (p = ".,:;?!"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + store_token(strsave(buf), TOKEN_PUNCT); + } + store_token("-", TOKEN_HYPHEN); +} + +static void store_letter(const char *lower, const char *upper, + const char *sort_key = 0) +{ + store_token(lower, TOKEN_LOWER, sort_key, upper); + store_token(upper, TOKEN_UPPER, sort_key, lower); +} + +static void init_letter(unsigned char uc_code, unsigned char lc_code, + const char *sort_key) +{ + char lbuf[2]; + lbuf[0] = lc_code; + lbuf[1] = 0; + char ubuf[2]; + ubuf[0] = uc_code; + ubuf[1] = 0; + store_letter(strsave(lbuf), strsave(ubuf), sort_key); +} + +static void init_latin1() +{ + init_letter(0xc0, 0xe0, "a"); + init_letter(0xc1, 0xe1, "a"); + init_letter(0xc2, 0xe2, "a"); + init_letter(0xc3, 0xe3, "a"); + init_letter(0xc4, 0xe4, "a"); + init_letter(0xc5, 0xe5, "a"); + init_letter(0xc6, 0xe6, "ae"); + init_letter(0xc7, 0xe7, "c"); + init_letter(0xc8, 0xe8, "e"); + init_letter(0xc9, 0xe9, "e"); + init_letter(0xca, 0xea, "e"); + init_letter(0xcb, 0xeb, "e"); + init_letter(0xcc, 0xec, "i"); + init_letter(0xcd, 0xed, "i"); + init_letter(0xce, 0xee, "i"); + init_letter(0xcf, 0xef, "i"); + + init_letter(0xd0, 0xf0, "d"); + init_letter(0xd1, 0xf1, "n"); + init_letter(0xd2, 0xf2, "o"); + init_letter(0xd3, 0xf3, "o"); + init_letter(0xd4, 0xf4, "o"); + init_letter(0xd5, 0xf5, "o"); + init_letter(0xd6, 0xf6, "o"); + init_letter(0xd8, 0xf8, "o"); + init_letter(0xd9, 0xf9, "u"); + init_letter(0xda, 0xfa, "u"); + init_letter(0xdb, 0xfb, "u"); + init_letter(0xdc, 0xfc, "u"); + init_letter(0xdd, 0xfd, "y"); + init_letter(0xde, 0xfe, THORN_SORT_KEY); + + store_token("\337", TOKEN_LOWER, "ss", "SS"); + store_token("\377", TOKEN_LOWER, "y", "Y"); +} + +static void init_two_char_letter(char l1, char l2, char u1, char u2, + const char *sk = 0) +{ + char buf[6]; + buf[0] = '\\'; + buf[1] = '('; + buf[2] = l1; + buf[3] = l2; + buf[4] = '\0'; + const char *p = strsave(buf); + buf[2] = u1; + buf[3] = u2; + store_letter(p, strsave(buf), sk); + buf[1] = '['; + buf[4] = ']'; + buf[5] = '\0'; + p = strsave(buf); + buf[2] = l1; + buf[3] = l2; + store_letter(strsave(buf), p, sk); + +} + +static void init_special_chars() +{ + for (const char *p = "':^`~"; *p; p++) + for (const char *q = "aeiouy"; *q; q++) { + // Use a variable to work around bug in gcc 2.0 + char c = cmupper(*q); + init_two_char_letter(*p, *q, *p, c); + } + for (p = "/l/o~n,coeaeij"; *p; p += 2) { + // Use variables to work around bug in gcc 2.0 + char c0 = cmupper(p[0]); + char c1 = cmupper(p[1]); + init_two_char_letter(p[0], p[1], c0, c1); + } + init_two_char_letter('v', 's', 'v', 'S', "s"); + init_two_char_letter('v', 'z', 'v', 'Z', "z"); + init_two_char_letter('o', 'a', 'o', 'A', "a"); + init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); + init_two_char_letter('-', 'd', '-', 'D'); + + store_token("\\(ss", TOKEN_LOWER, 0, "SS"); + store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); + + store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); + store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); + store_token("\\(hy", TOKEN_HYPHEN); + store_token("\\[hy]", TOKEN_HYPHEN); +} + +static void init_strings() +{ + char buf[6]; + buf[0] = '\\'; + buf[1] = '*'; + for (const char *p = "'`^^,:~v_o./;"; *p; p++) { + buf[2] = *p; + buf[3] = '\0'; + store_token(strsave(buf), TOKEN_ACCENT); + buf[2] = '['; + buf[3] = *p; + buf[4] = ']'; + buf[5] = '\0'; + store_token(strsave(buf), TOKEN_ACCENT); + } + + // -ms special letters + store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); + store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); + store_letter("\\*(d-", "\\*(D-"); + store_letter("\\*[d-]", "\\*[D-]"); + store_letter("\\*(ae", "\\*(Ae", "ae"); + store_letter("\\*[ae]", "\\*[Ae]", "ae"); + store_letter("\\*(oe", "\\*(Oe", "oe"); + store_letter("\\*[oe]", "\\*[Oe]", "oe"); + + store_token("\\*3", TOKEN_LOWER, "y", "Y"); + store_token("\\*8", TOKEN_LOWER, "ss", "SS"); + store_token("\\*q", TOKEN_LOWER, "o", "O"); +} + +struct token_initer { + token_initer(); +}; + +static token_initer the_token_initer; + +token_initer::token_initer() +{ + init_ascii(); + init_latin1(); + init_special_chars(); + init_strings(); + default_token_info.set(TOKEN_OTHER); +} diff --git a/gnu/usr.bin/groff/refer/token.h b/gnu/usr.bin/groff/refer/token.h new file mode 100644 index 000000000000..c6445d230ffc --- /dev/null +++ b/gnu/usr.bin/groff/refer/token.h @@ -0,0 +1,81 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ + +enum token_type { + TOKEN_OTHER, + TOKEN_UPPER, + TOKEN_LOWER, + TOKEN_ACCENT, + TOKEN_PUNCT, + TOKEN_HYPHEN +}; + +class token_info { +private: + token_type type; + const char *sort_key; + const char *other_case; +public: + token_info(); + void set(token_type, const char *sk = 0, const char *oc = 0); + void lower_case(const char *start, const char *end, string &result) const; + void upper_case(const char *start, const char *end, string &result) const; + void sortify(const char *start, const char *end, string &result) const; + int sortify_non_empty(const char *start, const char *end) const; + int is_upper() const; + int is_lower() const; + int is_accent() const; + int is_other() const; + int is_punct() const; + int is_hyphen() const; +}; + +inline int token_info::is_upper() const +{ + return type == TOKEN_UPPER; +} + +inline int token_info::is_lower() const +{ + return type == TOKEN_LOWER; +} + +inline int token_info::is_accent() const +{ + return type == TOKEN_ACCENT; +} + +inline int token_info::is_other() const +{ + return type == TOKEN_OTHER; +} + +inline int token_info::is_punct() const +{ + return type == TOKEN_PUNCT; +} + +inline int token_info::is_hyphen() const +{ + return type == TOKEN_HYPHEN; +} + +int get_token(const char **ptr, const char *end); +const token_info *lookup_token(const char *start, const char *end); |