diff options
Diffstat (limited to 'contrib/file/ascmagic.c')
| -rw-r--r-- | contrib/file/ascmagic.c | 342 |
1 files changed, 160 insertions, 182 deletions
diff --git a/contrib/file/ascmagic.c b/contrib/file/ascmagic.c index 43d467cd3e229..2a76a63378ff1 100644 --- a/contrib/file/ascmagic.c +++ b/contrib/file/ascmagic.c @@ -1,34 +1,10 @@ /* - * Copyright (c) Ian F. Darwin 1986-1995. - * Software written by Ian F. Darwin and others; - * maintained 1995-present by Christos Zoulas and others. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice immediately at the beginning of the file, without modification, - * this list of conditions, and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -/* * ASCII magic -- file types that we know based on keywords * that can appear anywhere in the file. * + * Copyright (c) Ian F. Darwin, 1987. + * Written by Ian F. Darwin. + * * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, * to handle character codes other than ASCII on a unified basis. * @@ -36,8 +12,29 @@ * international characters, now subsumed into this file. */ +/* + * This software is not subject to any license of the American Telephone + * and Telegraph Company or of the Regents of the University of California. + * + * Permission is granted to anyone to use this software for any purpose on + * any computer system, and to alter it and redistribute it freely, subject + * to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of this + * software, no matter how awful, even if they arise from flaws in it. + * + * 2. The origin of this software must not be misrepresented, either by + * explicit claim or by omission. Since few users ever read sources, + * credits must appear in the documentation. + * + * 3. Altered versions must be plainly marked as such, and must not be + * misrepresented as being the original software. Since few users + * ever read sources, credits must appear in the documentation. + * + * 4. This notice may not be removed or altered. + */ + #include "file.h" -#include "magic.h" #include <stdio.h> #include <string.h> #include <memory.h> @@ -49,7 +46,7 @@ #include "names.h" #ifndef lint -FILE_RCSID("@(#)$Id: ascmagic.c,v 1.45 2006/03/12 22:09:33 christos Exp $") +FILE_RCSID("@(#)$Id: ascmagic.c,v 1.28 2000/08/05 17:36:47 christos Exp $") #endif /* lint */ typedef unsigned long unichar; @@ -58,34 +55,35 @@ typedef unsigned long unichar; #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ || (x) == 0x85 || (x) == '\f') -private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); -private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *); -private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *); -private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); -private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); -private void from_ebcdic(const unsigned char *, size_t, unsigned char *); -private int ascmatch(const unsigned char *, const unichar *, size_t); - - -protected int -file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) +static int looks_ascii __P((const unsigned char *, int, unichar *, int *)); +static int looks_utf8 __P((const unsigned char *, int, unichar *, int *)); +static int looks_unicode __P((const unsigned char *, int, unichar *, int *)); +static int looks_latin1 __P((const unsigned char *, int, unichar *, int *)); +static int looks_extended __P((const unsigned char *, int, unichar *, int *)); +static void from_ebcdic __P((const unsigned char *, int, unsigned char *)); +static int ascmatch __P((const unsigned char *, const unichar *, int)); + +int +ascmagic(buf, nbytes) + unsigned char *buf; + int nbytes; /* size actually read */ { - size_t i; - unsigned char *nbuf = NULL; - unichar *ubuf = NULL; - size_t ulen; + int i; + unsigned char *s; + char nbuf[HOWMANY+1]; /* one extra for terminating '\0' */ + unichar ubuf[HOWMANY+1]; /* one extra for terminating '\0' */ + int ulen; + char *token; struct names *p; - int rv = -1; - const char *code = NULL; - const char *code_mime = NULL; - const char *type = NULL; - const char *subtype = NULL; - const char *subtype_mime = NULL; + char *code = NULL; + char *code_mime = NULL; + char *type = NULL; + char *subtype = NULL; + char *subtype_mime = NULL; int has_escapes = 0; int has_backspace = 0; - int seen_cr = 0; int n_crlf = 0; int n_lf = 0; @@ -96,16 +94,23 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) int has_long_lines = 0; /* - * Undo the NUL-termination kindly provided by process() - * but leave at least one byte to look at + * Do the tar test first, because if the first file in the tar + * archive starts with a dot, we can confuse it with an nroff file. */ - while (nbytes > 1 && buf[nbytes - 1] == '\0') - nbytes--; + switch (is_tar(buf, nbytes)) { + case 1: + ckfputs(iflag ? "application/x-tar" : "tar archive", stdout); + return 1; + case 2: + ckfputs(iflag ? "application/x-tar, POSIX" + : "POSIX tar archive", stdout); + return 1; + } + + /* Undo the NUL-termination kindly provided by process() */ - if ((nbuf = malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL) - goto done; - if ((ubuf = malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL) - goto done; + while (nbytes > 0 && buf[nbytes - 1] == '\0') + nbytes--; /* * Then try to determine whether it's any character code we can @@ -121,7 +126,7 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; - } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { + } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen))) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else @@ -149,16 +154,10 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) type = "character data"; code_mime = "ebcdic"; } else { - rv = 0; - goto done; /* doesn't look like text at all */ + return 0; /* doesn't look like text at all */ } } - if (nbytes <= 1) { - rv = 0; - goto done; - } - /* * for troff, look for . + letter + letter or .\"; * this must be done to disambiguate tar archives' ./file @@ -173,10 +172,8 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) while (ISSPC(*tp)) ++tp; /* skip leading whitespace */ if ((tp[0] == '\\' && tp[1] == '\"') || - (isascii((unsigned char)tp[0]) && - isalnum((unsigned char)tp[0]) && - isascii((unsigned char)tp[1]) && - isalnum((unsigned char)tp[1]) && + (isascii(tp[0]) && isalnum(tp[0]) && + isascii(tp[1]) && isalnum(tp[1]) && ISSPC(tp[2]))) { subtype_mime = "text/troff"; subtype = "troff or preprocessor input"; @@ -194,7 +191,7 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) i = 0; while (i < ulen) { - size_t end; + int end; /* * skip past any leading space @@ -215,8 +212,7 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) * compare the word thus isolated against the token list */ for (p = names; p < names + NNAMES; p++) { - if (ascmatch((const unsigned char *)p->name, ubuf + i, - end - i)) { + if (ascmatch(p->name, ubuf + i, end - i)) { subtype = types[p->type].human; subtype_mime = types[p->type].mime; goto subtype_identified; @@ -232,25 +228,6 @@ subtype_identified: * Now try to discover other details about the file. */ for (i = 0; i < ulen; i++) { - if (ubuf[i] == '\n') { - if (seen_cr) - n_crlf++; - else - n_lf++; - last_line_end = i; - } else if (seen_cr) - n_cr++; - - seen_cr = (ubuf[i] == '\r'); - if (seen_cr) - last_line_end = i; - - if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ - n_nel++; - last_line_end = i; - } - - /* If this line is _longer_ than MAXLINELEN, remember it. */ if (i > last_line_end + MAXLINELEN) has_long_lines = 1; @@ -258,49 +235,48 @@ subtype_identified: has_escapes = 1; if (ubuf[i] == '\b') has_backspace = 1; - } - /* Beware, if the data has been truncated, the final CR could have - been followed by a LF. If we have HOWMANY bytes, it indicates - that the data might have been truncated, probably even before - this function was called. */ - if (seen_cr && nbytes < HOWMANY) - n_cr++; - - if ((ms->flags & MAGIC_MIME)) { - if (subtype_mime) { - if (file_printf(ms, subtype_mime) == -1) - goto done; - } else { - if (file_printf(ms, "text/plain") == -1) - goto done; + if (ubuf[i] == '\r' && (i + 1 < ulen && ubuf[i + 1] == '\n')) { + n_crlf++; + last_line_end = i; + } + if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) { + n_cr++; + last_line_end = i; + } + if (ubuf[i] == '\n' && (i - 1 < 0 || ubuf[i - 1] != '\r')) { + n_lf++; + last_line_end = i; } + if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ + n_nel++; + last_line_end = i; + } + } + + if (iflag) { + if (subtype_mime) + ckfputs(subtype_mime, stdout); + else + ckfputs("text/plain", stdout); if (code_mime) { - if (file_printf(ms, "; charset=") == -1) - goto done; - if (file_printf(ms, code_mime) == -1) - goto done; + ckfputs("; charset=", stdout); + ckfputs(code_mime, stdout); } } else { - if (file_printf(ms, code) == -1) - goto done; + ckfputs(code, stdout); if (subtype) { - if (file_printf(ms, " ") == -1) - goto done; - if (file_printf(ms, subtype) == -1) - goto done; + ckfputs(" ", stdout); + ckfputs(subtype, stdout); } - if (file_printf(ms, " ") == -1) - goto done; - if (file_printf(ms, type) == -1) - goto done; + ckfputs(" ", stdout); + ckfputs(type, stdout); if (has_long_lines) - if (file_printf(ms, ", with very long lines") == -1) - goto done; + ckfputs(", with very long lines", stdout); /* * Only report line terminators if we find one other than LF, @@ -308,62 +284,47 @@ subtype_identified: */ if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { - if (file_printf(ms, ", with") == -1) - goto done; + ckfputs(", with", stdout); - if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { - if (file_printf(ms, " no") == -1) - goto done; - } else { + if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) + ckfputs(" no", stdout); + else { if (n_crlf) { - if (file_printf(ms, " CRLF") == -1) - goto done; + ckfputs(" CRLF", stdout); if (n_cr || n_lf || n_nel) - if (file_printf(ms, ",") == -1) - goto done; + ckfputs(",", stdout); } if (n_cr) { - if (file_printf(ms, " CR") == -1) - goto done; + ckfputs(" CR", stdout); if (n_lf || n_nel) - if (file_printf(ms, ",") == -1) - goto done; + ckfputs(",", stdout); } if (n_lf) { - if (file_printf(ms, " LF") == -1) - goto done; + ckfputs(" LF", stdout); if (n_nel) - if (file_printf(ms, ",") == -1) - goto done; + ckfputs(",", stdout); } if (n_nel) - if (file_printf(ms, " NEL") == -1) - goto done; + ckfputs(" NEL", stdout); } - if (file_printf(ms, " line terminators") == -1) - goto done; + ckfputs(" line terminators", stdout); } if (has_escapes) - if (file_printf(ms, ", with escape sequences") == -1) - goto done; + ckfputs(", with escape sequences", stdout); if (has_backspace) - if (file_printf(ms, ", with overstriking") == -1) - goto done; + ckfputs(", with overstriking", stdout); } - rv = 1; -done: - if (nbuf) - free(nbuf); - if (ubuf) - free(ubuf); - return rv; + return 1; } -private int -ascmatch(const unsigned char *s, const unichar *us, size_t ulen) +static int +ascmatch(s, us, ulen) + const unsigned char *s; + const unichar *us; + int ulen; { size_t i; @@ -435,7 +396,7 @@ ascmatch(const unsigned char *s, const unichar *us, size_t ulen) #define I 2 /* character appears in ISO-8859 text */ #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ -private char text_chars[256] = { +static char text_chars[256] = { /* BEL BS HT LF FF CR */ F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ /* ESC */ @@ -457,9 +418,12 @@ private char text_chars[256] = { I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ }; -private int -looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, - size_t *ulen) +static int +looks_ascii(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + int nbytes; + unichar *ubuf; + int *ulen; { int i; @@ -477,8 +441,12 @@ looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, return 1; } -private int -looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) +static int +looks_latin1(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + int nbytes; + unichar *ubuf; + int *ulen; { int i; @@ -496,9 +464,12 @@ looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ule return 1; } -private int -looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, - size_t *ulen) +static int +looks_extended(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + int nbytes; + unichar *ubuf; + int *ulen; { int i; @@ -516,8 +487,12 @@ looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, return 1; } -private int -looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) +int +looks_utf8(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + int nbytes; + unichar *ubuf; + int *ulen; { int i, n; unichar c; @@ -578,9 +553,12 @@ done: return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ } -private int -looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, - size_t *ulen) +static int +looks_unicode(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + int nbytes; + unichar *ubuf; + int *ulen; { int bigend; int i; @@ -607,12 +585,11 @@ looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, if (ubuf[*ulen - 1] == 0xfffe) return 0; - if (ubuf[*ulen - 1] < 128 && - text_chars[(size_t)ubuf[*ulen - 1]] != T) + if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T) return 0; } - return 1 + bigend; + return 1; } #undef F @@ -642,7 +619,7 @@ looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, * between old-style and internationalized examples of text. */ -private unsigned char ebcdic_to_ascii[] = { +unsigned char ebcdic_to_ascii[] = { 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, @@ -661,7 +638,6 @@ private unsigned char ebcdic_to_ascii[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 }; -#ifdef notdef /* * The following EBCDIC-to-ASCII table may relate more closely to reality, * or at least to modern reality. It comes from @@ -676,7 +652,7 @@ private unsigned char ebcdic_to_ascii[] = { * cases for the NEL character can be taken out of the code. */ -private unsigned char ebcdic_1047_to_8859[] = { +unsigned char ebcdic_1047_to_8859[] = { 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, @@ -694,13 +670,15 @@ private unsigned char ebcdic_1047_to_8859[] = { 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F }; -#endif /* * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. */ -private void -from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) +static void +from_ebcdic(buf, nbytes, out) + const unsigned char *buf; + int nbytes; + unsigned char *out; { int i; |
