diff options
Diffstat (limited to 'src/encoding.c')
| -rw-r--r-- | src/encoding.c | 79 |
1 files changed, 67 insertions, 12 deletions
diff --git a/src/encoding.c b/src/encoding.c index 3e7b9e584b08..76244f87f95b 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -35,7 +35,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: encoding.c,v 1.14 2017/11/02 20:25:39 christos Exp $") +FILE_RCSID("@(#)$File: encoding.c,v 1.20 2019/04/15 16:48:41 christos Exp $") #endif /* lint */ #include "magic.h" @@ -49,6 +49,7 @@ private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, size_t *); private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *); private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); +private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *); private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); private void from_ebcdic(const unsigned char *, size_t, unsigned char *); @@ -69,7 +70,7 @@ protected int file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) { - const unsigned char *buf = b->fbuf; + const unsigned char *buf = CAST(const unsigned char *, b->fbuf); size_t nbytes = b->flen; size_t mlen; int rv = 1, ucs_type; @@ -88,12 +89,13 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf, *code_mime = "binary"; mlen = (nbytes + 1) * sizeof((*ubuf)[0]); - if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) { + if ((*ubuf = CAST(unichar *, calloc(CAST(size_t, 1), mlen))) == NULL) { file_oomem(ms, mlen); goto done; } mlen = (nbytes + 1) * sizeof(nbuf[0]); - if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) { + if ((nbuf = CAST(unsigned char *, + calloc(CAST(size_t, 1), mlen))) == NULL) { file_oomem(ms, mlen); goto done; } @@ -116,6 +118,15 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf, DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); *code = "UTF-8 Unicode"; *code_mime = "utf-8"; + } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) { + if (ucs_type == 1) { + *code = "Little-endian UTF-32 Unicode"; + *code_mime = "utf-32le"; + } else { + *code = "Big-endian UTF-32 Unicode"; + *code_mime = "utf-32be"; + } + DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen)); } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { if (ucs_type == 1) { *code = "Little-endian UTF-16 Unicode"; @@ -410,7 +421,7 @@ looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) } private int -looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, +looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf, size_t *ulen) { int bigend; @@ -419,9 +430,9 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, if (nbytes < 2) return 0; - if (buf[0] == 0xff && buf[1] == 0xfe) + if (bf[0] == 0xff && bf[1] == 0xfe) bigend = 0; - else if (buf[0] == 0xfe && buf[1] == 0xff) + else if (bf[0] == 0xfe && bf[1] == 0xff) bigend = 1; else return 0; @@ -432,20 +443,64 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, /* XXX fix to properly handle chars > 65536 */ if (bigend) - ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; + ubf[(*ulen)++] = bf[i + 1] + | (CAST(unichar, bf[i]) << 8); else - ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; + ubf[(*ulen)++] = bf[i] + | (CAST(unichar, bf[i + 1]) << 8); - if (ubuf[*ulen - 1] == 0xfffe) + if (ubf[*ulen - 1] == 0xfffe) return 0; - if (ubuf[*ulen - 1] < 128 && - text_chars[(size_t)ubuf[*ulen - 1]] != T) + if (ubf[*ulen - 1] < 128 && + text_chars[CAST(size_t, ubf[*ulen - 1])] != T) return 0; } return 1 + bigend; } +private int +looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf, + size_t *ulen) +{ + int bigend; + size_t i; + + if (nbytes < 4) + return 0; + + if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) + bigend = 0; + else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) + bigend = 1; + else + return 0; + + *ulen = 0; + + for (i = 4; i + 3 < nbytes; i += 4) { + /* XXX fix to properly handle chars > 65536 */ + + if (bigend) + ubf[(*ulen)++] = CAST(unichar, bf[i + 3]) + | (CAST(unichar, bf[i + 2]) << 8) + | (CAST(unichar, bf[i + 1]) << 16) + | (CAST(unichar, bf[i]) << 24); + else + ubf[(*ulen)++] = CAST(unichar, bf[i + 0]) + | (CAST(unichar, bf[i + 1]) << 8) + | (CAST(unichar, bf[i + 2]) << 16) + | (CAST(unichar, bf[i + 3]) << 24); + + if (ubf[*ulen - 1] == 0xfffe) + return 0; + if (ubf[*ulen - 1] < 128 && + text_chars[CAST(size_t, ubf[*ulen - 1])] != T) + return 0; + } + + return 1 + bigend; +} #undef F #undef T #undef I |
