summaryrefslogtreecommitdiff
path: root/src/encoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/encoding.c')
-rw-r--r--src/encoding.c79
1 files changed, 67 insertions, 12 deletions
diff --git a/src/encoding.c b/src/encoding.c
index 3e7b9e584b08..76244f87f95b 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -35,7 +35,7 @@
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.14 2017/11/02 20:25:39 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.20 2019/04/15 16:48:41 christos Exp $")
#endif /* lint */
#include "magic.h"
@@ -49,6 +49,7 @@ private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
size_t *);
private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
@@ -69,7 +70,7 @@ protected int
file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
size_t *ulen, const char **code, const char **code_mime, const char **type)
{
- const unsigned char *buf = b->fbuf;
+ const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
size_t nbytes = b->flen;
size_t mlen;
int rv = 1, ucs_type;
@@ -88,12 +89,13 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
*code_mime = "binary";
mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
- if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
+ if ((*ubuf = CAST(unichar *, calloc(CAST(size_t, 1), mlen))) == NULL) {
file_oomem(ms, mlen);
goto done;
}
mlen = (nbytes + 1) * sizeof(nbuf[0]);
- if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
+ if ((nbuf = CAST(unsigned char *,
+ calloc(CAST(size_t, 1), mlen))) == NULL) {
file_oomem(ms, mlen);
goto done;
}
@@ -116,6 +118,15 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
*code = "UTF-8 Unicode";
*code_mime = "utf-8";
+ } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
+ if (ucs_type == 1) {
+ *code = "Little-endian UTF-32 Unicode";
+ *code_mime = "utf-32le";
+ } else {
+ *code = "Big-endian UTF-32 Unicode";
+ *code_mime = "utf-32be";
+ }
+ DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
if (ucs_type == 1) {
*code = "Little-endian UTF-16 Unicode";
@@ -410,7 +421,7 @@ looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
}
private int
-looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
size_t *ulen)
{
int bigend;
@@ -419,9 +430,9 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
if (nbytes < 2)
return 0;
- if (buf[0] == 0xff && buf[1] == 0xfe)
+ if (bf[0] == 0xff && bf[1] == 0xfe)
bigend = 0;
- else if (buf[0] == 0xfe && buf[1] == 0xff)
+ else if (bf[0] == 0xfe && bf[1] == 0xff)
bigend = 1;
else
return 0;
@@ -432,20 +443,64 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
/* XXX fix to properly handle chars > 65536 */
if (bigend)
- ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
+ ubf[(*ulen)++] = bf[i + 1]
+ | (CAST(unichar, bf[i]) << 8);
else
- ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
+ ubf[(*ulen)++] = bf[i]
+ | (CAST(unichar, bf[i + 1]) << 8);
- if (ubuf[*ulen - 1] == 0xfffe)
+ if (ubf[*ulen - 1] == 0xfffe)
return 0;
- if (ubuf[*ulen - 1] < 128 &&
- text_chars[(size_t)ubuf[*ulen - 1]] != T)
+ if (ubf[*ulen - 1] < 128 &&
+ text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
return 0;
}
return 1 + bigend;
}
+private int
+looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
+ size_t *ulen)
+{
+ int bigend;
+ size_t i;
+
+ if (nbytes < 4)
+ return 0;
+
+ if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
+ bigend = 0;
+ else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
+ bigend = 1;
+ else
+ return 0;
+
+ *ulen = 0;
+
+ for (i = 4; i + 3 < nbytes; i += 4) {
+ /* XXX fix to properly handle chars > 65536 */
+
+ if (bigend)
+ ubf[(*ulen)++] = CAST(unichar, bf[i + 3])
+ | (CAST(unichar, bf[i + 2]) << 8)
+ | (CAST(unichar, bf[i + 1]) << 16)
+ | (CAST(unichar, bf[i]) << 24);
+ else
+ ubf[(*ulen)++] = CAST(unichar, bf[i + 0])
+ | (CAST(unichar, bf[i + 1]) << 8)
+ | (CAST(unichar, bf[i + 2]) << 16)
+ | (CAST(unichar, bf[i + 3]) << 24);
+
+ if (ubf[*ulen - 1] == 0xfffe)
+ return 0;
+ if (ubf[*ulen - 1] < 128 &&
+ text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
+ return 0;
+ }
+
+ return 1 + bigend;
+}
#undef F
#undef T
#undef I