1 files changed, 67 insertions, 12 deletions
diff --git a/src/encoding.c b/src/encoding.c
index 3e7b9e584b08..76244f87f95b 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -35,7 +35,7 @@
 #include "file.h"
 
 #ifndef	lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.14 2017/11/02 20:25:39 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.20 2019/04/15 16:48:41 christos Exp $")
 #endif	/* lint */
 
 #include "magic.h"
@@ -49,6 +49,7 @@ private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
     size_t *);
 private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
@@ -69,7 +70,7 @@ protected int
 file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
     size_t *ulen, const char **code, const char **code_mime, const char **type)
 {
-	const unsigned char *buf = b->fbuf;
+	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
 	size_t nbytes = b->flen;
 	size_t mlen;
 	int rv = 1, ucs_type;
@@ -88,12 +89,13 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
 	*code_mime = "binary";
 
 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
-	if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
+	if ((*ubuf = CAST(unichar *, calloc(CAST(size_t, 1), mlen))) == NULL) {
 		file_oomem(ms, mlen);
 		goto done;
 	}
 	mlen = (nbytes + 1) * sizeof(nbuf[0]);
-	if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
+	if ((nbuf = CAST(unsigned char *,
+	    calloc(CAST(size_t, 1), mlen))) == NULL) {
 		file_oomem(ms, mlen);
 		goto done;
 	}
@@ -116,6 +118,15 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
 		*code = "UTF-8 Unicode";
 		*code_mime = "utf-8";
+	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
+		if (ucs_type == 1) {
+			*code = "Little-endian UTF-32 Unicode";
+			*code_mime = "utf-32le";
+		} else {
+			*code = "Big-endian UTF-32 Unicode";
+			*code_mime = "utf-32be";
+		}
+		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
 		if (ucs_type == 1) {
 			*code = "Little-endian UTF-16 Unicode";
@@ -410,7 +421,7 @@ looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
 }
 
 private int
-looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
     size_t *ulen)
 {
 	int bigend;
@@ -419,9 +430,9 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 	if (nbytes < 2)
 		return 0;
 
-	if (buf[0] == 0xff && buf[1] == 0xfe)
+	if (bf[0] == 0xff && bf[1] == 0xfe)
 		bigend = 0;
-	else if (buf[0] == 0xfe && buf[1] == 0xff)
+	else if (bf[0] == 0xfe && bf[1] == 0xff)
 		bigend = 1;
 	else
 		return 0;
@@ -432,20 +443,64 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 		/* XXX fix to properly handle chars > 65536 */
 
 		if (bigend)
-			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
+			ubf[(*ulen)++] = bf[i + 1]
+			    | (CAST(unichar, bf[i]) << 8);
 		else
-			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
+			ubf[(*ulen)++] = bf[i]
+			    | (CAST(unichar, bf[i + 1]) << 8);
 
-		if (ubuf[*ulen - 1] == 0xfffe)
+		if (ubf[*ulen - 1] == 0xfffe)
 			return 0;
-		if (ubuf[*ulen - 1] < 128 &&
-		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
+		if (ubf[*ulen - 1] < 128 &&
+		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
 			return 0;
 	}
 
 	return 1 + bigend;
 }
 
+private int
+looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
+    size_t *ulen)
+{
+	int bigend;
+	size_t i;
+
+	if (nbytes < 4)
+		return 0;
+
+	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
+		bigend = 0;
+	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
+		bigend = 1;
+	else
+		return 0;
+
+	*ulen = 0;
+
+	for (i = 4; i + 3 < nbytes; i += 4) {
+		/* XXX fix to properly handle chars > 65536 */
+
+		if (bigend)
+			ubf[(*ulen)++] = CAST(unichar, bf[i + 3])
+			    | (CAST(unichar, bf[i + 2]) << 8)
+			    | (CAST(unichar, bf[i + 1]) << 16)
+			    | (CAST(unichar, bf[i]) << 24);
+		else
+			ubf[(*ulen)++] = CAST(unichar, bf[i + 0])
+			    | (CAST(unichar, bf[i + 1]) << 8) 
+			    | (CAST(unichar, bf[i + 2]) << 16)
+			    | (CAST(unichar, bf[i + 3]) << 24);
+
+		if (ubf[*ulen - 1] == 0xfffe)
+			return 0;
+		if (ubf[*ulen - 1] < 128 &&
+		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
+			return 0;
+	}
+
+	return 1 + bigend;
+}
 #undef F
 #undef T
 #undef I