1 files changed, 110 insertions, 365 deletions
diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c
index 80ca90b139e7..5cfc2c512b86 100644
--- a/src/util/support/utf8_conv.c
+++ b/src/util/support/utf8_conv.c
@@ -1,7 +1,7 @@
 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
 /* util/support/utf8_conv.c */
 /*
- * Copyright 2008 by the Massachusetts Institute of Technology.
+ * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
  * All Rights Reserved.
  *
  * Export of this software from the United States of America may
@@ -47,411 +47,156 @@
  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  */
 
-/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
+/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */
 
 /*
- * UTF-8 Conversion Routines
- *
- * These routines convert between Wide Character and UTF-8,
- * or between MultiByte and UTF-8 encodings.
- *
- * Both single character and string versions of the functions are provided.
- * All functions return -1 if the character or string cannot be converted.
+ * These routines convert between UTF-16 and UTF-8.  UTF-16 encodes a Unicode
+ * character in either two or four bytes.  Characters in the Basic Multilingual
+ * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
+ * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
+ * surrogate and a low surrogate, each containing ten bits of the character
+ * value, and encoded in four bytes.
  */
 
 #include "k5-platform.h"
 #include "k5-utf8.h"
+#include "k5-buf.h"
+#include "k5-input.h"
 #include "supp-int.h"
 
 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 
-static ssize_t
-k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str,
-                  const char *utf8str,
-                  size_t count,
-                  int little_endian)
-{
-    size_t ucs2len = 0;
-    size_t utflen, i;
-    krb5_ucs2 ch;
-
-    /* If input ptr is NULL or empty... */
-    if (utf8str == NULL || *utf8str == '\0') {
-        if (ucs2str != NULL)
-            *ucs2str = 0;
-
-        return 0;
-    }
-
-    /* Examine next UTF-8 character.  */
-    while (ucs2len < count && *utf8str != '\0') {
-        /* Get UTF-8 sequence length from 1st byte */
-        utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen);
-
-        if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN)
-            return -1;
-
-        /* First byte minus length tag */
-        ch = (krb5_ucs2)(utf8str[0] & mask[utflen]);
-
-        for (i = 1; i < utflen; i++) {
-            /* Subsequent bytes must start with 10 */
-            if ((utf8str[i] & 0xc0) != 0x80)
-                return -1;
-
-            ch <<= 6;                   /* 6 bits of data in each subsequent byte */
-            ch |= (krb5_ucs2)(utf8str[i] & 0x3f);
-        }
-
-        if (ucs2str != NULL) {
-#ifdef K5_BE
-#ifndef SWAP16
-#define SWAP16(X)       ((((X) << 8) | ((X) >> 8)) & 0xFFFF)
-#endif
-            if (little_endian)
-                ucs2str[ucs2len] = SWAP16(ch);
-            else
-#endif
-                ucs2str[ucs2len] = ch;
-        }
-
-        utf8str += utflen;      /* Move to next UTF-8 character */
-        ucs2len++;              /* Count number of wide chars stored/required */
-    }
-
-    if (ucs2str != NULL && ucs2len < count) {
-        /* Add null terminator if there's room in the buffer. */
-        ucs2str[ucs2len] = 0;
-    }
-
-    return ucs2len;
-}
-
-int
-krb5int_utf8s_to_ucs2s(const char *utf8s,
-                       krb5_ucs2 **ucs2s,
-                       size_t *ucs2chars)
-{
-    ssize_t len;
-    size_t chars;
+/* A high surrogate is ten bits masked with 0xD800. */
+#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
 
-    chars = krb5int_utf8_chars(utf8s);
-    *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
-    if (*ucs2s == NULL) {
-        return ENOMEM;
-    }
+/* A low surrogate is ten bits masked with 0xDC00. */
+#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
 
-    len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
-    if (len < 0) {
-        free(*ucs2s);
-        *ucs2s = NULL;
-        return EINVAL;
-    }
+/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
+ * value. */
+#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
+#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
 
-    if (ucs2chars != NULL) {
-        *ucs2chars = chars;
-    }
+/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
+ * surrogate value. */
+#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
 
-    return 0;
-}
+/* Characters in the Supplementary Planes have a base value subtracted from
+ * their code points to form a 20-bit value; ten bits go in each surrogate. */
+#define BASE 0x10000
+#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
+#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
+#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
 
 int
-krb5int_utf8cs_to_ucs2s(const char *utf8s,
-                        size_t utf8slen,
-                        krb5_ucs2 **ucs2s,
-                        size_t *ucs2chars)
+k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
 {
-    ssize_t len;
-    size_t chars;
-
-    chars = krb5int_utf8c_chars(utf8s, utf8slen);
-    *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
-    if (*ucs2s == NULL) {
-        return ENOMEM;
-    }
-
-    len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars, 0);
-    if (len < 0) {
-        free(*ucs2s);
-        *ucs2s = NULL;
-        return EINVAL;
-    }
-    (*ucs2s)[chars] = 0;
-
-    if (ucs2chars != NULL) {
-        *ucs2chars = chars;
-    }
-
-    return 0;
-}
-
-int
-krb5int_utf8s_to_ucs2les(const char *utf8s,
-                         unsigned char **ucs2les,
-                         size_t *ucs2leslen)
-{
-    ssize_t len;
-    size_t chars;
-
-    chars = krb5int_utf8_chars(utf8s);
-
-    *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
-    if (*ucs2les == NULL) {
-        return ENOMEM;
-    }
-
-    len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
-    if (len < 0) {
-        free(*ucs2les);
-        *ucs2les = NULL;
-        return EINVAL;
-    }
-
-    if (ucs2leslen != NULL) {
-        *ucs2leslen = chars * sizeof(krb5_ucs2);
-    }
-
-    return 0;
-}
-
-int
-krb5int_utf8cs_to_ucs2les(const char *utf8s,
-                          size_t utf8slen,
-                          unsigned char **ucs2les,
-                          size_t *ucs2leslen)
-{
-    ssize_t len;
-    size_t chars;
-    krb5_ucs2 *ucs2s;
-
-    *ucs2les = NULL;
-
-    chars = krb5int_utf8c_chars(utf8s, utf8slen);
-    ucs2s = malloc((chars + 1) * sizeof(krb5_ucs2));
-    if (ucs2s == NULL)
-        return ENOMEM;
-
-    len = k5_utf8s_to_ucs2s(ucs2s, utf8s, chars, 1);
-    if (len < 0) {
-        free(ucs2s);
-        return EINVAL;
-    }
-    ucs2s[chars] = 0;
-
-    *ucs2les = (unsigned char *)ucs2s;
-    if (ucs2leslen != NULL) {
-        *ucs2leslen = chars * sizeof(krb5_ucs2);
-    }
+    struct k5buf buf;
+    krb5_ucs4 ch;
+    size_t chlen, i;
+    uint8_t *p;
 
-    return 0;
-}
+    *utf16_out = NULL;
+    *nbytes_out = 0;
 
-/*-----------------------------------------------------------------------------
-  Convert a wide char string to a UTF-8 string.
-  No more than 'count' bytes will be written to the output buffer.
-  Return the # of bytes written to the output buffer, excl null terminator.
+    k5_buf_init_dynamic(&buf);
 
-  ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the
-  length of the UCS-2 string in characters
-*/
-static ssize_t
-k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str,
-                  size_t count, ssize_t ucs2len, int little_endian)
-{
-    int len = 0;
-    int n;
-    char *p = utf8str;
-    krb5_ucs2 empty = 0, ch;
+    /* Examine next UTF-8 character. */
+    while (*utf8 != '\0') {
+        /* Get UTF-8 sequence length from first byte. */
+        chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
+        if (chlen == 0)
+            goto invalid;
 
-    if (ucs2str == NULL)        /* Treat input ptr NULL as an empty string */
-        ucs2str = &empty;
+        /* First byte minus length tag */
+        ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
 
-    if (utf8str == NULL)        /* Just compute size of output, excl null */
-    {
-        while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) {
-            /* Get UTF-8 size of next wide char */
-            ch = *ucs2str++;
-#ifdef K5_BE
-            if (little_endian)
-                ch = SWAP16(ch);
-#endif
+        for (i = 1; i < chlen; i++) {
+            /* Subsequent bytes must start with 10. */
+            if ((utf8[i] & 0xc0) != 0x80)
+                goto invalid;
 
-            n = krb5int_ucs2_to_utf8(ch, NULL);
-            if (n < 1 || n > INT_MAX - len)
-                return -1;
-            len += n;
+            /* 6 bits of data in each subsequent byte */
+            ch <<= 6;
+            ch |= (krb5_ucs4)(utf8[i] & 0x3f);
+        }
+        if (!IS_VALID_UNICODE(ch))
+            goto invalid;
+
+        /* Characters in the basic multilingual plane are encoded using two
+         * bytes; other characters are encoded using four bytes. */
+        p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4);
+        if (p == NULL)
+            return ENOMEM;
+        if (IS_BMP(ch)) {
+            store_16_le(ch, p);
+        } else {
+            /* 0x10000 is subtracted from ch; then the high ten bits plus
+             * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
+            store_16_le(HIGH_SURROGATE(ch), p);
+            store_16_le(LOW_SURROGATE(ch), p + 2);
         }
 
-        return len;
-    }
-
-    /* Do the actual conversion. */
-
-    n = 1;                                      /* In case of empty ucs2str */
-    while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) {
-        ch = *ucs2str++;
-#ifdef K5_BE
-        if (little_endian)
-            ch = SWAP16(ch);
-#endif
-
-        n = krb5int_ucs2_to_utf8(ch, p);
-
-        if (n < 1)
-            break;
-
-        p += n;
-        count -= n;                     /* Space left in output buffer */
-    }
-
-    /* If not enough room for last character, pad remainder with null
-       so that return value = original count, indicating buffer full. */
-    if (n == 0) {
-        while (count--)
-            *p++ = 0;
-    }
-    /* Add a null terminator if there's room. */
-    else if (count)
-        *p = 0;
-
-    if (n == -1)                        /* Conversion encountered invalid wide char. */
-        return -1;
-
-    /* Return the number of bytes written to output buffer, excl null. */
-    return (p - utf8str);
-}
-
-int
-krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s,
-                       char **utf8s,
-                       size_t *utf8slen)
-{
-    ssize_t len;
-
-    len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0);
-    if (len < 0) {
-        return EINVAL;
-    }
-
-    *utf8s = (char *)malloc((size_t)len + 1);
-    if (*utf8s == NULL) {
-        return ENOMEM;
-    }
-
-    len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0);
-    if (len < 0) {
-        free(*utf8s);
-        *utf8s = NULL;
-        return EINVAL;
-    }
-
-    if (utf8slen != NULL) {
-        *utf8slen = len;
+        /* Move to next UTF-8 character. */
+        utf8 += chlen;
     }
 
+    *utf16_out = buf.data;
+    *nbytes_out = buf.len;
     return 0;
-}
 
-int
-krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les,
-                         char **utf8s,
-                         size_t *utf8slen)
-{
-    ssize_t len;
-
-    len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1);
-    if (len < 0)
-        return EINVAL;
-
-    *utf8s = (char *)malloc((size_t)len + 1);
-    if (*utf8s == NULL) {
-        return ENOMEM;
-    }
-
-    len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1);
-    if (len < 0) {
-        free(*utf8s);
-        *utf8s = NULL;
-        return EINVAL;
-    }
-
-    if (utf8slen != NULL) {
-        *utf8slen = len;
-    }
-
-    return 0;
+invalid:
+    k5_buf_free(&buf);
+    return EINVAL;
 }
 
 int
-krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s,
-                        size_t ucs2slen,
-                        char **utf8s,
-                        size_t *utf8slen)
+k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
 {
-    ssize_t len;
+    struct k5buf buf;
+    struct k5input in;
+    uint16_t ch1, ch2;
+    krb5_ucs4 ch;
+    size_t chlen;
+    void *p;
 
-    if (ucs2slen > SSIZE_MAX)
-        return ERANGE;
+    *utf8_out = NULL;
 
-    len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0,
-                            (ssize_t)ucs2slen, 0);
-    if (len < 0)
+    if (nbytes % 2 != 0)
         return EINVAL;
 
-    *utf8s = (char *)malloc((size_t)len + 1);
-    if (*utf8s == NULL) {
-        return ENOMEM;
-    }
+    k5_buf_init_dynamic(&buf);
+    k5_input_init(&in, utf16bytes, nbytes);
+    while (!in.status && in.len > 0) {
+        /* Get the next character or high surrogate.  A low surrogate without a
+         * preceding high surrogate is invalid. */
+        ch1 = k5_input_get_uint16_le(&in);
+        if (IS_LOW_SURROGATE(ch1))
+            goto invalid;
+        if (IS_HIGH_SURROGATE(ch1)) {
+            /* Get the low surrogate and combine the pair. */
+            ch2 = k5_input_get_uint16_le(&in);
+            if (!IS_LOW_SURROGATE(ch2))
+                goto invalid;
+            ch = COMPOSE(ch1, ch2);
+        } else {
+            ch = ch1;
+        }
 
-    len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s, (size_t)len,
-                            (ssize_t)ucs2slen, 0);
-    if (len < 0) {
-        free(*utf8s);
-        *utf8s = NULL;
-        return EINVAL;
+        chlen = krb5int_ucs4_to_utf8(ch, NULL);
+        p = k5_buf_get_space(&buf, chlen);
+        if (p == NULL)
+            return ENOMEM;
+        (void)krb5int_ucs4_to_utf8(ch, p);
     }
-    (*utf8s)[len] = '\0';
 
-    if (utf8slen != NULL) {
-        *utf8slen = len;
-    }
+    if (in.status)
+        goto invalid;
 
+    *utf8_out = buf.data;
     return 0;
-}
-
-int
-krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les,
-                          size_t ucs2leslen,
-                          char **utf8s,
-                          size_t *utf8slen)
-{
-    ssize_t len;
 
-    if (ucs2leslen > SSIZE_MAX)
-        return ERANGE;
-
-    len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0,
-                            (ssize_t)ucs2leslen, 1);
-    if (len < 0)
-        return EINVAL;
-
-    *utf8s = (char *)malloc((size_t)len + 1);
-    if (*utf8s == NULL) {
-        return ENOMEM;
-    }
-
-    len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len,
-                            (ssize_t)ucs2leslen, 1);
-    if (len < 0) {
-        free(*utf8s);
-        *utf8s = NULL;
-        return EINVAL;
-    }
-    (*utf8s)[len] = '\0';
-
-    if (utf8slen != NULL) {
-        *utf8slen = len;
-    }
-
-    return 0;
+invalid:
+    k5_buf_free(&buf);
+    return EINVAL;
 }