summaryrefslogtreecommitdiff
path: root/src/util/support/utf8_conv.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/support/utf8_conv.c')
-rw-r--r--src/util/support/utf8_conv.c475
1 files changed, 110 insertions, 365 deletions
diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c
index 80ca90b139e7..5cfc2c512b86 100644
--- a/src/util/support/utf8_conv.c
+++ b/src/util/support/utf8_conv.c
@@ -1,7 +1,7 @@
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* util/support/utf8_conv.c */
/*
- * Copyright 2008 by the Massachusetts Institute of Technology.
+ * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
* All Rights Reserved.
*
* Export of this software from the United States of America may
@@ -47,411 +47,156 @@
* THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
*/
-/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
+/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */
/*
- * UTF-8 Conversion Routines
- *
- * These routines convert between Wide Character and UTF-8,
- * or between MultiByte and UTF-8 encodings.
- *
- * Both single character and string versions of the functions are provided.
- * All functions return -1 if the character or string cannot be converted.
+ * These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode
+ * character in either two or four bytes. Characters in the Basic Multilingual
+ * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
+ * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
+ * surrogate and a low surrogate, each containing ten bits of the character
+ * value, and encoded in four bytes.
*/
#include "k5-platform.h"
#include "k5-utf8.h"
+#include "k5-buf.h"
+#include "k5-input.h"
#include "supp-int.h"
static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
-static ssize_t
-k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str,
- const char *utf8str,
- size_t count,
- int little_endian)
-{
- size_t ucs2len = 0;
- size_t utflen, i;
- krb5_ucs2 ch;
-
- /* If input ptr is NULL or empty... */
- if (utf8str == NULL || *utf8str == '\0') {
- if (ucs2str != NULL)
- *ucs2str = 0;
-
- return 0;
- }
-
- /* Examine next UTF-8 character. */
- while (ucs2len < count && *utf8str != '\0') {
- /* Get UTF-8 sequence length from 1st byte */
- utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen);
-
- if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN)
- return -1;
-
- /* First byte minus length tag */
- ch = (krb5_ucs2)(utf8str[0] & mask[utflen]);
-
- for (i = 1; i < utflen; i++) {
- /* Subsequent bytes must start with 10 */
- if ((utf8str[i] & 0xc0) != 0x80)
- return -1;
-
- ch <<= 6; /* 6 bits of data in each subsequent byte */
- ch |= (krb5_ucs2)(utf8str[i] & 0x3f);
- }
-
- if (ucs2str != NULL) {
-#ifdef K5_BE
-#ifndef SWAP16
-#define SWAP16(X) ((((X) << 8) | ((X) >> 8)) & 0xFFFF)
-#endif
- if (little_endian)
- ucs2str[ucs2len] = SWAP16(ch);
- else
-#endif
- ucs2str[ucs2len] = ch;
- }
-
- utf8str += utflen; /* Move to next UTF-8 character */
- ucs2len++; /* Count number of wide chars stored/required */
- }
-
- if (ucs2str != NULL && ucs2len < count) {
- /* Add null terminator if there's room in the buffer. */
- ucs2str[ucs2len] = 0;
- }
-
- return ucs2len;
-}
-
-int
-krb5int_utf8s_to_ucs2s(const char *utf8s,
- krb5_ucs2 **ucs2s,
- size_t *ucs2chars)
-{
- ssize_t len;
- size_t chars;
+/* A high surrogate is ten bits masked with 0xD800. */
+#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
- chars = krb5int_utf8_chars(utf8s);
- *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
- if (*ucs2s == NULL) {
- return ENOMEM;
- }
+/* A low surrogate is ten bits masked with 0xDC00. */
+#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
- len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
- if (len < 0) {
- free(*ucs2s);
- *ucs2s = NULL;
- return EINVAL;
- }
+/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
+ * value. */
+#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
+#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
- if (ucs2chars != NULL) {
- *ucs2chars = chars;
- }
+/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
+ * surrogate value. */
+#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
- return 0;
-}
+/* Characters in the Supplementary Planes have a base value subtracted from
+ * their code points to form a 20-bit value; ten bits go in each surrogate. */
+#define BASE 0x10000
+#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
+#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
+#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
int
-krb5int_utf8cs_to_ucs2s(const char *utf8s,
- size_t utf8slen,
- krb5_ucs2 **ucs2s,
- size_t *ucs2chars)
+k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
{
- ssize_t len;
- size_t chars;
-
- chars = krb5int_utf8c_chars(utf8s, utf8slen);
- *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
- if (*ucs2s == NULL) {
- return ENOMEM;
- }
-
- len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars, 0);
- if (len < 0) {
- free(*ucs2s);
- *ucs2s = NULL;
- return EINVAL;
- }
- (*ucs2s)[chars] = 0;
-
- if (ucs2chars != NULL) {
- *ucs2chars = chars;
- }
-
- return 0;
-}
-
-int
-krb5int_utf8s_to_ucs2les(const char *utf8s,
- unsigned char **ucs2les,
- size_t *ucs2leslen)
-{
- ssize_t len;
- size_t chars;
-
- chars = krb5int_utf8_chars(utf8s);
-
- *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
- if (*ucs2les == NULL) {
- return ENOMEM;
- }
-
- len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
- if (len < 0) {
- free(*ucs2les);
- *ucs2les = NULL;
- return EINVAL;
- }
-
- if (ucs2leslen != NULL) {
- *ucs2leslen = chars * sizeof(krb5_ucs2);
- }
-
- return 0;
-}
-
-int
-krb5int_utf8cs_to_ucs2les(const char *utf8s,
- size_t utf8slen,
- unsigned char **ucs2les,
- size_t *ucs2leslen)
-{
- ssize_t len;
- size_t chars;
- krb5_ucs2 *ucs2s;
-
- *ucs2les = NULL;
-
- chars = krb5int_utf8c_chars(utf8s, utf8slen);
- ucs2s = malloc((chars + 1) * sizeof(krb5_ucs2));
- if (ucs2s == NULL)
- return ENOMEM;
-
- len = k5_utf8s_to_ucs2s(ucs2s, utf8s, chars, 1);
- if (len < 0) {
- free(ucs2s);
- return EINVAL;
- }
- ucs2s[chars] = 0;
-
- *ucs2les = (unsigned char *)ucs2s;
- if (ucs2leslen != NULL) {
- *ucs2leslen = chars * sizeof(krb5_ucs2);
- }
+ struct k5buf buf;
+ krb5_ucs4 ch;
+ size_t chlen, i;
+ uint8_t *p;
- return 0;
-}
+ *utf16_out = NULL;
+ *nbytes_out = 0;
-/*-----------------------------------------------------------------------------
- Convert a wide char string to a UTF-8 string.
- No more than 'count' bytes will be written to the output buffer.
- Return the # of bytes written to the output buffer, excl null terminator.
+ k5_buf_init_dynamic(&buf);
- ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the
- length of the UCS-2 string in characters
-*/
-static ssize_t
-k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str,
- size_t count, ssize_t ucs2len, int little_endian)
-{
- int len = 0;
- int n;
- char *p = utf8str;
- krb5_ucs2 empty = 0, ch;
+ /* Examine next UTF-8 character. */
+ while (*utf8 != '\0') {
+ /* Get UTF-8 sequence length from first byte. */
+ chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
+ if (chlen == 0)
+ goto invalid;
- if (ucs2str == NULL) /* Treat input ptr NULL as an empty string */
- ucs2str = &empty;
+ /* First byte minus length tag */
+ ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
- if (utf8str == NULL) /* Just compute size of output, excl null */
- {
- while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) {
- /* Get UTF-8 size of next wide char */
- ch = *ucs2str++;
-#ifdef K5_BE
- if (little_endian)
- ch = SWAP16(ch);
-#endif
+ for (i = 1; i < chlen; i++) {
+ /* Subsequent bytes must start with 10. */
+ if ((utf8[i] & 0xc0) != 0x80)
+ goto invalid;
- n = krb5int_ucs2_to_utf8(ch, NULL);
- if (n < 1 || n > INT_MAX - len)
- return -1;
- len += n;
+ /* 6 bits of data in each subsequent byte */
+ ch <<= 6;
+ ch |= (krb5_ucs4)(utf8[i] & 0x3f);
+ }
+ if (!IS_VALID_UNICODE(ch))
+ goto invalid;
+
+ /* Characters in the basic multilingual plane are encoded using two
+ * bytes; other characters are encoded using four bytes. */
+ p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4);
+ if (p == NULL)
+ return ENOMEM;
+ if (IS_BMP(ch)) {
+ store_16_le(ch, p);
+ } else {
+ /* 0x10000 is subtracted from ch; then the high ten bits plus
+ * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
+ store_16_le(HIGH_SURROGATE(ch), p);
+ store_16_le(LOW_SURROGATE(ch), p + 2);
}
- return len;
- }
-
- /* Do the actual conversion. */
-
- n = 1; /* In case of empty ucs2str */
- while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) {
- ch = *ucs2str++;
-#ifdef K5_BE
- if (little_endian)
- ch = SWAP16(ch);
-#endif
-
- n = krb5int_ucs2_to_utf8(ch, p);
-
- if (n < 1)
- break;
-
- p += n;
- count -= n; /* Space left in output buffer */
- }
-
- /* If not enough room for last character, pad remainder with null
- so that return value = original count, indicating buffer full. */
- if (n == 0) {
- while (count--)
- *p++ = 0;
- }
- /* Add a null terminator if there's room. */
- else if (count)
- *p = 0;
-
- if (n == -1) /* Conversion encountered invalid wide char. */
- return -1;
-
- /* Return the number of bytes written to output buffer, excl null. */
- return (p - utf8str);
-}
-
-int
-krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s,
- char **utf8s,
- size_t *utf8slen)
-{
- ssize_t len;
-
- len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0);
- if (len < 0) {
- return EINVAL;
- }
-
- *utf8s = (char *)malloc((size_t)len + 1);
- if (*utf8s == NULL) {
- return ENOMEM;
- }
-
- len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0);
- if (len < 0) {
- free(*utf8s);
- *utf8s = NULL;
- return EINVAL;
- }
-
- if (utf8slen != NULL) {
- *utf8slen = len;
+ /* Move to next UTF-8 character. */
+ utf8 += chlen;
}
+ *utf16_out = buf.data;
+ *nbytes_out = buf.len;
return 0;
-}
-int
-krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les,
- char **utf8s,
- size_t *utf8slen)
-{
- ssize_t len;
-
- len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1);
- if (len < 0)
- return EINVAL;
-
- *utf8s = (char *)malloc((size_t)len + 1);
- if (*utf8s == NULL) {
- return ENOMEM;
- }
-
- len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1);
- if (len < 0) {
- free(*utf8s);
- *utf8s = NULL;
- return EINVAL;
- }
-
- if (utf8slen != NULL) {
- *utf8slen = len;
- }
-
- return 0;
+invalid:
+ k5_buf_free(&buf);
+ return EINVAL;
}
int
-krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s,
- size_t ucs2slen,
- char **utf8s,
- size_t *utf8slen)
+k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
{
- ssize_t len;
+ struct k5buf buf;
+ struct k5input in;
+ uint16_t ch1, ch2;
+ krb5_ucs4 ch;
+ size_t chlen;
+ void *p;
- if (ucs2slen > SSIZE_MAX)
- return ERANGE;
+ *utf8_out = NULL;
- len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0,
- (ssize_t)ucs2slen, 0);
- if (len < 0)
+ if (nbytes % 2 != 0)
return EINVAL;
- *utf8s = (char *)malloc((size_t)len + 1);
- if (*utf8s == NULL) {
- return ENOMEM;
- }
+ k5_buf_init_dynamic(&buf);
+ k5_input_init(&in, utf16bytes, nbytes);
+ while (!in.status && in.len > 0) {
+ /* Get the next character or high surrogate. A low surrogate without a
+ * preceding high surrogate is invalid. */
+ ch1 = k5_input_get_uint16_le(&in);
+ if (IS_LOW_SURROGATE(ch1))
+ goto invalid;
+ if (IS_HIGH_SURROGATE(ch1)) {
+ /* Get the low surrogate and combine the pair. */
+ ch2 = k5_input_get_uint16_le(&in);
+ if (!IS_LOW_SURROGATE(ch2))
+ goto invalid;
+ ch = COMPOSE(ch1, ch2);
+ } else {
+ ch = ch1;
+ }
- len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s, (size_t)len,
- (ssize_t)ucs2slen, 0);
- if (len < 0) {
- free(*utf8s);
- *utf8s = NULL;
- return EINVAL;
+ chlen = krb5int_ucs4_to_utf8(ch, NULL);
+ p = k5_buf_get_space(&buf, chlen);
+ if (p == NULL)
+ return ENOMEM;
+ (void)krb5int_ucs4_to_utf8(ch, p);
}
- (*utf8s)[len] = '\0';
- if (utf8slen != NULL) {
- *utf8slen = len;
- }
+ if (in.status)
+ goto invalid;
+ *utf8_out = buf.data;
return 0;
-}
-
-int
-krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les,
- size_t ucs2leslen,
- char **utf8s,
- size_t *utf8slen)
-{
- ssize_t len;
- if (ucs2leslen > SSIZE_MAX)
- return ERANGE;
-
- len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0,
- (ssize_t)ucs2leslen, 1);
- if (len < 0)
- return EINVAL;
-
- *utf8s = (char *)malloc((size_t)len + 1);
- if (*utf8s == NULL) {
- return ENOMEM;
- }
-
- len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len,
- (ssize_t)ucs2leslen, 1);
- if (len < 0) {
- free(*utf8s);
- *utf8s = NULL;
- return EINVAL;
- }
- (*utf8s)[len] = '\0';
-
- if (utf8slen != NULL) {
- *utf8slen = len;
- }
-
- return 0;
+invalid:
+ k5_buf_free(&buf);
+ return EINVAL;
}