diff options
Diffstat (limited to 'src/util/support/utf8_conv.c')
-rw-r--r-- | src/util/support/utf8_conv.c | 475 |
1 files changed, 110 insertions, 365 deletions
diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c index 80ca90b139e7..5cfc2c512b86 100644 --- a/src/util/support/utf8_conv.c +++ b/src/util/support/utf8_conv.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* util/support/utf8_conv.c */ /* - * Copyright 2008 by the Massachusetts Institute of Technology. + * Copyright 2008, 2017 by the Massachusetts Institute of Technology. * All Rights Reserved. * * Export of this software from the United States of America may @@ -47,411 +47,156 @@ * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. */ -/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */ +/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */ /* - * UTF-8 Conversion Routines - * - * These routines convert between Wide Character and UTF-8, - * or between MultiByte and UTF-8 encodings. - * - * Both single character and string versions of the functions are provided. - * All functions return -1 if the character or string cannot be converted. + * These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode + * character in either two or four bytes. Characters in the Basic Multilingual + * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes. + * Characters in the Supplementary Planes (10000..10FFFF) are split into a high + * surrogate and a low surrogate, each containing ten bits of the character + * value, and encoded in four bytes. */ #include "k5-platform.h" #include "k5-utf8.h" +#include "k5-buf.h" +#include "k5-input.h" #include "supp-int.h" static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; -static ssize_t -k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str, - const char *utf8str, - size_t count, - int little_endian) -{ - size_t ucs2len = 0; - size_t utflen, i; - krb5_ucs2 ch; - - /* If input ptr is NULL or empty... */ - if (utf8str == NULL || *utf8str == '\0') { - if (ucs2str != NULL) - *ucs2str = 0; - - return 0; - } - - /* Examine next UTF-8 character. */ - while (ucs2len < count && *utf8str != '\0') { - /* Get UTF-8 sequence length from 1st byte */ - utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen); - - if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN) - return -1; - - /* First byte minus length tag */ - ch = (krb5_ucs2)(utf8str[0] & mask[utflen]); - - for (i = 1; i < utflen; i++) { - /* Subsequent bytes must start with 10 */ - if ((utf8str[i] & 0xc0) != 0x80) - return -1; - - ch <<= 6; /* 6 bits of data in each subsequent byte */ - ch |= (krb5_ucs2)(utf8str[i] & 0x3f); - } - - if (ucs2str != NULL) { -#ifdef K5_BE -#ifndef SWAP16 -#define SWAP16(X) ((((X) << 8) | ((X) >> 8)) & 0xFFFF) -#endif - if (little_endian) - ucs2str[ucs2len] = SWAP16(ch); - else -#endif - ucs2str[ucs2len] = ch; - } - - utf8str += utflen; /* Move to next UTF-8 character */ - ucs2len++; /* Count number of wide chars stored/required */ - } - - if (ucs2str != NULL && ucs2len < count) { - /* Add null terminator if there's room in the buffer. */ - ucs2str[ucs2len] = 0; - } - - return ucs2len; -} - -int -krb5int_utf8s_to_ucs2s(const char *utf8s, - krb5_ucs2 **ucs2s, - size_t *ucs2chars) -{ - ssize_t len; - size_t chars; +/* A high surrogate is ten bits masked with 0xD800. */ +#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF) - chars = krb5int_utf8_chars(utf8s); - *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2s == NULL) { - return ENOMEM; - } +/* A low surrogate is ten bits masked with 0xDC00. */ +#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF) - len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0); - if (len < 0) { - free(*ucs2s); - *ucs2s = NULL; - return EINVAL; - } +/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate + * value. */ +#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF) +#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c)) - if (ucs2chars != NULL) { - *ucs2chars = chars; - } +/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a + * surrogate value. */ +#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c)) - return 0; -} +/* Characters in the Supplementary Planes have a base value subtracted from + * their code points to form a 20-bit value; ten bits go in each surrogate. */ +#define BASE 0x10000 +#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10)) +#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF)) +#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF))) int -krb5int_utf8cs_to_ucs2s(const char *utf8s, - size_t utf8slen, - krb5_ucs2 **ucs2s, - size_t *ucs2chars) +k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out) { - ssize_t len; - size_t chars; - - chars = krb5int_utf8c_chars(utf8s, utf8slen); - *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2s == NULL) { - return ENOMEM; - } - - len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars, 0); - if (len < 0) { - free(*ucs2s); - *ucs2s = NULL; - return EINVAL; - } - (*ucs2s)[chars] = 0; - - if (ucs2chars != NULL) { - *ucs2chars = chars; - } - - return 0; -} - -int -krb5int_utf8s_to_ucs2les(const char *utf8s, - unsigned char **ucs2les, - size_t *ucs2leslen) -{ - ssize_t len; - size_t chars; - - chars = krb5int_utf8_chars(utf8s); - - *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2les == NULL) { - return ENOMEM; - } - - len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1); - if (len < 0) { - free(*ucs2les); - *ucs2les = NULL; - return EINVAL; - } - - if (ucs2leslen != NULL) { - *ucs2leslen = chars * sizeof(krb5_ucs2); - } - - return 0; -} - -int -krb5int_utf8cs_to_ucs2les(const char *utf8s, - size_t utf8slen, - unsigned char **ucs2les, - size_t *ucs2leslen) -{ - ssize_t len; - size_t chars; - krb5_ucs2 *ucs2s; - - *ucs2les = NULL; - - chars = krb5int_utf8c_chars(utf8s, utf8slen); - ucs2s = malloc((chars + 1) * sizeof(krb5_ucs2)); - if (ucs2s == NULL) - return ENOMEM; - - len = k5_utf8s_to_ucs2s(ucs2s, utf8s, chars, 1); - if (len < 0) { - free(ucs2s); - return EINVAL; - } - ucs2s[chars] = 0; - - *ucs2les = (unsigned char *)ucs2s; - if (ucs2leslen != NULL) { - *ucs2leslen = chars * sizeof(krb5_ucs2); - } + struct k5buf buf; + krb5_ucs4 ch; + size_t chlen, i; + uint8_t *p; - return 0; -} + *utf16_out = NULL; + *nbytes_out = 0; -/*----------------------------------------------------------------------------- - Convert a wide char string to a UTF-8 string. - No more than 'count' bytes will be written to the output buffer. - Return the # of bytes written to the output buffer, excl null terminator. + k5_buf_init_dynamic(&buf); - ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the - length of the UCS-2 string in characters -*/ -static ssize_t -k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str, - size_t count, ssize_t ucs2len, int little_endian) -{ - int len = 0; - int n; - char *p = utf8str; - krb5_ucs2 empty = 0, ch; + /* Examine next UTF-8 character. */ + while (*utf8 != '\0') { + /* Get UTF-8 sequence length from first byte. */ + chlen = KRB5_UTF8_CHARLEN2(utf8, chlen); + if (chlen == 0) + goto invalid; - if (ucs2str == NULL) /* Treat input ptr NULL as an empty string */ - ucs2str = ∅ + /* First byte minus length tag */ + ch = (krb5_ucs4)(utf8[0] & mask[chlen]); - if (utf8str == NULL) /* Just compute size of output, excl null */ - { - while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) { - /* Get UTF-8 size of next wide char */ - ch = *ucs2str++; -#ifdef K5_BE - if (little_endian) - ch = SWAP16(ch); -#endif + for (i = 1; i < chlen; i++) { + /* Subsequent bytes must start with 10. */ + if ((utf8[i] & 0xc0) != 0x80) + goto invalid; - n = krb5int_ucs2_to_utf8(ch, NULL); - if (n < 1 || n > INT_MAX - len) - return -1; - len += n; + /* 6 bits of data in each subsequent byte */ + ch <<= 6; + ch |= (krb5_ucs4)(utf8[i] & 0x3f); + } + if (!IS_VALID_UNICODE(ch)) + goto invalid; + + /* Characters in the basic multilingual plane are encoded using two + * bytes; other characters are encoded using four bytes. */ + p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4); + if (p == NULL) + return ENOMEM; + if (IS_BMP(ch)) { + store_16_le(ch, p); + } else { + /* 0x10000 is subtracted from ch; then the high ten bits plus + * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */ + store_16_le(HIGH_SURROGATE(ch), p); + store_16_le(LOW_SURROGATE(ch), p + 2); } - return len; - } - - /* Do the actual conversion. */ - - n = 1; /* In case of empty ucs2str */ - while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) { - ch = *ucs2str++; -#ifdef K5_BE - if (little_endian) - ch = SWAP16(ch); -#endif - - n = krb5int_ucs2_to_utf8(ch, p); - - if (n < 1) - break; - - p += n; - count -= n; /* Space left in output buffer */ - } - - /* If not enough room for last character, pad remainder with null - so that return value = original count, indicating buffer full. */ - if (n == 0) { - while (count--) - *p++ = 0; - } - /* Add a null terminator if there's room. */ - else if (count) - *p = 0; - - if (n == -1) /* Conversion encountered invalid wide char. */ - return -1; - - /* Return the number of bytes written to output buffer, excl null. */ - return (p - utf8str); -} - -int -krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - - len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0); - if (len < 0) { - return EINVAL; - } - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - - if (utf8slen != NULL) { - *utf8slen = len; + /* Move to next UTF-8 character. */ + utf8 += chlen; } + *utf16_out = buf.data; + *nbytes_out = buf.len; return 0; -} -int -krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1); - if (len < 0) - return EINVAL; - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - - if (utf8slen != NULL) { - *utf8slen = len; - } - - return 0; +invalid: + k5_buf_free(&buf); + return EINVAL; } int -krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s, - size_t ucs2slen, - char **utf8s, - size_t *utf8slen) +k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out) { - ssize_t len; + struct k5buf buf; + struct k5input in; + uint16_t ch1, ch2; + krb5_ucs4 ch; + size_t chlen; + void *p; - if (ucs2slen > SSIZE_MAX) - return ERANGE; + *utf8_out = NULL; - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0, - (ssize_t)ucs2slen, 0); - if (len < 0) + if (nbytes % 2 != 0) return EINVAL; - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } + k5_buf_init_dynamic(&buf); + k5_input_init(&in, utf16bytes, nbytes); + while (!in.status && in.len > 0) { + /* Get the next character or high surrogate. A low surrogate without a + * preceding high surrogate is invalid. */ + ch1 = k5_input_get_uint16_le(&in); + if (IS_LOW_SURROGATE(ch1)) + goto invalid; + if (IS_HIGH_SURROGATE(ch1)) { + /* Get the low surrogate and combine the pair. */ + ch2 = k5_input_get_uint16_le(&in); + if (!IS_LOW_SURROGATE(ch2)) + goto invalid; + ch = COMPOSE(ch1, ch2); + } else { + ch = ch1; + } - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s, (size_t)len, - (ssize_t)ucs2slen, 0); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; + chlen = krb5int_ucs4_to_utf8(ch, NULL); + p = k5_buf_get_space(&buf, chlen); + if (p == NULL) + return ENOMEM; + (void)krb5int_ucs4_to_utf8(ch, p); } - (*utf8s)[len] = '\0'; - if (utf8slen != NULL) { - *utf8slen = len; - } + if (in.status) + goto invalid; + *utf8_out = buf.data; return 0; -} - -int -krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les, - size_t ucs2leslen, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - if (ucs2leslen > SSIZE_MAX) - return ERANGE; - - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, - (ssize_t)ucs2leslen, 1); - if (len < 0) - return EINVAL; - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len, - (ssize_t)ucs2leslen, 1); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - (*utf8s)[len] = '\0'; - - if (utf8slen != NULL) { - *utf8slen = len; - } - - return 0; +invalid: + k5_buf_free(&buf); + return EINVAL; } |