diff options
author | Cy Schubert <cy@FreeBSD.org> | 2018-04-03 19:36:00 +0000 |
---|---|---|
committer | Cy Schubert <cy@FreeBSD.org> | 2018-04-03 19:36:00 +0000 |
commit | b0e4d68d5124581ae353493d69bea352de4cff8a (patch) | |
tree | 43300ec43e83eccd367fd76fdfdefba2dcd7d8f4 /src/util/support | |
parent | 33a9b234e7087f573ef08cd7318c6497ba08b439 (diff) |
Notes
Diffstat (limited to 'src/util/support')
-rw-r--r-- | src/util/support/Makefile.in | 10 | ||||
-rw-r--r-- | src/util/support/cache-addrinfo.h | 12 | ||||
-rw-r--r-- | src/util/support/deps | 6 | ||||
-rw-r--r-- | src/util/support/fake-addrinfo.c | 16 | ||||
-rw-r--r-- | src/util/support/gmt_mktime.c | 17 | ||||
-rw-r--r-- | src/util/support/libkrb5support-fixed.exports | 5 | ||||
-rw-r--r-- | src/util/support/plugins.c | 5 | ||||
-rw-r--r-- | src/util/support/t_utf16.c | 117 | ||||
-rw-r--r-- | src/util/support/threads.c | 6 | ||||
-rw-r--r-- | src/util/support/utf8.c | 2 | ||||
-rw-r--r-- | src/util/support/utf8_conv.c | 475 |
11 files changed, 263 insertions, 408 deletions
diff --git a/src/util/support/Makefile.in b/src/util/support/Makefile.in index 6239e41761ee..0bf0b7a87277 100644 --- a/src/util/support/Makefile.in +++ b/src/util/support/Makefile.in @@ -143,6 +143,7 @@ SRCS=\ $(srcdir)/bcmp.c \ $(srcdir)/strerror_r.c \ $(srcdir)/t_utf8.c \ + $(srcdir)/t_utf16.c \ $(srcdir)/getopt.c \ $(srcdir)/getopt_long.c @@ -220,7 +221,12 @@ t_unal: t_unal.o t_utf8: t_utf8.o utf8.o $(CC_LINK) -o t_utf8 t_utf8.o utf8.o -TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 +T_UTF16_OBJS= t_utf16.o utf8_conv.o utf8.o k5buf.o $(PRINTF_ST_OBJ) + +t_utf16: $(T_UTF16_OBJS) + $(CC_LINK) -o $@ $(T_UTF16_OBJS) + +TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 t_utf16 check-unix: $(TEST_PROGS) ./t_k5buf @@ -230,11 +236,13 @@ check-unix: $(TEST_PROGS) ./t_json ./t_unal ./t_utf8 + ./t_utf16 clean: $(RM) t_k5buf.o t_k5buf t_unal.o t_unal path_win.o path_win $(RM) t_path_win.o t_path_win t_path.o t_path t_base64.o t_base64 $(RM) t_json.o t_json libkrb5support.exports t_utf8.o t_utf8 + $(RM) t_utf16.o t_utf16 @lib_frag@ @libobj_frag@ diff --git a/src/util/support/cache-addrinfo.h b/src/util/support/cache-addrinfo.h index a1b7fb28becb..40752ab5f4a7 100644 --- a/src/util/support/cache-addrinfo.h +++ b/src/util/support/cache-addrinfo.h @@ -52,12 +52,12 @@ * the data structures and flag values locally. * * - * On Mac OS X, getaddrinfo results aren't cached (though - * gethostbyname results are), so we need to build a cache here. Now - * things are getting really messy. Because the cache is in use, we - * use getservbyname, and throw away thread safety. (Not that the - * cache is thread safe, but when we get locking support, that'll be - * dealt with.) This code needs tearing down and rebuilding, soon. + * On macOS, getaddrinfo results aren't cached (though gethostbyname + * results are), so we need to build a cache here. Now things are + * getting really messy. Because the cache is in use, we use + * getservbyname, and throw away thread safety. (Not that the cache + * is thread safe, but when we get locking support, that'll be dealt + * with.) This code needs tearing down and rebuilding, soon. * * * Note that recent Windows developers' code has an interesting hack: diff --git a/src/util/support/deps b/src/util/support/deps index 4dff014f463b..34d8a884b330 100644 --- a/src/util/support/deps +++ b/src/util/support/deps @@ -33,7 +33,8 @@ utf8.so utf8.po $(OUTPRE)utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ $(top_srcdir)/include/k5-utf8.h supp-int.h utf8.c utf8_conv.so utf8_conv.po $(OUTPRE)utf8_conv.$(OBJEXT): \ - $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-platform.h \ + $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-buf.h \ + $(top_srcdir)/include/k5-input.h $(top_srcdir)/include/k5-platform.h \ $(top_srcdir)/include/k5-thread.h $(top_srcdir)/include/k5-utf8.h \ supp-int.h utf8_conv.c gettimeofday.so gettimeofday.po $(OUTPRE)gettimeofday.$(OBJEXT): \ @@ -84,6 +85,9 @@ strerror_r.so strerror_r.po $(OUTPRE)strerror_r.$(OBJEXT): \ t_utf8.so t_utf8.po $(OUTPRE)t_utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ $(top_srcdir)/include/k5-utf8.h t_utf8.c +t_utf16.so t_utf16.po $(OUTPRE)t_utf16.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ + $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ + $(top_srcdir)/include/k5-utf8.h t_utf16.c getopt.so getopt.po $(OUTPRE)getopt.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \ $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \ getopt.c diff --git a/src/util/support/fake-addrinfo.c b/src/util/support/fake-addrinfo.c index df1cc1dec558..3ee162e0d28b 100644 --- a/src/util/support/fake-addrinfo.c +++ b/src/util/support/fake-addrinfo.c @@ -52,7 +52,7 @@ * the data structures and flag values locally. * * - * On Mac OS X, getaddrinfo results aren't cached (though + * On macOS, getaddrinfo results aren't cached (though * gethostbyname results are), so we need to build a cache here. Now * things are getting really messy. Because the cache is in use, we * use getservbyname, and throw away thread safety. (Not that the @@ -331,18 +331,6 @@ system_freeaddrinfo (struct addrinfo *ai) freeaddrinfo(ai); } -/* Note: Implementations written to RFC 2133 use size_t, while RFC - 2553 implementations use socklen_t, for the second parameter. - - Mac OS X (10.2) and AIX 4.3.3 appear to be in the RFC 2133 camp, - but we don't have an autoconf test for that right now. */ -static inline int -system_getnameinfo (const struct sockaddr *sa, socklen_t salen, - char *host, size_t hostlen, char *serv, size_t servlen, - int flags) -{ - return getnameinfo(sa, salen, host, hostlen, serv, servlen, flags); -} #endif #if !defined (HAVE_GETADDRINFO) || defined(WRAP_GETADDRINFO) || defined(FAI_CACHE) @@ -697,7 +685,7 @@ static inline int fai_add_hosts_by_name (const char *name, sometimes associates it with the specified service, sometimes not. - But on Mac OS X (10.3, 10.4) they've "extended" getaddrinfo + But on macOS (10.3, 10.4) they've "extended" getaddrinfo to make SRV RR queries. (Please, somebody, show me something in the specs that actually supports this? RFC 3493 says nothing about it, but it does say getaddrinfo is diff --git a/src/util/support/gmt_mktime.c b/src/util/support/gmt_mktime.c index 32fef4386cd4..ac7752fefed0 100644 --- a/src/util/support/gmt_mktime.c +++ b/src/util/support/gmt_mktime.c @@ -78,21 +78,20 @@ static const int days_in_month[12] = { static time_t gmt_mktime(struct tm *t) { - time_t accum; + uint32_t accum; #define assert_time(cnd) if(!(cnd)) return (time_t) -1 /* - * For 32-bit signed time_t centered on 1/1/1970, the range is: - * time 0x80000000 -> Fri Dec 13 16:45:52 1901 - * time 0x7fffffff -> Mon Jan 18 22:14:07 2038 + * For 32-bit unsigned time values starting on 1/1/1970, the range is: + * time 0x00000000 -> Thu Jan 1 00:00:00 1970 + * time 0xffffffff -> Sun Feb 7 06:28:15 2106 * - * So years 1901 and 2038 are allowable, but we can't encode all - * dates in those years, and we're not doing overflow/underflow - * checking for such cases. + * We can't encode all dates in 2106, and we're not doing overflow checking + * for such cases. */ - assert_time(t->tm_year>=1); - assert_time(t->tm_year<=138); + assert_time(t->tm_year>=70); + assert_time(t->tm_year<=206); assert_time(t->tm_mon>=0); assert_time(t->tm_mon<=11); diff --git a/src/util/support/libkrb5support-fixed.exports b/src/util/support/libkrb5support-fixed.exports index d5d4177b72dc..fd74a1897ebb 100644 --- a/src/util/support/libkrb5support-fixed.exports +++ b/src/util/support/libkrb5support-fixed.exports @@ -52,6 +52,8 @@ k5_path_isabs k5_path_join k5_path_split k5_strerror_r +k5_utf8_to_utf16le +k5_utf16le_to_utf8 krb5int_key_register krb5int_key_delete krb5int_getspecific @@ -77,9 +79,6 @@ krb5int_mutex_free krb5int_mutex_lock krb5int_mutex_unlock krb5int_gmt_mktime -krb5int_utf8cs_to_ucs2les -krb5int_utf8s_to_ucs2les -krb5int_ucs2lecs_to_utf8s krb5int_ucs4_to_utf8 krb5int_utf8_to_ucs4 krb5int_utf8_lentab diff --git a/src/util/support/plugins.c b/src/util/support/plugins.c index b0bb2ada8755..47368be9d49b 100644 --- a/src/util/support/plugins.c +++ b/src/util/support/plugins.c @@ -592,9 +592,10 @@ krb5int_open_plugin_dirs (const char * const *dirnames, } } - if (krb5int_open_plugin (filepath, &handle, ep) == 0) { + if (!err && krb5int_open_plugin(filepath, &handle, ep) == 0) { err = krb5int_plugin_file_handle_array_add (&h, &count, handle); - if (!err) { handle = NULL; } /* h takes ownership */ + if (!err) + handle = NULL; /* h takes ownership */ } free(filepath); diff --git a/src/util/support/t_utf16.c b/src/util/support/t_utf16.c new file mode 100644 index 000000000000..bc3390a415cd --- /dev/null +++ b/src/util/support/t_utf16.c @@ -0,0 +1,117 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* util/support/t_utf16.c - test UTF-16 conversion functions */ +/* + * Copyright (C) 2017 by the Massachusetts Institute of Technology. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This program tests conversions between UTF-8 and little-endian UTF-16, with + * an eye mainly towards covering UTF-16 edge cases and UTF-8 decoding results + * which we detect as invalid in utf8_conv.c. t_utf8.c covers more UTF-8 edge + * cases. + */ + +#include <stdio.h> +#include <string.h> + +#include "k5-platform.h" +#include "k5-utf8.h" + +struct test { + const char *utf8; + const char *utf16; + size_t utf16len; +} tests[] = { + { "", "", 0 }, + { "abcd", "a\0b\0c\0d\0", 8 }, + /* From RFC 2781 (tests code point 0x12345 and some ASCII) */ + { "\xF0\x92\x8D\x85=Ra", "\x08\xD8\x45\xDF=\0R\0a\0", 10 }, + /* Lowest and highest Supplementary Plane code points */ + { "\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF", + "\x00\xD8\x00\xDC \0\xFF\xDB\xFF\xDF", 10 }, + /* Basic Multilingual Plane code points near and above surrogate range */ + { "\xED\x9F\xBF", "\xFF\xD7", 2 }, + { "\xEE\x80\x80 \xEE\xBF\xBF", "\x00\xE0 \0\xFF\xEF", 6 }, + /* Invalid UTF-8: decodes to value in surrogate pair range */ + { "\xED\xA0\x80", NULL, 0 }, /* 0xD800 */ + { "\xED\xAF\xBF", NULL, 0 }, /* 0xDBFF */ + { "\xED\xB0\x80", NULL, 0 }, /* 0xDC00 */ + { "\xED\xBF\xBF", NULL, 0 }, /* 0xDFFF */ + /* Invalid UTF-8: decodes to value above Unicode range */ + { "\xF4\x90\x80\x80", NULL, 0 }, + { "\xF4\xBF\xBF\xBF", NULL, 0 }, + { "\xF5\x80\x80\x80", NULL, 0 }, /* thrown out early due to first byte */ + /* Invalid UTF-16: odd numbers of UTF-16 bytes */ + { NULL, "\x00", 1 }, + { NULL, "\x01\x00\x02", 3 }, + /* Invalid UTF-16: high surrogate without a following low surrogate */ + { NULL, "\x00\xD8\x00\x00", 4 }, + { NULL, "\x00\xD8\xFF\xDB", 4 }, + { NULL, "\xFF\xDB", 2 }, + /* Invalid UTF-16: low surrogate without a preceding high surrogate */ + { NULL, "\x61\x00\x00\xDC", 4 }, + { NULL, "\xFF\xDF\xFF\xDB", 4 }, +}; + +int +main(int argc, char **argv) +{ + int ret; + struct test *t; + size_t i, utf16len; + uint8_t *utf16; + char *utf8; + + for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { + t = &tests[i]; + if (t->utf8 != NULL) { + ret = k5_utf8_to_utf16le(t->utf8, &utf16, &utf16len); + if (t->utf16 == NULL) { + assert(ret == EINVAL); + } else { + assert(ret == 0); + assert(t->utf16len == utf16len); + assert(memcmp(t->utf16, utf16, utf16len) == 0); + free(utf16); + } + } + + if (t->utf16 != NULL) { + ret = k5_utf16le_to_utf8((uint8_t *)t->utf16, t->utf16len, &utf8); + if (t->utf8 == NULL) { + assert(ret == EINVAL); + } else { + assert(ret == 0); + assert(strcmp(t->utf8, utf8) == 0); + free(utf8); + } + } + } + return 0; +} diff --git a/src/util/support/threads.c b/src/util/support/threads.c index bb8e287ecf75..be7e4c2e3f92 100644 --- a/src/util/support/threads.c +++ b/src/util/support/threads.c @@ -237,7 +237,6 @@ void *k5_getspecific (k5_key_t keynum) if (err) return NULL; - assert(keynum >= 0 && keynum < K5_KEY_MAX); assert(destructors_set[keynum] == 1); #ifndef ENABLE_THREADS @@ -271,7 +270,6 @@ int k5_setspecific (k5_key_t keynum, void *value) if (err) return err; - assert(keynum >= 0 && keynum < K5_KEY_MAX); assert(destructors_set[keynum] == 1); #ifndef ENABLE_THREADS @@ -334,8 +332,6 @@ int k5_key_register (k5_key_t keynum, void (*destructor)(void *)) if (err) return err; - assert(keynum >= 0 && keynum < K5_KEY_MAX); - #ifndef ENABLE_THREADS assert(destructors_set[keynum] == 0); @@ -365,8 +361,6 @@ int k5_key_register (k5_key_t keynum, void (*destructor)(void *)) int k5_key_delete (k5_key_t keynum) { - assert(keynum >= 0 && keynum < K5_KEY_MAX); - #ifndef ENABLE_THREADS assert(destructors_set[keynum] == 1); diff --git a/src/util/support/utf8.c b/src/util/support/utf8.c index e42c0c7dc82b..34e2b6adb059 100644 --- a/src/util/support/utf8.c +++ b/src/util/support/utf8.c @@ -205,7 +205,7 @@ int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out) return 0; } -/* conv UCS-2 to UTF-8, not used */ +/* conv UCS-4 to UTF-8 */ size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) { size_t len = 0; diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c index 80ca90b139e7..5cfc2c512b86 100644 --- a/src/util/support/utf8_conv.c +++ b/src/util/support/utf8_conv.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* util/support/utf8_conv.c */ /* - * Copyright 2008 by the Massachusetts Institute of Technology. + * Copyright 2008, 2017 by the Massachusetts Institute of Technology. * All Rights Reserved. * * Export of this software from the United States of America may @@ -47,411 +47,156 @@ * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. */ -/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */ +/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */ /* - * UTF-8 Conversion Routines - * - * These routines convert between Wide Character and UTF-8, - * or between MultiByte and UTF-8 encodings. - * - * Both single character and string versions of the functions are provided. - * All functions return -1 if the character or string cannot be converted. + * These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode + * character in either two or four bytes. Characters in the Basic Multilingual + * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes. + * Characters in the Supplementary Planes (10000..10FFFF) are split into a high + * surrogate and a low surrogate, each containing ten bits of the character + * value, and encoded in four bytes. */ #include "k5-platform.h" #include "k5-utf8.h" +#include "k5-buf.h" +#include "k5-input.h" #include "supp-int.h" static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; -static ssize_t -k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str, - const char *utf8str, - size_t count, - int little_endian) -{ - size_t ucs2len = 0; - size_t utflen, i; - krb5_ucs2 ch; - - /* If input ptr is NULL or empty... */ - if (utf8str == NULL || *utf8str == '\0') { - if (ucs2str != NULL) - *ucs2str = 0; - - return 0; - } - - /* Examine next UTF-8 character. */ - while (ucs2len < count && *utf8str != '\0') { - /* Get UTF-8 sequence length from 1st byte */ - utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen); - - if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN) - return -1; - - /* First byte minus length tag */ - ch = (krb5_ucs2)(utf8str[0] & mask[utflen]); - - for (i = 1; i < utflen; i++) { - /* Subsequent bytes must start with 10 */ - if ((utf8str[i] & 0xc0) != 0x80) - return -1; - - ch <<= 6; /* 6 bits of data in each subsequent byte */ - ch |= (krb5_ucs2)(utf8str[i] & 0x3f); - } - - if (ucs2str != NULL) { -#ifdef K5_BE -#ifndef SWAP16 -#define SWAP16(X) ((((X) << 8) | ((X) >> 8)) & 0xFFFF) -#endif - if (little_endian) - ucs2str[ucs2len] = SWAP16(ch); - else -#endif - ucs2str[ucs2len] = ch; - } - - utf8str += utflen; /* Move to next UTF-8 character */ - ucs2len++; /* Count number of wide chars stored/required */ - } - - if (ucs2str != NULL && ucs2len < count) { - /* Add null terminator if there's room in the buffer. */ - ucs2str[ucs2len] = 0; - } - - return ucs2len; -} - -int -krb5int_utf8s_to_ucs2s(const char *utf8s, - krb5_ucs2 **ucs2s, - size_t *ucs2chars) -{ - ssize_t len; - size_t chars; +/* A high surrogate is ten bits masked with 0xD800. */ +#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF) - chars = krb5int_utf8_chars(utf8s); - *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2s == NULL) { - return ENOMEM; - } +/* A low surrogate is ten bits masked with 0xDC00. */ +#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF) - len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0); - if (len < 0) { - free(*ucs2s); - *ucs2s = NULL; - return EINVAL; - } +/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate + * value. */ +#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF) +#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c)) - if (ucs2chars != NULL) { - *ucs2chars = chars; - } +/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a + * surrogate value. */ +#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c)) - return 0; -} +/* Characters in the Supplementary Planes have a base value subtracted from + * their code points to form a 20-bit value; ten bits go in each surrogate. */ +#define BASE 0x10000 +#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10)) +#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF)) +#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF))) int -krb5int_utf8cs_to_ucs2s(const char *utf8s, - size_t utf8slen, - krb5_ucs2 **ucs2s, - size_t *ucs2chars) +k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out) { - ssize_t len; - size_t chars; - - chars = krb5int_utf8c_chars(utf8s, utf8slen); - *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2s == NULL) { - return ENOMEM; - } - - len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars, 0); - if (len < 0) { - free(*ucs2s); - *ucs2s = NULL; - return EINVAL; - } - (*ucs2s)[chars] = 0; - - if (ucs2chars != NULL) { - *ucs2chars = chars; - } - - return 0; -} - -int -krb5int_utf8s_to_ucs2les(const char *utf8s, - unsigned char **ucs2les, - size_t *ucs2leslen) -{ - ssize_t len; - size_t chars; - - chars = krb5int_utf8_chars(utf8s); - - *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2)); - if (*ucs2les == NULL) { - return ENOMEM; - } - - len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1); - if (len < 0) { - free(*ucs2les); - *ucs2les = NULL; - return EINVAL; - } - - if (ucs2leslen != NULL) { - *ucs2leslen = chars * sizeof(krb5_ucs2); - } - - return 0; -} - -int -krb5int_utf8cs_to_ucs2les(const char *utf8s, - size_t utf8slen, - unsigned char **ucs2les, - size_t *ucs2leslen) -{ - ssize_t len; - size_t chars; - krb5_ucs2 *ucs2s; - - *ucs2les = NULL; - - chars = krb5int_utf8c_chars(utf8s, utf8slen); - ucs2s = malloc((chars + 1) * sizeof(krb5_ucs2)); - if (ucs2s == NULL) - return ENOMEM; - - len = k5_utf8s_to_ucs2s(ucs2s, utf8s, chars, 1); - if (len < 0) { - free(ucs2s); - return EINVAL; - } - ucs2s[chars] = 0; - - *ucs2les = (unsigned char *)ucs2s; - if (ucs2leslen != NULL) { - *ucs2leslen = chars * sizeof(krb5_ucs2); - } + struct k5buf buf; + krb5_ucs4 ch; + size_t chlen, i; + uint8_t *p; - return 0; -} + *utf16_out = NULL; + *nbytes_out = 0; -/*----------------------------------------------------------------------------- - Convert a wide char string to a UTF-8 string. - No more than 'count' bytes will be written to the output buffer. - Return the # of bytes written to the output buffer, excl null terminator. + k5_buf_init_dynamic(&buf); - ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the - length of the UCS-2 string in characters -*/ -static ssize_t -k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str, - size_t count, ssize_t ucs2len, int little_endian) -{ - int len = 0; - int n; - char *p = utf8str; - krb5_ucs2 empty = 0, ch; + /* Examine next UTF-8 character. */ + while (*utf8 != '\0') { + /* Get UTF-8 sequence length from first byte. */ + chlen = KRB5_UTF8_CHARLEN2(utf8, chlen); + if (chlen == 0) + goto invalid; - if (ucs2str == NULL) /* Treat input ptr NULL as an empty string */ - ucs2str = ∅ + /* First byte minus length tag */ + ch = (krb5_ucs4)(utf8[0] & mask[chlen]); - if (utf8str == NULL) /* Just compute size of output, excl null */ - { - while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) { - /* Get UTF-8 size of next wide char */ - ch = *ucs2str++; -#ifdef K5_BE - if (little_endian) - ch = SWAP16(ch); -#endif + for (i = 1; i < chlen; i++) { + /* Subsequent bytes must start with 10. */ + if ((utf8[i] & 0xc0) != 0x80) + goto invalid; - n = krb5int_ucs2_to_utf8(ch, NULL); - if (n < 1 || n > INT_MAX - len) - return -1; - len += n; + /* 6 bits of data in each subsequent byte */ + ch <<= 6; + ch |= (krb5_ucs4)(utf8[i] & 0x3f); + } + if (!IS_VALID_UNICODE(ch)) + goto invalid; + + /* Characters in the basic multilingual plane are encoded using two + * bytes; other characters are encoded using four bytes. */ + p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4); + if (p == NULL) + return ENOMEM; + if (IS_BMP(ch)) { + store_16_le(ch, p); + } else { + /* 0x10000 is subtracted from ch; then the high ten bits plus + * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */ + store_16_le(HIGH_SURROGATE(ch), p); + store_16_le(LOW_SURROGATE(ch), p + 2); } - return len; - } - - /* Do the actual conversion. */ - - n = 1; /* In case of empty ucs2str */ - while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) { - ch = *ucs2str++; -#ifdef K5_BE - if (little_endian) - ch = SWAP16(ch); -#endif - - n = krb5int_ucs2_to_utf8(ch, p); - - if (n < 1) - break; - - p += n; - count -= n; /* Space left in output buffer */ - } - - /* If not enough room for last character, pad remainder with null - so that return value = original count, indicating buffer full. */ - if (n == 0) { - while (count--) - *p++ = 0; - } - /* Add a null terminator if there's room. */ - else if (count) - *p = 0; - - if (n == -1) /* Conversion encountered invalid wide char. */ - return -1; - - /* Return the number of bytes written to output buffer, excl null. */ - return (p - utf8str); -} - -int -krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - - len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0); - if (len < 0) { - return EINVAL; - } - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - - if (utf8slen != NULL) { - *utf8slen = len; + /* Move to next UTF-8 character. */ + utf8 += chlen; } + *utf16_out = buf.data; + *nbytes_out = buf.len; return 0; -} -int -krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1); - if (len < 0) - return EINVAL; - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - - if (utf8slen != NULL) { - *utf8slen = len; - } - - return 0; +invalid: + k5_buf_free(&buf); + return EINVAL; } int -krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s, - size_t ucs2slen, - char **utf8s, - size_t *utf8slen) +k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out) { - ssize_t len; + struct k5buf buf; + struct k5input in; + uint16_t ch1, ch2; + krb5_ucs4 ch; + size_t chlen; + void *p; - if (ucs2slen > SSIZE_MAX) - return ERANGE; + *utf8_out = NULL; - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0, - (ssize_t)ucs2slen, 0); - if (len < 0) + if (nbytes % 2 != 0) return EINVAL; - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } + k5_buf_init_dynamic(&buf); + k5_input_init(&in, utf16bytes, nbytes); + while (!in.status && in.len > 0) { + /* Get the next character or high surrogate. A low surrogate without a + * preceding high surrogate is invalid. */ + ch1 = k5_input_get_uint16_le(&in); + if (IS_LOW_SURROGATE(ch1)) + goto invalid; + if (IS_HIGH_SURROGATE(ch1)) { + /* Get the low surrogate and combine the pair. */ + ch2 = k5_input_get_uint16_le(&in); + if (!IS_LOW_SURROGATE(ch2)) + goto invalid; + ch = COMPOSE(ch1, ch2); + } else { + ch = ch1; + } - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s, (size_t)len, - (ssize_t)ucs2slen, 0); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; + chlen = krb5int_ucs4_to_utf8(ch, NULL); + p = k5_buf_get_space(&buf, chlen); + if (p == NULL) + return ENOMEM; + (void)krb5int_ucs4_to_utf8(ch, p); } - (*utf8s)[len] = '\0'; - if (utf8slen != NULL) { - *utf8slen = len; - } + if (in.status) + goto invalid; + *utf8_out = buf.data; return 0; -} - -int -krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les, - size_t ucs2leslen, - char **utf8s, - size_t *utf8slen) -{ - ssize_t len; - if (ucs2leslen > SSIZE_MAX) - return ERANGE; - - len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, - (ssize_t)ucs2leslen, 1); - if (len < 0) - return EINVAL; - - *utf8s = (char *)malloc((size_t)len + 1); - if (*utf8s == NULL) { - return ENOMEM; - } - - len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len, - (ssize_t)ucs2leslen, 1); - if (len < 0) { - free(*utf8s); - *utf8s = NULL; - return EINVAL; - } - (*utf8s)[len] = '\0'; - - if (utf8slen != NULL) { - *utf8slen = len; - } - - return 0; +invalid: + k5_buf_free(&buf); + return EINVAL; } |