diff options
Diffstat (limited to 'lib/wind/normalize.c')
| -rw-r--r-- | lib/wind/normalize.c | 325 | 
1 files changed, 325 insertions, 0 deletions
| diff --git a/lib/wind/normalize.c b/lib/wind/normalize.c new file mode 100644 index 000000000000..15274f6855e1 --- /dev/null +++ b/lib/wind/normalize.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2004 Kungliga Tekniska Högskolan + * (Royal Institute of Technology, Stockholm, Sweden). + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Institute nor the names of its contributors + *    may be used to endorse or promote products derived from this software + *    without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "windlocl.h" + +#include <assert.h> +#include <stdlib.h> +#include <errno.h> +#include <stdio.h> + +#include "roken.h" + +#include "normalize_table.h" + +static int +translation_cmp(const void *key, const void *data) +{ +    const struct translation *t1 = (const struct translation *)key; +    const struct translation *t2 = (const struct translation *)data; + +    return t1->key - t2->key; +} + +enum { s_base  = 0xAC00}; +enum { s_count = 11172}; +enum { l_base  = 0x1100}; +enum { l_count = 19}; +enum { v_base  = 0x1161}; +enum { v_count = 21}; +enum { t_base  = 0x11A7}; +enum { t_count = 28}; +enum { n_count = v_count * t_count}; + +static int +hangul_decomp(const uint32_t *in, size_t in_len, +	      uint32_t *out, size_t *out_len) +{ +    uint32_t u = *in; +    unsigned s_index; +    unsigned l, v, t; +    unsigned o; + +    if (u < s_base || u >= s_base + s_count) +	return 0; +    s_index = u - s_base; +    l = l_base + s_index / n_count; +    v = v_base + (s_index % n_count) / t_count; +    t = t_base + s_index % t_count; +    o = 2; +    if (t != t_base) +	++o; +    if (*out_len < o) +	return WIND_ERR_OVERRUN; +    out[0] = l; +    out[1] = v; +    if (t != t_base) +	out[2] = t; +    *out_len = o; +    return 1; +} + +static uint32_t +hangul_composition(const uint32_t *in, size_t in_len) +{ +    if (in_len < 2) +	return 0; +    if (in[0] >= l_base && in[0] < l_base + l_count) { +	unsigned l_index = in[0] - l_base; +	unsigned v_index; + +	if (in[1] < v_base || in[1] >= v_base + v_count) +	    return 0; +	v_index = in[1] - v_base; +	return (l_index * v_count + v_index) * t_count + s_base; +    } else if (in[0] >= s_base && in[0] < s_base + s_count) { +	unsigned s_index = in[0] - s_base; +	unsigned t_index; + +	if (s_index % t_count != 0) +	    return 0; +	if (in[1] < t_base || in[1] >= t_base + t_count) +	    return 0; +	t_index = in[1] - t_base; +	return in[0] + t_index; +    } +    return 0; +} + +static int +compat_decomp(const uint32_t *in, size_t in_len, +	      uint32_t *out, size_t *out_len) +{ +    unsigned i; +    unsigned o = 0; + +    for (i = 0; i < in_len; ++i) { +	struct translation ts = {in[i]}; +	size_t sub_len = *out_len - o; +	int ret; + +	ret = hangul_decomp(in + i, in_len - i, +			    out + o, &sub_len); +	if (ret) { +	    if (ret == WIND_ERR_OVERRUN) +		return ret; +	    o += sub_len; +	} else { +	    void *s = bsearch(&ts, +			      _wind_normalize_table, +			      _wind_normalize_table_size, +			      sizeof(_wind_normalize_table[0]), +			      translation_cmp); +	    if (s != NULL) { +		const struct translation *t = (const struct translation *)s; + +		ret = compat_decomp(_wind_normalize_val_table + t->val_offset, +				    t->val_len, +				    out + o, &sub_len); +		if (ret) +		    return ret; +		o += sub_len; +	    } else { +		if (o >= *out_len) +		    return WIND_ERR_OVERRUN; +		out[o++] = in[i]; + +	    } +	} +    } +    *out_len = o; +    return 0; +} + +static void +swap_char(uint32_t * a, uint32_t * b) +{ +    uint32_t t; +    t = *a; +    *a = *b; +    *b = t; +} + +/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points + * that all have Canonical_Combining_Class > 0 */ +static void +canonical_reorder_sequence(uint32_t * a, size_t len) +{ +    size_t i, j; + +    if (len <= 1) +	return; + +    for (i = 1; i < len; i++) { +	for (j = i; +	     j > 0 && +		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]); +	     j--) +	    swap_char(&a[j], &a[j-1]); +    } +} + +static void +canonical_reorder(uint32_t *tmp, size_t tmp_len) +{ +    size_t i; + +    for (i = 0; i < tmp_len; ++i) { +	int cc = _wind_combining_class(tmp[i]); +	if (cc) { +	    size_t j; +	    for (j = i + 1; +		 j < tmp_len && _wind_combining_class(tmp[j]); +		 ++j) +		; +	    canonical_reorder_sequence(&tmp[i], j - i); +	    i = j; +	} +    } +} + +static uint32_t +find_composition(const uint32_t *in, unsigned in_len) +{ +    unsigned short canon_index = 0; +    uint32_t cur; +    unsigned n = 0; + +    cur = hangul_composition(in, in_len); +    if (cur) +	return cur; + +    do { +	const struct canon_node *c = &_wind_canon_table[canon_index]; +	unsigned i; + +	if (n % 5 == 0) { +	    cur = *in++; +	    if (in_len-- == 0) +		return c->val; +	} + +	i = cur >> 16; +	if (i < c->next_start || i >= c->next_end) +	    canon_index = 0; +	else +	    canon_index = +		_wind_canon_next_table[c->next_offset + i - c->next_start]; +	if (canon_index != 0) { +	    cur = (cur << 4) & 0xFFFFF; +	    ++n; +	} +    } while (canon_index != 0); +    return 0; +} + +static int +combine(const uint32_t *in, size_t in_len, +	uint32_t *out, size_t *out_len) +{ +    unsigned i; +    int ostarter; +    unsigned o = 0; +    int old_cc; + +    for (i = 0; i < in_len;) { +	while (i < in_len && _wind_combining_class(in[i]) != 0) { +	    out[o++] = in[i++]; +	} +	if (i < in_len) { +	    if (o >= *out_len) +		return WIND_ERR_OVERRUN; +	    ostarter = o; +	    out[o++] = in[i++]; +	    old_cc   = -1; + +	    while (i < in_len) { +		uint32_t comb; +		uint32_t v[2]; +		int cc; + +		v[0] = out[ostarter]; +		v[1] = in[i]; + +		cc = _wind_combining_class(in[i]); +		if (old_cc != cc && (comb = find_composition(v, 2))) { +		    out[ostarter] = comb; +		} else if (cc == 0) { +		    break; +		} else { +		    if (o >= *out_len) +			return WIND_ERR_OVERRUN; +		    out[o++] = in[i]; +		    old_cc   = cc; +		} +		++i; +	    } +	} +    } +    *out_len = o; +    return 0; +} + +int +_wind_stringprep_normalize(const uint32_t *in, size_t in_len, +			   uint32_t *out, size_t *out_len) +{ +    size_t tmp_len; +    uint32_t *tmp; +    int ret; + +    if (in_len == 0) { +	*out_len = 0; +	return 0; +    } + +    tmp_len = in_len * 4; +    if (tmp_len < MAX_LENGTH_CANON) +	tmp_len = MAX_LENGTH_CANON; +    tmp = malloc(tmp_len * sizeof(uint32_t)); +    if (tmp == NULL) +	return ENOMEM; + +    ret = compat_decomp(in, in_len, tmp, &tmp_len); +    if (ret) { +	free(tmp); +	return ret; +    } +    canonical_reorder(tmp, tmp_len); +    ret = combine(tmp, tmp_len, out, out_len); +    free(tmp); +    return ret; +} | 
