diff options
Diffstat (limited to 'lib/Basic/ConvertUTF.c')
| -rw-r--r-- | lib/Basic/ConvertUTF.c | 143 | 
1 files changed, 80 insertions, 63 deletions
| diff --git a/lib/Basic/ConvertUTF.c b/lib/Basic/ConvertUTF.c index 124e386c5526..e1970039e164 100644 --- a/lib/Basic/ConvertUTF.c +++ b/lib/Basic/ConvertUTF.c @@ -339,67 +339,6 @@ ConversionResult ConvertUTF32toUTF8 (      return result;  } -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF32 ( -        const UTF8** sourceStart, const UTF8* sourceEnd,  -        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { -    ConversionResult result = conversionOK; -    const UTF8* source = *sourceStart; -    UTF32* target = *targetStart; -    while (source < sourceEnd) { -        UTF32 ch = 0; -        unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; -        if (source + extraBytesToRead >= sourceEnd) { -            result = sourceExhausted; break; -        } -        /* Do this check whether lenient or strict */ -        if (!isLegalUTF8(source, extraBytesToRead+1)) { -            result = sourceIllegal; -            break; -        } -        /* -         * The cases all fall through. See "Note A" below. -         */ -        switch (extraBytesToRead) { -            case 5: ch += *source++; ch <<= 6; -            case 4: ch += *source++; ch <<= 6; -            case 3: ch += *source++; ch <<= 6; -            case 2: ch += *source++; ch <<= 6; -            case 1: ch += *source++; ch <<= 6; -            case 0: ch += *source++; -        } -        ch -= offsetsFromUTF8[extraBytesToRead]; - -        if (target >= targetEnd) { -            source -= (extraBytesToRead+1); /* Back up the source pointer! */ -            result = targetExhausted; break; -        } -        if (ch <= UNI_MAX_LEGAL_UTF32) { -            /* -             * UTF-16 surrogate values are illegal in UTF-32, and anything -             * over Plane 17 (> 0x10FFFF) is illegal. -             */ -            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { -                if (flags == strictConversion) { -                    source -= (extraBytesToRead+1); /* return to the illegal value itself */ -                    result = sourceIllegal; -                    break; -                } else { -                    *target++ = UNI_REPLACEMENT_CHAR; -                } -            } else { -                *target++ = ch; -            } -        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ -            result = sourceIllegal; -            *target++ = UNI_REPLACEMENT_CHAR; -        } -    } -    *sourceStart = source; -    *targetStart = target; -    return result; -}  #endif  /* --------------------------------------------------------------------- */ @@ -448,7 +387,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {   */  Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {      int length = trailingBytesForUTF8[*source]+1; -    if (source+length > sourceEnd) { +    if (length > sourceEnd - source) {          return false;      }      return isLegalUTF8(source, length); @@ -456,6 +395,22 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {  /* --------------------------------------------------------------------- */ +/* + * Exported function to return whether a UTF-8 string is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) { +    while (source != sourceEnd) { +        int length = trailingBytesForUTF8[*source] + 1; +        if (length > sourceEnd - source || !isLegalUTF8(source, length)) +            return false; +        source += length; +    } +    return true; +} + +/* --------------------------------------------------------------------- */ +  ConversionResult ConvertUTF8toUTF16 (          const UTF8** sourceStart, const UTF8* sourceEnd,           UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { @@ -465,7 +420,7 @@ ConversionResult ConvertUTF8toUTF16 (      while (source < sourceEnd) {          UTF32 ch = 0;          unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; -        if (source + extraBytesToRead >= sourceEnd) { +        if (extraBytesToRead >= sourceEnd - source) {              result = sourceExhausted; break;          }          /* Do this check whether lenient or strict */ @@ -527,6 +482,68 @@ ConversionResult ConvertUTF8toUTF16 (      return result;  } +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF32 ( +        const UTF8** sourceStart, const UTF8* sourceEnd,  +        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { +    ConversionResult result = conversionOK; +    const UTF8* source = *sourceStart; +    UTF32* target = *targetStart; +    while (source < sourceEnd) { +        UTF32 ch = 0; +        unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; +        if (extraBytesToRead >= sourceEnd - source) { +            result = sourceExhausted; break; +        } +        /* Do this check whether lenient or strict */ +        if (!isLegalUTF8(source, extraBytesToRead+1)) { +            result = sourceIllegal; +            break; +        } +        /* +         * The cases all fall through. See "Note A" below. +         */ +        switch (extraBytesToRead) { +            case 5: ch += *source++; ch <<= 6; +            case 4: ch += *source++; ch <<= 6; +            case 3: ch += *source++; ch <<= 6; +            case 2: ch += *source++; ch <<= 6; +            case 1: ch += *source++; ch <<= 6; +            case 0: ch += *source++; +        } +        ch -= offsetsFromUTF8[extraBytesToRead]; + +        if (target >= targetEnd) { +            source -= (extraBytesToRead+1); /* Back up the source pointer! */ +            result = targetExhausted; break; +        } +        if (ch <= UNI_MAX_LEGAL_UTF32) { +            /* +             * UTF-16 surrogate values are illegal in UTF-32, and anything +             * over Plane 17 (> 0x10FFFF) is illegal. +             */ +            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { +                if (flags == strictConversion) { +                    source -= (extraBytesToRead+1); /* return to the illegal value itself */ +                    result = sourceIllegal; +                    break; +                } else { +                    *target++ = UNI_REPLACEMENT_CHAR; +                } +            } else { +                *target++ = ch; +            } +        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ +            result = sourceIllegal; +            *target++ = UNI_REPLACEMENT_CHAR; +        } +    } +    *sourceStart = source; +    *targetStart = target; +    return result; +} +  /* ---------------------------------------------------------------------      Note A. | 
