aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Support/ConvertUTFWrapper.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Support/ConvertUTFWrapper.cpp')
-rw-r--r--llvm/lib/Support/ConvertUTFWrapper.cpp102
1 files changed, 79 insertions, 23 deletions
diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp
index 392c4c4890e1..9bf3f8f8b897 100644
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -34,31 +34,31 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
const UTF8 *sourceStart = (const UTF8*)Source.data();
// FIXME: Make the type of the result buffer correct instead of
// using reinterpret_cast.
- UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
+ UTF16 *targetStart = reinterpret_cast<UTF16 *>(ResultPtr);
ConversionFlags flags = strictConversion;
- result = ConvertUTF8toUTF16(
- &sourceStart, sourceStart + Source.size(),
- &targetStart, targetStart + Source.size(), flags);
+ result =
+ ConvertUTF8toUTF16(&sourceStart, sourceStart + Source.size(),
+ &targetStart, targetStart + Source.size(), flags);
if (result == conversionOK)
- ResultPtr = reinterpret_cast<char*>(targetStart);
+ ResultPtr = reinterpret_cast<char *>(targetStart);
else
ErrorPtr = sourceStart;
} else if (WideCharWidth == 4) {
- const UTF8 *sourceStart = (const UTF8*)Source.data();
+ const UTF8 *sourceStart = (const UTF8 *)Source.data();
// FIXME: Make the type of the result buffer correct instead of
// using reinterpret_cast.
- UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
+ UTF32 *targetStart = reinterpret_cast<UTF32 *>(ResultPtr);
ConversionFlags flags = strictConversion;
- result = ConvertUTF8toUTF32(
- &sourceStart, sourceStart + Source.size(),
- &targetStart, targetStart + Source.size(), flags);
+ result =
+ ConvertUTF8toUTF32(&sourceStart, sourceStart + Source.size(),
+ &targetStart, targetStart + Source.size(), flags);
if (result == conversionOK)
- ResultPtr = reinterpret_cast<char*>(targetStart);
+ ResultPtr = reinterpret_cast<char *>(targetStart);
else
ErrorPtr = sourceStart;
}
- assert((result != targetExhausted)
- && "ConvertUTF8toUTFXX exhausted target buffer");
+ assert((result != targetExhausted) &&
+ "ConvertUTF8toUTFXX exhausted target buffer");
return result == conversionOK;
}
@@ -67,20 +67,18 @@ bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
const UTF32 *SourceEnd = SourceStart + 1;
UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
UTF8 *TargetEnd = TargetStart + 4;
- ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
- &TargetStart, TargetEnd,
- strictConversion);
+ ConversionResult CR = ConvertUTF32toUTF8(
+ &SourceStart, SourceEnd, &TargetStart, TargetEnd, strictConversion);
if (CR != conversionOK)
return false;
- ResultPtr = reinterpret_cast<char*>(TargetStart);
+ ResultPtr = reinterpret_cast<char *>(TargetStart);
return true;
}
bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
- return (S.size() >= 2 &&
- ((S[0] == '\xff' && S[1] == '\xfe') ||
- (S[0] == '\xfe' && S[1] == '\xff')));
+ return (S.size() >= 2 && ((S[0] == '\xff' && S[1] == '\xfe') ||
+ (S[0] == '\xfe' && S[1] == '\xff')));
}
bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
@@ -134,11 +132,69 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
return true;
}
-bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
-{
+bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) {
return convertUTF16ToUTF8String(
llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
- Src.size() * sizeof(UTF16)), Out);
+ Src.size() * sizeof(UTF16)),
+ Out);
+}
+
+bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
+ assert(Out.empty());
+
+ // Error out on an uneven byte count.
+ if (SrcBytes.size() % 4)
+ return false;
+
+ // Avoid OOB by returning early on empty input.
+ if (SrcBytes.empty())
+ return true;
+
+ const UTF32 *Src = reinterpret_cast<const UTF32 *>(SrcBytes.begin());
+ const UTF32 *SrcEnd = reinterpret_cast<const UTF32 *>(SrcBytes.end());
+
+ assert((uintptr_t)Src % sizeof(UTF32) == 0);
+
+ // Byteswap if necessary.
+ std::vector<UTF32> ByteSwapped;
+ if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) {
+ ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
+ for (UTF32 &I : ByteSwapped)
+ I = llvm::ByteSwap_32(I);
+ Src = &ByteSwapped[0];
+ SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
+ }
+
+ // Skip the BOM for conversion.
+ if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)
+ Src++;
+
+ // Just allocate enough space up front. We'll shrink it later. Allocate
+ // enough that we can fit a null terminator without reallocating.
+ Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
+ UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
+ UTF8 *DstEnd = Dst + Out.size();
+
+ ConversionResult CR =
+ ConvertUTF32toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
+ assert(CR != targetExhausted);
+
+ if (CR != conversionOK) {
+ Out.clear();
+ return false;
+ }
+
+ Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
+ Out.push_back(0);
+ Out.pop_back();
+ return true;
+}
+
+bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) {
+ return convertUTF32ToUTF8String(
+ llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
+ Src.size() * sizeof(UTF32)),
+ Out);
}
bool convertUTF8ToUTF16String(StringRef SrcUTF8,