aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Support/FormattedStream.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Support/FormattedStream.cpp')
-rw-r--r--llvm/lib/Support/FormattedStream.cpp66
1 files changed, 57 insertions, 9 deletions
diff --git a/llvm/lib/Support/FormattedStream.cpp b/llvm/lib/Support/FormattedStream.cpp
index 4eb747038bb9..5716afc187e4 100644
--- a/llvm/lib/Support/FormattedStream.cpp
+++ b/llvm/lib/Support/FormattedStream.cpp
@@ -11,7 +11,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Unicode.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -19,16 +21,22 @@ using namespace llvm;
/// UpdatePosition - Examine the given char sequence and figure out which
/// column we end up in after output, and how many line breaks are contained.
-///
-static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *Ptr, size_t Size) {
+/// This assumes that the input string is well-formed UTF-8, and takes into
+/// account Unicode characters which render as multiple columns wide.
+void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) {
unsigned &Column = Position.first;
unsigned &Line = Position.second;
- // Keep track of the current column and line by scanning the string for
- // special characters
- for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
- ++Column;
- switch (*Ptr) {
+ auto ProcessUTF8CodePoint = [&Line, &Column](StringRef CP) {
+ int Width = sys::unicode::columnWidthUTF8(CP);
+ if (Width != sys::unicode::ErrorNonPrintableCharacter)
+ Column += Width;
+
+ // The only special whitespace characters we care about are single-byte.
+ if (CP.size() > 1)
+ return;
+
+ switch (CP[0]) {
case '\n':
Line += 1;
LLVM_FALLTHROUGH;
@@ -40,6 +48,46 @@ static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *
Column += (8 - (Column & 0x7)) & 0x7;
break;
}
+ };
+
+ // If we have a partial UTF-8 sequence from the previous buffer, check that
+ // first.
+ if (PartialUTF8Char.size()) {
+ size_t BytesFromBuffer =
+ getNumBytesForUTF8(PartialUTF8Char[0]) - PartialUTF8Char.size();
+ if (Size < BytesFromBuffer) {
+ // If we still don't have enough bytes for a complete code point, just
+ // append what we have.
+ PartialUTF8Char.append(StringRef(Ptr, Size));
+ return;
+ } else {
+ // The first few bytes from the buffer will complete the code point.
+ // Concatenate them and process their effect on the line and column
+ // numbers.
+ PartialUTF8Char.append(StringRef(Ptr, BytesFromBuffer));
+ ProcessUTF8CodePoint(PartialUTF8Char);
+ PartialUTF8Char.clear();
+ Ptr += BytesFromBuffer;
+ Size -= BytesFromBuffer;
+ }
+ }
+
+ // Now scan the rest of the buffer.
+ unsigned NumBytes;
+ for (const char *End = Ptr + Size; Ptr < End; Ptr += NumBytes) {
+ NumBytes = getNumBytesForUTF8(*Ptr);
+
+ // The buffer might end part way through a UTF-8 code unit sequence for a
+ // Unicode scalar value if it got flushed. If this happens, we can't know
+ // the display width until we see the rest of the code point. Stash the
+ // bytes we do have, so that we can reconstruct the whole code point later,
+ // even if the buffer is being flushed.
+ if ((unsigned)(End - Ptr) < NumBytes) {
+ PartialUTF8Char = StringRef(Ptr, End - Ptr);
+ return;
+ }
+
+ ProcessUTF8CodePoint(StringRef(Ptr, NumBytes));
}
}
@@ -52,9 +100,9 @@ void formatted_raw_ostream::ComputePosition(const char *Ptr, size_t Size) {
if (Ptr <= Scanned && Scanned <= Ptr + Size)
// Scan all characters added since our last scan to determine the new
// column.
- UpdatePosition(Position, Scanned, Size - (Scanned - Ptr));
+ UpdatePosition(Scanned, Size - (Scanned - Ptr));
else
- UpdatePosition(Position, Ptr, Size);
+ UpdatePosition(Ptr, Size);
// Update the scanning pointer.
Scanned = Ptr + Size;