aboutsummaryrefslogtreecommitdiff
path: root/clang/lib/Lex/LiteralSupport.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib/Lex/LiteralSupport.cpp')
-rw-r--r--clang/lib/Lex/LiteralSupport.cpp241
1 files changed, 207 insertions, 34 deletions
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 76c8b324671d..ebf30c9f01a9 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -27,6 +27,7 @@
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Unicode.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -233,7 +234,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
HadError = true;
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
- diag::err_delimited_escape_missing_brace);
+ diag::err_delimited_escape_missing_brace)
+ << "o";
break;
}
@@ -309,7 +311,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
<< tok::r_brace;
else if (!HadError) {
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
- diag::ext_delimited_escape_sequence);
+ diag::ext_delimited_escape_sequence)
+ << /*delimited*/ 0;
}
}
@@ -320,10 +323,8 @@ static void appendCodePoint(unsigned Codepoint,
llvm::SmallVectorImpl<char> &Str) {
char ResultBuf[4];
char *ResultPtr = ResultBuf;
- bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
- (void)Res;
- assert(Res && "Unexpected conversion failure");
- Str.append(ResultBuf, ResultPtr);
+ if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
+ Str.append(ResultBuf, ResultPtr);
}
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
@@ -337,7 +338,7 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
char Kind = *I;
++I;
- assert(Kind == 'u' || Kind == 'U');
+ assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
uint32_t CodePoint = 0;
if (Kind == 'u' && *I == '{') {
@@ -351,6 +352,22 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
continue;
}
+ if (Kind == 'N') {
+ assert(*I == '{');
+ ++I;
+ auto Delim = std::find(I, Input.end(), '}');
+ assert(Delim != Input.end());
+ llvm::Optional<llvm::sys::unicode::LooseMatchingResult> Res =
+ llvm::sys::unicode::nameToCodepointLooseMatching(
+ StringRef(I, std::distance(I, Delim)));
+ assert(Res);
+ CodePoint = Res->CodePoint;
+ assert(CodePoint != 0xFFFFFFFF);
+ appendCodePoint(CodePoint, Buf);
+ I = Delim;
+ continue;
+ }
+
unsigned NumHexDigits;
if (Kind == 'u')
NumHexDigits = 4;
@@ -372,23 +389,20 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
}
}
-/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
-/// return the UTF32.
-static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
- const char *ThisTokEnd,
- uint32_t &UcnVal, unsigned short &UcnLen,
- FullSourceLoc Loc, DiagnosticsEngine *Diags,
- const LangOptions &Features,
- bool in_char_string_literal = false) {
+static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
+ const char *&ThisTokBuf,
+ const char *ThisTokEnd, uint32_t &UcnVal,
+ unsigned short &UcnLen, bool &Delimited,
+ FullSourceLoc Loc, DiagnosticsEngine *Diags,
+ const LangOptions &Features,
+ bool in_char_string_literal = false) {
const char *UcnBegin = ThisTokBuf;
+ bool HasError = false;
+ bool EndDelimiterFound = false;
// Skip the '\u' char's.
ThisTokBuf += 2;
-
- bool Delimited = false;
- bool EndDelimiterFound = false;
- bool HasError = false;
-
+ Delimited = false;
if (UcnBegin[1] == 'u' && in_char_string_literal &&
ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
@@ -396,7 +410,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
if (Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
- diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
+ diag::err_hex_escape_no_digits)
+ << StringRef(&ThisTokBuf[-1], 1);
return false;
}
UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
@@ -457,7 +472,136 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
: diag::err_ucn_escape_incomplete);
return false;
}
+ return !HasError;
+}
+
+static void DiagnoseInvalidUnicodeCharacterName(
+ DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
+ const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
+ llvm::StringRef Name) {
+
+ Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
+ diag::err_invalid_ucn_name)
+ << Name;
+
+ namespace u = llvm::sys::unicode;
+
+ llvm::Optional<u::LooseMatchingResult> Res =
+ u::nameToCodepointLooseMatching(Name);
+ if (Res) {
+ Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
+ diag::note_invalid_ucn_name_loose_matching)
+ << FixItHint::CreateReplacement(
+ MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
+ TokRangeEnd),
+ Res->Name);
+ return;
+ }
+
+ unsigned Distance = 0;
+ SmallVector<u::MatchForCodepointName> Matches =
+ u::nearestMatchesForCodepointName(Name, 5);
+ assert(!Matches.empty() && "No unicode characters found");
+
+ for (const auto &Match : Matches) {
+ if (Distance == 0)
+ Distance = Match.Distance;
+ if (std::max(Distance, Match.Distance) -
+ std::min(Distance, Match.Distance) >
+ 3)
+ break;
+ Distance = Match.Distance;
+ std::string Str;
+ llvm::UTF32 V = Match.Value;
+ LLVM_ATTRIBUTE_UNUSED bool Converted =
+ llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
+ assert(Converted && "Found a match wich is not a unicode character");
+
+ Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
+ diag::note_invalid_ucn_name_candidate)
+ << Match.Name << llvm::utohexstr(Match.Value)
+ << Str // FIXME: Fix the rendering of non printable characters
+ << FixItHint::CreateReplacement(
+ MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
+ TokRangeEnd),
+ Match.Name);
+ }
+}
+
+static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
+ const char *&ThisTokBuf,
+ const char *ThisTokEnd, uint32_t &UcnVal,
+ unsigned short &UcnLen, FullSourceLoc Loc,
+ DiagnosticsEngine *Diags,
+ const LangOptions &Features) {
+ const char *UcnBegin = ThisTokBuf;
+ assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
+ ThisTokBuf += 2;
+ if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
+ if (Diags) {
+ Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+ diag::err_delimited_escape_missing_brace)
+ << StringRef(&ThisTokBuf[-1], 1);
+ }
+ ThisTokBuf++;
+ return false;
+ }
+ ThisTokBuf++;
+ const char *ClosingBrace =
+ std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
+ return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
+ });
+ bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
+ bool Empty = ClosingBrace == ThisTokBuf;
+ if (Incomplete || Empty) {
+ if (Diags) {
+ Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+ Incomplete ? diag::err_ucn_escape_incomplete
+ : diag::err_delimited_escape_empty)
+ << StringRef(&UcnBegin[1], 1);
+ }
+ ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
+ return false;
+ }
+ StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
+ ThisTokBuf = ClosingBrace + 1;
+ llvm::Optional<char32_t> Res =
+ llvm::sys::unicode::nameToCodepointStrict(Name);
+ if (!Res) {
+ if (Diags)
+ DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
+ &UcnBegin[3], ClosingBrace, Name);
+ return false;
+ }
+ UcnVal = *Res;
+ UcnLen = UcnVal > 0xFFFF ? 8 : 4;
+ return true;
+}
+
+/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
+/// return the UTF32.
+static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
+ const char *ThisTokEnd, uint32_t &UcnVal,
+ unsigned short &UcnLen, FullSourceLoc Loc,
+ DiagnosticsEngine *Diags,
+ const LangOptions &Features,
+ bool in_char_string_literal = false) {
+
+ bool HasError;
+ const char *UcnBegin = ThisTokBuf;
+ bool IsDelimitedEscapeSequence = false;
+ bool IsNamedEscapeSequence = false;
+ if (ThisTokBuf[1] == 'N') {
+ IsNamedEscapeSequence = true;
+ HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
+ UcnVal, UcnLen, Loc, Diags, Features);
+ } else {
+ HasError =
+ !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
+ UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
+ Features, in_char_string_literal);
+ }
if (HasError)
return false;
@@ -495,9 +639,10 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
diag::warn_ucn_not_valid_in_c89_literal);
- if (Delimited && Diags)
+ if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
- diag::ext_delimited_escape_sequence);
+ diag::ext_delimited_escape_sequence)
+ << (IsNamedEscapeSequence ? 1 : 0);
return true;
}
@@ -711,6 +856,7 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
isFract = false;
isAccum = false;
hadError = false;
+ isBitInt = false;
// This routine assumes that the range begin/end matches the regex for integer
// and FP constants (specifically, the 'pp-number' regex), and assumes that
@@ -895,6 +1041,24 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
if (isImaginary) break; // Cannot be repeated.
isImaginary = true;
continue; // Success.
+ case 'w':
+ case 'W':
+ if (isFPConstant)
+ break; // Invalid for floats.
+ if (HasSize)
+ break; // Invalid if we already have a size for the literal.
+
+ // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
+ // explicitly do not support the suffix in C++ as an extension because a
+ // library-based UDL that resolves to a library type may be more
+ // appropriate there.
+ if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
+ (s[0] == 'W' && s[1] == 'B'))) {
+ isBitInt = true;
+ HasSize = true;
+ ++s; // Skip both characters (2nd char skipped on continue).
+ continue; // Success.
+ }
}
// If we reached here, there was an error or a ud-suffix.
break;
@@ -916,6 +1080,7 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
isFloat16 = false;
isHalf = false;
isImaginary = false;
+ isBitInt = false;
MicrosoftInteger = 0;
saw_fixed_point_suffix = false;
isFract = false;
@@ -1145,8 +1310,14 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
// floating point constant, the radix will change to 10. Octal floating
// point constants are not permitted (only decimal and hexadecimal).
radix = 8;
- DigitsBegin = s;
+ const char *PossibleNewDigitStart = s;
s = SkipOctalDigits(s);
+ // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
+ // as the start of the digits. So if skipping octal digits does not skip
+ // anything, we leave the digit start where it was.
+ if (s != PossibleNewDigitStart)
+ DigitsBegin = PossibleNewDigitStart;
+
if (s == ThisTokEnd)
return; // Done, simple octal number like 01234
@@ -1510,7 +1681,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
// If we see bad encoding for unprefixed character literals, warn and
// simply copy the byte values, for compatibility with gcc and
// older versions of clang.
- bool NoErrorOnBadEncoding = isAscii();
+ bool NoErrorOnBadEncoding = isOrdinary();
unsigned Msg = diag::err_bad_character_encoding;
if (NoErrorOnBadEncoding)
Msg = diag::warn_bad_character_encoding;
@@ -1535,7 +1706,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
continue;
}
// Is this a Universal Character Name escape?
- if (begin[1] == 'u' || begin[1] == 'U') {
+ if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
unsigned short UcnLen = 0;
if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
FullSourceLoc(Loc, PP.getSourceManager()),
@@ -1560,9 +1731,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
if (NumCharsSoFar > 1) {
- if (isAscii() && NumCharsSoFar == 4)
+ if (isOrdinary() && NumCharsSoFar == 4)
PP.Diag(Loc, diag::warn_four_char_character_literal);
- else if (isAscii())
+ else if (isOrdinary())
PP.Diag(Loc, diag::warn_multichar_character_literal);
else {
PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
@@ -1578,7 +1749,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
// Narrow character literals act as though their value is concatenated
// in this implementation, but warn on overflow.
bool multi_char_too_long = false;
- if (isAscii() && isMultiChar()) {
+ if (isOrdinary() && isMultiChar()) {
LitVal = 0;
for (size_t i = 0; i < NumCharsSoFar; ++i) {
// check for enough leading zeros to shift into
@@ -1602,7 +1773,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
// character constants are not sign extended in the this implementation:
// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
- if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
+ if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
PP.getLangOpts().CharIsSigned)
Value = (signed char)Value;
}
@@ -1707,7 +1878,7 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
// Remember if we see any wide or utf-8/16/32 strings.
// Also check for illegal concatenations.
if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
- if (isAscii()) {
+ if (isOrdinary()) {
Kind = StringToks[i].getKind();
} else {
if (Diags)
@@ -1895,7 +2066,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
continue;
}
// Is this a Universal Character Name escape?
- if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+ if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
+ ThisTokBuf[1] == 'N') {
EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
ResultPtr, hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
@@ -1990,7 +2162,7 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok,
// If we see bad encoding for unprefixed string literals, warn and
// simply copy the byte values, for compatibility with gcc and older
// versions of clang.
- bool NoErrorOnBadEncoding = isAscii();
+ bool NoErrorOnBadEncoding = isOrdinary();
if (NoErrorOnBadEncoding) {
memcpy(ResultPtr, Fragment.data(), Fragment.size());
ResultPtr += Fragment.size();
@@ -2088,7 +2260,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
// Otherwise, this is an escape character. Advance over it.
bool HadError = false;
- if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
+ if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
+ SpellingPtr[1] == 'N') {
const char *EscapePtr = SpellingPtr;
unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1, Features, HadError);