diff options
Diffstat (limited to 'clang/lib/Lex/LiteralSupport.cpp')
| -rw-r--r-- | clang/lib/Lex/LiteralSupport.cpp | 241 |
1 files changed, 207 insertions, 34 deletions
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 76c8b324671d..ebf30c9f01a9 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -27,6 +27,7 @@ #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Unicode.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -233,7 +234,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, HadError = true; if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, - diag::err_delimited_escape_missing_brace); + diag::err_delimited_escape_missing_brace) + << "o"; break; } @@ -309,7 +311,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, << tok::r_brace; else if (!HadError) { Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, - diag::ext_delimited_escape_sequence); + diag::ext_delimited_escape_sequence) + << /*delimited*/ 0; } } @@ -320,10 +323,8 @@ static void appendCodePoint(unsigned Codepoint, llvm::SmallVectorImpl<char> &Str) { char ResultBuf[4]; char *ResultPtr = ResultBuf; - bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); - (void)Res; - assert(Res && "Unexpected conversion failure"); - Str.append(ResultBuf, ResultPtr); + if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr)) + Str.append(ResultBuf, ResultPtr); } void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { @@ -337,7 +338,7 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { char Kind = *I; ++I; - assert(Kind == 'u' || Kind == 'U'); + assert(Kind == 'u' || Kind == 'U' || Kind == 'N'); uint32_t CodePoint = 0; if (Kind == 'u' && *I == '{') { @@ -351,6 +352,22 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { continue; } + if (Kind == 'N') { + assert(*I == '{'); + ++I; + auto Delim = std::find(I, Input.end(), '}'); + assert(Delim != Input.end()); + llvm::Optional<llvm::sys::unicode::LooseMatchingResult> Res = + llvm::sys::unicode::nameToCodepointLooseMatching( + StringRef(I, std::distance(I, Delim))); + assert(Res); + CodePoint = Res->CodePoint; + assert(CodePoint != 0xFFFFFFFF); + appendCodePoint(CodePoint, Buf); + I = Delim; + continue; + } + unsigned NumHexDigits; if (Kind == 'u') NumHexDigits = 4; @@ -372,23 +389,20 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { } } -/// ProcessUCNEscape - Read the Universal Character Name, check constraints and -/// return the UTF32. -static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, - const char *ThisTokEnd, - uint32_t &UcnVal, unsigned short &UcnLen, - FullSourceLoc Loc, DiagnosticsEngine *Diags, - const LangOptions &Features, - bool in_char_string_literal = false) { +static bool ProcessNumericUCNEscape(const char *ThisTokBegin, + const char *&ThisTokBuf, + const char *ThisTokEnd, uint32_t &UcnVal, + unsigned short &UcnLen, bool &Delimited, + FullSourceLoc Loc, DiagnosticsEngine *Diags, + const LangOptions &Features, + bool in_char_string_literal = false) { const char *UcnBegin = ThisTokBuf; + bool HasError = false; + bool EndDelimiterFound = false; // Skip the '\u' char's. ThisTokBuf += 2; - - bool Delimited = false; - bool EndDelimiterFound = false; - bool HasError = false; - + Delimited = false; if (UcnBegin[1] == 'u' && in_char_string_literal && ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { Delimited = true; @@ -396,7 +410,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, - diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1); + diag::err_hex_escape_no_digits) + << StringRef(&ThisTokBuf[-1], 1); return false; } UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); @@ -457,7 +472,136 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, : diag::err_ucn_escape_incomplete); return false; } + return !HasError; +} + +static void DiagnoseInvalidUnicodeCharacterName( + DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc, + const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, + llvm::StringRef Name) { + + Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd, + diag::err_invalid_ucn_name) + << Name; + + namespace u = llvm::sys::unicode; + + llvm::Optional<u::LooseMatchingResult> Res = + u::nameToCodepointLooseMatching(Name); + if (Res) { + Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd, + diag::note_invalid_ucn_name_loose_matching) + << FixItHint::CreateReplacement( + MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin, + TokRangeEnd), + Res->Name); + return; + } + + unsigned Distance = 0; + SmallVector<u::MatchForCodepointName> Matches = + u::nearestMatchesForCodepointName(Name, 5); + assert(!Matches.empty() && "No unicode characters found"); + + for (const auto &Match : Matches) { + if (Distance == 0) + Distance = Match.Distance; + if (std::max(Distance, Match.Distance) - + std::min(Distance, Match.Distance) > + 3) + break; + Distance = Match.Distance; + std::string Str; + llvm::UTF32 V = Match.Value; + LLVM_ATTRIBUTE_UNUSED bool Converted = + llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str); + assert(Converted && "Found a match wich is not a unicode character"); + + Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd, + diag::note_invalid_ucn_name_candidate) + << Match.Name << llvm::utohexstr(Match.Value) + << Str // FIXME: Fix the rendering of non printable characters + << FixItHint::CreateReplacement( + MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin, + TokRangeEnd), + Match.Name); + } +} + +static bool ProcessNamedUCNEscape(const char *ThisTokBegin, + const char *&ThisTokBuf, + const char *ThisTokEnd, uint32_t &UcnVal, + unsigned short &UcnLen, FullSourceLoc Loc, + DiagnosticsEngine *Diags, + const LangOptions &Features) { + const char *UcnBegin = ThisTokBuf; + assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N'); + ThisTokBuf += 2; + if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') { + if (Diags) { + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_delimited_escape_missing_brace) + << StringRef(&ThisTokBuf[-1], 1); + } + ThisTokBuf++; + return false; + } + ThisTokBuf++; + const char *ClosingBrace = + std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) { + return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-'; + }); + bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}'; + bool Empty = ClosingBrace == ThisTokBuf; + if (Incomplete || Empty) { + if (Diags) { + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + Incomplete ? diag::err_ucn_escape_incomplete + : diag::err_delimited_escape_empty) + << StringRef(&UcnBegin[1], 1); + } + ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1; + return false; + } + StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf); + ThisTokBuf = ClosingBrace + 1; + llvm::Optional<char32_t> Res = + llvm::sys::unicode::nameToCodepointStrict(Name); + if (!Res) { + if (Diags) + DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin, + &UcnBegin[3], ClosingBrace, Name); + return false; + } + UcnVal = *Res; + UcnLen = UcnVal > 0xFFFF ? 8 : 4; + return true; +} + +/// ProcessUCNEscape - Read the Universal Character Name, check constraints and +/// return the UTF32. +static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, + const char *ThisTokEnd, uint32_t &UcnVal, + unsigned short &UcnLen, FullSourceLoc Loc, + DiagnosticsEngine *Diags, + const LangOptions &Features, + bool in_char_string_literal = false) { + + bool HasError; + const char *UcnBegin = ThisTokBuf; + bool IsDelimitedEscapeSequence = false; + bool IsNamedEscapeSequence = false; + if (ThisTokBuf[1] == 'N') { + IsNamedEscapeSequence = true; + HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, + UcnVal, UcnLen, Loc, Diags, Features); + } else { + HasError = + !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, + UcnLen, IsDelimitedEscapeSequence, Loc, Diags, + Features, in_char_string_literal); + } if (HasError) return false; @@ -495,9 +639,10 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::warn_ucn_not_valid_in_c89_literal); - if (Delimited && Diags) + if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, - diag::ext_delimited_escape_sequence); + diag::ext_delimited_escape_sequence) + << (IsNamedEscapeSequence ? 1 : 0); return true; } @@ -711,6 +856,7 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, isFract = false; isAccum = false; hadError = false; + isBitInt = false; // This routine assumes that the range begin/end matches the regex for integer // and FP constants (specifically, the 'pp-number' regex), and assumes that @@ -895,6 +1041,24 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, if (isImaginary) break; // Cannot be repeated. isImaginary = true; continue; // Success. + case 'w': + case 'W': + if (isFPConstant) + break; // Invalid for floats. + if (HasSize) + break; // Invalid if we already have a size for the literal. + + // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We + // explicitly do not support the suffix in C++ as an extension because a + // library-based UDL that resolves to a library type may be more + // appropriate there. + if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') || + (s[0] == 'W' && s[1] == 'B'))) { + isBitInt = true; + HasSize = true; + ++s; // Skip both characters (2nd char skipped on continue). + continue; // Success. + } } // If we reached here, there was an error or a ud-suffix. break; @@ -916,6 +1080,7 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, isFloat16 = false; isHalf = false; isImaginary = false; + isBitInt = false; MicrosoftInteger = 0; saw_fixed_point_suffix = false; isFract = false; @@ -1145,8 +1310,14 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { // floating point constant, the radix will change to 10. Octal floating // point constants are not permitted (only decimal and hexadecimal). radix = 8; - DigitsBegin = s; + const char *PossibleNewDigitStart = s; s = SkipOctalDigits(s); + // When the value is 0 followed by a suffix (like 0wb), we want to leave 0 + // as the start of the digits. So if skipping octal digits does not skip + // anything, we leave the digit start where it was. + if (s != PossibleNewDigitStart) + DigitsBegin = PossibleNewDigitStart; + if (s == ThisTokEnd) return; // Done, simple octal number like 01234 @@ -1510,7 +1681,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // If we see bad encoding for unprefixed character literals, warn and // simply copy the byte values, for compatibility with gcc and // older versions of clang. - bool NoErrorOnBadEncoding = isAscii(); + bool NoErrorOnBadEncoding = isOrdinary(); unsigned Msg = diag::err_bad_character_encoding; if (NoErrorOnBadEncoding) Msg = diag::warn_bad_character_encoding; @@ -1535,7 +1706,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, continue; } // Is this a Universal Character Name escape? - if (begin[1] == 'u' || begin[1] == 'U') { + if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') { unsigned short UcnLen = 0; if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, FullSourceLoc(Loc, PP.getSourceManager()), @@ -1560,9 +1731,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front(); if (NumCharsSoFar > 1) { - if (isAscii() && NumCharsSoFar == 4) + if (isOrdinary() && NumCharsSoFar == 4) PP.Diag(Loc, diag::warn_four_char_character_literal); - else if (isAscii()) + else if (isOrdinary()) PP.Diag(Loc, diag::warn_multichar_character_literal); else { PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1); @@ -1578,7 +1749,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // Narrow character literals act as though their value is concatenated // in this implementation, but warn on overflow. bool multi_char_too_long = false; - if (isAscii() && isMultiChar()) { + if (isOrdinary() && isMultiChar()) { LitVal = 0; for (size_t i = 0; i < NumCharsSoFar; ++i) { // check for enough leading zeros to shift into @@ -1602,7 +1773,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple // character constants are not sign extended in the this implementation: // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. - if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && + if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) && PP.getLangOpts().CharIsSigned) Value = (signed char)Value; } @@ -1707,7 +1878,7 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){ // Remember if we see any wide or utf-8/16/32 strings. // Also check for illegal concatenations. if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { - if (isAscii()) { + if (isOrdinary()) { Kind = StringToks[i].getKind(); } else { if (Diags) @@ -1895,7 +2066,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){ continue; } // Is this a Universal Character Name escape? - if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { + if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' || + ThisTokBuf[1] == 'N') { EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr, hadError, FullSourceLoc(StringToks[i].getLocation(), SM), @@ -1990,7 +2162,7 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok, // If we see bad encoding for unprefixed string literals, warn and // simply copy the byte values, for compatibility with gcc and older // versions of clang. - bool NoErrorOnBadEncoding = isAscii(); + bool NoErrorOnBadEncoding = isOrdinary(); if (NoErrorOnBadEncoding) { memcpy(ResultPtr, Fragment.data(), Fragment.size()); ResultPtr += Fragment.size(); @@ -2088,7 +2260,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, // Otherwise, this is an escape character. Advance over it. bool HadError = false; - if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') { + if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' || + SpellingPtr[1] == 'N') { const char *EscapePtr = SpellingPtr; unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd, 1, Features, HadError); |
