diff options
Diffstat (limited to 'clang/lib/Format/FormatTokenLexer.cpp')
-rw-r--r-- | clang/lib/Format/FormatTokenLexer.cpp | 291 |
1 files changed, 241 insertions, 50 deletions
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index ef20ba884fb3..1fd153d1112e 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -22,13 +22,15 @@ namespace clang { namespace format { -FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, - unsigned Column, const FormatStyle &Style, - encoding::Encoding Encoding) +FormatTokenLexer::FormatTokenLexer( + const SourceManager &SourceMgr, FileID ID, unsigned Column, + const FormatStyle &Style, encoding::Encoding Encoding, + llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, + IdentifierTable &IdentTable) : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), - Style(Style), IdentTable(getFormattingLangOpts(Style)), - Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0), + Style(Style), IdentTable(IdentTable), Keywords(IdentTable), + Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), MacroBlockEndRegex(Style.MacroBlockEnd) { Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr, @@ -43,6 +45,11 @@ FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro}); for (const std::string &NamespaceMacro : Style.NamespaceMacros) Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro}); + for (const std::string &WhitespaceSensitiveMacro : + Style.WhitespaceSensitiveMacros) { + Macros.insert( + {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc}); + } } ArrayRef<FormatToken *> FormatTokenLexer::lex() { @@ -57,6 +64,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() { if (Style.Language == FormatStyle::LK_TextProto) tryParsePythonComment(); tryMergePreviousTokens(); + if (Style.isCSharp()) + // This needs to come after tokens have been merged so that C# + // string literals are correctly identified. + handleCSharpVerbatimAndInterpolatedStrings(); if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) FirstInLineIndex = Tokens.size() - 1; } while (Tokens.back()->Tok.isNot(tok::eof)); @@ -70,15 +81,19 @@ void FormatTokenLexer::tryMergePreviousTokens() { return; if (tryMergeLessLess()) return; + if (tryMergeForEach()) + return; + if (Style.isCpp() && tryTransformTryUsageForC()) + return; if (Style.isCSharp()) { if (tryMergeCSharpKeywordVariables()) return; - if (tryMergeCSharpVerbatimStringLiteral()) + if (tryMergeCSharpStringLiteral()) return; if (tryMergeCSharpDoubleQuestion()) return; - if (tryMergeCSharpNullConditionals()) + if (tryMergeCSharpNullConditional()) return; if (tryTransformCSharpForEach()) return; @@ -120,8 +135,11 @@ void FormatTokenLexer::tryMergePreviousTokens() { Tokens.back()->Tok.setKind(tok::starequal); return; } - if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) + if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) { + // Treat like the "||" operator (as opposed to the ternary ?). + Tokens.back()->Tok.setKind(tok::pipepipe); return; + } if (tryMergeTokens(JSNullPropagatingOperator, TT_JsNullPropagatingOperator)) { // Treat like a regular "." access. @@ -151,7 +169,7 @@ bool FormatTokenLexer::tryMergeNSStringLiteral() { At->TokenText = StringRef(At->TokenText.begin(), String->TokenText.end() - At->TokenText.begin()); At->ColumnWidth += String->ColumnWidth; - At->Type = TT_ObjCStringLiteral; + At->setType(TT_ObjCStringLiteral); Tokens.erase(Tokens.end() - 1); return true; } @@ -170,7 +188,7 @@ bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { StringRef(Hash->TokenText.begin(), Identifier->TokenText.end() - Hash->TokenText.begin()); Hash->ColumnWidth += Identifier->ColumnWidth; - Hash->Type = TT_JsPrivateIdentifier; + Hash->setType(TT_JsPrivateIdentifier); Tokens.erase(Tokens.end() - 1); return true; } @@ -178,18 +196,71 @@ bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { // Search for verbatim or interpolated string literals @"ABC" or // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to // prevent splitting of @, $ and ". -bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() { +// Merging of multiline verbatim strings with embedded '"' is handled in +// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing. +bool FormatTokenLexer::tryMergeCSharpStringLiteral() { if (Tokens.size() < 2) return false; - auto &At = *(Tokens.end() - 2); + + // Interpolated strings could contain { } with " characters inside. + // $"{x ?? "null"}" + // should not be split into $"{x ?? ", null, "}" but should treated as a + // single string-literal. + // + // We opt not to try and format expressions inside {} within a C# + // interpolated string. Formatting expressions within an interpolated string + // would require similar work as that done for JavaScript template strings + // in `handleTemplateStrings()`. + auto &CSharpInterpolatedString = *(Tokens.end() - 2); + if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral && + (CSharpInterpolatedString->TokenText.startswith(R"($")") || + CSharpInterpolatedString->TokenText.startswith(R"($@")"))) { + int UnmatchedOpeningBraceCount = 0; + + auto TokenTextSize = CSharpInterpolatedString->TokenText.size(); + for (size_t Index = 0; Index < TokenTextSize; ++Index) { + char C = CSharpInterpolatedString->TokenText[Index]; + if (C == '{') { + // "{{" inside an interpolated string is an escaped '{' so skip it. + if (Index + 1 < TokenTextSize && + CSharpInterpolatedString->TokenText[Index + 1] == '{') { + ++Index; + continue; + } + ++UnmatchedOpeningBraceCount; + } else if (C == '}') { + // "}}" inside an interpolated string is an escaped '}' so skip it. + if (Index + 1 < TokenTextSize && + CSharpInterpolatedString->TokenText[Index + 1] == '}') { + ++Index; + continue; + } + --UnmatchedOpeningBraceCount; + } + } + + if (UnmatchedOpeningBraceCount > 0) { + auto &NextToken = *(Tokens.end() - 1); + CSharpInterpolatedString->TokenText = + StringRef(CSharpInterpolatedString->TokenText.begin(), + NextToken->TokenText.end() - + CSharpInterpolatedString->TokenText.begin()); + CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth; + Tokens.erase(Tokens.end() - 1); + return true; + } + } + + // Look for @"aaaaaa" or $"aaaaaa". auto &String = *(Tokens.end() - 1); + if (!String->is(tok::string_literal)) + return false; - // Look for $"aaaaaa" @"aaaaaa". - if (!(At->is(tok::at) || At->TokenText == "$") || - !String->is(tok::string_literal)) + auto &At = *(Tokens.end() - 2); + if (!(At->is(tok::at) || At->TokenText == "$")) return false; - if (Tokens.size() >= 2 && At->is(tok::at)) { + if (Tokens.size() > 2 && At->is(tok::at)) { auto &Dollar = *(Tokens.end() - 3); if (Dollar->TokenText == "$") { // This looks like $@"aaaaa" so we need to combine all 3 tokens. @@ -198,7 +269,7 @@ bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() { StringRef(Dollar->TokenText.begin(), String->TokenText.end() - Dollar->TokenText.begin()); Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth); - Dollar->Type = TT_CSharpStringLiteral; + Dollar->setType(TT_CSharpStringLiteral); Tokens.erase(Tokens.end() - 2); Tokens.erase(Tokens.end() - 1); return true; @@ -210,11 +281,18 @@ bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() { At->TokenText = StringRef(At->TokenText.begin(), String->TokenText.end() - At->TokenText.begin()); At->ColumnWidth += String->ColumnWidth; - At->Type = TT_CSharpStringLiteral; + At->setType(TT_CSharpStringLiteral); Tokens.erase(Tokens.end() - 1); return true; } +// Valid C# attribute targets: +// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets +const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = { + "assembly", "module", "field", "event", "method", + "param", "property", "return", "type", +}; + bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() { if (Tokens.size() < 2) return false; @@ -222,12 +300,38 @@ bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() { auto &SecondQuestion = *(Tokens.end() - 1); if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question)) return false; - FirstQuestion->Tok.setKind(tok::question); + FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens. FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(), SecondQuestion->TokenText.end() - FirstQuestion->TokenText.begin()); FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth; - FirstQuestion->Type = TT_CSharpNullCoalescing; + FirstQuestion->setType(TT_CSharpNullCoalescing); + Tokens.erase(Tokens.end() - 1); + return true; +} + +// Merge '?[' and '?.' pairs into single tokens. +bool FormatTokenLexer::tryMergeCSharpNullConditional() { + if (Tokens.size() < 2) + return false; + auto &Question = *(Tokens.end() - 2); + auto &PeriodOrLSquare = *(Tokens.end() - 1); + if (!Question->is(tok::question) || + !PeriodOrLSquare->isOneOf(tok::l_square, tok::period)) + return false; + Question->TokenText = + StringRef(Question->TokenText.begin(), + PeriodOrLSquare->TokenText.end() - Question->TokenText.begin()); + Question->ColumnWidth += PeriodOrLSquare->ColumnWidth; + + if (PeriodOrLSquare->is(tok::l_square)) { + Question->Tok.setKind(tok::question); // no '?[' in clang tokens. + Question->setType(TT_CSharpNullConditionalLSquare); + } else { + Question->Tok.setKind(tok::question); // no '?.' in clang tokens. + Question->setType(TT_CSharpNullConditional); + } + Tokens.erase(Tokens.end() - 1); return true; } @@ -246,24 +350,7 @@ bool FormatTokenLexer::tryMergeCSharpKeywordVariables() { At->TokenText = StringRef(At->TokenText.begin(), Keyword->TokenText.end() - At->TokenText.begin()); At->ColumnWidth += Keyword->ColumnWidth; - At->Type = Keyword->Type; - Tokens.erase(Tokens.end() - 1); - return true; -} - -// In C# merge the Identifier and the ? together e.g. arg?. -bool FormatTokenLexer::tryMergeCSharpNullConditionals() { - if (Tokens.size() < 2) - return false; - auto &Identifier = *(Tokens.end() - 2); - auto &Question = *(Tokens.end() - 1); - if (!Identifier->isOneOf(tok::r_square, tok::identifier) || - !Question->is(tok::question)) - return false; - Identifier->TokenText = - StringRef(Identifier->TokenText.begin(), - Question->TokenText.end() - Identifier->TokenText.begin()); - Identifier->ColumnWidth += Question->ColumnWidth; + At->setType(Keyword->getType()); Tokens.erase(Tokens.end() - 1); return true; } @@ -278,11 +365,53 @@ bool FormatTokenLexer::tryTransformCSharpForEach() { if (Identifier->TokenText != "foreach") return false; - Identifier->Type = TT_ForEachMacro; + Identifier->setType(TT_ForEachMacro); Identifier->Tok.setKind(tok::kw_for); return true; } +bool FormatTokenLexer::tryMergeForEach() { + if (Tokens.size() < 2) + return false; + auto &For = *(Tokens.end() - 2); + auto &Each = *(Tokens.end() - 1); + if (!For->is(tok::kw_for)) + return false; + if (!Each->is(tok::identifier)) + return false; + if (Each->TokenText != "each") + return false; + + For->setType(TT_ForEachMacro); + For->Tok.setKind(tok::kw_for); + + For->TokenText = StringRef(For->TokenText.begin(), + Each->TokenText.end() - For->TokenText.begin()); + For->ColumnWidth += Each->ColumnWidth; + Tokens.erase(Tokens.end() - 1); + return true; +} + +bool FormatTokenLexer::tryTransformTryUsageForC() { + if (Tokens.size() < 2) + return false; + auto &Try = *(Tokens.end() - 2); + if (!Try->is(tok::kw_try)) + return false; + auto &Next = *(Tokens.end() - 1); + if (Next->isOneOf(tok::l_brace, tok::colon)) + return false; + + if (Tokens.size() > 2) { + auto &At = *(Tokens.end() - 3); + if (At->is(tok::at)) + return false; + } + + Try->Tok.setKind(tok::identifier); + return true; +} + bool FormatTokenLexer::tryMergeLessLess() { // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. if (Tokens.size() < 3) @@ -329,7 +458,7 @@ bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, First[0]->TokenText = StringRef(First[0]->TokenText.data(), First[0]->TokenText.size() + AddLength); First[0]->ColumnWidth += AddLength; - First[0]->Type = NewType; + First[0]->setType(NewType); return true; } @@ -418,7 +547,7 @@ void FormatTokenLexer::tryParseJSRegexLiteral() { } } - RegexToken->Type = TT_RegexLiteral; + RegexToken->setType(TT_RegexLiteral); // Treat regex literals like other string_literals. RegexToken->Tok.setKind(tok::string_literal); RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); @@ -427,6 +556,68 @@ void FormatTokenLexer::tryParseJSRegexLiteral() { resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); } +void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() { + FormatToken *CSharpStringLiteral = Tokens.back(); + + if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral) + return; + + // Deal with multiline strings. + if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") || + CSharpStringLiteral->TokenText.startswith(R"($@")"))) + return; + + const char *StrBegin = + Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size(); + const char *Offset = StrBegin; + if (CSharpStringLiteral->TokenText.startswith(R"(@")")) + Offset += 2; + else // CSharpStringLiteral->TokenText.startswith(R"($@")") + Offset += 3; + + // Look for a terminating '"' in the current file buffer. + // Make no effort to format code within an interpolated or verbatim string. + for (; Offset != Lex->getBuffer().end(); ++Offset) { + if (Offset[0] == '"') { + // "" within a verbatim string is an escaped double quote: skip it. + if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"') + ++Offset; + else + break; + } + } + + // Make no attempt to format code properly if a verbatim string is + // unterminated. + if (Offset == Lex->getBuffer().end()) + return; + + StringRef LiteralText(StrBegin, Offset - StrBegin + 1); + CSharpStringLiteral->TokenText = LiteralText; + + // Adjust width for potentially multiline string literals. + size_t FirstBreak = LiteralText.find('\n'); + StringRef FirstLineText = FirstBreak == StringRef::npos + ? LiteralText + : LiteralText.substr(0, FirstBreak); + CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs( + FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth, + Encoding); + size_t LastBreak = LiteralText.rfind('\n'); + if (LastBreak != StringRef::npos) { + CSharpStringLiteral->IsMultiline = true; + unsigned StartColumn = 0; + CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs( + LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, + Style.TabWidth, Encoding); + } + + SourceLocation loc = Offset < Lex->getBuffer().end() + ? Lex->getSourceLocation(Offset + 1) + : SourceMgr.getLocForEndOfFile(ID); + resetLexer(SourceMgr.getFileOffset(loc)); +} + void FormatTokenLexer::handleTemplateStrings() { FormatToken *BacktickToken = Tokens.back(); @@ -468,7 +659,7 @@ void FormatTokenLexer::handleTemplateStrings() { } StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); - BacktickToken->Type = TT_TemplateString; + BacktickToken->setType(TT_TemplateString); BacktickToken->Tok.setKind(tok::string_literal); BacktickToken->TokenText = LiteralText; @@ -506,7 +697,7 @@ void FormatTokenLexer::tryParsePythonComment() { if (To == StringRef::npos) To = Lex->getBuffer().size(); size_t Len = To - From; - HashToken->Type = TT_LineComment; + HashToken->setType(TT_LineComment); HashToken->Tok.setKind(tok::comment); HashToken->TokenText = Lex->getBuffer().substr(From, Len); SourceLocation Loc = To < Lex->getBuffer().size() @@ -604,7 +795,7 @@ bool FormatTokenLexer::tryMergeConflictMarkers() { // We do not need to build a complete token here, as we will skip it // during parsing anyway (as we must not touch whitespace around conflict // markers). - Tokens.back()->Type = Type; + Tokens.back()->setType(Type); Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); Tokens.push_back(Next); @@ -691,13 +882,13 @@ FormatToken *FormatTokenLexer::getNextToken() { break; case '\\': if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) - FormatTok->Type = TT_ImplicitStringLiteral; + FormatTok->setType(TT_ImplicitStringLiteral); break; default: - FormatTok->Type = TT_ImplicitStringLiteral; + FormatTok->setType(TT_ImplicitStringLiteral); break; } - if (FormatTok->Type == TT_ImplicitStringLiteral) + if (FormatTok->getType() == TT_ImplicitStringLiteral) break; } @@ -825,12 +1016,12 @@ FormatToken *FormatTokenLexer::getNextToken() { Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == tok::pp_define) && it != Macros.end()) { - FormatTok->Type = it->second; + FormatTok->setType(it->second); } else if (FormatTok->is(tok::identifier)) { if (MacroBlockBeginRegex.match(Text)) { - FormatTok->Type = TT_MacroBlockBegin; + FormatTok->setType(TT_MacroBlockBegin); } else if (MacroBlockEndRegex.match(Text)) { - FormatTok->Type = TT_MacroBlockEnd; + FormatTok->setType(TT_MacroBlockEnd); } } } |