summaryrefslogtreecommitdiff
path: root/clang/lib/Format/FormatTokenLexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib/Format/FormatTokenLexer.cpp')
-rw-r--r--clang/lib/Format/FormatTokenLexer.cpp291
1 files changed, 241 insertions, 50 deletions
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index ef20ba884fb3..1fd153d1112e 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -22,13 +22,15 @@
namespace clang {
namespace format {
-FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
- unsigned Column, const FormatStyle &Style,
- encoding::Encoding Encoding)
+FormatTokenLexer::FormatTokenLexer(
+ const SourceManager &SourceMgr, FileID ID, unsigned Column,
+ const FormatStyle &Style, encoding::Encoding Encoding,
+ llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
+ IdentifierTable &IdentTable)
: FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
- Style(Style), IdentTable(getFormattingLangOpts(Style)),
- Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
+ Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
+ Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
MacroBlockEndRegex(Style.MacroBlockEnd) {
Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
@@ -43,6 +45,11 @@ FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
for (const std::string &NamespaceMacro : Style.NamespaceMacros)
Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
+ for (const std::string &WhitespaceSensitiveMacro :
+ Style.WhitespaceSensitiveMacros) {
+ Macros.insert(
+ {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
+ }
}
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
@@ -57,6 +64,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
if (Style.Language == FormatStyle::LK_TextProto)
tryParsePythonComment();
tryMergePreviousTokens();
+ if (Style.isCSharp())
+ // This needs to come after tokens have been merged so that C#
+ // string literals are correctly identified.
+ handleCSharpVerbatimAndInterpolatedStrings();
if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
FirstInLineIndex = Tokens.size() - 1;
} while (Tokens.back()->Tok.isNot(tok::eof));
@@ -70,15 +81,19 @@ void FormatTokenLexer::tryMergePreviousTokens() {
return;
if (tryMergeLessLess())
return;
+ if (tryMergeForEach())
+ return;
+ if (Style.isCpp() && tryTransformTryUsageForC())
+ return;
if (Style.isCSharp()) {
if (tryMergeCSharpKeywordVariables())
return;
- if (tryMergeCSharpVerbatimStringLiteral())
+ if (tryMergeCSharpStringLiteral())
return;
if (tryMergeCSharpDoubleQuestion())
return;
- if (tryMergeCSharpNullConditionals())
+ if (tryMergeCSharpNullConditional())
return;
if (tryTransformCSharpForEach())
return;
@@ -120,8 +135,11 @@ void FormatTokenLexer::tryMergePreviousTokens() {
Tokens.back()->Tok.setKind(tok::starequal);
return;
}
- if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator))
+ if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
+ // Treat like the "||" operator (as opposed to the ternary ?).
+ Tokens.back()->Tok.setKind(tok::pipepipe);
return;
+ }
if (tryMergeTokens(JSNullPropagatingOperator,
TT_JsNullPropagatingOperator)) {
// Treat like a regular "." access.
@@ -151,7 +169,7 @@ bool FormatTokenLexer::tryMergeNSStringLiteral() {
At->TokenText = StringRef(At->TokenText.begin(),
String->TokenText.end() - At->TokenText.begin());
At->ColumnWidth += String->ColumnWidth;
- At->Type = TT_ObjCStringLiteral;
+ At->setType(TT_ObjCStringLiteral);
Tokens.erase(Tokens.end() - 1);
return true;
}
@@ -170,7 +188,7 @@ bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
StringRef(Hash->TokenText.begin(),
Identifier->TokenText.end() - Hash->TokenText.begin());
Hash->ColumnWidth += Identifier->ColumnWidth;
- Hash->Type = TT_JsPrivateIdentifier;
+ Hash->setType(TT_JsPrivateIdentifier);
Tokens.erase(Tokens.end() - 1);
return true;
}
@@ -178,18 +196,71 @@ bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
// Search for verbatim or interpolated string literals @"ABC" or
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
// prevent splitting of @, $ and ".
-bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
+// Merging of multiline verbatim strings with embedded '"' is handled in
+// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
+bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
if (Tokens.size() < 2)
return false;
- auto &At = *(Tokens.end() - 2);
+
+ // Interpolated strings could contain { } with " characters inside.
+ // $"{x ?? "null"}"
+ // should not be split into $"{x ?? ", null, "}" but should treated as a
+ // single string-literal.
+ //
+ // We opt not to try and format expressions inside {} within a C#
+ // interpolated string. Formatting expressions within an interpolated string
+ // would require similar work as that done for JavaScript template strings
+ // in `handleTemplateStrings()`.
+ auto &CSharpInterpolatedString = *(Tokens.end() - 2);
+ if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
+ (CSharpInterpolatedString->TokenText.startswith(R"($")") ||
+ CSharpInterpolatedString->TokenText.startswith(R"($@")"))) {
+ int UnmatchedOpeningBraceCount = 0;
+
+ auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
+ for (size_t Index = 0; Index < TokenTextSize; ++Index) {
+ char C = CSharpInterpolatedString->TokenText[Index];
+ if (C == '{') {
+ // "{{" inside an interpolated string is an escaped '{' so skip it.
+ if (Index + 1 < TokenTextSize &&
+ CSharpInterpolatedString->TokenText[Index + 1] == '{') {
+ ++Index;
+ continue;
+ }
+ ++UnmatchedOpeningBraceCount;
+ } else if (C == '}') {
+ // "}}" inside an interpolated string is an escaped '}' so skip it.
+ if (Index + 1 < TokenTextSize &&
+ CSharpInterpolatedString->TokenText[Index + 1] == '}') {
+ ++Index;
+ continue;
+ }
+ --UnmatchedOpeningBraceCount;
+ }
+ }
+
+ if (UnmatchedOpeningBraceCount > 0) {
+ auto &NextToken = *(Tokens.end() - 1);
+ CSharpInterpolatedString->TokenText =
+ StringRef(CSharpInterpolatedString->TokenText.begin(),
+ NextToken->TokenText.end() -
+ CSharpInterpolatedString->TokenText.begin());
+ CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
+ Tokens.erase(Tokens.end() - 1);
+ return true;
+ }
+ }
+
+ // Look for @"aaaaaa" or $"aaaaaa".
auto &String = *(Tokens.end() - 1);
+ if (!String->is(tok::string_literal))
+ return false;
- // Look for $"aaaaaa" @"aaaaaa".
- if (!(At->is(tok::at) || At->TokenText == "$") ||
- !String->is(tok::string_literal))
+ auto &At = *(Tokens.end() - 2);
+ if (!(At->is(tok::at) || At->TokenText == "$"))
return false;
- if (Tokens.size() >= 2 && At->is(tok::at)) {
+ if (Tokens.size() > 2 && At->is(tok::at)) {
auto &Dollar = *(Tokens.end() - 3);
if (Dollar->TokenText == "$") {
// This looks like $@"aaaaa" so we need to combine all 3 tokens.
@@ -198,7 +269,7 @@ bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
StringRef(Dollar->TokenText.begin(),
String->TokenText.end() - Dollar->TokenText.begin());
Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
- Dollar->Type = TT_CSharpStringLiteral;
+ Dollar->setType(TT_CSharpStringLiteral);
Tokens.erase(Tokens.end() - 2);
Tokens.erase(Tokens.end() - 1);
return true;
@@ -210,11 +281,18 @@ bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
At->TokenText = StringRef(At->TokenText.begin(),
String->TokenText.end() - At->TokenText.begin());
At->ColumnWidth += String->ColumnWidth;
- At->Type = TT_CSharpStringLiteral;
+ At->setType(TT_CSharpStringLiteral);
Tokens.erase(Tokens.end() - 1);
return true;
}
+// Valid C# attribute targets:
+// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
+const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
+ "assembly", "module", "field", "event", "method",
+ "param", "property", "return", "type",
+};
+
bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
if (Tokens.size() < 2)
return false;
@@ -222,12 +300,38 @@ bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
auto &SecondQuestion = *(Tokens.end() - 1);
if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
return false;
- FirstQuestion->Tok.setKind(tok::question);
+ FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens.
FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
SecondQuestion->TokenText.end() -
FirstQuestion->TokenText.begin());
FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
- FirstQuestion->Type = TT_CSharpNullCoalescing;
+ FirstQuestion->setType(TT_CSharpNullCoalescing);
+ Tokens.erase(Tokens.end() - 1);
+ return true;
+}
+
+// Merge '?[' and '?.' pairs into single tokens.
+bool FormatTokenLexer::tryMergeCSharpNullConditional() {
+ if (Tokens.size() < 2)
+ return false;
+ auto &Question = *(Tokens.end() - 2);
+ auto &PeriodOrLSquare = *(Tokens.end() - 1);
+ if (!Question->is(tok::question) ||
+ !PeriodOrLSquare->isOneOf(tok::l_square, tok::period))
+ return false;
+ Question->TokenText =
+ StringRef(Question->TokenText.begin(),
+ PeriodOrLSquare->TokenText.end() - Question->TokenText.begin());
+ Question->ColumnWidth += PeriodOrLSquare->ColumnWidth;
+
+ if (PeriodOrLSquare->is(tok::l_square)) {
+ Question->Tok.setKind(tok::question); // no '?[' in clang tokens.
+ Question->setType(TT_CSharpNullConditionalLSquare);
+ } else {
+ Question->Tok.setKind(tok::question); // no '?.' in clang tokens.
+ Question->setType(TT_CSharpNullConditional);
+ }
+
Tokens.erase(Tokens.end() - 1);
return true;
}
@@ -246,24 +350,7 @@ bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
At->TokenText = StringRef(At->TokenText.begin(),
Keyword->TokenText.end() - At->TokenText.begin());
At->ColumnWidth += Keyword->ColumnWidth;
- At->Type = Keyword->Type;
- Tokens.erase(Tokens.end() - 1);
- return true;
-}
-
-// In C# merge the Identifier and the ? together e.g. arg?.
-bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
- if (Tokens.size() < 2)
- return false;
- auto &Identifier = *(Tokens.end() - 2);
- auto &Question = *(Tokens.end() - 1);
- if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
- !Question->is(tok::question))
- return false;
- Identifier->TokenText =
- StringRef(Identifier->TokenText.begin(),
- Question->TokenText.end() - Identifier->TokenText.begin());
- Identifier->ColumnWidth += Question->ColumnWidth;
+ At->setType(Keyword->getType());
Tokens.erase(Tokens.end() - 1);
return true;
}
@@ -278,11 +365,53 @@ bool FormatTokenLexer::tryTransformCSharpForEach() {
if (Identifier->TokenText != "foreach")
return false;
- Identifier->Type = TT_ForEachMacro;
+ Identifier->setType(TT_ForEachMacro);
Identifier->Tok.setKind(tok::kw_for);
return true;
}
+bool FormatTokenLexer::tryMergeForEach() {
+ if (Tokens.size() < 2)
+ return false;
+ auto &For = *(Tokens.end() - 2);
+ auto &Each = *(Tokens.end() - 1);
+ if (!For->is(tok::kw_for))
+ return false;
+ if (!Each->is(tok::identifier))
+ return false;
+ if (Each->TokenText != "each")
+ return false;
+
+ For->setType(TT_ForEachMacro);
+ For->Tok.setKind(tok::kw_for);
+
+ For->TokenText = StringRef(For->TokenText.begin(),
+ Each->TokenText.end() - For->TokenText.begin());
+ For->ColumnWidth += Each->ColumnWidth;
+ Tokens.erase(Tokens.end() - 1);
+ return true;
+}
+
+bool FormatTokenLexer::tryTransformTryUsageForC() {
+ if (Tokens.size() < 2)
+ return false;
+ auto &Try = *(Tokens.end() - 2);
+ if (!Try->is(tok::kw_try))
+ return false;
+ auto &Next = *(Tokens.end() - 1);
+ if (Next->isOneOf(tok::l_brace, tok::colon))
+ return false;
+
+ if (Tokens.size() > 2) {
+ auto &At = *(Tokens.end() - 3);
+ if (At->is(tok::at))
+ return false;
+ }
+
+ Try->Tok.setKind(tok::identifier);
+ return true;
+}
+
bool FormatTokenLexer::tryMergeLessLess() {
// Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
if (Tokens.size() < 3)
@@ -329,7 +458,7 @@ bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
First[0]->TokenText = StringRef(First[0]->TokenText.data(),
First[0]->TokenText.size() + AddLength);
First[0]->ColumnWidth += AddLength;
- First[0]->Type = NewType;
+ First[0]->setType(NewType);
return true;
}
@@ -418,7 +547,7 @@ void FormatTokenLexer::tryParseJSRegexLiteral() {
}
}
- RegexToken->Type = TT_RegexLiteral;
+ RegexToken->setType(TT_RegexLiteral);
// Treat regex literals like other string_literals.
RegexToken->Tok.setKind(tok::string_literal);
RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
@@ -427,6 +556,68 @@ void FormatTokenLexer::tryParseJSRegexLiteral() {
resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
}
+void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
+ FormatToken *CSharpStringLiteral = Tokens.back();
+
+ if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
+ return;
+
+ // Deal with multiline strings.
+ if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
+ CSharpStringLiteral->TokenText.startswith(R"($@")")))
+ return;
+
+ const char *StrBegin =
+ Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
+ const char *Offset = StrBegin;
+ if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
+ Offset += 2;
+ else // CSharpStringLiteral->TokenText.startswith(R"($@")")
+ Offset += 3;
+
+ // Look for a terminating '"' in the current file buffer.
+ // Make no effort to format code within an interpolated or verbatim string.
+ for (; Offset != Lex->getBuffer().end(); ++Offset) {
+ if (Offset[0] == '"') {
+ // "" within a verbatim string is an escaped double quote: skip it.
+ if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
+ ++Offset;
+ else
+ break;
+ }
+ }
+
+ // Make no attempt to format code properly if a verbatim string is
+ // unterminated.
+ if (Offset == Lex->getBuffer().end())
+ return;
+
+ StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
+ CSharpStringLiteral->TokenText = LiteralText;
+
+ // Adjust width for potentially multiline string literals.
+ size_t FirstBreak = LiteralText.find('\n');
+ StringRef FirstLineText = FirstBreak == StringRef::npos
+ ? LiteralText
+ : LiteralText.substr(0, FirstBreak);
+ CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
+ FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
+ Encoding);
+ size_t LastBreak = LiteralText.rfind('\n');
+ if (LastBreak != StringRef::npos) {
+ CSharpStringLiteral->IsMultiline = true;
+ unsigned StartColumn = 0;
+ CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
+ LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
+ Style.TabWidth, Encoding);
+ }
+
+ SourceLocation loc = Offset < Lex->getBuffer().end()
+ ? Lex->getSourceLocation(Offset + 1)
+ : SourceMgr.getLocForEndOfFile(ID);
+ resetLexer(SourceMgr.getFileOffset(loc));
+}
+
void FormatTokenLexer::handleTemplateStrings() {
FormatToken *BacktickToken = Tokens.back();
@@ -468,7 +659,7 @@ void FormatTokenLexer::handleTemplateStrings() {
}
StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
- BacktickToken->Type = TT_TemplateString;
+ BacktickToken->setType(TT_TemplateString);
BacktickToken->Tok.setKind(tok::string_literal);
BacktickToken->TokenText = LiteralText;
@@ -506,7 +697,7 @@ void FormatTokenLexer::tryParsePythonComment() {
if (To == StringRef::npos)
To = Lex->getBuffer().size();
size_t Len = To - From;
- HashToken->Type = TT_LineComment;
+ HashToken->setType(TT_LineComment);
HashToken->Tok.setKind(tok::comment);
HashToken->TokenText = Lex->getBuffer().substr(From, Len);
SourceLocation Loc = To < Lex->getBuffer().size()
@@ -604,7 +795,7 @@ bool FormatTokenLexer::tryMergeConflictMarkers() {
// We do not need to build a complete token here, as we will skip it
// during parsing anyway (as we must not touch whitespace around conflict
// markers).
- Tokens.back()->Type = Type;
+ Tokens.back()->setType(Type);
Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
Tokens.push_back(Next);
@@ -691,13 +882,13 @@ FormatToken *FormatTokenLexer::getNextToken() {
break;
case '\\':
if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
- FormatTok->Type = TT_ImplicitStringLiteral;
+ FormatTok->setType(TT_ImplicitStringLiteral);
break;
default:
- FormatTok->Type = TT_ImplicitStringLiteral;
+ FormatTok->setType(TT_ImplicitStringLiteral);
break;
}
- if (FormatTok->Type == TT_ImplicitStringLiteral)
+ if (FormatTok->getType() == TT_ImplicitStringLiteral)
break;
}
@@ -825,12 +1016,12 @@ FormatToken *FormatTokenLexer::getNextToken() {
Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
tok::pp_define) &&
it != Macros.end()) {
- FormatTok->Type = it->second;
+ FormatTok->setType(it->second);
} else if (FormatTok->is(tok::identifier)) {
if (MacroBlockBeginRegex.match(Text)) {
- FormatTok->Type = TT_MacroBlockBegin;
+ FormatTok->setType(TT_MacroBlockBegin);
} else if (MacroBlockEndRegex.match(Text)) {
- FormatTok->Type = TT_MacroBlockEnd;
+ FormatTok->setType(TT_MacroBlockEnd);
}
}
}