1 files changed, 148 insertions, 36 deletions
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 3f9b68ccbb39..f8f5f7112188 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -193,6 +193,78 @@ void FormatTokenLexer::tryMergePreviousTokens() {
     if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
       return;
   }
+
+  if (Style.isVerilog()) {
+    // Merge the number following a base like `'h?a0`.
+    if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
+        Tokens.end()[-2]->is(tok::numeric_constant) &&
+        Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
+                               tok::question) &&
+        tryMergeTokens(2, TT_Unknown)) {
+      return;
+    }
+    // Part select.
+    if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
+                          TT_BitFieldColon)) {
+      return;
+    }
+    // Xnor. The combined token is treated as a caret which can also be either a
+    // unary or binary operator. The actual type is determined in
+    // TokenAnnotator. We also check the token length so we know it is not
+    // already a merged token.
+    if (Tokens.back()->TokenText.size() == 1 &&
+        tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
+                          TT_BinaryOperator)) {
+      Tokens.back()->Tok.setKind(tok::caret);
+      return;
+    }
+    // Signed shift and distribution weight.
+    if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
+      Tokens.back()->Tok.setKind(tok::lessless);
+      return;
+    }
+    if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
+      Tokens.back()->Tok.setKind(tok::greatergreater);
+      return;
+    }
+    if (tryMergeTokensAny({{tok::lessless, tok::equal},
+                           {tok::lessless, tok::lessequal},
+                           {tok::greatergreater, tok::equal},
+                           {tok::greatergreater, tok::greaterequal},
+                           {tok::colon, tok::equal},
+                           {tok::colon, tok::slash}},
+                          TT_BinaryOperator)) {
+      Tokens.back()->ForcedPrecedence = prec::Assignment;
+      return;
+    }
+    // Exponentiation, signed shift, case equality, and wildcard equality.
+    if (tryMergeTokensAny({{tok::star, tok::star},
+                           {tok::lessless, tok::less},
+                           {tok::greatergreater, tok::greater},
+                           {tok::exclaimequal, tok::equal},
+                           {tok::exclaimequal, tok::question},
+                           {tok::equalequal, tok::equal},
+                           {tok::equalequal, tok::question}},
+                          TT_BinaryOperator)) {
+      return;
+    }
+    // Module paths in specify blocks and implications in properties.
+    if (tryMergeTokensAny({{tok::plusequal, tok::greater},
+                           {tok::plus, tok::star, tok::greater},
+                           {tok::minusequal, tok::greater},
+                           {tok::minus, tok::star, tok::greater},
+                           {tok::less, tok::arrow},
+                           {tok::equal, tok::greater},
+                           {tok::star, tok::greater},
+                           {tok::pipeequal, tok::greater},
+                           {tok::pipe, tok::arrow},
+                           {tok::hash, tok::minus, tok::hash},
+                           {tok::hash, tok::equal, tok::hash}},
+                          TT_BinaryOperator)) {
+      Tokens.back()->ForcedPrecedence = prec::Comma;
+      return;
+    }
+  }
 }
 
 bool FormatTokenLexer::tryMergeNSStringLiteral() {
@@ -240,36 +312,32 @@ bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
     return false;
 
   // Look for @"aaaaaa" or $"aaaaaa".
-  auto &String = *(Tokens.end() - 1);
-  if (!String->is(tok::string_literal))
+  const auto String = *(Tokens.end() - 1);
+  if (String->isNot(tok::string_literal))
     return false;
 
-  auto &At = *(Tokens.end() - 2);
-  if (!(At->is(tok::at) || At->TokenText == "$"))
+  auto Prefix = *(Tokens.end() - 2);
+  if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
     return false;
 
-  if (Tokens.size() > 2 && At->is(tok::at)) {
-    auto &Dollar = *(Tokens.end() - 3);
-    if (Dollar->TokenText == "$") {
-      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
-      Dollar->Tok.setKind(tok::string_literal);
-      Dollar->TokenText =
-          StringRef(Dollar->TokenText.begin(),
-                    String->TokenText.end() - Dollar->TokenText.begin());
-      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
-      Dollar->setType(TT_CSharpStringLiteral);
+  if (Tokens.size() > 2) {
+    const auto Tok = *(Tokens.end() - 3);
+    if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
+        (Tok->is(tok::at) && Prefix->TokenText == "$")) {
+      // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
+      Tok->ColumnWidth += Prefix->ColumnWidth;
       Tokens.erase(Tokens.end() - 2);
-      Tokens.erase(Tokens.end() - 1);
-      return true;
+      Prefix = Tok;
     }
   }
 
   // Convert back into just a string_literal.
-  At->Tok.setKind(tok::string_literal);
-  At->TokenText = StringRef(At->TokenText.begin(),
-                            String->TokenText.end() - At->TokenText.begin());
-  At->ColumnWidth += String->ColumnWidth;
-  At->setType(TT_CSharpStringLiteral);
+  Prefix->Tok.setKind(tok::string_literal);
+  Prefix->TokenText =
+      StringRef(Prefix->TokenText.begin(),
+                String->TokenText.end() - Prefix->TokenText.begin());
+  Prefix->ColumnWidth += String->ColumnWidth;
+  Prefix->setType(TT_CSharpStringLiteral);
   Tokens.erase(Tokens.end() - 1);
   return true;
 }
@@ -303,9 +371,11 @@ bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
   if (Tokens.size() < 2)
     return false;
-  auto &At = *(Tokens.end() - 2);
-  auto &Keyword = *(Tokens.end() - 1);
-  if (!At->is(tok::at))
+  const auto At = *(Tokens.end() - 2);
+  if (At->isNot(tok::at))
+    return false;
+  const auto Keyword = *(Tokens.end() - 1);
+  if (Keyword->TokenText == "$")
     return false;
   if (!Keywords.isCSharpKeyword(*Keyword))
     return false;
@@ -412,15 +482,28 @@ bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
 
   SmallVectorImpl<FormatToken *>::const_iterator First =
       Tokens.end() - Kinds.size();
-  if (!First[0]->is(Kinds[0]))
+  for (unsigned i = 0; i < Kinds.size(); ++i)
+    if (!First[i]->is(Kinds[i]))
+      return false;
+
+  return tryMergeTokens(Kinds.size(), NewType);
+}
+
+bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
+  if (Tokens.size() < Count)
     return false;
+
+  SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
   unsigned AddLength = 0;
-  for (unsigned i = 1; i < Kinds.size(); ++i) {
-    if (!First[i]->is(Kinds[i]) || First[i]->hasWhitespaceBefore())
+  for (size_t i = 1; i < Count; ++i) {
+    // If there is whitespace separating the token and the previous one,
+    // they should not be merged.
+    if (First[i]->hasWhitespaceBefore())
       return false;
     AddLength += First[i]->TokenText.size();
   }
-  Tokens.resize(Tokens.size() - Kinds.size() + 1);
+
+  Tokens.resize(Tokens.size() - Count + 1);
   First[0]->TokenText = StringRef(First[0]->TokenText.data(),
                                   First[0]->TokenText.size() + AddLength);
   First[0]->ColumnWidth += AddLength;
@@ -428,6 +511,13 @@ bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
   return true;
 }
 
+bool FormatTokenLexer::tryMergeTokensAny(
+    ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
+  return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
+    return tryMergeTokens(Kinds, NewType);
+  });
+}
+
 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
   // NB: This is not entirely correct, as an r_paren can introduce an operand
@@ -591,7 +681,7 @@ void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
 
   bool Verbatim = false;
   bool Interpolated = false;
-  if (TokenText.startswith(R"($@")")) {
+  if (TokenText.startswith(R"($@")") || TokenText.startswith(R"(@$")")) {
     Verbatim = true;
     Interpolated = true;
   } else if (TokenText.startswith(R"(@")")) {
@@ -670,6 +760,7 @@ void FormatTokenLexer::handleTemplateStrings() {
   for (; Offset != Lex->getBuffer().end(); ++Offset) {
     if (Offset[0] == '`') {
       StateStack.pop();
+      ++Offset;
       break;
     }
     if (Offset[0] == '\\') {
@@ -678,12 +769,12 @@ void FormatTokenLexer::handleTemplateStrings() {
                Offset[1] == '{') {
       // '${' introduces an expression interpolation in the template string.
       StateStack.push(LexerState::NORMAL);
-      ++Offset;
+      Offset += 2;
       break;
     }
   }
 
-  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
+  StringRef LiteralText(TmplBegin, Offset - TmplBegin);
   BacktickToken->setType(TT_TemplateString);
   BacktickToken->Tok.setKind(tok::string_literal);
   BacktickToken->TokenText = LiteralText;
@@ -704,9 +795,7 @@ void FormatTokenLexer::handleTemplateStrings() {
                                       StartColumn, Style.TabWidth, Encoding);
   }
 
-  SourceLocation loc = Offset < Lex->getBuffer().end()
-                           ? Lex->getSourceLocation(Offset + 1)
-                           : SourceMgr.getLocForEndOfFile(ID);
+  SourceLocation loc = Lex->getSourceLocation(Offset);
   resetLexer(SourceMgr.getFileOffset(loc));
 }
 
@@ -935,7 +1024,7 @@ FormatToken *FormatTokenLexer::getNextToken() {
         // the same as a single LF.
         if (i + 1 < e && Text[i + 1] == '\n')
           break;
-        LLVM_FALLTHROUGH;
+        [[fallthrough]];
       case '\n':
         ++FormatTok->NewlinesBefore;
         if (!InEscape)
@@ -1004,12 +1093,19 @@ FormatToken *FormatTokenLexer::getNextToken() {
   }
 
   if (Style.isVerilog()) {
+    static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
+    SmallVector<StringRef, 1> Matches;
     // Verilog uses the backtick instead of the hash for preprocessor stuff.
     // And it uses the hash for delays and parameter lists. In order to continue
     // using `tok::hash` in other places, the backtick gets marked as the hash
     // here.  And in order to tell the backtick and hash apart for
     // Verilog-specific stuff, the hash becomes an identifier.
-    if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
+    if (FormatTok->is(tok::numeric_constant)) {
+      // In Verilog the quote is not part of a number.
+      auto Quote = FormatTok->TokenText.find('\'');
+      if (Quote != StringRef::npos)
+        truncateToken(Quote);
+    } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
       FormatTok->Tok.setKind(tok::raw_identifier);
     } else if (FormatTok->is(tok::raw_identifier)) {
       if (FormatTok->TokenText == "`") {
@@ -1018,6 +1114,15 @@ FormatToken *FormatTokenLexer::getNextToken() {
       } else if (FormatTok->TokenText == "``") {
         FormatTok->Tok.setIdentifierInfo(nullptr);
         FormatTok->Tok.setKind(tok::hashhash);
+      } else if (Tokens.size() > 0 &&
+                 Tokens.back()->is(Keywords.kw_apostrophe) &&
+                 NumberBase.match(FormatTok->TokenText, &Matches)) {
+        // In Verilog in a based number literal like `'b10`, there may be
+        // whitespace between `'b` and `10`. Therefore we handle the base and
+        // the rest of the number literal as two tokens. But if there is no
+        // space in the input code, we need to manually separate the two parts.
+        truncateToken(Matches[0].size());
+        FormatTok->setFinalizedType(TT_VerilogNumberBase);
       }
     }
   }
@@ -1060,6 +1165,13 @@ FormatToken *FormatTokenLexer::getNextToken() {
     StateStack.push(LexerState::TOKEN_STASHED);
   }
 
+  if (Style.isVerilog() && Tokens.size() > 0 &&
+      Tokens.back()->is(TT_VerilogNumberBase) &&
+      FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
+    // Mark the number following a base like `'h?a0` as a number.
+    FormatTok->Tok.setKind(tok::numeric_constant);
+  }
+
   // Now FormatTok is the next non-whitespace token.
 
   StringRef Text = FormatTok->TokenText;