diff options
Diffstat (limited to 'clang/lib/ASTMatchers/Dynamic/Parser.cpp')
| -rw-r--r-- | clang/lib/ASTMatchers/Dynamic/Parser.cpp | 683 | 
1 files changed, 683 insertions, 0 deletions
| diff --git a/clang/lib/ASTMatchers/Dynamic/Parser.cpp b/clang/lib/ASTMatchers/Dynamic/Parser.cpp new file mode 100644 index 000000000000..e3b00b46832c --- /dev/null +++ b/clang/lib/ASTMatchers/Dynamic/Parser.cpp @@ -0,0 +1,683 @@ +//===- Parser.cpp - Matcher expression parser -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Recursive parser implementation for the matcher expression grammar. +/// +//===----------------------------------------------------------------------===// + +#include "clang/ASTMatchers/Dynamic/Parser.h" +#include "clang/ASTMatchers/ASTMatchersInternal.h" +#include "clang/ASTMatchers/Dynamic/Diagnostics.h" +#include "clang/ASTMatchers/Dynamic/Registry.h" +#include "clang/Basic/CharInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ManagedStatic.h" +#include <algorithm> +#include <cassert> +#include <cerrno> +#include <cstddef> +#include <cstdlib> +#include <string> +#include <utility> +#include <vector> + +namespace clang { +namespace ast_matchers { +namespace dynamic { + +/// Simple structure to hold information for one token from the parser. +struct Parser::TokenInfo { +  /// Different possible tokens. +  enum TokenKind { +    TK_Eof, +    TK_OpenParen, +    TK_CloseParen, +    TK_Comma, +    TK_Period, +    TK_Literal, +    TK_Ident, +    TK_InvalidChar, +    TK_Error, +    TK_CodeCompletion +  }; + +  /// Some known identifiers. +  static const char* const ID_Bind; + +  TokenInfo() = default; + +  StringRef Text; +  TokenKind Kind = TK_Eof; +  SourceRange Range; +  VariantValue Value; +}; + +const char* const Parser::TokenInfo::ID_Bind = "bind"; + +/// Simple tokenizer for the parser. +class Parser::CodeTokenizer { +public: +  explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) +      : Code(MatcherCode), StartOfLine(MatcherCode), Error(Error) { +    NextToken = getNextToken(); +  } + +  CodeTokenizer(StringRef MatcherCode, Diagnostics *Error, +                unsigned CodeCompletionOffset) +      : Code(MatcherCode), StartOfLine(MatcherCode), Error(Error), +        CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) { +    NextToken = getNextToken(); +  } + +  /// Returns but doesn't consume the next token. +  const TokenInfo &peekNextToken() const { return NextToken; } + +  /// Consumes and returns the next token. +  TokenInfo consumeNextToken() { +    TokenInfo ThisToken = NextToken; +    NextToken = getNextToken(); +    return ThisToken; +  } + +  TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } + +private: +  TokenInfo getNextToken() { +    consumeWhitespace(); +    TokenInfo Result; +    Result.Range.Start = currentLocation(); + +    if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) { +      Result.Kind = TokenInfo::TK_CodeCompletion; +      Result.Text = StringRef(CodeCompletionLocation, 0); +      CodeCompletionLocation = nullptr; +      return Result; +    } + +    if (Code.empty()) { +      Result.Kind = TokenInfo::TK_Eof; +      Result.Text = ""; +      return Result; +    } + +    switch (Code[0]) { +    case '#': +      Result.Kind = TokenInfo::TK_Eof; +      Result.Text = ""; +      return Result; +    case ',': +      Result.Kind = TokenInfo::TK_Comma; +      Result.Text = Code.substr(0, 1); +      Code = Code.drop_front(); +      break; +    case '.': +      Result.Kind = TokenInfo::TK_Period; +      Result.Text = Code.substr(0, 1); +      Code = Code.drop_front(); +      break; +    case '(': +      Result.Kind = TokenInfo::TK_OpenParen; +      Result.Text = Code.substr(0, 1); +      Code = Code.drop_front(); +      break; +    case ')': +      Result.Kind = TokenInfo::TK_CloseParen; +      Result.Text = Code.substr(0, 1); +      Code = Code.drop_front(); +      break; + +    case '"': +    case '\'': +      // Parse a string literal. +      consumeStringLiteral(&Result); +      break; + +    case '0': case '1': case '2': case '3': case '4': +    case '5': case '6': case '7': case '8': case '9': +      // Parse an unsigned and float literal. +      consumeNumberLiteral(&Result); +      break; + +    default: +      if (isAlphanumeric(Code[0])) { +        // Parse an identifier +        size_t TokenLength = 1; +        while (true) { +          // A code completion location in/immediately after an identifier will +          // cause the portion of the identifier before the code completion +          // location to become a code completion token. +          if (CodeCompletionLocation == Code.data() + TokenLength) { +            CodeCompletionLocation = nullptr; +            Result.Kind = TokenInfo::TK_CodeCompletion; +            Result.Text = Code.substr(0, TokenLength); +            Code = Code.drop_front(TokenLength); +            return Result; +          } +          if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength])) +            break; +          ++TokenLength; +        } +        if (TokenLength == 4 && Code.startswith("true")) { +          Result.Kind = TokenInfo::TK_Literal; +          Result.Value = true; +        } else if (TokenLength == 5 && Code.startswith("false")) { +          Result.Kind = TokenInfo::TK_Literal; +          Result.Value = false; +        } else { +          Result.Kind = TokenInfo::TK_Ident; +          Result.Text = Code.substr(0, TokenLength); +        } +        Code = Code.drop_front(TokenLength); +      } else { +        Result.Kind = TokenInfo::TK_InvalidChar; +        Result.Text = Code.substr(0, 1); +        Code = Code.drop_front(1); +      } +      break; +    } + +    Result.Range.End = currentLocation(); +    return Result; +  } + +  /// Consume an unsigned and float literal. +  void consumeNumberLiteral(TokenInfo *Result) { +    bool isFloatingLiteral = false; +    unsigned Length = 1; +    if (Code.size() > 1) { +      // Consume the 'x' or 'b' radix modifier, if present. +      switch (toLowercase(Code[1])) { +      case 'x': case 'b': Length = 2; +      } +    } +    while (Length < Code.size() && isHexDigit(Code[Length])) +      ++Length; + +    // Try to recognize a floating point literal. +    while (Length < Code.size()) { +      char c = Code[Length]; +      if (c == '-' || c == '+' || c == '.' || isHexDigit(c)) { +        isFloatingLiteral = true; +        Length++; +      } else { +        break; +      } +    } + +    Result->Text = Code.substr(0, Length); +    Code = Code.drop_front(Length); + +    if (isFloatingLiteral) { +      char *end; +      errno = 0; +      std::string Text = Result->Text.str(); +      double doubleValue = strtod(Text.c_str(), &end); +      if (*end == 0 && errno == 0) { +        Result->Kind = TokenInfo::TK_Literal; +        Result->Value = doubleValue; +        return; +      } +    } else { +      unsigned Value; +      if (!Result->Text.getAsInteger(0, Value)) { +        Result->Kind = TokenInfo::TK_Literal; +        Result->Value = Value; +        return; +      } +    } + +    SourceRange Range; +    Range.Start = Result->Range.Start; +    Range.End = currentLocation(); +    Error->addError(Range, Error->ET_ParserNumberError) << Result->Text; +    Result->Kind = TokenInfo::TK_Error; +  } + +  /// Consume a string literal. +  /// +  /// \c Code must be positioned at the start of the literal (the opening +  /// quote). Consumed until it finds the same closing quote character. +  void consumeStringLiteral(TokenInfo *Result) { +    bool InEscape = false; +    const char Marker = Code[0]; +    for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { +      if (InEscape) { +        InEscape = false; +        continue; +      } +      if (Code[Length] == '\\') { +        InEscape = true; +        continue; +      } +      if (Code[Length] == Marker) { +        Result->Kind = TokenInfo::TK_Literal; +        Result->Text = Code.substr(0, Length + 1); +        Result->Value = Code.substr(1, Length - 1); +        Code = Code.drop_front(Length + 1); +        return; +      } +    } + +    StringRef ErrorText = Code; +    Code = Code.drop_front(Code.size()); +    SourceRange Range; +    Range.Start = Result->Range.Start; +    Range.End = currentLocation(); +    Error->addError(Range, Error->ET_ParserStringError) << ErrorText; +    Result->Kind = TokenInfo::TK_Error; +  } + +  /// Consume all leading whitespace from \c Code. +  void consumeWhitespace() { +    while (!Code.empty() && isWhitespace(Code[0])) { +      if (Code[0] == '\n') { +        ++Line; +        StartOfLine = Code.drop_front(); +      } +      Code = Code.drop_front(); +    } +  } + +  SourceLocation currentLocation() { +    SourceLocation Location; +    Location.Line = Line; +    Location.Column = Code.data() - StartOfLine.data() + 1; +    return Location; +  } + +  StringRef Code; +  StringRef StartOfLine; +  unsigned Line = 1; +  Diagnostics *Error; +  TokenInfo NextToken; +  const char *CodeCompletionLocation = nullptr; +}; + +Parser::Sema::~Sema() = default; + +std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes( +    llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { +  return {}; +} + +std::vector<MatcherCompletion> +Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) { +  return {}; +} + +struct Parser::ScopedContextEntry { +  Parser *P; + +  ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) { +    P->ContextStack.push_back(std::make_pair(C, 0u)); +  } + +  ~ScopedContextEntry() { +    P->ContextStack.pop_back(); +  } + +  void nextArg() { +    ++P->ContextStack.back().second; +  } +}; + +/// Parse expressions that start with an identifier. +/// +/// This function can parse named values and matchers. +/// In case of failure it will try to determine the user's intent to give +/// an appropriate error message. +bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) { +  const TokenInfo NameToken = Tokenizer->consumeNextToken(); + +  if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) { +    // Parse as a named value. +    if (const VariantValue NamedValue = +            NamedValues ? NamedValues->lookup(NameToken.Text) +                        : VariantValue()) { + +      if (Tokenizer->nextTokenKind() != TokenInfo::TK_Period) { +        *Value = NamedValue; +        return true; +      } + +      std::string BindID; +      if (!parseBindID(BindID)) +        return false; + +      assert(NamedValue.isMatcher()); +      llvm::Optional<DynTypedMatcher> Result = +          NamedValue.getMatcher().getSingleMatcher(); +      if (Result.hasValue()) { +        llvm::Optional<DynTypedMatcher> Bound = Result->tryBind(BindID); +        if (Bound.hasValue()) { +          *Value = VariantMatcher::SingleMatcher(*Bound); +          return true; +        } +      } +      return false; +    } +    // If the syntax is correct and the name is not a matcher either, report +    // unknown named value. +    if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma || +         Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen || +         Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) && +        !S->lookupMatcherCtor(NameToken.Text)) { +      Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound) +          << NameToken.Text; +      return false; +    } +    // Otherwise, fallback to the matcher parser. +  } + +  // Parse as a matcher expression. +  return parseMatcherExpressionImpl(NameToken, Value); +} + +bool Parser::parseBindID(std::string &BindID) { +  // Parse .bind("foo") +  assert(Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period); +  Tokenizer->consumeNextToken(); // consume the period. +  const TokenInfo BindToken = Tokenizer->consumeNextToken(); +  if (BindToken.Kind == TokenInfo::TK_CodeCompletion) { +    addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1)); +    return false; +  } + +  const TokenInfo OpenToken = Tokenizer->consumeNextToken(); +  const TokenInfo IDToken = Tokenizer->consumeNextToken(); +  const TokenInfo CloseToken = Tokenizer->consumeNextToken(); + +  // TODO: We could use different error codes for each/some to be more +  //       explicit about the syntax error. +  if (BindToken.Kind != TokenInfo::TK_Ident || +      BindToken.Text != TokenInfo::ID_Bind) { +    Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); +    return false; +  } +  if (OpenToken.Kind != TokenInfo::TK_OpenParen) { +    Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); +    return false; +  } +  if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { +    Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); +    return false; +  } +  if (CloseToken.Kind != TokenInfo::TK_CloseParen) { +    Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); +    return false; +  } +  BindID = IDToken.Value.getString(); +  return true; +} + +/// Parse and validate a matcher expression. +/// \return \c true on success, in which case \c Value has the matcher parsed. +///   If the input is malformed, or some argument has an error, it +///   returns \c false. +bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken, +                                        VariantValue *Value) { +  assert(NameToken.Kind == TokenInfo::TK_Ident); +  const TokenInfo OpenToken = Tokenizer->consumeNextToken(); +  if (OpenToken.Kind != TokenInfo::TK_OpenParen) { +    Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) +        << OpenToken.Text; +    return false; +  } + +  llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text); + +  if (!Ctor) { +    Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound) +        << NameToken.Text; +    // Do not return here. We need to continue to give completion suggestions. +  } + +  std::vector<ParserValue> Args; +  TokenInfo EndToken; + +  { +    ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr); + +    while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { +      if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { +        // End of args. +        EndToken = Tokenizer->consumeNextToken(); +        break; +      } +      if (!Args.empty()) { +        // We must find a , token to continue. +        const TokenInfo CommaToken = Tokenizer->consumeNextToken(); +        if (CommaToken.Kind != TokenInfo::TK_Comma) { +          Error->addError(CommaToken.Range, Error->ET_ParserNoComma) +              << CommaToken.Text; +          return false; +        } +      } + +      Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, +                               NameToken.Text, NameToken.Range, +                               Args.size() + 1); +      ParserValue ArgValue; +      ArgValue.Text = Tokenizer->peekNextToken().Text; +      ArgValue.Range = Tokenizer->peekNextToken().Range; +      if (!parseExpressionImpl(&ArgValue.Value)) { +        return false; +      } + +      Args.push_back(ArgValue); +      SCE.nextArg(); +    } +  } + +  if (EndToken.Kind == TokenInfo::TK_Eof) { +    Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); +    return false; +  } + +  std::string BindID; +  if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { +    if (!parseBindID(BindID)) +      return false; +  } + +  if (!Ctor) +    return false; + +  // Merge the start and end infos. +  Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, +                           NameToken.Text, NameToken.Range); +  SourceRange MatcherRange = NameToken.Range; +  MatcherRange.End = EndToken.Range.End; +  VariantMatcher Result = S->actOnMatcherExpression( +      *Ctor, MatcherRange, BindID, Args, Error); +  if (Result.isNull()) return false; + +  *Value = Result; +  return true; +} + +// If the prefix of this completion matches the completion token, add it to +// Completions minus the prefix. +void Parser::addCompletion(const TokenInfo &CompToken, +                           const MatcherCompletion& Completion) { +  if (StringRef(Completion.TypedText).startswith(CompToken.Text) && +      Completion.Specificity > 0) { +    Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()), +                             Completion.MatcherDecl, Completion.Specificity); +  } +} + +std::vector<MatcherCompletion> Parser::getNamedValueCompletions( +    ArrayRef<ArgKind> AcceptedTypes) { +  if (!NamedValues) return std::vector<MatcherCompletion>(); +  std::vector<MatcherCompletion> Result; +  for (const auto &Entry : *NamedValues) { +    unsigned Specificity; +    if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) { +      std::string Decl = +          (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str(); +      Result.emplace_back(Entry.getKey(), Decl, Specificity); +    } +  } +  return Result; +} + +void Parser::addExpressionCompletions() { +  const TokenInfo CompToken = Tokenizer->consumeNextToken(); +  assert(CompToken.Kind == TokenInfo::TK_CodeCompletion); + +  // We cannot complete code if there is an invalid element on the context +  // stack. +  for (ContextStackTy::iterator I = ContextStack.begin(), +                                E = ContextStack.end(); +       I != E; ++I) { +    if (!I->first) +      return; +  } + +  auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack); +  for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) { +    addCompletion(CompToken, Completion); +  } + +  for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) { +    addCompletion(CompToken, Completion); +  } +} + +/// Parse an <Expression> +bool Parser::parseExpressionImpl(VariantValue *Value) { +  switch (Tokenizer->nextTokenKind()) { +  case TokenInfo::TK_Literal: +    *Value = Tokenizer->consumeNextToken().Value; +    return true; + +  case TokenInfo::TK_Ident: +    return parseIdentifierPrefixImpl(Value); + +  case TokenInfo::TK_CodeCompletion: +    addExpressionCompletions(); +    return false; + +  case TokenInfo::TK_Eof: +    Error->addError(Tokenizer->consumeNextToken().Range, +                    Error->ET_ParserNoCode); +    return false; + +  case TokenInfo::TK_Error: +    // This error was already reported by the tokenizer. +    return false; + +  case TokenInfo::TK_OpenParen: +  case TokenInfo::TK_CloseParen: +  case TokenInfo::TK_Comma: +  case TokenInfo::TK_Period: +  case TokenInfo::TK_InvalidChar: +    const TokenInfo Token = Tokenizer->consumeNextToken(); +    Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; +    return false; +  } + +  llvm_unreachable("Unknown token kind."); +} + +static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema; + +Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, +               const NamedValueMap *NamedValues, Diagnostics *Error) +    : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema), +      NamedValues(NamedValues), Error(Error) {} + +Parser::RegistrySema::~RegistrySema() = default; + +llvm::Optional<MatcherCtor> +Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) { +  return Registry::lookupMatcherCtor(MatcherName); +} + +VariantMatcher Parser::RegistrySema::actOnMatcherExpression( +    MatcherCtor Ctor, SourceRange NameRange, StringRef BindID, +    ArrayRef<ParserValue> Args, Diagnostics *Error) { +  if (BindID.empty()) { +    return Registry::constructMatcher(Ctor, NameRange, Args, Error); +  } else { +    return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args, +                                           Error); +  } +} + +std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes( +    ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { +  return Registry::getAcceptedCompletionTypes(Context); +} + +std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions( +    ArrayRef<ArgKind> AcceptedTypes) { +  return Registry::getMatcherCompletions(AcceptedTypes); +} + +bool Parser::parseExpression(StringRef Code, Sema *S, +                             const NamedValueMap *NamedValues, +                             VariantValue *Value, Diagnostics *Error) { +  CodeTokenizer Tokenizer(Code, Error); +  if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value)) +    return false; +  if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { +    Error->addError(Tokenizer.peekNextToken().Range, +                    Error->ET_ParserTrailingCode); +    return false; +  } +  return true; +} + +std::vector<MatcherCompletion> +Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S, +                           const NamedValueMap *NamedValues) { +  Diagnostics Error; +  CodeTokenizer Tokenizer(Code, &Error, CompletionOffset); +  Parser P(&Tokenizer, S, NamedValues, &Error); +  VariantValue Dummy; +  P.parseExpressionImpl(&Dummy); + +  // Sort by specificity, then by name. +  llvm::sort(P.Completions, +             [](const MatcherCompletion &A, const MatcherCompletion &B) { +               if (A.Specificity != B.Specificity) +                 return A.Specificity > B.Specificity; +               return A.TypedText < B.TypedText; +             }); + +  return P.Completions; +} + +llvm::Optional<DynTypedMatcher> +Parser::parseMatcherExpression(StringRef Code, Sema *S, +                               const NamedValueMap *NamedValues, +                               Diagnostics *Error) { +  VariantValue Value; +  if (!parseExpression(Code, S, NamedValues, &Value, Error)) +    return llvm::Optional<DynTypedMatcher>(); +  if (!Value.isMatcher()) { +    Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); +    return llvm::Optional<DynTypedMatcher>(); +  } +  llvm::Optional<DynTypedMatcher> Result = +      Value.getMatcher().getSingleMatcher(); +  if (!Result.hasValue()) { +    Error->addError(SourceRange(), Error->ET_ParserOverloadedType) +        << Value.getTypeAsString(); +  } +  return Result; +} + +} // namespace dynamic +} // namespace ast_matchers +} // namespace clang | 
