diff options
Diffstat (limited to 'include/clang/Tooling/Syntax')
-rw-r--r-- | include/clang/Tooling/Syntax/BuildTree.h | 24 | ||||
-rw-r--r-- | include/clang/Tooling/Syntax/Nodes.h | 92 | ||||
-rw-r--r-- | include/clang/Tooling/Syntax/Tokens.h | 355 | ||||
-rw-r--r-- | include/clang/Tooling/Syntax/Tree.h | 142 |
4 files changed, 613 insertions, 0 deletions
diff --git a/include/clang/Tooling/Syntax/BuildTree.h b/include/clang/Tooling/Syntax/BuildTree.h new file mode 100644 index 000000000000..055d6462eabd --- /dev/null +++ b/include/clang/Tooling/Syntax/BuildTree.h @@ -0,0 +1,24 @@ +//===- BuildTree.h - build syntax trees -----------------------*- C++ -*-=====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Functions to construct a syntax tree from an AST. +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_H +#define LLVM_CLANG_TOOLING_SYNTAX_TREE_H + +#include "clang/AST/Decl.h" +#include "clang/Tooling/Syntax/Nodes.h" + +namespace clang { +namespace syntax { + +/// Build a syntax tree for the main file. +syntax::TranslationUnit *buildSyntaxTree(Arena &A, + const clang::TranslationUnitDecl &TU); +} // namespace syntax +} // namespace clang +#endif diff --git a/include/clang/Tooling/Syntax/Nodes.h b/include/clang/Tooling/Syntax/Nodes.h new file mode 100644 index 000000000000..d20c7cb7b171 --- /dev/null +++ b/include/clang/Tooling/Syntax/Nodes.h @@ -0,0 +1,92 @@ +//===- Nodes.h - syntax nodes for C/C++ grammar constructs ----*- C++ -*-=====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Syntax tree nodes for C, C++ and Objective-C grammar constructs. +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLING_SYNTAX_NODES_H +#define LLVM_CLANG_TOOLING_SYNTAX_NODES_H + +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Token.h" +#include "clang/Tooling/Syntax/Tokens.h" +#include "clang/Tooling/Syntax/Tree.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace syntax { + +/// A kind of a syntax node, used for implementing casts. +enum class NodeKind : uint16_t { + Leaf, + TranslationUnit, + TopLevelDeclaration, + CompoundStatement +}; +/// For debugging purposes. +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, NodeKind K); + +/// A relation between a parent and child node. Used for implementing accessors. +enum class NodeRole : uint8_t { + // A node without a parent. + Detached, + // Children of an unknown semantic nature, e.g. skipped tokens, comments. + Unknown, + // FIXME: should this be shared for all other nodes with braces, e.g. init + // lists? + CompoundStatement_lbrace, + CompoundStatement_rbrace +}; + +/// A root node for a translation unit. Parent is always null. +class TranslationUnit final : public Tree { +public: + TranslationUnit() : Tree(NodeKind::TranslationUnit) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::TranslationUnit; + } +}; + +/// FIXME: this node is temporary and will be replaced with nodes for various +/// 'declarations' and 'declarators' from the C/C++ grammar +/// +/// Represents any top-level declaration. Only there to give the syntax tree a +/// bit of structure until we implement syntax nodes for declarations and +/// declarators. +class TopLevelDeclaration final : public Tree { +public: + TopLevelDeclaration() : Tree(NodeKind::TopLevelDeclaration) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::TopLevelDeclaration; + } +}; + +/// An abstract node for C++ statements, e.g. 'while', 'if', etc. +class Statement : public Tree { +public: + Statement(NodeKind K) : Tree(K) {} + static bool classof(const Node *N) { + return NodeKind::CompoundStatement <= N->kind() && + N->kind() <= NodeKind::CompoundStatement; + } +}; + +/// { statement1; statement2; … } +class CompoundStatement final : public Statement { +public: + CompoundStatement() : Statement(NodeKind::CompoundStatement) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::CompoundStatement; + } + syntax::Leaf *lbrace(); + syntax::Leaf *rbrace(); +}; + +} // namespace syntax +} // namespace clang +#endif diff --git a/include/clang/Tooling/Syntax/Tokens.h b/include/clang/Tooling/Syntax/Tokens.h new file mode 100644 index 000000000000..4640ccb2d30a --- /dev/null +++ b/include/clang/Tooling/Syntax/Tokens.h @@ -0,0 +1,355 @@ +//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Record tokens that a preprocessor emits and define operations to map between +// the tokens written in a file and tokens produced by the preprocessor. +// +// When running the compiler, there are two token streams we are interested in: +// - "spelled" tokens directly correspond to a substring written in some +// source file. +// - "expanded" tokens represent the result of preprocessing, parses consumes +// this token stream to produce the AST. +// +// Expanded tokens correspond directly to locations found in the AST, allowing +// to find subranges of the token stream covered by various AST nodes. Spelled +// tokens correspond directly to the source code written by the user. +// +// To allow composing these two use-cases, we also define operations that map +// between expanded and spelled tokens that produced them (macro calls, +// directives, etc). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H +#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H + +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/raw_ostream.h" +#include <cstdint> +#include <tuple> + +namespace clang { +class Preprocessor; + +namespace syntax { + +/// A half-open character range inside a particular file, the start offset is +/// included and the end offset is excluded from the range. +struct FileRange { + /// EXPECTS: File.isValid() && Begin <= End. + FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); + /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). + FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); + /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files + /// are the same. + FileRange(const SourceManager &SM, SourceLocation BeginLoc, + SourceLocation EndLoc); + + FileID file() const { return File; } + /// Start is a start offset (inclusive) in the corresponding file. + unsigned beginOffset() const { return Begin; } + /// End offset (exclusive) in the corresponding file. + unsigned endOffset() const { return End; } + + unsigned length() const { return End - Begin; } + + /// Check if \p Offset is inside the range. + bool contains(unsigned Offset) const { + return Begin <= Offset && Offset < End; + } + /// Check \p Offset is inside the range or equal to its endpoint. + bool touches(unsigned Offset) const { + return Begin <= Offset && Offset <= End; + } + + /// Gets the substring that this FileRange refers to. + llvm::StringRef text(const SourceManager &SM) const; + + friend bool operator==(const FileRange &L, const FileRange &R) { + return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); + } + friend bool operator!=(const FileRange &L, const FileRange &R) { + return !(L == R); + } + +private: + FileID File; + unsigned Begin; + unsigned End; +}; + +/// For debugging purposes. +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); + +/// A token coming directly from a file or from a macro invocation. Has just +/// enough information to locate the token in the source code. +/// Can represent both expanded and spelled tokens. +class Token { +public: + Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); + /// EXPECTS: clang::Token is not an annotation token. + explicit Token(const clang::Token &T); + + tok::TokenKind kind() const { return Kind; } + /// Location of the first character of a token. + SourceLocation location() const { return Location; } + /// Location right after the last character of a token. + SourceLocation endLocation() const { + return Location.getLocWithOffset(Length); + } + unsigned length() const { return Length; } + + /// Get the substring covered by the token. Note that will include all + /// digraphs, newline continuations, etc. E.g. tokens for 'int' and + /// in\ + /// t + /// both have the same kind tok::kw_int, but results of text() are different. + llvm::StringRef text(const SourceManager &SM) const; + + /// Gets a range of this token. + /// EXPECTS: token comes from a file, not from a macro expansion. + FileRange range(const SourceManager &SM) const; + + /// Given two tokens inside the same file, returns a file range that starts at + /// \p First and ends at \p Last. + /// EXPECTS: First and Last are file tokens from the same file, Last starts + /// after First. + static FileRange range(const SourceManager &SM, const syntax::Token &First, + const syntax::Token &Last); + + std::string dumpForTests(const SourceManager &SM) const; + /// For debugging purposes. + std::string str() const; + +private: + SourceLocation Location; + unsigned Length; + tok::TokenKind Kind; +}; +/// For debugging purposes. Equivalent to a call to Token::str(). +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); + +/// A list of tokens obtained by preprocessing a text buffer and operations to +/// map between the expanded and spelled tokens, i.e. TokenBuffer has +/// information about two token streams: +/// 1. Expanded tokens: tokens produced by the preprocessor after all macro +/// replacements, +/// 2. Spelled tokens: corresponding directly to the source code of a file +/// before any macro replacements occurred. +/// Here's an example to illustrate a difference between those two: +/// #define FOO 10 +/// int a = FOO; +/// +/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. +/// Expanded tokens are {'int','a','=','10',';','eof'}. +/// +/// Note that the expanded token stream has a tok::eof token at the end, the +/// spelled tokens never store a 'eof' token. +/// +/// The full list expanded tokens can be obtained with expandedTokens(). Spelled +/// tokens for each of the files can be obtained via spelledTokens(FileID). +/// +/// To map between the expanded and spelled tokens use findSpelledByExpanded(). +/// +/// To build a token buffer use the TokenCollector class. You can also compute +/// the spelled tokens of a file using the tokenize() helper. +/// +/// FIXME: allow to map from spelled to expanded tokens when use-case shows up. +/// FIXME: allow mappings into macro arguments. +class TokenBuffer { +public: + TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} + /// All tokens produced by the preprocessor after all macro replacements, + /// directives, etc. Source locations found in the clang AST will always + /// point to one of these tokens. + /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split + /// into two '>' tokens by the parser. However, TokenBuffer currently + /// keeps it as a single '>>' token. + llvm::ArrayRef<syntax::Token> expandedTokens() const { + return ExpandedTokens; + } + + /// Find the subrange of spelled tokens that produced the corresponding \p + /// Expanded tokens. + /// + /// EXPECTS: \p Expanded is a subrange of expandedTokens(). + /// + /// Will fail if the expanded tokens do not correspond to a + /// sequence of spelled tokens. E.g. for the following example: + /// + /// #define FIRST f1 f2 f3 + /// #define SECOND s1 s2 s3 + /// + /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c + /// + /// the results would be: + /// expanded => spelled + /// ------------------------ + /// a => a + /// s1 s2 s3 => SECOND + /// a f1 f2 f3 => a FIRST + /// a f1 => can't map + /// s1 s2 => can't map + /// + /// If \p Expanded is empty, the returned value is llvm::None. + /// Complexity is logarithmic. + llvm::Optional<llvm::ArrayRef<syntax::Token>> + spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; + + /// An expansion produced by the preprocessor, includes macro expansions and + /// preprocessor directives. Preprocessor always maps a non-empty range of + /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a + /// few examples of expansions: + /// #pragma once // Expands to an empty range. + /// #define FOO 1 2 3 // Expands an empty range. + /// FOO // Expands to "1 2 3". + /// FIXME(ibiryukov): implement this, currently #include expansions are empty. + /// #include <vector> // Expands to tokens produced by the include. + struct Expansion { + llvm::ArrayRef<syntax::Token> Spelled; + llvm::ArrayRef<syntax::Token> Expanded; + }; + /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting + /// a preprocessor directive) return the subrange of expanded tokens that the + /// macro expands to. + llvm::Optional<Expansion> + expansionStartingAt(const syntax::Token *Spelled) const; + + /// Lexed tokens of a file before preprocessing. E.g. for the following input + /// #define DECL(name) int name = 10 + /// DECL(a); + /// spelledTokens() returns {"#", "define", "DECL", "(", "name", ")", "eof"}. + /// FIXME: we do not yet store tokens of directives, like #include, #define, + /// #pragma, etc. + llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; + + const SourceManager &sourceManager() const { return *SourceMgr; } + + std::string dumpForTests() const; + +private: + /// Describes a mapping between a continuous subrange of spelled tokens and + /// expanded tokens. Represents macro expansions, preprocessor directives, + /// conditionally disabled pp regions, etc. + /// #define FOO 1+2 + /// #define BAR(a) a + 1 + /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. + /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, + /// macroTokens = {'BAR', '(', '1', ')'}. + struct Mapping { + // Positions in the corresponding spelled token stream. The corresponding + // range is never empty. + unsigned BeginSpelled = 0; + unsigned EndSpelled = 0; + // Positions in the expanded token stream. The corresponding range can be + // empty. + unsigned BeginExpanded = 0; + unsigned EndExpanded = 0; + + /// For debugging purposes. + std::string str() const; + }; + /// Spelled tokens of the file with information about the subranges. + struct MarkedFile { + /// Lexed, but not preprocessed, tokens of the file. These map directly to + /// text in the corresponding files and include tokens of all preprocessor + /// directives. + /// FIXME: spelled tokens don't change across FileID that map to the same + /// FileEntry. We could consider deduplicating them to save memory. + std::vector<syntax::Token> SpelledTokens; + /// A sorted list to convert between the spelled and expanded token streams. + std::vector<Mapping> Mappings; + /// The first expanded token produced for this FileID. + unsigned BeginExpanded = 0; + unsigned EndExpanded = 0; + }; + + friend class TokenCollector; + + /// Maps a single expanded token to its spelled counterpart or a mapping that + /// produced it. + std::pair<const syntax::Token *, const Mapping *> + spelledForExpandedToken(const syntax::Token *Expanded) const; + + /// Token stream produced after preprocessing, conceputally this captures the + /// same stream as 'clang -E' (excluding the preprocessor directives like + /// #file, etc.). + std::vector<syntax::Token> ExpandedTokens; + llvm::DenseMap<FileID, MarkedFile> Files; + // The value is never null, pointer instead of reference to avoid disabling + // implicit assignment operator. + const SourceManager *SourceMgr; +}; + +/// Lex the text buffer, corresponding to \p FID, in raw mode and record the +/// resulting spelled tokens. Does minimal post-processing on raw identifiers, +/// setting the appropriate token kind (instead of the raw_identifier reported +/// by lexer in raw mode). This is a very low-level function, most users should +/// prefer to use TokenCollector. Lexing in raw mode produces wildly different +/// results from what one might expect when running a C++ frontend, e.g. +/// preprocessor does not run at all. +/// The result will *not* have a 'eof' token at the end. +std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, + const LangOptions &LO); + +/// Collects tokens for the main file while running the frontend action. An +/// instance of this object should be created on +/// FrontendAction::BeginSourceFile() and the results should be consumed after +/// FrontendAction::Execute() finishes. +class TokenCollector { +public: + /// Adds the hooks to collect the tokens. Should be called before the + /// preprocessing starts, i.e. as a part of BeginSourceFile() or + /// CreateASTConsumer(). + TokenCollector(Preprocessor &P); + + /// Finalizes token collection. Should be called after preprocessing is + /// finished, i.e. after running Execute(). + LLVM_NODISCARD TokenBuffer consume() &&; + +private: + /// Maps from a start to an end spelling location of transformations + /// performed by the preprocessor. These include: + /// 1. range from '#' to the last token in the line for PP directives, + /// 2. macro name and arguments for macro expansions. + /// Note that we record only top-level macro expansions, intermediate + /// expansions (e.g. inside macro arguments) are ignored. + /// + /// Used to find correct boundaries of macro calls and directives when + /// building mappings from spelled to expanded tokens. + /// + /// Logically, at each point of the preprocessor execution there is a stack of + /// macro expansions being processed and we could use it to recover the + /// location information we need. However, the public preprocessor API only + /// exposes the points when macro expansions start (when we push a macro onto + /// the stack) and not when they end (when we pop a macro from the stack). + /// To workaround this limitation, we rely on source location information + /// stored in this map. + using PPExpansions = llvm::DenseMap</*SourceLocation*/ int, SourceLocation>; + class Builder; + class CollectPPExpansions; + + std::vector<syntax::Token> Expanded; + // FIXME: we only store macro expansions, also add directives(#pragma, etc.) + PPExpansions Expansions; + Preprocessor &PP; + CollectPPExpansions *Collector; +}; + +} // namespace syntax +} // namespace clang + +#endif diff --git a/include/clang/Tooling/Syntax/Tree.h b/include/clang/Tooling/Syntax/Tree.h new file mode 100644 index 000000000000..b66777561a6a --- /dev/null +++ b/include/clang/Tooling/Syntax/Tree.h @@ -0,0 +1,142 @@ +//===- Tree.h - structure of the syntax tree ------------------*- C++ -*-=====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Defines the basic structure of the syntax tree. There are two kinds of nodes: +// - leaf nodes correspond to a token in the expanded token stream, +// - tree nodes correspond to language grammar constructs. +// +// The tree is initially built from an AST. Each node of a newly built tree +// covers a continous subrange of expanded tokens (i.e. tokens after +// preprocessing), the specific tokens coverered are stored in the leaf nodes of +// a tree. A post-order traversal of a tree will visit leaf nodes in an order +// corresponding the original order of expanded tokens. +// +// This is still work in progress and highly experimental, we leave room for +// ourselves to completely change the design and/or implementation. +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H +#define LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H + +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Tooling/Syntax/Tokens.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/Allocator.h" +#include <cstdint> + +namespace clang { +namespace syntax { + +/// A memory arena for syntax trees. Also tracks the underlying token buffers, +/// source manager, etc. +class Arena { +public: + Arena(SourceManager &SourceMgr, const LangOptions &LangOpts, + TokenBuffer Tokens); + + const SourceManager &sourceManager() const { return SourceMgr; } + const LangOptions &langOptions() const { return LangOpts; } + + const TokenBuffer &tokenBuffer() const; + llvm::BumpPtrAllocator &allocator() { return Allocator; } + + /// Add \p Buffer to the underlying source manager, tokenize it and store the + /// resulting tokens. Useful when there is a need to materialize tokens that + /// were not written in user code. + std::pair<FileID, llvm::ArrayRef<syntax::Token>> + lexBuffer(std::unique_ptr<llvm::MemoryBuffer> Buffer); + +private: + SourceManager &SourceMgr; + const LangOptions &LangOpts; + TokenBuffer Tokens; + /// IDs and storage for additional tokenized files. + llvm::DenseMap<FileID, std::vector<syntax::Token>> ExtraTokens; + /// Keeps all the allocated nodes and their intermediate data structures. + llvm::BumpPtrAllocator Allocator; +}; + +class Tree; +class TreeBuilder; +enum class NodeKind : uint16_t; +enum class NodeRole : uint8_t; + +/// A node in a syntax tree. Each node is either a Leaf (representing tokens) or +/// a Tree (representing language constructrs). +class Node { +public: + /// Newly created nodes are detached from a tree, parent and sibling links are + /// set when the node is added as a child to another one. + Node(NodeKind Kind); + + NodeKind kind() const { return static_cast<NodeKind>(Kind); } + NodeRole role() const { return static_cast<NodeRole>(Role); } + + const Tree *parent() const { return Parent; } + Tree *parent() { return Parent; } + + const Node *nextSibling() const { return NextSibling; } + Node *nextSibling() { return NextSibling; } + + /// Dumps the structure of a subtree. For debugging and testing purposes. + std::string dump(const Arena &A) const; + /// Dumps the tokens forming this subtree. + std::string dumpTokens(const Arena &A) const; + +private: + // Tree is allowed to change the Parent link and Role. + friend class Tree; + + Tree *Parent; + Node *NextSibling; + unsigned Kind : 16; + unsigned Role : 8; +}; + +/// A leaf node points to a single token inside the expanded token stream. +class Leaf final : public Node { +public: + Leaf(const syntax::Token *T); + static bool classof(const Node *N); + + const syntax::Token *token() const { return Tok; } + +private: + const syntax::Token *Tok; +}; + +/// A node that has children and represents a syntactic language construct. +class Tree : public Node { +public: + using Node::Node; + static bool classof(const Node *N); + + Node *firstChild() { return FirstChild; } + const Node *firstChild() const { return FirstChild; } + +protected: + /// Find the first node with a corresponding role. + syntax::Node *findChild(NodeRole R); + +private: + /// Prepend \p Child to the list of children and and sets the parent pointer. + /// A very low-level operation that does not check any invariants, only used + /// by TreeBuilder. + /// EXPECTS: Role != NodeRoleDetached. + void prependChildLowLevel(Node *Child, NodeRole Role); + friend class TreeBuilder; + + Node *FirstChild = nullptr; +}; + +} // namespace syntax +} // namespace clang + +#endif |