diff options
Diffstat (limited to 'clang/lib/Lex')
23 files changed, 25116 insertions, 0 deletions
diff --git a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp new file mode 100644 index 000000000000..f063ed711c44 --- /dev/null +++ b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp @@ -0,0 +1,955 @@ +//===- DependencyDirectivesSourceMinimizer.cpp - -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This is the implementation for minimizing header and source files to the +/// minimum necessary preprocessor directives for evaluating includes. It +/// reduces the source down to #define, #include, #import, @import, and any +/// conditional preprocessor logic that contains one of those. +/// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/DependencyDirectivesSourceMinimizer.h" +#include "clang/Basic/CharInfo.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Lex/LexDiagnostic.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace llvm; +using namespace clang; +using namespace clang::minimize_source_to_dependency_directives; + +namespace { + +struct Minimizer { + /// Minimized output. + SmallVectorImpl<char> &Out; + /// The known tokens encountered during the minimization. + SmallVectorImpl<Token> &Tokens; + + Minimizer(SmallVectorImpl<char> &Out, SmallVectorImpl<Token> &Tokens, + StringRef Input, DiagnosticsEngine *Diags, + SourceLocation InputSourceLoc) + : Out(Out), Tokens(Tokens), Input(Input), Diags(Diags), + InputSourceLoc(InputSourceLoc) {} + + /// Lex the provided source and emit the minimized output. + /// + /// \returns True on error. + bool minimize(); + +private: + struct IdInfo { + const char *Last; + StringRef Name; + }; + + /// Lex an identifier. + /// + /// \pre First points at a valid identifier head. + LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End); + LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First, + const char *const End); + LLVM_NODISCARD bool minimizeImpl(const char *First, const char *const End); + LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End); + LLVM_NODISCARD bool lexAt(const char *&First, const char *const End); + LLVM_NODISCARD bool lexModule(const char *&First, const char *const End); + LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End); + LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End); + LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End); + LLVM_NODISCARD bool lexDefault(TokenKind Kind, StringRef Directive, + const char *&First, const char *const End); + Token &makeToken(TokenKind K) { + Tokens.emplace_back(K, Out.size()); + return Tokens.back(); + } + void popToken() { + Out.resize(Tokens.back().Offset); + Tokens.pop_back(); + } + TokenKind top() const { return Tokens.empty() ? pp_none : Tokens.back().K; } + + Minimizer &put(char Byte) { + Out.push_back(Byte); + return *this; + } + Minimizer &append(StringRef S) { return append(S.begin(), S.end()); } + Minimizer &append(const char *First, const char *Last) { + Out.append(First, Last); + return *this; + } + + void printToNewline(const char *&First, const char *const End); + void printAdjacentModuleNameParts(const char *&First, const char *const End); + LLVM_NODISCARD bool printAtImportBody(const char *&First, + const char *const End); + void printDirectiveBody(const char *&First, const char *const End); + void printAdjacentMacroArgs(const char *&First, const char *const End); + LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End); + + /// Reports a diagnostic if the diagnostic engine is provided. Always returns + /// true at the end. + bool reportError(const char *CurPtr, unsigned Err); + + StringMap<char> SplitIds; + StringRef Input; + DiagnosticsEngine *Diags; + SourceLocation InputSourceLoc; +}; + +} // end anonymous namespace + +bool Minimizer::reportError(const char *CurPtr, unsigned Err) { + if (!Diags) + return true; + assert(CurPtr >= Input.data() && "invalid buffer ptr"); + Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err); + return true; +} + +static void skipOverSpaces(const char *&First, const char *const End) { + while (First != End && isHorizontalWhitespace(*First)) + ++First; +} + +LLVM_NODISCARD static bool isRawStringLiteral(const char *First, + const char *Current) { + assert(First <= Current); + + // Check if we can even back up. + if (*Current != '"' || First == Current) + return false; + + // Check for an "R". + --Current; + if (*Current != 'R') + return false; + if (First == Current || !isIdentifierBody(*--Current)) + return true; + + // Check for a prefix of "u", "U", or "L". + if (*Current == 'u' || *Current == 'U' || *Current == 'L') + return First == Current || !isIdentifierBody(*--Current); + + // Check for a prefix of "u8". + if (*Current != '8' || First == Current || *Current-- != 'u') + return false; + return First == Current || !isIdentifierBody(*--Current); +} + +static void skipRawString(const char *&First, const char *const End) { + assert(First[0] == '"'); + assert(First[-1] == 'R'); + + const char *Last = ++First; + while (Last != End && *Last != '(') + ++Last; + if (Last == End) { + First = Last; // Hit the end... just give up. + return; + } + + StringRef Terminator(First, Last - First); + for (;;) { + // Move First to just past the next ")". + First = Last; + while (First != End && *First != ')') + ++First; + if (First == End) + return; + ++First; + + // Look ahead for the terminator sequence. + Last = First; + while (Last != End && size_t(Last - First) < Terminator.size() && + Terminator[Last - First] == *Last) + ++Last; + + // Check if we hit it (or the end of the file). + if (Last == End) { + First = Last; + return; + } + if (size_t(Last - First) < Terminator.size()) + continue; + if (*Last != '"') + continue; + First = Last + 1; + return; + } +} + +// Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n) +static unsigned isEOL(const char *First, const char *const End) { + if (First == End) + return 0; + if (End - First > 1 && isVerticalWhitespace(First[0]) && + isVerticalWhitespace(First[1]) && First[0] != First[1]) + return 2; + return !!isVerticalWhitespace(First[0]); +} + +static void skipString(const char *&First, const char *const End) { + assert(*First == '\'' || *First == '"' || *First == '<'); + const char Terminator = *First == '<' ? '>' : *First; + for (++First; First != End && *First != Terminator; ++First) { + // String and character literals don't extend past the end of the line. + if (isVerticalWhitespace(*First)) + return; + if (*First != '\\') + continue; + // Skip past backslash to the next character. This ensures that the + // character right after it is skipped as well, which matters if it's + // the terminator. + if (++First == End) + return; + if (!isWhitespace(*First)) + continue; + // Whitespace after the backslash might indicate a line continuation. + const char *FirstAfterBackslashPastSpace = First; + skipOverSpaces(FirstAfterBackslashPastSpace, End); + if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) { + // Advance the character pointer to the next line for the next + // iteration. + First = FirstAfterBackslashPastSpace + NLSize - 1; + } + } + if (First != End) + ++First; // Finish off the string. +} + +// Returns the length of the skipped newline +static unsigned skipNewline(const char *&First, const char *End) { + if (First == End) + return 0; + assert(isVerticalWhitespace(*First)); + unsigned Len = isEOL(First, End); + assert(Len && "expected newline"); + First += Len; + return Len; +} + +static bool wasLineContinuation(const char *First, unsigned EOLLen) { + return *(First - (int)EOLLen - 1) == '\\'; +} + +static void skipToNewlineRaw(const char *&First, const char *const End) { + for (;;) { + if (First == End) + return; + + unsigned Len = isEOL(First, End); + if (Len) + return; + + do { + if (++First == End) + return; + Len = isEOL(First, End); + } while (!Len); + + if (First[-1] != '\\') + return; + + First += Len; + // Keep skipping lines... + } +} + +static const char *findLastNonSpace(const char *First, const char *Last) { + assert(First <= Last); + while (First != Last && isHorizontalWhitespace(Last[-1])) + --Last; + return Last; +} + +static const char *findFirstTrailingSpace(const char *First, + const char *Last) { + const char *LastNonSpace = findLastNonSpace(First, Last); + if (Last == LastNonSpace) + return Last; + assert(isHorizontalWhitespace(LastNonSpace[0])); + return LastNonSpace + 1; +} + +static void skipLineComment(const char *&First, const char *const End) { + assert(First[0] == '/' && First[1] == '/'); + First += 2; + skipToNewlineRaw(First, End); +} + +static void skipBlockComment(const char *&First, const char *const End) { + assert(First[0] == '/' && First[1] == '*'); + if (End - First < 4) { + First = End; + return; + } + for (First += 3; First != End; ++First) + if (First[-1] == '*' && First[0] == '/') { + ++First; + return; + } +} + +/// \returns True if the current single quotation mark character is a C++ 14 +/// digit separator. +static bool isQuoteCppDigitSeparator(const char *const Start, + const char *const Cur, + const char *const End) { + assert(*Cur == '\'' && "expected quotation character"); + // skipLine called in places where we don't expect a valid number + // body before `start` on the same line, so always return false at the start. + if (Start == Cur) + return false; + // The previous character must be a valid PP number character. + // Make sure that the L, u, U, u8 prefixes don't get marked as a + // separator though. + char Prev = *(Cur - 1); + if (Prev == 'L' || Prev == 'U' || Prev == 'u') + return false; + if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u') + return false; + if (!isPreprocessingNumberBody(Prev)) + return false; + // The next character should be a valid identifier body character. + return (Cur + 1) < End && isIdentifierBody(*(Cur + 1)); +} + +static void skipLine(const char *&First, const char *const End) { + for (;;) { + assert(First <= End); + if (First == End) + return; + + if (isVerticalWhitespace(*First)) { + skipNewline(First, End); + return; + } + const char *Start = First; + while (First != End && !isVerticalWhitespace(*First)) { + // Iterate over strings correctly to avoid comments and newlines. + if (*First == '"' || + (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) { + if (isRawStringLiteral(Start, First)) + skipRawString(First, End); + else + skipString(First, End); + continue; + } + + // Iterate over comments correctly. + if (*First != '/' || End - First < 2) { + ++First; + continue; + } + + if (First[1] == '/') { + // "//...". + skipLineComment(First, End); + continue; + } + + if (First[1] != '*') { + ++First; + continue; + } + + // "/*...*/". + skipBlockComment(First, End); + } + if (First == End) + return; + + // Skip over the newline. + unsigned Len = skipNewline(First, End); + if (!wasLineContinuation(First, Len)) // Continue past line-continuations. + break; + } +} + +static void skipDirective(StringRef Name, const char *&First, + const char *const End) { + if (llvm::StringSwitch<bool>(Name) + .Case("warning", true) + .Case("error", true) + .Default(false)) + // Do not process quotes or comments. + skipToNewlineRaw(First, End); + else + skipLine(First, End); +} + +void Minimizer::printToNewline(const char *&First, const char *const End) { + while (First != End && !isVerticalWhitespace(*First)) { + const char *Last = First; + do { + // Iterate over strings correctly to avoid comments and newlines. + if (*Last == '"' || *Last == '\'' || + (*Last == '<' && top() == pp_include)) { + if (LLVM_UNLIKELY(isRawStringLiteral(First, Last))) + skipRawString(Last, End); + else + skipString(Last, End); + continue; + } + if (*Last != '/' || End - Last < 2) { + ++Last; + continue; // Gather the rest up to print verbatim. + } + + if (Last[1] != '/' && Last[1] != '*') { + ++Last; + continue; + } + + // Deal with "//..." and "/*...*/". + append(First, findFirstTrailingSpace(First, Last)); + First = Last; + + if (Last[1] == '/') { + skipLineComment(First, End); + return; + } + + put(' '); + skipBlockComment(First, End); + skipOverSpaces(First, End); + Last = First; + } while (Last != End && !isVerticalWhitespace(*Last)); + + // Print out the string. + const char *LastBeforeTrailingSpace = findLastNonSpace(First, Last); + if (Last == End || LastBeforeTrailingSpace == First || + LastBeforeTrailingSpace[-1] != '\\') { + append(First, LastBeforeTrailingSpace); + First = Last; + skipNewline(First, End); + return; + } + + // Print up to the backslash, backing up over spaces. Preserve at least one + // space, as the space matters when tokens are separated by a line + // continuation. + append(First, findFirstTrailingSpace( + First, LastBeforeTrailingSpace - 1)); + + First = Last; + skipNewline(First, End); + skipOverSpaces(First, End); + } +} + +static void skipWhitespace(const char *&First, const char *const End) { + for (;;) { + assert(First <= End); + skipOverSpaces(First, End); + + if (End - First < 2) + return; + + if (First[0] == '\\' && isVerticalWhitespace(First[1])) { + skipNewline(++First, End); + continue; + } + + // Check for a non-comment character. + if (First[0] != '/') + return; + + // "// ...". + if (First[1] == '/') { + skipLineComment(First, End); + return; + } + + // Cannot be a comment. + if (First[1] != '*') + return; + + // "/*...*/". + skipBlockComment(First, End); + } +} + +void Minimizer::printAdjacentModuleNameParts(const char *&First, + const char *const End) { + // Skip over parts of the body. + const char *Last = First; + do + ++Last; + while (Last != End && (isIdentifierBody(*Last) || *Last == '.')); + append(First, Last); + First = Last; +} + +bool Minimizer::printAtImportBody(const char *&First, const char *const End) { + for (;;) { + skipWhitespace(First, End); + if (First == End) + return true; + + if (isVerticalWhitespace(*First)) { + skipNewline(First, End); + continue; + } + + // Found a semicolon. + if (*First == ';') { + put(*First++).put('\n'); + return false; + } + + // Don't handle macro expansions inside @import for now. + if (!isIdentifierBody(*First) && *First != '.') + return true; + + printAdjacentModuleNameParts(First, End); + } +} + +void Minimizer::printDirectiveBody(const char *&First, const char *const End) { + skipWhitespace(First, End); // Skip initial whitespace. + printToNewline(First, End); + while (Out.back() == ' ') + Out.pop_back(); + put('\n'); +} + +LLVM_NODISCARD static const char *lexRawIdentifier(const char *First, + const char *const End) { + assert(isIdentifierBody(*First) && "invalid identifer"); + const char *Last = First + 1; + while (Last != End && isIdentifierBody(*Last)) + ++Last; + return Last; +} + +LLVM_NODISCARD static const char * +getIdentifierContinuation(const char *First, const char *const End) { + if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1])) + return nullptr; + + ++First; + skipNewline(First, End); + if (First == End) + return nullptr; + return isIdentifierBody(First[0]) ? First : nullptr; +} + +Minimizer::IdInfo Minimizer::lexIdentifier(const char *First, + const char *const End) { + const char *Last = lexRawIdentifier(First, End); + const char *Next = getIdentifierContinuation(Last, End); + if (LLVM_LIKELY(!Next)) + return IdInfo{Last, StringRef(First, Last - First)}; + + // Slow path, where identifiers are split over lines. + SmallVector<char, 64> Id(First, Last); + while (Next) { + Last = lexRawIdentifier(Next, End); + Id.append(Next, Last); + Next = getIdentifierContinuation(Last, End); + } + return IdInfo{ + Last, + SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()}; +} + +void Minimizer::printAdjacentMacroArgs(const char *&First, + const char *const End) { + // Skip over parts of the body. + const char *Last = First; + do + ++Last; + while (Last != End && + (isIdentifierBody(*Last) || *Last == '.' || *Last == ',')); + append(First, Last); + First = Last; +} + +bool Minimizer::printMacroArgs(const char *&First, const char *const End) { + assert(*First == '('); + put(*First++); + for (;;) { + skipWhitespace(First, End); + if (First == End) + return true; + + if (*First == ')') { + put(*First++); + return false; + } + + // This is intentionally fairly liberal. + if (!(isIdentifierBody(*First) || *First == '.' || *First == ',')) + return true; + + printAdjacentMacroArgs(First, End); + } +} + +/// Looks for an identifier starting from Last. +/// +/// Updates "First" to just past the next identifier, if any. Returns true iff +/// the identifier matches "Id". +bool Minimizer::isNextIdentifier(StringRef Id, const char *&First, + const char *const End) { + skipWhitespace(First, End); + if (First == End || !isIdentifierHead(*First)) + return false; + + IdInfo FoundId = lexIdentifier(First, End); + First = FoundId.Last; + return FoundId.Name == Id; +} + +bool Minimizer::lexAt(const char *&First, const char *const End) { + // Handle "@import". + const char *ImportLoc = First++; + if (!isNextIdentifier("import", First, End)) { + skipLine(First, End); + return false; + } + makeToken(decl_at_import); + append("@import "); + if (printAtImportBody(First, End)) + return reportError( + ImportLoc, diag::err_dep_source_minimizer_missing_sema_after_at_import); + skipWhitespace(First, End); + if (First == End) + return false; + if (!isVerticalWhitespace(*First)) + return reportError( + ImportLoc, diag::err_dep_source_minimizer_unexpected_tokens_at_import); + skipNewline(First, End); + return false; +} + +bool Minimizer::lexModule(const char *&First, const char *const End) { + IdInfo Id = lexIdentifier(First, End); + First = Id.Last; + bool Export = false; + if (Id.Name == "export") { + Export = true; + skipWhitespace(First, End); + if (!isIdentifierBody(*First)) { + skipLine(First, End); + return false; + } + Id = lexIdentifier(First, End); + First = Id.Last; + } + + if (Id.Name != "module" && Id.Name != "import") { + skipLine(First, End); + return false; + } + + skipWhitespace(First, End); + + // Ignore this as a module directive if the next character can't be part of + // an import. + + switch (*First) { + case ':': + case '<': + case '"': + break; + default: + if (!isIdentifierBody(*First)) { + skipLine(First, End); + return false; + } + } + + if (Export) { + makeToken(cxx_export_decl); + append("export "); + } + + if (Id.Name == "module") + makeToken(cxx_module_decl); + else + makeToken(cxx_import_decl); + append(Id.Name); + append(" "); + printToNewline(First, End); + append("\n"); + return false; +} + +bool Minimizer::lexDefine(const char *&First, const char *const End) { + makeToken(pp_define); + append("#define "); + skipWhitespace(First, End); + + if (!isIdentifierHead(*First)) + return reportError(First, diag::err_pp_macro_not_identifier); + + IdInfo Id = lexIdentifier(First, End); + const char *Last = Id.Last; + append(Id.Name); + if (Last == End) + return false; + if (*Last == '(') { + size_t Size = Out.size(); + if (printMacroArgs(Last, End)) { + // Be robust to bad macro arguments, since they can show up in disabled + // code. + Out.resize(Size); + append("(/* invalid */\n"); + skipLine(Last, End); + return false; + } + } + skipWhitespace(Last, End); + if (Last == End) + return false; + if (!isVerticalWhitespace(*Last)) + put(' '); + printDirectiveBody(Last, End); + First = Last; + return false; +} + +bool Minimizer::lexPragma(const char *&First, const char *const End) { + // #pragma. + skipWhitespace(First, End); + if (First == End || !isIdentifierHead(*First)) + return false; + + IdInfo FoundId = lexIdentifier(First, End); + First = FoundId.Last; + if (FoundId.Name == "once") { + // #pragma once + skipLine(First, End); + makeToken(pp_pragma_once); + append("#pragma once\n"); + return false; + } + + if (FoundId.Name != "clang") { + skipLine(First, End); + return false; + } + + // #pragma clang. + if (!isNextIdentifier("module", First, End)) { + skipLine(First, End); + return false; + } + + // #pragma clang module. + if (!isNextIdentifier("import", First, End)) { + skipLine(First, End); + return false; + } + + // #pragma clang module import. + makeToken(pp_pragma_import); + append("#pragma clang module import "); + printDirectiveBody(First, End); + return false; +} + +bool Minimizer::lexEndif(const char *&First, const char *const End) { + // Strip out "#else" if it's empty. + if (top() == pp_else) + popToken(); + + // Strip out "#elif" if they're empty. + while (top() == pp_elif) + popToken(); + + // If "#if" is empty, strip it and skip the "#endif". + if (top() == pp_if || top() == pp_ifdef || top() == pp_ifndef) { + popToken(); + skipLine(First, End); + return false; + } + + return lexDefault(pp_endif, "endif", First, End); +} + +bool Minimizer::lexDefault(TokenKind Kind, StringRef Directive, + const char *&First, const char *const End) { + makeToken(Kind); + put('#').append(Directive).put(' '); + printDirectiveBody(First, End); + return false; +} + +static bool isStartOfRelevantLine(char First) { + switch (First) { + case '#': + case '@': + case 'i': + case 'e': + case 'm': + return true; + } + return false; +} + +bool Minimizer::lexPPLine(const char *&First, const char *const End) { + assert(First != End); + + skipWhitespace(First, End); + assert(First <= End); + if (First == End) + return false; + + if (!isStartOfRelevantLine(*First)) { + skipLine(First, End); + assert(First <= End); + return false; + } + + // Handle "@import". + if (*First == '@') + return lexAt(First, End); + + if (*First == 'i' || *First == 'e' || *First == 'm') + return lexModule(First, End); + + // Handle preprocessing directives. + ++First; // Skip over '#'. + skipWhitespace(First, End); + + if (First == End) + return reportError(First, diag::err_pp_expected_eol); + + if (!isIdentifierHead(*First)) { + skipLine(First, End); + return false; + } + + // Figure out the token. + IdInfo Id = lexIdentifier(First, End); + First = Id.Last; + auto Kind = llvm::StringSwitch<TokenKind>(Id.Name) + .Case("include", pp_include) + .Case("__include_macros", pp___include_macros) + .Case("define", pp_define) + .Case("undef", pp_undef) + .Case("import", pp_import) + .Case("include_next", pp_include_next) + .Case("if", pp_if) + .Case("ifdef", pp_ifdef) + .Case("ifndef", pp_ifndef) + .Case("elif", pp_elif) + .Case("else", pp_else) + .Case("endif", pp_endif) + .Case("pragma", pp_pragma_import) + .Default(pp_none); + if (Kind == pp_none) { + skipDirective(Id.Name, First, End); + return false; + } + + if (Kind == pp_endif) + return lexEndif(First, End); + + if (Kind == pp_define) + return lexDefine(First, End); + + if (Kind == pp_pragma_import) + return lexPragma(First, End); + + // Everything else. + return lexDefault(Kind, Id.Name, First, End); +} + +static void skipUTF8ByteOrderMark(const char *&First, const char *const End) { + if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' && + First[2] == '\xbf') + First += 3; +} + +bool Minimizer::minimizeImpl(const char *First, const char *const End) { + skipUTF8ByteOrderMark(First, End); + while (First != End) + if (lexPPLine(First, End)) + return true; + return false; +} + +bool Minimizer::minimize() { + bool Error = minimizeImpl(Input.begin(), Input.end()); + + if (!Error) { + // Add a trailing newline and an EOF on success. + if (!Out.empty() && Out.back() != '\n') + Out.push_back('\n'); + makeToken(pp_eof); + } + + // Null-terminate the output. This way the memory buffer that's passed to + // Clang will not have to worry about the terminating '\0'. + Out.push_back(0); + Out.pop_back(); + return Error; +} + +bool clang::minimize_source_to_dependency_directives::computeSkippedRanges( + ArrayRef<Token> Input, llvm::SmallVectorImpl<SkippedRange> &Range) { + struct Directive { + enum DirectiveKind { + If, // if/ifdef/ifndef + Else // elif,else + }; + int Offset; + DirectiveKind Kind; + }; + llvm::SmallVector<Directive, 32> Offsets; + for (const Token &T : Input) { + switch (T.K) { + case pp_if: + case pp_ifdef: + case pp_ifndef: + Offsets.push_back({T.Offset, Directive::If}); + break; + + case pp_elif: + case pp_else: { + if (Offsets.empty()) + return true; + int PreviousOffset = Offsets.back().Offset; + Range.push_back({PreviousOffset, T.Offset - PreviousOffset}); + Offsets.push_back({T.Offset, Directive::Else}); + break; + } + + case pp_endif: { + if (Offsets.empty()) + return true; + int PreviousOffset = Offsets.back().Offset; + Range.push_back({PreviousOffset, T.Offset - PreviousOffset}); + do { + Directive::DirectiveKind Kind = Offsets.pop_back_val().Kind; + if (Kind == Directive::If) + break; + } while (!Offsets.empty()); + break; + } + default: + break; + } + } + return false; +} + +bool clang::minimizeSourceToDependencyDirectives( + StringRef Input, SmallVectorImpl<char> &Output, + SmallVectorImpl<Token> &Tokens, DiagnosticsEngine *Diags, + SourceLocation InputSourceLoc) { + Output.clear(); + Tokens.clear(); + return Minimizer(Output, Tokens, Input, Diags, InputSourceLoc).minimize(); +} diff --git a/clang/lib/Lex/HeaderMap.cpp b/clang/lib/Lex/HeaderMap.cpp new file mode 100644 index 000000000000..d44ef29c05d1 --- /dev/null +++ b/clang/lib/Lex/HeaderMap.cpp @@ -0,0 +1,242 @@ +//===--- HeaderMap.cpp - A file that acts like dir of symlinks ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the HeaderMap interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/HeaderMap.h" +#include "clang/Lex/HeaderMapTypes.h" +#include "clang/Basic/CharInfo.h" +#include "clang/Basic/FileManager.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SwapByteOrder.h" +#include "llvm/Support/Debug.h" +#include <cstring> +#include <memory> +using namespace clang; + +/// HashHMapKey - This is the 'well known' hash function required by the file +/// format, used to look up keys in the hash table. The hash table uses simple +/// linear probing based on this function. +static inline unsigned HashHMapKey(StringRef Str) { + unsigned Result = 0; + const char *S = Str.begin(), *End = Str.end(); + + for (; S != End; S++) + Result += toLowercase(*S) * 13; + return Result; +} + + + +//===----------------------------------------------------------------------===// +// Verification and Construction +//===----------------------------------------------------------------------===// + +/// HeaderMap::Create - This attempts to load the specified file as a header +/// map. If it doesn't look like a HeaderMap, it gives up and returns null. +/// If it looks like a HeaderMap but is obviously corrupted, it puts a reason +/// into the string error argument and returns null. +std::unique_ptr<HeaderMap> HeaderMap::Create(const FileEntry *FE, + FileManager &FM) { + // If the file is too small to be a header map, ignore it. + unsigned FileSize = FE->getSize(); + if (FileSize <= sizeof(HMapHeader)) return nullptr; + + auto FileBuffer = FM.getBufferForFile(FE); + if (!FileBuffer || !*FileBuffer) + return nullptr; + bool NeedsByteSwap; + if (!checkHeader(**FileBuffer, NeedsByteSwap)) + return nullptr; + return std::unique_ptr<HeaderMap>(new HeaderMap(std::move(*FileBuffer), NeedsByteSwap)); +} + +bool HeaderMapImpl::checkHeader(const llvm::MemoryBuffer &File, + bool &NeedsByteSwap) { + if (File.getBufferSize() <= sizeof(HMapHeader)) + return false; + const char *FileStart = File.getBufferStart(); + + // We know the file is at least as big as the header, check it now. + const HMapHeader *Header = reinterpret_cast<const HMapHeader*>(FileStart); + + // Sniff it to see if it's a headermap by checking the magic number and + // version. + if (Header->Magic == HMAP_HeaderMagicNumber && + Header->Version == HMAP_HeaderVersion) + NeedsByteSwap = false; + else if (Header->Magic == llvm::ByteSwap_32(HMAP_HeaderMagicNumber) && + Header->Version == llvm::ByteSwap_16(HMAP_HeaderVersion)) + NeedsByteSwap = true; // Mixed endianness headermap. + else + return false; // Not a header map. + + if (Header->Reserved != 0) + return false; + + // Check the number of buckets. It should be a power of two, and there + // should be enough space in the file for all of them. + uint32_t NumBuckets = NeedsByteSwap + ? llvm::sys::getSwappedBytes(Header->NumBuckets) + : Header->NumBuckets; + if (!llvm::isPowerOf2_32(NumBuckets)) + return false; + if (File.getBufferSize() < + sizeof(HMapHeader) + sizeof(HMapBucket) * NumBuckets) + return false; + + // Okay, everything looks good. + return true; +} + +//===----------------------------------------------------------------------===// +// Utility Methods +//===----------------------------------------------------------------------===// + + +/// getFileName - Return the filename of the headermap. +StringRef HeaderMapImpl::getFileName() const { + return FileBuffer->getBufferIdentifier(); +} + +unsigned HeaderMapImpl::getEndianAdjustedWord(unsigned X) const { + if (!NeedsBSwap) return X; + return llvm::ByteSwap_32(X); +} + +/// getHeader - Return a reference to the file header, in unbyte-swapped form. +/// This method cannot fail. +const HMapHeader &HeaderMapImpl::getHeader() const { + // We know the file is at least as big as the header. Return it. + return *reinterpret_cast<const HMapHeader*>(FileBuffer->getBufferStart()); +} + +/// getBucket - Return the specified hash table bucket from the header map, +/// bswap'ing its fields as appropriate. If the bucket number is not valid, +/// this return a bucket with an empty key (0). +HMapBucket HeaderMapImpl::getBucket(unsigned BucketNo) const { + assert(FileBuffer->getBufferSize() >= + sizeof(HMapHeader) + sizeof(HMapBucket) * BucketNo && + "Expected bucket to be in range"); + + HMapBucket Result; + Result.Key = HMAP_EmptyBucketKey; + + const HMapBucket *BucketArray = + reinterpret_cast<const HMapBucket*>(FileBuffer->getBufferStart() + + sizeof(HMapHeader)); + const HMapBucket *BucketPtr = BucketArray+BucketNo; + + // Load the values, bswapping as needed. + Result.Key = getEndianAdjustedWord(BucketPtr->Key); + Result.Prefix = getEndianAdjustedWord(BucketPtr->Prefix); + Result.Suffix = getEndianAdjustedWord(BucketPtr->Suffix); + return Result; +} + +Optional<StringRef> HeaderMapImpl::getString(unsigned StrTabIdx) const { + // Add the start of the string table to the idx. + StrTabIdx += getEndianAdjustedWord(getHeader().StringsOffset); + + // Check for invalid index. + if (StrTabIdx >= FileBuffer->getBufferSize()) + return None; + + const char *Data = FileBuffer->getBufferStart() + StrTabIdx; + unsigned MaxLen = FileBuffer->getBufferSize() - StrTabIdx; + unsigned Len = strnlen(Data, MaxLen); + + // Check whether the buffer is null-terminated. + if (Len == MaxLen && Data[Len - 1]) + return None; + + return StringRef(Data, Len); +} + +//===----------------------------------------------------------------------===// +// The Main Drivers +//===----------------------------------------------------------------------===// + +/// dump - Print the contents of this headermap to stderr. +LLVM_DUMP_METHOD void HeaderMapImpl::dump() const { + const HMapHeader &Hdr = getHeader(); + unsigned NumBuckets = getEndianAdjustedWord(Hdr.NumBuckets); + + llvm::dbgs() << "Header Map " << getFileName() << ":\n " << NumBuckets + << ", " << getEndianAdjustedWord(Hdr.NumEntries) << "\n"; + + auto getStringOrInvalid = [this](unsigned Id) -> StringRef { + if (Optional<StringRef> S = getString(Id)) + return *S; + return "<invalid>"; + }; + + for (unsigned i = 0; i != NumBuckets; ++i) { + HMapBucket B = getBucket(i); + if (B.Key == HMAP_EmptyBucketKey) continue; + + StringRef Key = getStringOrInvalid(B.Key); + StringRef Prefix = getStringOrInvalid(B.Prefix); + StringRef Suffix = getStringOrInvalid(B.Suffix); + llvm::dbgs() << " " << i << ". " << Key << " -> '" << Prefix << "' '" + << Suffix << "'\n"; + } +} + +/// LookupFile - Check to see if the specified relative filename is located in +/// this HeaderMap. If so, open it and return its FileEntry. +Optional<FileEntryRef> HeaderMap::LookupFile(StringRef Filename, + FileManager &FM) const { + + SmallString<1024> Path; + StringRef Dest = HeaderMapImpl::lookupFilename(Filename, Path); + if (Dest.empty()) + return None; + + return FM.getOptionalFileRef(Dest); +} + +StringRef HeaderMapImpl::lookupFilename(StringRef Filename, + SmallVectorImpl<char> &DestPath) const { + const HMapHeader &Hdr = getHeader(); + unsigned NumBuckets = getEndianAdjustedWord(Hdr.NumBuckets); + + // Don't probe infinitely. This should be checked before constructing. + assert(llvm::isPowerOf2_32(NumBuckets) && "Expected power of 2"); + + // Linearly probe the hash table. + for (unsigned Bucket = HashHMapKey(Filename);; ++Bucket) { + HMapBucket B = getBucket(Bucket & (NumBuckets-1)); + if (B.Key == HMAP_EmptyBucketKey) return StringRef(); // Hash miss. + + // See if the key matches. If not, probe on. + Optional<StringRef> Key = getString(B.Key); + if (LLVM_UNLIKELY(!Key)) + continue; + if (!Filename.equals_lower(*Key)) + continue; + + // If so, we have a match in the hash table. Construct the destination + // path. + Optional<StringRef> Prefix = getString(B.Prefix); + Optional<StringRef> Suffix = getString(B.Suffix); + + DestPath.clear(); + if (LLVM_LIKELY(Prefix && Suffix)) { + DestPath.append(Prefix->begin(), Prefix->end()); + DestPath.append(Suffix->begin(), Suffix->end()); + } + return StringRef(DestPath.begin(), DestPath.size()); + } +} diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp new file mode 100644 index 000000000000..f0c5900c8ce4 --- /dev/null +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -0,0 +1,1801 @@ +//===- HeaderSearch.cpp - Resolve Header File Locations -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the DirectoryLookup and HeaderSearch interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/HeaderSearch.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/Module.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Lex/DirectoryLookup.h" +#include "clang/Lex/ExternalPreprocessorSource.h" +#include "clang/Lex/HeaderMap.h" +#include "clang/Lex/HeaderSearchOptions.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/ModuleMap.h" +#include "clang/Lex/Preprocessor.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Capacity.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/VirtualFileSystem.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdio> +#include <cstring> +#include <string> +#include <system_error> +#include <utility> + +using namespace clang; + +#define DEBUG_TYPE "file-search" + +ALWAYS_ENABLED_STATISTIC(NumIncluded, "Number of attempted #includes."); +ALWAYS_ENABLED_STATISTIC( + NumMultiIncludeFileOptzn, + "Number of #includes skipped due to the multi-include optimization."); +ALWAYS_ENABLED_STATISTIC(NumFrameworkLookups, "Number of framework lookups."); +ALWAYS_ENABLED_STATISTIC(NumSubFrameworkLookups, + "Number of subframework lookups."); + +const IdentifierInfo * +HeaderFileInfo::getControllingMacro(ExternalPreprocessorSource *External) { + if (ControllingMacro) { + if (ControllingMacro->isOutOfDate()) { + assert(External && "We must have an external source if we have a " + "controlling macro that is out of date."); + External->updateOutOfDateIdentifier( + *const_cast<IdentifierInfo *>(ControllingMacro)); + } + return ControllingMacro; + } + + if (!ControllingMacroID || !External) + return nullptr; + + ControllingMacro = External->GetIdentifier(ControllingMacroID); + return ControllingMacro; +} + +ExternalHeaderFileInfoSource::~ExternalHeaderFileInfoSource() = default; + +HeaderSearch::HeaderSearch(std::shared_ptr<HeaderSearchOptions> HSOpts, + SourceManager &SourceMgr, DiagnosticsEngine &Diags, + const LangOptions &LangOpts, + const TargetInfo *Target) + : HSOpts(std::move(HSOpts)), Diags(Diags), + FileMgr(SourceMgr.getFileManager()), FrameworkMap(64), + ModMap(SourceMgr, Diags, LangOpts, Target, *this) {} + +void HeaderSearch::PrintStats() { + llvm::errs() << "\n*** HeaderSearch Stats:\n" + << FileInfo.size() << " files tracked.\n"; + unsigned NumOnceOnlyFiles = 0, MaxNumIncludes = 0, NumSingleIncludedFiles = 0; + for (unsigned i = 0, e = FileInfo.size(); i != e; ++i) { + NumOnceOnlyFiles += FileInfo[i].isImport; + if (MaxNumIncludes < FileInfo[i].NumIncludes) + MaxNumIncludes = FileInfo[i].NumIncludes; + NumSingleIncludedFiles += FileInfo[i].NumIncludes == 1; + } + llvm::errs() << " " << NumOnceOnlyFiles << " #import/#pragma once files.\n" + << " " << NumSingleIncludedFiles << " included exactly once.\n" + << " " << MaxNumIncludes << " max times a file is included.\n"; + + llvm::errs() << " " << NumIncluded << " #include/#include_next/#import.\n" + << " " << NumMultiIncludeFileOptzn + << " #includes skipped due to the multi-include optimization.\n"; + + llvm::errs() << NumFrameworkLookups << " framework lookups.\n" + << NumSubFrameworkLookups << " subframework lookups.\n"; +} + +/// CreateHeaderMap - This method returns a HeaderMap for the specified +/// FileEntry, uniquing them through the 'HeaderMaps' datastructure. +const HeaderMap *HeaderSearch::CreateHeaderMap(const FileEntry *FE) { + // We expect the number of headermaps to be small, and almost always empty. + // If it ever grows, use of a linear search should be re-evaluated. + if (!HeaderMaps.empty()) { + for (unsigned i = 0, e = HeaderMaps.size(); i != e; ++i) + // Pointer equality comparison of FileEntries works because they are + // already uniqued by inode. + if (HeaderMaps[i].first == FE) + return HeaderMaps[i].second.get(); + } + + if (std::unique_ptr<HeaderMap> HM = HeaderMap::Create(FE, FileMgr)) { + HeaderMaps.emplace_back(FE, std::move(HM)); + return HeaderMaps.back().second.get(); + } + + return nullptr; +} + +/// Get filenames for all registered header maps. +void HeaderSearch::getHeaderMapFileNames( + SmallVectorImpl<std::string> &Names) const { + for (auto &HM : HeaderMaps) + Names.push_back(HM.first->getName()); +} + +std::string HeaderSearch::getCachedModuleFileName(Module *Module) { + const FileEntry *ModuleMap = + getModuleMap().getModuleMapFileForUniquing(Module); + return getCachedModuleFileName(Module->Name, ModuleMap->getName()); +} + +std::string HeaderSearch::getPrebuiltModuleFileName(StringRef ModuleName, + bool FileMapOnly) { + // First check the module name to pcm file map. + auto i (HSOpts->PrebuiltModuleFiles.find(ModuleName)); + if (i != HSOpts->PrebuiltModuleFiles.end()) + return i->second; + + if (FileMapOnly || HSOpts->PrebuiltModulePaths.empty()) + return {}; + + // Then go through each prebuilt module directory and try to find the pcm + // file. + for (const std::string &Dir : HSOpts->PrebuiltModulePaths) { + SmallString<256> Result(Dir); + llvm::sys::fs::make_absolute(Result); + llvm::sys::path::append(Result, ModuleName + ".pcm"); + if (getFileMgr().getFile(Result.str())) + return Result.str().str(); + } + return {}; +} + +std::string HeaderSearch::getCachedModuleFileName(StringRef ModuleName, + StringRef ModuleMapPath) { + // If we don't have a module cache path or aren't supposed to use one, we + // can't do anything. + if (getModuleCachePath().empty()) + return {}; + + SmallString<256> Result(getModuleCachePath()); + llvm::sys::fs::make_absolute(Result); + + if (HSOpts->DisableModuleHash) { + llvm::sys::path::append(Result, ModuleName + ".pcm"); + } else { + // Construct the name <ModuleName>-<hash of ModuleMapPath>.pcm which should + // ideally be globally unique to this particular module. Name collisions + // in the hash are safe (because any translation unit can only import one + // module with each name), but result in a loss of caching. + // + // To avoid false-negatives, we form as canonical a path as we can, and map + // to lower-case in case we're on a case-insensitive file system. + std::string Parent = llvm::sys::path::parent_path(ModuleMapPath); + if (Parent.empty()) + Parent = "."; + auto Dir = FileMgr.getDirectory(Parent); + if (!Dir) + return {}; + auto DirName = FileMgr.getCanonicalName(*Dir); + auto FileName = llvm::sys::path::filename(ModuleMapPath); + + llvm::hash_code Hash = + llvm::hash_combine(DirName.lower(), FileName.lower()); + + SmallString<128> HashStr; + llvm::APInt(64, size_t(Hash)).toStringUnsigned(HashStr, /*Radix*/36); + llvm::sys::path::append(Result, ModuleName + "-" + HashStr + ".pcm"); + } + return Result.str().str(); +} + +Module *HeaderSearch::lookupModule(StringRef ModuleName, bool AllowSearch, + bool AllowExtraModuleMapSearch) { + // Look in the module map to determine if there is a module by this name. + Module *Module = ModMap.findModule(ModuleName); + if (Module || !AllowSearch || !HSOpts->ImplicitModuleMaps) + return Module; + + StringRef SearchName = ModuleName; + Module = lookupModule(ModuleName, SearchName, AllowExtraModuleMapSearch); + + // The facility for "private modules" -- adjacent, optional module maps named + // module.private.modulemap that are supposed to define private submodules -- + // may have different flavors of names: FooPrivate, Foo_Private and Foo.Private. + // + // Foo.Private is now deprecated in favor of Foo_Private. Users of FooPrivate + // should also rename to Foo_Private. Representing private as submodules + // could force building unwanted dependencies into the parent module and cause + // dependency cycles. + if (!Module && SearchName.consume_back("_Private")) + Module = lookupModule(ModuleName, SearchName, AllowExtraModuleMapSearch); + if (!Module && SearchName.consume_back("Private")) + Module = lookupModule(ModuleName, SearchName, AllowExtraModuleMapSearch); + return Module; +} + +Module *HeaderSearch::lookupModule(StringRef ModuleName, StringRef SearchName, + bool AllowExtraModuleMapSearch) { + Module *Module = nullptr; + + // Look through the various header search paths to load any available module + // maps, searching for a module map that describes this module. + for (unsigned Idx = 0, N = SearchDirs.size(); Idx != N; ++Idx) { + if (SearchDirs[Idx].isFramework()) { + // Search for or infer a module map for a framework. Here we use + // SearchName rather than ModuleName, to permit finding private modules + // named FooPrivate in buggy frameworks named Foo. + SmallString<128> FrameworkDirName; + FrameworkDirName += SearchDirs[Idx].getFrameworkDir()->getName(); + llvm::sys::path::append(FrameworkDirName, SearchName + ".framework"); + if (auto FrameworkDir = FileMgr.getDirectory(FrameworkDirName)) { + bool IsSystem + = SearchDirs[Idx].getDirCharacteristic() != SrcMgr::C_User; + Module = loadFrameworkModule(ModuleName, *FrameworkDir, IsSystem); + if (Module) + break; + } + } + + // FIXME: Figure out how header maps and module maps will work together. + + // Only deal with normal search directories. + if (!SearchDirs[Idx].isNormalDir()) + continue; + + bool IsSystem = SearchDirs[Idx].isSystemHeaderDirectory(); + // Search for a module map file in this directory. + if (loadModuleMapFile(SearchDirs[Idx].getDir(), IsSystem, + /*IsFramework*/false) == LMM_NewlyLoaded) { + // We just loaded a module map file; check whether the module is + // available now. + Module = ModMap.findModule(ModuleName); + if (Module) + break; + } + + // Search for a module map in a subdirectory with the same name as the + // module. + SmallString<128> NestedModuleMapDirName; + NestedModuleMapDirName = SearchDirs[Idx].getDir()->getName(); + llvm::sys::path::append(NestedModuleMapDirName, ModuleName); + if (loadModuleMapFile(NestedModuleMapDirName, IsSystem, + /*IsFramework*/false) == LMM_NewlyLoaded){ + // If we just loaded a module map file, look for the module again. + Module = ModMap.findModule(ModuleName); + if (Module) + break; + } + + // If we've already performed the exhaustive search for module maps in this + // search directory, don't do it again. + if (SearchDirs[Idx].haveSearchedAllModuleMaps()) + continue; + + // Load all module maps in the immediate subdirectories of this search + // directory if ModuleName was from @import. + if (AllowExtraModuleMapSearch) + loadSubdirectoryModuleMaps(SearchDirs[Idx]); + + // Look again for the module. + Module = ModMap.findModule(ModuleName); + if (Module) + break; + } + + return Module; +} + +//===----------------------------------------------------------------------===// +// File lookup within a DirectoryLookup scope +//===----------------------------------------------------------------------===// + +/// getName - Return the directory or filename corresponding to this lookup +/// object. +StringRef DirectoryLookup::getName() const { + // FIXME: Use the name from \c DirectoryEntryRef. + if (isNormalDir()) + return getDir()->getName(); + if (isFramework()) + return getFrameworkDir()->getName(); + assert(isHeaderMap() && "Unknown DirectoryLookup"); + return getHeaderMap()->getFileName(); +} + +Optional<FileEntryRef> HeaderSearch::getFileAndSuggestModule( + StringRef FileName, SourceLocation IncludeLoc, const DirectoryEntry *Dir, + bool IsSystemHeaderDir, Module *RequestingModule, + ModuleMap::KnownHeader *SuggestedModule) { + // If we have a module map that might map this header, load it and + // check whether we'll have a suggestion for a module. + auto File = getFileMgr().getFileRef(FileName, /*OpenFile=*/true); + if (!File) { + // For rare, surprising errors (e.g. "out of file handles"), diag the EC + // message. + std::error_code EC = llvm::errorToErrorCode(File.takeError()); + if (EC != llvm::errc::no_such_file_or_directory && + EC != llvm::errc::invalid_argument && + EC != llvm::errc::is_a_directory && EC != llvm::errc::not_a_directory) { + Diags.Report(IncludeLoc, diag::err_cannot_open_file) + << FileName << EC.message(); + } + return None; + } + + // If there is a module that corresponds to this header, suggest it. + if (!findUsableModuleForHeader( + &File->getFileEntry(), Dir ? Dir : File->getFileEntry().getDir(), + RequestingModule, SuggestedModule, IsSystemHeaderDir)) + return None; + + return *File; +} + +/// LookupFile - Lookup the specified file in this search path, returning it +/// if it exists or returning null if not. +Optional<FileEntryRef> DirectoryLookup::LookupFile( + StringRef &Filename, HeaderSearch &HS, SourceLocation IncludeLoc, + SmallVectorImpl<char> *SearchPath, SmallVectorImpl<char> *RelativePath, + Module *RequestingModule, ModuleMap::KnownHeader *SuggestedModule, + bool &InUserSpecifiedSystemFramework, bool &IsFrameworkFound, + bool &IsInHeaderMap, SmallVectorImpl<char> &MappedName) const { + InUserSpecifiedSystemFramework = false; + IsInHeaderMap = false; + MappedName.clear(); + + SmallString<1024> TmpDir; + if (isNormalDir()) { + // Concatenate the requested file onto the directory. + TmpDir = getDir()->getName(); + llvm::sys::path::append(TmpDir, Filename); + if (SearchPath) { + StringRef SearchPathRef(getDir()->getName()); + SearchPath->clear(); + SearchPath->append(SearchPathRef.begin(), SearchPathRef.end()); + } + if (RelativePath) { + RelativePath->clear(); + RelativePath->append(Filename.begin(), Filename.end()); + } + + return HS.getFileAndSuggestModule(TmpDir, IncludeLoc, getDir(), + isSystemHeaderDirectory(), + RequestingModule, SuggestedModule); + } + + if (isFramework()) + return DoFrameworkLookup(Filename, HS, SearchPath, RelativePath, + RequestingModule, SuggestedModule, + InUserSpecifiedSystemFramework, IsFrameworkFound); + + assert(isHeaderMap() && "Unknown directory lookup"); + const HeaderMap *HM = getHeaderMap(); + SmallString<1024> Path; + StringRef Dest = HM->lookupFilename(Filename, Path); + if (Dest.empty()) + return None; + + IsInHeaderMap = true; + + auto FixupSearchPath = [&]() { + if (SearchPath) { + StringRef SearchPathRef(getName()); + SearchPath->clear(); + SearchPath->append(SearchPathRef.begin(), SearchPathRef.end()); + } + if (RelativePath) { + RelativePath->clear(); + RelativePath->append(Filename.begin(), Filename.end()); + } + }; + + // Check if the headermap maps the filename to a framework include + // ("Foo.h" -> "Foo/Foo.h"), in which case continue header lookup using the + // framework include. + if (llvm::sys::path::is_relative(Dest)) { + MappedName.append(Dest.begin(), Dest.end()); + Filename = StringRef(MappedName.begin(), MappedName.size()); + Optional<FileEntryRef> Result = HM->LookupFile(Filename, HS.getFileMgr()); + if (Result) { + FixupSearchPath(); + return *Result; + } + } else if (auto Res = HS.getFileMgr().getOptionalFileRef(Dest)) { + FixupSearchPath(); + return *Res; + } + + return None; +} + +/// Given a framework directory, find the top-most framework directory. +/// +/// \param FileMgr The file manager to use for directory lookups. +/// \param DirName The name of the framework directory. +/// \param SubmodulePath Will be populated with the submodule path from the +/// returned top-level module to the originally named framework. +static const DirectoryEntry * +getTopFrameworkDir(FileManager &FileMgr, StringRef DirName, + SmallVectorImpl<std::string> &SubmodulePath) { + assert(llvm::sys::path::extension(DirName) == ".framework" && + "Not a framework directory"); + + // Note: as an egregious but useful hack we use the real path here, because + // frameworks moving between top-level frameworks to embedded frameworks tend + // to be symlinked, and we base the logical structure of modules on the + // physical layout. In particular, we need to deal with crazy includes like + // + // #include <Foo/Frameworks/Bar.framework/Headers/Wibble.h> + // + // where 'Bar' used to be embedded in 'Foo', is now a top-level framework + // which one should access with, e.g., + // + // #include <Bar/Wibble.h> + // + // Similar issues occur when a top-level framework has moved into an + // embedded framework. + const DirectoryEntry *TopFrameworkDir = nullptr; + if (auto TopFrameworkDirOrErr = FileMgr.getDirectory(DirName)) + TopFrameworkDir = *TopFrameworkDirOrErr; + + if (TopFrameworkDir) + DirName = FileMgr.getCanonicalName(TopFrameworkDir); + do { + // Get the parent directory name. + DirName = llvm::sys::path::parent_path(DirName); + if (DirName.empty()) + break; + + // Determine whether this directory exists. + auto Dir = FileMgr.getDirectory(DirName); + if (!Dir) + break; + + // If this is a framework directory, then we're a subframework of this + // framework. + if (llvm::sys::path::extension(DirName) == ".framework") { + SubmodulePath.push_back(llvm::sys::path::stem(DirName)); + TopFrameworkDir = *Dir; + } + } while (true); + + return TopFrameworkDir; +} + +static bool needModuleLookup(Module *RequestingModule, + bool HasSuggestedModule) { + return HasSuggestedModule || + (RequestingModule && RequestingModule->NoUndeclaredIncludes); +} + +/// DoFrameworkLookup - Do a lookup of the specified file in the current +/// DirectoryLookup, which is a framework directory. +Optional<FileEntryRef> DirectoryLookup::DoFrameworkLookup( + StringRef Filename, HeaderSearch &HS, SmallVectorImpl<char> *SearchPath, + SmallVectorImpl<char> *RelativePath, Module *RequestingModule, + ModuleMap::KnownHeader *SuggestedModule, + bool &InUserSpecifiedSystemFramework, bool &IsFrameworkFound) const { + FileManager &FileMgr = HS.getFileMgr(); + + // Framework names must have a '/' in the filename. + size_t SlashPos = Filename.find('/'); + if (SlashPos == StringRef::npos) + return None; + + // Find out if this is the home for the specified framework, by checking + // HeaderSearch. Possible answers are yes/no and unknown. + FrameworkCacheEntry &CacheEntry = + HS.LookupFrameworkCache(Filename.substr(0, SlashPos)); + + // If it is known and in some other directory, fail. + if (CacheEntry.Directory && CacheEntry.Directory != getFrameworkDir()) + return None; + + // Otherwise, construct the path to this framework dir. + + // FrameworkName = "/System/Library/Frameworks/" + SmallString<1024> FrameworkName; + FrameworkName += getFrameworkDirRef()->getName(); + if (FrameworkName.empty() || FrameworkName.back() != '/') + FrameworkName.push_back('/'); + + // FrameworkName = "/System/Library/Frameworks/Cocoa" + StringRef ModuleName(Filename.begin(), SlashPos); + FrameworkName += ModuleName; + + // FrameworkName = "/System/Library/Frameworks/Cocoa.framework/" + FrameworkName += ".framework/"; + + // If the cache entry was unresolved, populate it now. + if (!CacheEntry.Directory) { + ++NumFrameworkLookups; + + // If the framework dir doesn't exist, we fail. + auto Dir = FileMgr.getDirectory(FrameworkName); + if (!Dir) + return None; + + // Otherwise, if it does, remember that this is the right direntry for this + // framework. + CacheEntry.Directory = getFrameworkDir(); + + // If this is a user search directory, check if the framework has been + // user-specified as a system framework. + if (getDirCharacteristic() == SrcMgr::C_User) { + SmallString<1024> SystemFrameworkMarker(FrameworkName); + SystemFrameworkMarker += ".system_framework"; + if (llvm::sys::fs::exists(SystemFrameworkMarker)) { + CacheEntry.IsUserSpecifiedSystemFramework = true; + } + } + } + + // Set out flags. + InUserSpecifiedSystemFramework = CacheEntry.IsUserSpecifiedSystemFramework; + IsFrameworkFound = CacheEntry.Directory; + + if (RelativePath) { + RelativePath->clear(); + RelativePath->append(Filename.begin()+SlashPos+1, Filename.end()); + } + + // Check "/System/Library/Frameworks/Cocoa.framework/Headers/file.h" + unsigned OrigSize = FrameworkName.size(); + + FrameworkName += "Headers/"; + + if (SearchPath) { + SearchPath->clear(); + // Without trailing '/'. + SearchPath->append(FrameworkName.begin(), FrameworkName.end()-1); + } + + FrameworkName.append(Filename.begin()+SlashPos+1, Filename.end()); + + auto File = + FileMgr.getOptionalFileRef(FrameworkName, /*OpenFile=*/!SuggestedModule); + if (!File) { + // Check "/System/Library/Frameworks/Cocoa.framework/PrivateHeaders/file.h" + const char *Private = "Private"; + FrameworkName.insert(FrameworkName.begin()+OrigSize, Private, + Private+strlen(Private)); + if (SearchPath) + SearchPath->insert(SearchPath->begin()+OrigSize, Private, + Private+strlen(Private)); + + File = FileMgr.getOptionalFileRef(FrameworkName, + /*OpenFile=*/!SuggestedModule); + } + + // If we found the header and are allowed to suggest a module, do so now. + if (File && needModuleLookup(RequestingModule, SuggestedModule)) { + // Find the framework in which this header occurs. + StringRef FrameworkPath = File->getFileEntry().getDir()->getName(); + bool FoundFramework = false; + do { + // Determine whether this directory exists. + auto Dir = FileMgr.getDirectory(FrameworkPath); + if (!Dir) + break; + + // If this is a framework directory, then we're a subframework of this + // framework. + if (llvm::sys::path::extension(FrameworkPath) == ".framework") { + FoundFramework = true; + break; + } + + // Get the parent directory name. + FrameworkPath = llvm::sys::path::parent_path(FrameworkPath); + if (FrameworkPath.empty()) + break; + } while (true); + + bool IsSystem = getDirCharacteristic() != SrcMgr::C_User; + if (FoundFramework) { + if (!HS.findUsableModuleForFrameworkHeader( + &File->getFileEntry(), FrameworkPath, RequestingModule, + SuggestedModule, IsSystem)) + return None; + } else { + if (!HS.findUsableModuleForHeader(&File->getFileEntry(), getDir(), + RequestingModule, SuggestedModule, + IsSystem)) + return None; + } + } + if (File) + return *File; + return None; +} + +void HeaderSearch::setTarget(const TargetInfo &Target) { + ModMap.setTarget(Target); +} + +//===----------------------------------------------------------------------===// +// Header File Location. +//===----------------------------------------------------------------------===// + +/// Return true with a diagnostic if the file that MSVC would have found +/// fails to match the one that Clang would have found with MSVC header search +/// disabled. +static bool checkMSVCHeaderSearch(DiagnosticsEngine &Diags, + const FileEntry *MSFE, const FileEntry *FE, + SourceLocation IncludeLoc) { + if (MSFE && FE != MSFE) { + Diags.Report(IncludeLoc, diag::ext_pp_include_search_ms) << MSFE->getName(); + return true; + } + return false; +} + +static const char *copyString(StringRef Str, llvm::BumpPtrAllocator &Alloc) { + assert(!Str.empty()); + char *CopyStr = Alloc.Allocate<char>(Str.size()+1); + std::copy(Str.begin(), Str.end(), CopyStr); + CopyStr[Str.size()] = '\0'; + return CopyStr; +} + +static bool isFrameworkStylePath(StringRef Path, bool &IsPrivateHeader, + SmallVectorImpl<char> &FrameworkName) { + using namespace llvm::sys; + path::const_iterator I = path::begin(Path); + path::const_iterator E = path::end(Path); + IsPrivateHeader = false; + + // Detect different types of framework style paths: + // + // ...Foo.framework/{Headers,PrivateHeaders} + // ...Foo.framework/Versions/{A,Current}/{Headers,PrivateHeaders} + // ...Foo.framework/Frameworks/Nested.framework/{Headers,PrivateHeaders} + // ...<other variations with 'Versions' like in the above path> + // + // and some other variations among these lines. + int FoundComp = 0; + while (I != E) { + if (*I == "Headers") + ++FoundComp; + if (I->endswith(".framework")) { + FrameworkName.append(I->begin(), I->end()); + ++FoundComp; + } + if (*I == "PrivateHeaders") { + ++FoundComp; + IsPrivateHeader = true; + } + ++I; + } + + return !FrameworkName.empty() && FoundComp >= 2; +} + +static void +diagnoseFrameworkInclude(DiagnosticsEngine &Diags, SourceLocation IncludeLoc, + StringRef Includer, StringRef IncludeFilename, + const FileEntry *IncludeFE, bool isAngled = false, + bool FoundByHeaderMap = false) { + bool IsIncluderPrivateHeader = false; + SmallString<128> FromFramework, ToFramework; + if (!isFrameworkStylePath(Includer, IsIncluderPrivateHeader, FromFramework)) + return; + bool IsIncludeePrivateHeader = false; + bool IsIncludeeInFramework = isFrameworkStylePath( + IncludeFE->getName(), IsIncludeePrivateHeader, ToFramework); + + if (!isAngled && !FoundByHeaderMap) { + SmallString<128> NewInclude("<"); + if (IsIncludeeInFramework) { + NewInclude += StringRef(ToFramework).drop_back(10); // drop .framework + NewInclude += "/"; + } + NewInclude += IncludeFilename; + NewInclude += ">"; + Diags.Report(IncludeLoc, diag::warn_quoted_include_in_framework_header) + << IncludeFilename + << FixItHint::CreateReplacement(IncludeLoc, NewInclude); + } + + // Headers in Foo.framework/Headers should not include headers + // from Foo.framework/PrivateHeaders, since this violates public/private + // API boundaries and can cause modular dependency cycles. + if (!IsIncluderPrivateHeader && IsIncludeeInFramework && + IsIncludeePrivateHeader && FromFramework == ToFramework) + Diags.Report(IncludeLoc, diag::warn_framework_include_private_from_public) + << IncludeFilename; +} + +/// LookupFile - Given a "foo" or \<foo> reference, look up the indicated file, +/// return null on failure. isAngled indicates whether the file reference is +/// for system \#include's or not (i.e. using <> instead of ""). Includers, if +/// non-empty, indicates where the \#including file(s) are, in case a relative +/// search is needed. Microsoft mode will pass all \#including files. +Optional<FileEntryRef> HeaderSearch::LookupFile( + StringRef Filename, SourceLocation IncludeLoc, bool isAngled, + const DirectoryLookup *FromDir, const DirectoryLookup *&CurDir, + ArrayRef<std::pair<const FileEntry *, const DirectoryEntry *>> Includers, + SmallVectorImpl<char> *SearchPath, SmallVectorImpl<char> *RelativePath, + Module *RequestingModule, ModuleMap::KnownHeader *SuggestedModule, + bool *IsMapped, bool *IsFrameworkFound, bool SkipCache, + bool BuildSystemModule) { + if (IsMapped) + *IsMapped = false; + + if (IsFrameworkFound) + *IsFrameworkFound = false; + + if (SuggestedModule) + *SuggestedModule = ModuleMap::KnownHeader(); + + // If 'Filename' is absolute, check to see if it exists and no searching. + if (llvm::sys::path::is_absolute(Filename)) { + CurDir = nullptr; + + // If this was an #include_next "/absolute/file", fail. + if (FromDir) + return None; + + if (SearchPath) + SearchPath->clear(); + if (RelativePath) { + RelativePath->clear(); + RelativePath->append(Filename.begin(), Filename.end()); + } + // Otherwise, just return the file. + return getFileAndSuggestModule(Filename, IncludeLoc, nullptr, + /*IsSystemHeaderDir*/false, + RequestingModule, SuggestedModule); + } + + // This is the header that MSVC's header search would have found. + ModuleMap::KnownHeader MSSuggestedModule; + const FileEntry *MSFE_FE = nullptr; + StringRef MSFE_Name; + + // Unless disabled, check to see if the file is in the #includer's + // directory. This cannot be based on CurDir, because each includer could be + // a #include of a subdirectory (#include "foo/bar.h") and a subsequent + // include of "baz.h" should resolve to "whatever/foo/baz.h". + // This search is not done for <> headers. + if (!Includers.empty() && !isAngled && !NoCurDirSearch) { + SmallString<1024> TmpDir; + bool First = true; + for (const auto &IncluderAndDir : Includers) { + const FileEntry *Includer = IncluderAndDir.first; + + // Concatenate the requested file onto the directory. + // FIXME: Portability. Filename concatenation should be in sys::Path. + TmpDir = IncluderAndDir.second->getName(); + TmpDir.push_back('/'); + TmpDir.append(Filename.begin(), Filename.end()); + + // FIXME: We don't cache the result of getFileInfo across the call to + // getFileAndSuggestModule, because it's a reference to an element of + // a container that could be reallocated across this call. + // + // If we have no includer, that means we're processing a #include + // from a module build. We should treat this as a system header if we're + // building a [system] module. + bool IncluderIsSystemHeader = + Includer ? getFileInfo(Includer).DirInfo != SrcMgr::C_User : + BuildSystemModule; + if (Optional<FileEntryRef> FE = getFileAndSuggestModule( + TmpDir, IncludeLoc, IncluderAndDir.second, IncluderIsSystemHeader, + RequestingModule, SuggestedModule)) { + if (!Includer) { + assert(First && "only first includer can have no file"); + return FE; + } + + // Leave CurDir unset. + // This file is a system header or C++ unfriendly if the old file is. + // + // Note that we only use one of FromHFI/ToHFI at once, due to potential + // reallocation of the underlying vector potentially making the first + // reference binding dangling. + HeaderFileInfo &FromHFI = getFileInfo(Includer); + unsigned DirInfo = FromHFI.DirInfo; + bool IndexHeaderMapHeader = FromHFI.IndexHeaderMapHeader; + StringRef Framework = FromHFI.Framework; + + HeaderFileInfo &ToHFI = getFileInfo(&FE->getFileEntry()); + ToHFI.DirInfo = DirInfo; + ToHFI.IndexHeaderMapHeader = IndexHeaderMapHeader; + ToHFI.Framework = Framework; + + if (SearchPath) { + StringRef SearchPathRef(IncluderAndDir.second->getName()); + SearchPath->clear(); + SearchPath->append(SearchPathRef.begin(), SearchPathRef.end()); + } + if (RelativePath) { + RelativePath->clear(); + RelativePath->append(Filename.begin(), Filename.end()); + } + if (First) { + diagnoseFrameworkInclude(Diags, IncludeLoc, + IncluderAndDir.second->getName(), Filename, + &FE->getFileEntry()); + return FE; + } + + // Otherwise, we found the path via MSVC header search rules. If + // -Wmsvc-include is enabled, we have to keep searching to see if we + // would've found this header in -I or -isystem directories. + if (Diags.isIgnored(diag::ext_pp_include_search_ms, IncludeLoc)) { + return FE; + } else { + MSFE_FE = &FE->getFileEntry(); + MSFE_Name = FE->getName(); + if (SuggestedModule) { + MSSuggestedModule = *SuggestedModule; + *SuggestedModule = ModuleMap::KnownHeader(); + } + break; + } + } + First = false; + } + } + + Optional<FileEntryRef> MSFE(MSFE_FE ? FileEntryRef(MSFE_Name, *MSFE_FE) + : Optional<FileEntryRef>()); + + CurDir = nullptr; + + // If this is a system #include, ignore the user #include locs. + unsigned i = isAngled ? AngledDirIdx : 0; + + // If this is a #include_next request, start searching after the directory the + // file was found in. + if (FromDir) + i = FromDir-&SearchDirs[0]; + + // Cache all of the lookups performed by this method. Many headers are + // multiply included, and the "pragma once" optimization prevents them from + // being relex/pp'd, but they would still have to search through a + // (potentially huge) series of SearchDirs to find it. + LookupFileCacheInfo &CacheLookup = LookupFileCache[Filename]; + + // If the entry has been previously looked up, the first value will be + // non-zero. If the value is equal to i (the start point of our search), then + // this is a matching hit. + if (!SkipCache && CacheLookup.StartIdx == i+1) { + // Skip querying potentially lots of directories for this lookup. + i = CacheLookup.HitIdx; + if (CacheLookup.MappedName) { + Filename = CacheLookup.MappedName; + if (IsMapped) + *IsMapped = true; + } + } else { + // Otherwise, this is the first query, or the previous query didn't match + // our search start. We will fill in our found location below, so prime the + // start point value. + CacheLookup.reset(/*StartIdx=*/i+1); + } + + SmallString<64> MappedName; + + // Check each directory in sequence to see if it contains this file. + for (; i != SearchDirs.size(); ++i) { + bool InUserSpecifiedSystemFramework = false; + bool IsInHeaderMap = false; + bool IsFrameworkFoundInDir = false; + Optional<FileEntryRef> File = SearchDirs[i].LookupFile( + Filename, *this, IncludeLoc, SearchPath, RelativePath, RequestingModule, + SuggestedModule, InUserSpecifiedSystemFramework, IsFrameworkFoundInDir, + IsInHeaderMap, MappedName); + if (!MappedName.empty()) { + assert(IsInHeaderMap && "MappedName should come from a header map"); + CacheLookup.MappedName = + copyString(MappedName, LookupFileCache.getAllocator()); + } + if (IsMapped) + // A filename is mapped when a header map remapped it to a relative path + // used in subsequent header search or to an absolute path pointing to an + // existing file. + *IsMapped |= (!MappedName.empty() || (IsInHeaderMap && File)); + if (IsFrameworkFound) + // Because we keep a filename remapped for subsequent search directory + // lookups, ignore IsFrameworkFoundInDir after the first remapping and not + // just for remapping in a current search directory. + *IsFrameworkFound |= (IsFrameworkFoundInDir && !CacheLookup.MappedName); + if (!File) + continue; + + CurDir = &SearchDirs[i]; + + // This file is a system header or C++ unfriendly if the dir is. + HeaderFileInfo &HFI = getFileInfo(&File->getFileEntry()); + HFI.DirInfo = CurDir->getDirCharacteristic(); + + // If the directory characteristic is User but this framework was + // user-specified to be treated as a system framework, promote the + // characteristic. + if (HFI.DirInfo == SrcMgr::C_User && InUserSpecifiedSystemFramework) + HFI.DirInfo = SrcMgr::C_System; + + // If the filename matches a known system header prefix, override + // whether the file is a system header. + for (unsigned j = SystemHeaderPrefixes.size(); j; --j) { + if (Filename.startswith(SystemHeaderPrefixes[j-1].first)) { + HFI.DirInfo = SystemHeaderPrefixes[j-1].second ? SrcMgr::C_System + : SrcMgr::C_User; + break; + } + } + + // If this file is found in a header map and uses the framework style of + // includes, then this header is part of a framework we're building. + if (CurDir->isIndexHeaderMap()) { + size_t SlashPos = Filename.find('/'); + if (SlashPos != StringRef::npos) { + HFI.IndexHeaderMapHeader = 1; + HFI.Framework = getUniqueFrameworkName(StringRef(Filename.begin(), + SlashPos)); + } + } + + if (checkMSVCHeaderSearch(Diags, MSFE ? &MSFE->getFileEntry() : nullptr, + &File->getFileEntry(), IncludeLoc)) { + if (SuggestedModule) + *SuggestedModule = MSSuggestedModule; + return MSFE; + } + + bool FoundByHeaderMap = !IsMapped ? false : *IsMapped; + if (!Includers.empty()) + diagnoseFrameworkInclude( + Diags, IncludeLoc, Includers.front().second->getName(), Filename, + &File->getFileEntry(), isAngled, FoundByHeaderMap); + + // Remember this location for the next lookup we do. + CacheLookup.HitIdx = i; + return File; + } + + // If we are including a file with a quoted include "foo.h" from inside + // a header in a framework that is currently being built, and we couldn't + // resolve "foo.h" any other way, change the include to <Foo/foo.h>, where + // "Foo" is the name of the framework in which the including header was found. + if (!Includers.empty() && Includers.front().first && !isAngled && + Filename.find('/') == StringRef::npos) { + HeaderFileInfo &IncludingHFI = getFileInfo(Includers.front().first); + if (IncludingHFI.IndexHeaderMapHeader) { + SmallString<128> ScratchFilename; + ScratchFilename += IncludingHFI.Framework; + ScratchFilename += '/'; + ScratchFilename += Filename; + + Optional<FileEntryRef> File = LookupFile( + ScratchFilename, IncludeLoc, /*isAngled=*/true, FromDir, CurDir, + Includers.front(), SearchPath, RelativePath, RequestingModule, + SuggestedModule, IsMapped, /*IsFrameworkFound=*/nullptr); + + if (checkMSVCHeaderSearch(Diags, MSFE ? &MSFE->getFileEntry() : nullptr, + File ? &File->getFileEntry() : nullptr, + IncludeLoc)) { + if (SuggestedModule) + *SuggestedModule = MSSuggestedModule; + return MSFE; + } + + LookupFileCacheInfo &CacheLookup = LookupFileCache[Filename]; + CacheLookup.HitIdx = LookupFileCache[ScratchFilename].HitIdx; + // FIXME: SuggestedModule. + return File; + } + } + + if (checkMSVCHeaderSearch(Diags, MSFE ? &MSFE->getFileEntry() : nullptr, + nullptr, IncludeLoc)) { + if (SuggestedModule) + *SuggestedModule = MSSuggestedModule; + return MSFE; + } + + // Otherwise, didn't find it. Remember we didn't find this. + CacheLookup.HitIdx = SearchDirs.size(); + return None; +} + +/// LookupSubframeworkHeader - Look up a subframework for the specified +/// \#include file. For example, if \#include'ing <HIToolbox/HIToolbox.h> from +/// within ".../Carbon.framework/Headers/Carbon.h", check to see if HIToolbox +/// is a subframework within Carbon.framework. If so, return the FileEntry +/// for the designated file, otherwise return null. +Optional<FileEntryRef> HeaderSearch::LookupSubframeworkHeader( + StringRef Filename, const FileEntry *ContextFileEnt, + SmallVectorImpl<char> *SearchPath, SmallVectorImpl<char> *RelativePath, + Module *RequestingModule, ModuleMap::KnownHeader *SuggestedModule) { + assert(ContextFileEnt && "No context file?"); + + // Framework names must have a '/' in the filename. Find it. + // FIXME: Should we permit '\' on Windows? + size_t SlashPos = Filename.find('/'); + if (SlashPos == StringRef::npos) + return None; + + // Look up the base framework name of the ContextFileEnt. + StringRef ContextName = ContextFileEnt->getName(); + + // If the context info wasn't a framework, couldn't be a subframework. + const unsigned DotFrameworkLen = 10; + auto FrameworkPos = ContextName.find(".framework"); + if (FrameworkPos == StringRef::npos || + (ContextName[FrameworkPos + DotFrameworkLen] != '/' && + ContextName[FrameworkPos + DotFrameworkLen] != '\\')) + return None; + + SmallString<1024> FrameworkName(ContextName.data(), ContextName.data() + + FrameworkPos + + DotFrameworkLen + 1); + + // Append Frameworks/HIToolbox.framework/ + FrameworkName += "Frameworks/"; + FrameworkName.append(Filename.begin(), Filename.begin()+SlashPos); + FrameworkName += ".framework/"; + + auto &CacheLookup = + *FrameworkMap.insert(std::make_pair(Filename.substr(0, SlashPos), + FrameworkCacheEntry())).first; + + // Some other location? + if (CacheLookup.second.Directory && + CacheLookup.first().size() == FrameworkName.size() && + memcmp(CacheLookup.first().data(), &FrameworkName[0], + CacheLookup.first().size()) != 0) + return None; + + // Cache subframework. + if (!CacheLookup.second.Directory) { + ++NumSubFrameworkLookups; + + // If the framework dir doesn't exist, we fail. + auto Dir = FileMgr.getDirectory(FrameworkName); + if (!Dir) + return None; + + // Otherwise, if it does, remember that this is the right direntry for this + // framework. + CacheLookup.second.Directory = *Dir; + } + + + if (RelativePath) { + RelativePath->clear(); + RelativePath->append(Filename.begin()+SlashPos+1, Filename.end()); + } + + // Check ".../Frameworks/HIToolbox.framework/Headers/HIToolbox.h" + SmallString<1024> HeadersFilename(FrameworkName); + HeadersFilename += "Headers/"; + if (SearchPath) { + SearchPath->clear(); + // Without trailing '/'. + SearchPath->append(HeadersFilename.begin(), HeadersFilename.end()-1); + } + + HeadersFilename.append(Filename.begin()+SlashPos+1, Filename.end()); + auto File = FileMgr.getOptionalFileRef(HeadersFilename, /*OpenFile=*/true); + if (!File) { + // Check ".../Frameworks/HIToolbox.framework/PrivateHeaders/HIToolbox.h" + HeadersFilename = FrameworkName; + HeadersFilename += "PrivateHeaders/"; + if (SearchPath) { + SearchPath->clear(); + // Without trailing '/'. + SearchPath->append(HeadersFilename.begin(), HeadersFilename.end()-1); + } + + HeadersFilename.append(Filename.begin()+SlashPos+1, Filename.end()); + File = FileMgr.getOptionalFileRef(HeadersFilename, /*OpenFile=*/true); + + if (!File) + return None; + } + + // This file is a system header or C++ unfriendly if the old file is. + // + // Note that the temporary 'DirInfo' is required here, as either call to + // getFileInfo could resize the vector and we don't want to rely on order + // of evaluation. + unsigned DirInfo = getFileInfo(ContextFileEnt).DirInfo; + getFileInfo(&File->getFileEntry()).DirInfo = DirInfo; + + FrameworkName.pop_back(); // remove the trailing '/' + if (!findUsableModuleForFrameworkHeader(&File->getFileEntry(), FrameworkName, + RequestingModule, SuggestedModule, + /*IsSystem*/ false)) + return None; + + return *File; +} + +//===----------------------------------------------------------------------===// +// File Info Management. +//===----------------------------------------------------------------------===// + +/// Merge the header file info provided by \p OtherHFI into the current +/// header file info (\p HFI) +static void mergeHeaderFileInfo(HeaderFileInfo &HFI, + const HeaderFileInfo &OtherHFI) { + assert(OtherHFI.External && "expected to merge external HFI"); + + HFI.isImport |= OtherHFI.isImport; + HFI.isPragmaOnce |= OtherHFI.isPragmaOnce; + HFI.isModuleHeader |= OtherHFI.isModuleHeader; + HFI.NumIncludes += OtherHFI.NumIncludes; + + if (!HFI.ControllingMacro && !HFI.ControllingMacroID) { + HFI.ControllingMacro = OtherHFI.ControllingMacro; + HFI.ControllingMacroID = OtherHFI.ControllingMacroID; + } + + HFI.DirInfo = OtherHFI.DirInfo; + HFI.External = (!HFI.IsValid || HFI.External); + HFI.IsValid = true; + HFI.IndexHeaderMapHeader = OtherHFI.IndexHeaderMapHeader; + + if (HFI.Framework.empty()) + HFI.Framework = OtherHFI.Framework; +} + +/// getFileInfo - Return the HeaderFileInfo structure for the specified +/// FileEntry. +HeaderFileInfo &HeaderSearch::getFileInfo(const FileEntry *FE) { + if (FE->getUID() >= FileInfo.size()) + FileInfo.resize(FE->getUID() + 1); + + HeaderFileInfo *HFI = &FileInfo[FE->getUID()]; + // FIXME: Use a generation count to check whether this is really up to date. + if (ExternalSource && !HFI->Resolved) { + HFI->Resolved = true; + auto ExternalHFI = ExternalSource->GetHeaderFileInfo(FE); + + HFI = &FileInfo[FE->getUID()]; + if (ExternalHFI.External) + mergeHeaderFileInfo(*HFI, ExternalHFI); + } + + HFI->IsValid = true; + // We have local information about this header file, so it's no longer + // strictly external. + HFI->External = false; + return *HFI; +} + +const HeaderFileInfo * +HeaderSearch::getExistingFileInfo(const FileEntry *FE, + bool WantExternal) const { + // If we have an external source, ensure we have the latest information. + // FIXME: Use a generation count to check whether this is really up to date. + HeaderFileInfo *HFI; + if (ExternalSource) { + if (FE->getUID() >= FileInfo.size()) { + if (!WantExternal) + return nullptr; + FileInfo.resize(FE->getUID() + 1); + } + + HFI = &FileInfo[FE->getUID()]; + if (!WantExternal && (!HFI->IsValid || HFI->External)) + return nullptr; + if (!HFI->Resolved) { + HFI->Resolved = true; + auto ExternalHFI = ExternalSource->GetHeaderFileInfo(FE); + + HFI = &FileInfo[FE->getUID()]; + if (ExternalHFI.External) + mergeHeaderFileInfo(*HFI, ExternalHFI); + } + } else if (FE->getUID() >= FileInfo.size()) { + return nullptr; + } else { + HFI = &FileInfo[FE->getUID()]; + } + + if (!HFI->IsValid || (HFI->External && !WantExternal)) + return nullptr; + + return HFI; +} + +bool HeaderSearch::isFileMultipleIncludeGuarded(const FileEntry *File) { + // Check if we've ever seen this file as a header. + if (auto *HFI = getExistingFileInfo(File)) + return HFI->isPragmaOnce || HFI->isImport || HFI->ControllingMacro || + HFI->ControllingMacroID; + return false; +} + +void HeaderSearch::MarkFileModuleHeader(const FileEntry *FE, + ModuleMap::ModuleHeaderRole Role, + bool isCompilingModuleHeader) { + bool isModularHeader = !(Role & ModuleMap::TextualHeader); + + // Don't mark the file info as non-external if there's nothing to change. + if (!isCompilingModuleHeader) { + if (!isModularHeader) + return; + auto *HFI = getExistingFileInfo(FE); + if (HFI && HFI->isModuleHeader) + return; + } + + auto &HFI = getFileInfo(FE); + HFI.isModuleHeader |= isModularHeader; + HFI.isCompilingModuleHeader |= isCompilingModuleHeader; +} + +bool HeaderSearch::ShouldEnterIncludeFile(Preprocessor &PP, + const FileEntry *File, bool isImport, + bool ModulesEnabled, Module *M) { + ++NumIncluded; // Count # of attempted #includes. + + // Get information about this file. + HeaderFileInfo &FileInfo = getFileInfo(File); + + // FIXME: this is a workaround for the lack of proper modules-aware support + // for #import / #pragma once + auto TryEnterImported = [&]() -> bool { + if (!ModulesEnabled) + return false; + // Ensure FileInfo bits are up to date. + ModMap.resolveHeaderDirectives(File); + // Modules with builtins are special; multiple modules use builtins as + // modular headers, example: + // + // module stddef { header "stddef.h" export * } + // + // After module map parsing, this expands to: + // + // module stddef { + // header "/path_to_builtin_dirs/stddef.h" + // textual "stddef.h" + // } + // + // It's common that libc++ and system modules will both define such + // submodules. Make sure cached results for a builtin header won't + // prevent other builtin modules to potentially enter the builtin header. + // Note that builtins are header guarded and the decision to actually + // enter them is postponed to the controlling macros logic below. + bool TryEnterHdr = false; + if (FileInfo.isCompilingModuleHeader && FileInfo.isModuleHeader) + TryEnterHdr = File->getDir() == ModMap.getBuiltinDir() && + ModuleMap::isBuiltinHeader( + llvm::sys::path::filename(File->getName())); + + // Textual headers can be #imported from different modules. Since ObjC + // headers find in the wild might rely only on #import and do not contain + // controlling macros, be conservative and only try to enter textual headers + // if such macro is present. + if (!FileInfo.isModuleHeader && + FileInfo.getControllingMacro(ExternalLookup)) + TryEnterHdr = true; + return TryEnterHdr; + }; + + // If this is a #import directive, check that we have not already imported + // this header. + if (isImport) { + // If this has already been imported, don't import it again. + FileInfo.isImport = true; + + // Has this already been #import'ed or #include'd? + if (FileInfo.NumIncludes && !TryEnterImported()) + return false; + } else { + // Otherwise, if this is a #include of a file that was previously #import'd + // or if this is the second #include of a #pragma once file, ignore it. + if (FileInfo.isImport && !TryEnterImported()) + return false; + } + + // Next, check to see if the file is wrapped with #ifndef guards. If so, and + // if the macro that guards it is defined, we know the #include has no effect. + if (const IdentifierInfo *ControllingMacro + = FileInfo.getControllingMacro(ExternalLookup)) { + // If the header corresponds to a module, check whether the macro is already + // defined in that module rather than checking in the current set of visible + // modules. + if (M ? PP.isMacroDefinedInLocalModule(ControllingMacro, M) + : PP.isMacroDefined(ControllingMacro)) { + ++NumMultiIncludeFileOptzn; + return false; + } + } + + // Increment the number of times this file has been included. + ++FileInfo.NumIncludes; + + return true; +} + +size_t HeaderSearch::getTotalMemory() const { + return SearchDirs.capacity() + + llvm::capacity_in_bytes(FileInfo) + + llvm::capacity_in_bytes(HeaderMaps) + + LookupFileCache.getAllocator().getTotalMemory() + + FrameworkMap.getAllocator().getTotalMemory(); +} + +StringRef HeaderSearch::getUniqueFrameworkName(StringRef Framework) { + return FrameworkNames.insert(Framework).first->first(); +} + +bool HeaderSearch::hasModuleMap(StringRef FileName, + const DirectoryEntry *Root, + bool IsSystem) { + if (!HSOpts->ImplicitModuleMaps) + return false; + + SmallVector<const DirectoryEntry *, 2> FixUpDirectories; + + StringRef DirName = FileName; + do { + // Get the parent directory name. + DirName = llvm::sys::path::parent_path(DirName); + if (DirName.empty()) + return false; + + // Determine whether this directory exists. + auto Dir = FileMgr.getDirectory(DirName); + if (!Dir) + return false; + + // Try to load the module map file in this directory. + switch (loadModuleMapFile(*Dir, IsSystem, + llvm::sys::path::extension((*Dir)->getName()) == + ".framework")) { + case LMM_NewlyLoaded: + case LMM_AlreadyLoaded: + // Success. All of the directories we stepped through inherit this module + // map file. + for (unsigned I = 0, N = FixUpDirectories.size(); I != N; ++I) + DirectoryHasModuleMap[FixUpDirectories[I]] = true; + return true; + + case LMM_NoDirectory: + case LMM_InvalidModuleMap: + break; + } + + // If we hit the top of our search, we're done. + if (*Dir == Root) + return false; + + // Keep track of all of the directories we checked, so we can mark them as + // having module maps if we eventually do find a module map. + FixUpDirectories.push_back(*Dir); + } while (true); +} + +ModuleMap::KnownHeader +HeaderSearch::findModuleForHeader(const FileEntry *File, + bool AllowTextual) const { + if (ExternalSource) { + // Make sure the external source has handled header info about this file, + // which includes whether the file is part of a module. + (void)getExistingFileInfo(File); + } + return ModMap.findModuleForHeader(File, AllowTextual); +} + +static bool suggestModule(HeaderSearch &HS, const FileEntry *File, + Module *RequestingModule, + ModuleMap::KnownHeader *SuggestedModule) { + ModuleMap::KnownHeader Module = + HS.findModuleForHeader(File, /*AllowTextual*/true); + if (SuggestedModule) + *SuggestedModule = (Module.getRole() & ModuleMap::TextualHeader) + ? ModuleMap::KnownHeader() + : Module; + + // If this module specifies [no_undeclared_includes], we cannot find any + // file that's in a non-dependency module. + if (RequestingModule && Module && RequestingModule->NoUndeclaredIncludes) { + HS.getModuleMap().resolveUses(RequestingModule, /*Complain*/false); + if (!RequestingModule->directlyUses(Module.getModule())) { + return false; + } + } + + return true; +} + +bool HeaderSearch::findUsableModuleForHeader( + const FileEntry *File, const DirectoryEntry *Root, Module *RequestingModule, + ModuleMap::KnownHeader *SuggestedModule, bool IsSystemHeaderDir) { + if (File && needModuleLookup(RequestingModule, SuggestedModule)) { + // If there is a module that corresponds to this header, suggest it. + hasModuleMap(File->getName(), Root, IsSystemHeaderDir); + return suggestModule(*this, File, RequestingModule, SuggestedModule); + } + return true; +} + +bool HeaderSearch::findUsableModuleForFrameworkHeader( + const FileEntry *File, StringRef FrameworkName, Module *RequestingModule, + ModuleMap::KnownHeader *SuggestedModule, bool IsSystemFramework) { + // If we're supposed to suggest a module, look for one now. + if (needModuleLookup(RequestingModule, SuggestedModule)) { + // Find the top-level framework based on this framework. + SmallVector<std::string, 4> SubmodulePath; + const DirectoryEntry *TopFrameworkDir + = ::getTopFrameworkDir(FileMgr, FrameworkName, SubmodulePath); + + // Determine the name of the top-level framework. + StringRef ModuleName = llvm::sys::path::stem(TopFrameworkDir->getName()); + + // Load this framework module. If that succeeds, find the suggested module + // for this header, if any. + loadFrameworkModule(ModuleName, TopFrameworkDir, IsSystemFramework); + + // FIXME: This can find a module not part of ModuleName, which is + // important so that we're consistent about whether this header + // corresponds to a module. Possibly we should lock down framework modules + // so that this is not possible. + return suggestModule(*this, File, RequestingModule, SuggestedModule); + } + return true; +} + +static const FileEntry *getPrivateModuleMap(const FileEntry *File, + FileManager &FileMgr) { + StringRef Filename = llvm::sys::path::filename(File->getName()); + SmallString<128> PrivateFilename(File->getDir()->getName()); + if (Filename == "module.map") + llvm::sys::path::append(PrivateFilename, "module_private.map"); + else if (Filename == "module.modulemap") + llvm::sys::path::append(PrivateFilename, "module.private.modulemap"); + else + return nullptr; + if (auto File = FileMgr.getFile(PrivateFilename)) + return *File; + return nullptr; +} + +bool HeaderSearch::loadModuleMapFile(const FileEntry *File, bool IsSystem, + FileID ID, unsigned *Offset, + StringRef OriginalModuleMapFile) { + // Find the directory for the module. For frameworks, that may require going + // up from the 'Modules' directory. + const DirectoryEntry *Dir = nullptr; + if (getHeaderSearchOpts().ModuleMapFileHomeIsCwd) { + if (auto DirOrErr = FileMgr.getDirectory(".")) + Dir = *DirOrErr; + } else { + if (!OriginalModuleMapFile.empty()) { + // We're building a preprocessed module map. Find or invent the directory + // that it originally occupied. + auto DirOrErr = FileMgr.getDirectory( + llvm::sys::path::parent_path(OriginalModuleMapFile)); + if (DirOrErr) { + Dir = *DirOrErr; + } else { + auto *FakeFile = FileMgr.getVirtualFile(OriginalModuleMapFile, 0, 0); + Dir = FakeFile->getDir(); + } + } else { + Dir = File->getDir(); + } + + StringRef DirName(Dir->getName()); + if (llvm::sys::path::filename(DirName) == "Modules") { + DirName = llvm::sys::path::parent_path(DirName); + if (DirName.endswith(".framework")) + if (auto DirOrErr = FileMgr.getDirectory(DirName)) + Dir = *DirOrErr; + // FIXME: This assert can fail if there's a race between the above check + // and the removal of the directory. + assert(Dir && "parent must exist"); + } + } + + switch (loadModuleMapFileImpl(File, IsSystem, Dir, ID, Offset)) { + case LMM_AlreadyLoaded: + case LMM_NewlyLoaded: + return false; + case LMM_NoDirectory: + case LMM_InvalidModuleMap: + return true; + } + llvm_unreachable("Unknown load module map result"); +} + +HeaderSearch::LoadModuleMapResult +HeaderSearch::loadModuleMapFileImpl(const FileEntry *File, bool IsSystem, + const DirectoryEntry *Dir, FileID ID, + unsigned *Offset) { + assert(File && "expected FileEntry"); + + // Check whether we've already loaded this module map, and mark it as being + // loaded in case we recursively try to load it from itself. + auto AddResult = LoadedModuleMaps.insert(std::make_pair(File, true)); + if (!AddResult.second) + return AddResult.first->second ? LMM_AlreadyLoaded : LMM_InvalidModuleMap; + + if (ModMap.parseModuleMapFile(File, IsSystem, Dir, ID, Offset)) { + LoadedModuleMaps[File] = false; + return LMM_InvalidModuleMap; + } + + // Try to load a corresponding private module map. + if (const FileEntry *PMMFile = getPrivateModuleMap(File, FileMgr)) { + if (ModMap.parseModuleMapFile(PMMFile, IsSystem, Dir)) { + LoadedModuleMaps[File] = false; + return LMM_InvalidModuleMap; + } + } + + // This directory has a module map. + return LMM_NewlyLoaded; +} + +const FileEntry * +HeaderSearch::lookupModuleMapFile(const DirectoryEntry *Dir, bool IsFramework) { + if (!HSOpts->ImplicitModuleMaps) + return nullptr; + // For frameworks, the preferred spelling is Modules/module.modulemap, but + // module.map at the framework root is also accepted. + SmallString<128> ModuleMapFileName(Dir->getName()); + if (IsFramework) + llvm::sys::path::append(ModuleMapFileName, "Modules"); + llvm::sys::path::append(ModuleMapFileName, "module.modulemap"); + if (auto F = FileMgr.getFile(ModuleMapFileName)) + return *F; + + // Continue to allow module.map + ModuleMapFileName = Dir->getName(); + llvm::sys::path::append(ModuleMapFileName, "module.map"); + if (auto F = FileMgr.getFile(ModuleMapFileName)) + return *F; + return nullptr; +} + +Module *HeaderSearch::loadFrameworkModule(StringRef Name, + const DirectoryEntry *Dir, + bool IsSystem) { + if (Module *Module = ModMap.findModule(Name)) + return Module; + + // Try to load a module map file. + switch (loadModuleMapFile(Dir, IsSystem, /*IsFramework*/true)) { + case LMM_InvalidModuleMap: + // Try to infer a module map from the framework directory. + if (HSOpts->ImplicitModuleMaps) + ModMap.inferFrameworkModule(Dir, IsSystem, /*Parent=*/nullptr); + break; + + case LMM_AlreadyLoaded: + case LMM_NoDirectory: + return nullptr; + + case LMM_NewlyLoaded: + break; + } + + return ModMap.findModule(Name); +} + +HeaderSearch::LoadModuleMapResult +HeaderSearch::loadModuleMapFile(StringRef DirName, bool IsSystem, + bool IsFramework) { + if (auto Dir = FileMgr.getDirectory(DirName)) + return loadModuleMapFile(*Dir, IsSystem, IsFramework); + + return LMM_NoDirectory; +} + +HeaderSearch::LoadModuleMapResult +HeaderSearch::loadModuleMapFile(const DirectoryEntry *Dir, bool IsSystem, + bool IsFramework) { + auto KnownDir = DirectoryHasModuleMap.find(Dir); + if (KnownDir != DirectoryHasModuleMap.end()) + return KnownDir->second ? LMM_AlreadyLoaded : LMM_InvalidModuleMap; + + if (const FileEntry *ModuleMapFile = lookupModuleMapFile(Dir, IsFramework)) { + LoadModuleMapResult Result = + loadModuleMapFileImpl(ModuleMapFile, IsSystem, Dir); + // Add Dir explicitly in case ModuleMapFile is in a subdirectory. + // E.g. Foo.framework/Modules/module.modulemap + // ^Dir ^ModuleMapFile + if (Result == LMM_NewlyLoaded) + DirectoryHasModuleMap[Dir] = true; + else if (Result == LMM_InvalidModuleMap) + DirectoryHasModuleMap[Dir] = false; + return Result; + } + return LMM_InvalidModuleMap; +} + +void HeaderSearch::collectAllModules(SmallVectorImpl<Module *> &Modules) { + Modules.clear(); + + if (HSOpts->ImplicitModuleMaps) { + // Load module maps for each of the header search directories. + for (unsigned Idx = 0, N = SearchDirs.size(); Idx != N; ++Idx) { + bool IsSystem = SearchDirs[Idx].isSystemHeaderDirectory(); + if (SearchDirs[Idx].isFramework()) { + std::error_code EC; + SmallString<128> DirNative; + llvm::sys::path::native(SearchDirs[Idx].getFrameworkDir()->getName(), + DirNative); + + // Search each of the ".framework" directories to load them as modules. + llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); + for (llvm::vfs::directory_iterator Dir = FS.dir_begin(DirNative, EC), + DirEnd; + Dir != DirEnd && !EC; Dir.increment(EC)) { + if (llvm::sys::path::extension(Dir->path()) != ".framework") + continue; + + auto FrameworkDir = + FileMgr.getDirectory(Dir->path()); + if (!FrameworkDir) + continue; + + // Load this framework module. + loadFrameworkModule(llvm::sys::path::stem(Dir->path()), *FrameworkDir, + IsSystem); + } + continue; + } + + // FIXME: Deal with header maps. + if (SearchDirs[Idx].isHeaderMap()) + continue; + + // Try to load a module map file for the search directory. + loadModuleMapFile(SearchDirs[Idx].getDir(), IsSystem, + /*IsFramework*/ false); + + // Try to load module map files for immediate subdirectories of this + // search directory. + loadSubdirectoryModuleMaps(SearchDirs[Idx]); + } + } + + // Populate the list of modules. + for (ModuleMap::module_iterator M = ModMap.module_begin(), + MEnd = ModMap.module_end(); + M != MEnd; ++M) { + Modules.push_back(M->getValue()); + } +} + +void HeaderSearch::loadTopLevelSystemModules() { + if (!HSOpts->ImplicitModuleMaps) + return; + + // Load module maps for each of the header search directories. + for (unsigned Idx = 0, N = SearchDirs.size(); Idx != N; ++Idx) { + // We only care about normal header directories. + if (!SearchDirs[Idx].isNormalDir()) { + continue; + } + + // Try to load a module map file for the search directory. + loadModuleMapFile(SearchDirs[Idx].getDir(), + SearchDirs[Idx].isSystemHeaderDirectory(), + SearchDirs[Idx].isFramework()); + } +} + +void HeaderSearch::loadSubdirectoryModuleMaps(DirectoryLookup &SearchDir) { + assert(HSOpts->ImplicitModuleMaps && + "Should not be loading subdirectory module maps"); + + if (SearchDir.haveSearchedAllModuleMaps()) + return; + + std::error_code EC; + SmallString<128> Dir = SearchDir.getDir()->getName(); + FileMgr.makeAbsolutePath(Dir); + SmallString<128> DirNative; + llvm::sys::path::native(Dir, DirNative); + llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); + for (llvm::vfs::directory_iterator Dir = FS.dir_begin(DirNative, EC), DirEnd; + Dir != DirEnd && !EC; Dir.increment(EC)) { + bool IsFramework = llvm::sys::path::extension(Dir->path()) == ".framework"; + if (IsFramework == SearchDir.isFramework()) + loadModuleMapFile(Dir->path(), SearchDir.isSystemHeaderDirectory(), + SearchDir.isFramework()); + } + + SearchDir.setSearchedAllModuleMaps(true); +} + +std::string HeaderSearch::suggestPathToFileForDiagnostics( + const FileEntry *File, llvm::StringRef MainFile, bool *IsSystem) { + // FIXME: We assume that the path name currently cached in the FileEntry is + // the most appropriate one for this analysis (and that it's spelled the + // same way as the corresponding header search path). + return suggestPathToFileForDiagnostics(File->getName(), /*WorkingDir=*/"", + MainFile, IsSystem); +} + +std::string HeaderSearch::suggestPathToFileForDiagnostics( + llvm::StringRef File, llvm::StringRef WorkingDir, llvm::StringRef MainFile, + bool *IsSystem) { + using namespace llvm::sys; + + unsigned BestPrefixLength = 0; + // Checks whether Dir and File shares a common prefix, if they do and that's + // the longest prefix we've seen so for it returns true and updates the + // BestPrefixLength accordingly. + auto CheckDir = [&](llvm::StringRef Dir) -> bool { + llvm::SmallString<32> DirPath(Dir.begin(), Dir.end()); + if (!WorkingDir.empty() && !path::is_absolute(Dir)) + fs::make_absolute(WorkingDir, DirPath); + path::remove_dots(DirPath, /*remove_dot_dot=*/true); + Dir = DirPath; + for (auto NI = path::begin(File), NE = path::end(File), + DI = path::begin(Dir), DE = path::end(Dir); + /*termination condition in loop*/; ++NI, ++DI) { + // '.' components in File are ignored. + while (NI != NE && *NI == ".") + ++NI; + if (NI == NE) + break; + + // '.' components in Dir are ignored. + while (DI != DE && *DI == ".") + ++DI; + if (DI == DE) { + // Dir is a prefix of File, up to '.' components and choice of path + // separators. + unsigned PrefixLength = NI - path::begin(File); + if (PrefixLength > BestPrefixLength) { + BestPrefixLength = PrefixLength; + return true; + } + break; + } + + // Consider all path separators equal. + if (NI->size() == 1 && DI->size() == 1 && + path::is_separator(NI->front()) && path::is_separator(DI->front())) + continue; + + if (*NI != *DI) + break; + } + return false; + }; + + for (unsigned I = 0; I != SearchDirs.size(); ++I) { + // FIXME: Support this search within frameworks and header maps. + if (!SearchDirs[I].isNormalDir()) + continue; + + StringRef Dir = SearchDirs[I].getDir()->getName(); + if (CheckDir(Dir) && IsSystem) + *IsSystem = BestPrefixLength ? I >= SystemDirIdx : false; + } + + // Try to shorten include path using TUs directory, if we couldn't find any + // suitable prefix in include search paths. + if (!BestPrefixLength && CheckDir(path::parent_path(MainFile)) && IsSystem) + *IsSystem = false; + + + return path::convert_to_slash(File.drop_front(BestPrefixLength)); +} diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp new file mode 100644 index 000000000000..17f5ab1e035d --- /dev/null +++ b/clang/lib/Lex/Lexer.cpp @@ -0,0 +1,3951 @@ +//===- Lexer.cpp - C Language Family Lexer --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Lexer and Token interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Lexer.h" +#include "UnicodeCharSets.h" +#include "clang/Basic/CharInfo.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/MultipleIncludeOpt.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/Token.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/NativeFormatting.h" +#include "llvm/Support/UnicodeCharRanges.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <string> +#include <tuple> +#include <utility> + +using namespace clang; + +//===----------------------------------------------------------------------===// +// Token Class Implementation +//===----------------------------------------------------------------------===// + +/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. +bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { + if (isAnnotation()) + return false; + if (IdentifierInfo *II = getIdentifierInfo()) + return II->getObjCKeywordID() == objcKey; + return false; +} + +/// getObjCKeywordID - Return the ObjC keyword kind. +tok::ObjCKeywordKind Token::getObjCKeywordID() const { + if (isAnnotation()) + return tok::objc_not_keyword; + IdentifierInfo *specId = getIdentifierInfo(); + return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; +} + +//===----------------------------------------------------------------------===// +// Lexer Class Implementation +//===----------------------------------------------------------------------===// + +void Lexer::anchor() {} + +void Lexer::InitLexer(const char *BufStart, const char *BufPtr, + const char *BufEnd) { + BufferStart = BufStart; + BufferPtr = BufPtr; + BufferEnd = BufEnd; + + assert(BufEnd[0] == 0 && + "We assume that the input buffer has a null character at the end" + " to simplify lexing!"); + + // Check whether we have a BOM in the beginning of the buffer. If yes - act + // accordingly. Right now we support only UTF-8 with and without BOM, so, just + // skip the UTF-8 BOM if it's present. + if (BufferStart == BufferPtr) { + // Determine the size of the BOM. + StringRef Buf(BufferStart, BufferEnd - BufferStart); + size_t BOMLength = llvm::StringSwitch<size_t>(Buf) + .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM + .Default(0); + + // Skip the BOM. + BufferPtr += BOMLength; + } + + Is_PragmaLexer = false; + CurrentConflictMarkerState = CMK_None; + + // Start of the file is a start of line. + IsAtStartOfLine = true; + IsAtPhysicalStartOfLine = true; + + HasLeadingSpace = false; + HasLeadingEmptyMacro = false; + + // We are not after parsing a #. + ParsingPreprocessorDirective = false; + + // We are not after parsing #include. + ParsingFilename = false; + + // We are not in raw mode. Raw mode disables diagnostics and interpretation + // of tokens (e.g. identifiers, thus disabling macro expansion). It is used + // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block + // or otherwise skipping over tokens. + LexingRawMode = false; + + // Default to not keeping comments. + ExtendedTokenMode = 0; +} + +/// Lexer constructor - Create a new lexer object for the specified buffer +/// with the specified preprocessor managing the lexing process. This lexer +/// assumes that the associated file buffer and Preprocessor objects will +/// outlive it, so it doesn't take ownership of either of them. +Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) + : PreprocessorLexer(&PP, FID), + FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), + LangOpts(PP.getLangOpts()) { + InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), + InputFile->getBufferEnd()); + + resetExtendedTokenMode(); +} + +/// Lexer constructor - Create a new raw lexer object. This object is only +/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text +/// range will outlive it, so it doesn't take ownership of it. +Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, + const char *BufStart, const char *BufPtr, const char *BufEnd) + : FileLoc(fileloc), LangOpts(langOpts) { + InitLexer(BufStart, BufPtr, BufEnd); + + // We *are* in raw mode. + LexingRawMode = true; +} + +/// Lexer constructor - Create a new raw lexer object. This object is only +/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text +/// range will outlive it, so it doesn't take ownership of it. +Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, + const SourceManager &SM, const LangOptions &langOpts) + : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(), + FromFile->getBufferStart(), FromFile->getBufferEnd()) {} + +void Lexer::resetExtendedTokenMode() { + assert(PP && "Cannot reset token mode without a preprocessor"); + if (LangOpts.TraditionalCPP) + SetKeepWhitespaceMode(true); + else + SetCommentRetentionState(PP->getCommentRetentionState()); +} + +/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for +/// _Pragma expansion. This has a variety of magic semantics that this method +/// sets up. It returns a new'd Lexer that must be delete'd when done. +/// +/// On entrance to this routine, TokStartLoc is a macro location which has a +/// spelling loc that indicates the bytes to be lexed for the token and an +/// expansion location that indicates where all lexed tokens should be +/// "expanded from". +/// +/// TODO: It would really be nice to make _Pragma just be a wrapper around a +/// normal lexer that remaps tokens as they fly by. This would require making +/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer +/// interface that could handle this stuff. This would pull GetMappedTokenLoc +/// out of the critical path of the lexer! +/// +Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, + SourceLocation ExpansionLocStart, + SourceLocation ExpansionLocEnd, + unsigned TokLen, Preprocessor &PP) { + SourceManager &SM = PP.getSourceManager(); + + // Create the lexer as if we were going to lex the file normally. + FileID SpellingFID = SM.getFileID(SpellingLoc); + const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); + Lexer *L = new Lexer(SpellingFID, InputFile, PP); + + // Now that the lexer is created, change the start/end locations so that we + // just lex the subsection of the file that we want. This is lexing from a + // scratch buffer. + const char *StrData = SM.getCharacterData(SpellingLoc); + + L->BufferPtr = StrData; + L->BufferEnd = StrData+TokLen; + assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); + + // Set the SourceLocation with the remapping information. This ensures that + // GetMappedTokenLoc will remap the tokens as they are lexed. + L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), + ExpansionLocStart, + ExpansionLocEnd, TokLen); + + // Ensure that the lexer thinks it is inside a directive, so that end \n will + // return an EOD token. + L->ParsingPreprocessorDirective = true; + + // This lexer really is for _Pragma. + L->Is_PragmaLexer = true; + return L; +} + +bool Lexer::skipOver(unsigned NumBytes) { + IsAtPhysicalStartOfLine = true; + IsAtStartOfLine = true; + if ((BufferPtr + NumBytes) > BufferEnd) + return true; + BufferPtr += NumBytes; + return false; +} + +template <typename T> static void StringifyImpl(T &Str, char Quote) { + typename T::size_type i = 0, e = Str.size(); + while (i < e) { + if (Str[i] == '\\' || Str[i] == Quote) { + Str.insert(Str.begin() + i, '\\'); + i += 2; + ++e; + } else if (Str[i] == '\n' || Str[i] == '\r') { + // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. + if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && + Str[i] != Str[i + 1]) { + Str[i] = '\\'; + Str[i + 1] = 'n'; + } else { + // Replace '\n' and '\r' to '\\' followed by 'n'. + Str[i] = '\\'; + Str.insert(Str.begin() + i + 1, 'n'); + ++e; + } + i += 2; + } else + ++i; + } +} + +std::string Lexer::Stringify(StringRef Str, bool Charify) { + std::string Result = Str; + char Quote = Charify ? '\'' : '"'; + StringifyImpl(Result, Quote); + return Result; +} + +void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } + +//===----------------------------------------------------------------------===// +// Token Spelling +//===----------------------------------------------------------------------===// + +/// Slow case of getSpelling. Extract the characters comprising the +/// spelling of this token from the provided input buffer. +static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, + const LangOptions &LangOpts, char *Spelling) { + assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); + + size_t Length = 0; + const char *BufEnd = BufPtr + Tok.getLength(); + + if (tok::isStringLiteral(Tok.getKind())) { + // Munch the encoding-prefix and opening double-quote. + while (BufPtr < BufEnd) { + unsigned Size; + Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); + BufPtr += Size; + + if (Spelling[Length - 1] == '"') + break; + } + + // Raw string literals need special handling; trigraph expansion and line + // splicing do not occur within their d-char-sequence nor within their + // r-char-sequence. + if (Length >= 2 && + Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { + // Search backwards from the end of the token to find the matching closing + // quote. + const char *RawEnd = BufEnd; + do --RawEnd; while (*RawEnd != '"'); + size_t RawLength = RawEnd - BufPtr + 1; + + // Everything between the quotes is included verbatim in the spelling. + memcpy(Spelling + Length, BufPtr, RawLength); + Length += RawLength; + BufPtr += RawLength; + + // The rest of the token is lexed normally. + } + } + + while (BufPtr < BufEnd) { + unsigned Size; + Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); + BufPtr += Size; + } + + assert(Length < Tok.getLength() && + "NeedsCleaning flag set on token that didn't need cleaning!"); + return Length; +} + +/// getSpelling() - Return the 'spelling' of this token. The spelling of a +/// token are the characters used to represent the token in the source file +/// after trigraph expansion and escaped-newline folding. In particular, this +/// wants to get the true, uncanonicalized, spelling of things like digraphs +/// UCNs, etc. +StringRef Lexer::getSpelling(SourceLocation loc, + SmallVectorImpl<char> &buffer, + const SourceManager &SM, + const LangOptions &options, + bool *invalid) { + // Break down the source location. + std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); + + // Try to the load the file buffer. + bool invalidTemp = false; + StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); + if (invalidTemp) { + if (invalid) *invalid = true; + return {}; + } + + const char *tokenBegin = file.data() + locInfo.second; + + // Lex from the start of the given location. + Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, + file.begin(), tokenBegin, file.end()); + Token token; + lexer.LexFromRawLexer(token); + + unsigned length = token.getLength(); + + // Common case: no need for cleaning. + if (!token.needsCleaning()) + return StringRef(tokenBegin, length); + + // Hard case, we need to relex the characters into the string. + buffer.resize(length); + buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); + return StringRef(buffer.data(), buffer.size()); +} + +/// getSpelling() - Return the 'spelling' of this token. The spelling of a +/// token are the characters used to represent the token in the source file +/// after trigraph expansion and escaped-newline folding. In particular, this +/// wants to get the true, uncanonicalized, spelling of things like digraphs +/// UCNs, etc. +std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, + const LangOptions &LangOpts, bool *Invalid) { + assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); + + bool CharDataInvalid = false; + const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), + &CharDataInvalid); + if (Invalid) + *Invalid = CharDataInvalid; + if (CharDataInvalid) + return {}; + + // If this token contains nothing interesting, return it directly. + if (!Tok.needsCleaning()) + return std::string(TokStart, TokStart + Tok.getLength()); + + std::string Result; + Result.resize(Tok.getLength()); + Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); + return Result; +} + +/// getSpelling - This method is used to get the spelling of a token into a +/// preallocated buffer, instead of as an std::string. The caller is required +/// to allocate enough space for the token, which is guaranteed to be at least +/// Tok.getLength() bytes long. The actual length of the token is returned. +/// +/// Note that this method may do two possible things: it may either fill in +/// the buffer specified with characters, or it may *change the input pointer* +/// to point to a constant buffer with the data already in it (avoiding a +/// copy). The caller is not allowed to modify the returned buffer pointer +/// if an internal buffer is returned. +unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, + const SourceManager &SourceMgr, + const LangOptions &LangOpts, bool *Invalid) { + assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); + + const char *TokStart = nullptr; + // NOTE: this has to be checked *before* testing for an IdentifierInfo. + if (Tok.is(tok::raw_identifier)) + TokStart = Tok.getRawIdentifier().data(); + else if (!Tok.hasUCN()) { + if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { + // Just return the string from the identifier table, which is very quick. + Buffer = II->getNameStart(); + return II->getLength(); + } + } + + // NOTE: this can be checked even after testing for an IdentifierInfo. + if (Tok.isLiteral()) + TokStart = Tok.getLiteralData(); + + if (!TokStart) { + // Compute the start of the token in the input lexer buffer. + bool CharDataInvalid = false; + TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); + if (Invalid) + *Invalid = CharDataInvalid; + if (CharDataInvalid) { + Buffer = ""; + return 0; + } + } + + // If this token contains nothing interesting, return it directly. + if (!Tok.needsCleaning()) { + Buffer = TokStart; + return Tok.getLength(); + } + + // Otherwise, hard case, relex the characters into the string. + return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); +} + +/// MeasureTokenLength - Relex the token at the specified location and return +/// its length in bytes in the input file. If the token needs cleaning (e.g. +/// includes a trigraph or an escaped newline) then this count includes bytes +/// that are part of that. +unsigned Lexer::MeasureTokenLength(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + Token TheTok; + if (getRawToken(Loc, TheTok, SM, LangOpts)) + return 0; + return TheTok.getLength(); +} + +/// Relex the token at the specified location. +/// \returns true if there was a failure, false on success. +bool Lexer::getRawToken(SourceLocation Loc, Token &Result, + const SourceManager &SM, + const LangOptions &LangOpts, + bool IgnoreWhiteSpace) { + // TODO: this could be special cased for common tokens like identifiers, ')', + // etc to make this faster, if it mattered. Just look at StrData[0] to handle + // all obviously single-char tokens. This could use + // Lexer::isObviouslySimpleCharacter for example to handle identifiers or + // something. + + // If this comes from a macro expansion, we really do want the macro name, not + // the token this macro expanded to. + Loc = SM.getExpansionLoc(Loc); + std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); + bool Invalid = false; + StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); + if (Invalid) + return true; + + const char *StrData = Buffer.data()+LocInfo.second; + + if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) + return true; + + // Create a lexer starting at the beginning of this token. + Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, + Buffer.begin(), StrData, Buffer.end()); + TheLexer.SetCommentRetentionState(true); + TheLexer.LexFromRawLexer(Result); + return false; +} + +/// Returns the pointer that points to the beginning of line that contains +/// the given offset, or null if the offset if invalid. +static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { + const char *BufStart = Buffer.data(); + if (Offset >= Buffer.size()) + return nullptr; + + const char *LexStart = BufStart + Offset; + for (; LexStart != BufStart; --LexStart) { + if (isVerticalWhitespace(LexStart[0]) && + !Lexer::isNewLineEscaped(BufStart, LexStart)) { + // LexStart should point at first character of logical line. + ++LexStart; + break; + } + } + return LexStart; +} + +static SourceLocation getBeginningOfFileToken(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + assert(Loc.isFileID()); + std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); + if (LocInfo.first.isInvalid()) + return Loc; + + bool Invalid = false; + StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); + if (Invalid) + return Loc; + + // Back up from the current location until we hit the beginning of a line + // (or the buffer). We'll relex from that point. + const char *StrData = Buffer.data() + LocInfo.second; + const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); + if (!LexStart || LexStart == StrData) + return Loc; + + // Create a lexer starting at the beginning of this token. + SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); + Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, + Buffer.end()); + TheLexer.SetCommentRetentionState(true); + + // Lex tokens until we find the token that contains the source location. + Token TheTok; + do { + TheLexer.LexFromRawLexer(TheTok); + + if (TheLexer.getBufferLocation() > StrData) { + // Lexing this token has taken the lexer past the source location we're + // looking for. If the current token encompasses our source location, + // return the beginning of that token. + if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) + return TheTok.getLocation(); + + // We ended up skipping over the source location entirely, which means + // that it points into whitespace. We're done here. + break; + } + } while (TheTok.getKind() != tok::eof); + + // We've passed our source location; just return the original source location. + return Loc; +} + +SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + if (Loc.isFileID()) + return getBeginningOfFileToken(Loc, SM, LangOpts); + + if (!SM.isMacroArgExpansion(Loc)) + return Loc; + + SourceLocation FileLoc = SM.getSpellingLoc(Loc); + SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); + std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); + std::pair<FileID, unsigned> BeginFileLocInfo = + SM.getDecomposedLoc(BeginFileLoc); + assert(FileLocInfo.first == BeginFileLocInfo.first && + FileLocInfo.second >= BeginFileLocInfo.second); + return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); +} + +namespace { + +enum PreambleDirectiveKind { + PDK_Skipped, + PDK_Unknown +}; + +} // namespace + +PreambleBounds Lexer::ComputePreamble(StringRef Buffer, + const LangOptions &LangOpts, + unsigned MaxLines) { + // Create a lexer starting at the beginning of the file. Note that we use a + // "fake" file source location at offset 1 so that the lexer will track our + // position within the file. + const unsigned StartOffset = 1; + SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); + Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), + Buffer.end()); + TheLexer.SetCommentRetentionState(true); + + bool InPreprocessorDirective = false; + Token TheTok; + SourceLocation ActiveCommentLoc; + + unsigned MaxLineOffset = 0; + if (MaxLines) { + const char *CurPtr = Buffer.begin(); + unsigned CurLine = 0; + while (CurPtr != Buffer.end()) { + char ch = *CurPtr++; + if (ch == '\n') { + ++CurLine; + if (CurLine == MaxLines) + break; + } + } + if (CurPtr != Buffer.end()) + MaxLineOffset = CurPtr - Buffer.begin(); + } + + do { + TheLexer.LexFromRawLexer(TheTok); + + if (InPreprocessorDirective) { + // If we've hit the end of the file, we're done. + if (TheTok.getKind() == tok::eof) { + break; + } + + // If we haven't hit the end of the preprocessor directive, skip this + // token. + if (!TheTok.isAtStartOfLine()) + continue; + + // We've passed the end of the preprocessor directive, and will look + // at this token again below. + InPreprocessorDirective = false; + } + + // Keep track of the # of lines in the preamble. + if (TheTok.isAtStartOfLine()) { + unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; + + // If we were asked to limit the number of lines in the preamble, + // and we're about to exceed that limit, we're done. + if (MaxLineOffset && TokOffset >= MaxLineOffset) + break; + } + + // Comments are okay; skip over them. + if (TheTok.getKind() == tok::comment) { + if (ActiveCommentLoc.isInvalid()) + ActiveCommentLoc = TheTok.getLocation(); + continue; + } + + if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { + // This is the start of a preprocessor directive. + Token HashTok = TheTok; + InPreprocessorDirective = true; + ActiveCommentLoc = SourceLocation(); + + // Figure out which directive this is. Since we're lexing raw tokens, + // we don't have an identifier table available. Instead, just look at + // the raw identifier to recognize and categorize preprocessor directives. + TheLexer.LexFromRawLexer(TheTok); + if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { + StringRef Keyword = TheTok.getRawIdentifier(); + PreambleDirectiveKind PDK + = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) + .Case("include", PDK_Skipped) + .Case("__include_macros", PDK_Skipped) + .Case("define", PDK_Skipped) + .Case("undef", PDK_Skipped) + .Case("line", PDK_Skipped) + .Case("error", PDK_Skipped) + .Case("pragma", PDK_Skipped) + .Case("import", PDK_Skipped) + .Case("include_next", PDK_Skipped) + .Case("warning", PDK_Skipped) + .Case("ident", PDK_Skipped) + .Case("sccs", PDK_Skipped) + .Case("assert", PDK_Skipped) + .Case("unassert", PDK_Skipped) + .Case("if", PDK_Skipped) + .Case("ifdef", PDK_Skipped) + .Case("ifndef", PDK_Skipped) + .Case("elif", PDK_Skipped) + .Case("else", PDK_Skipped) + .Case("endif", PDK_Skipped) + .Default(PDK_Unknown); + + switch (PDK) { + case PDK_Skipped: + continue; + + case PDK_Unknown: + // We don't know what this directive is; stop at the '#'. + break; + } + } + + // We only end up here if we didn't recognize the preprocessor + // directive or it was one that can't occur in the preamble at this + // point. Roll back the current token to the location of the '#'. + TheTok = HashTok; + } + + // We hit a token that we don't recognize as being in the + // "preprocessing only" part of the file, so we're no longer in + // the preamble. + break; + } while (true); + + SourceLocation End; + if (ActiveCommentLoc.isValid()) + End = ActiveCommentLoc; // don't truncate a decl comment. + else + End = TheTok.getLocation(); + + return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), + TheTok.isAtStartOfLine()); +} + +unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, + const SourceManager &SM, + const LangOptions &LangOpts) { + // Figure out how many physical characters away the specified expansion + // character is. This needs to take into consideration newlines and + // trigraphs. + bool Invalid = false; + const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); + + // If they request the first char of the token, we're trivially done. + if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) + return 0; + + unsigned PhysOffset = 0; + + // The usual case is that tokens don't contain anything interesting. Skip + // over the uninteresting characters. If a token only consists of simple + // chars, this method is extremely fast. + while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { + if (CharNo == 0) + return PhysOffset; + ++TokPtr; + --CharNo; + ++PhysOffset; + } + + // If we have a character that may be a trigraph or escaped newline, use a + // lexer to parse it correctly. + for (; CharNo; --CharNo) { + unsigned Size; + Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); + TokPtr += Size; + PhysOffset += Size; + } + + // Final detail: if we end up on an escaped newline, we want to return the + // location of the actual byte of the token. For example foo\<newline>bar + // advanced by 3 should return the location of b, not of \\. One compounding + // detail of this is that the escape may be made by a trigraph. + if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) + PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; + + return PhysOffset; +} + +/// Computes the source location just past the end of the +/// token at this source location. +/// +/// This routine can be used to produce a source location that +/// points just past the end of the token referenced by \p Loc, and +/// is generally used when a diagnostic needs to point just after a +/// token where it expected something different that it received. If +/// the returned source location would not be meaningful (e.g., if +/// it points into a macro), this routine returns an invalid +/// source location. +/// +/// \param Offset an offset from the end of the token, where the source +/// location should refer to. The default offset (0) produces a source +/// location pointing just past the end of the token; an offset of 1 produces +/// a source location pointing to the last character in the token, etc. +SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, + const SourceManager &SM, + const LangOptions &LangOpts) { + if (Loc.isInvalid()) + return {}; + + if (Loc.isMacroID()) { + if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) + return {}; // Points inside the macro expansion. + } + + unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); + if (Len > Offset) + Len = Len - Offset; + else + return Loc; + + return Loc.getLocWithOffset(Len); +} + +/// Returns true if the given MacroID location points at the first +/// token of the macro expansion. +bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, + const SourceManager &SM, + const LangOptions &LangOpts, + SourceLocation *MacroBegin) { + assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); + + SourceLocation expansionLoc; + if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) + return false; + + if (expansionLoc.isFileID()) { + // No other macro expansions, this is the first. + if (MacroBegin) + *MacroBegin = expansionLoc; + return true; + } + + return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); +} + +/// Returns true if the given MacroID location points at the last +/// token of the macro expansion. +bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, + const SourceManager &SM, + const LangOptions &LangOpts, + SourceLocation *MacroEnd) { + assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); + + SourceLocation spellLoc = SM.getSpellingLoc(loc); + unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); + if (tokLen == 0) + return false; + + SourceLocation afterLoc = loc.getLocWithOffset(tokLen); + SourceLocation expansionLoc; + if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) + return false; + + if (expansionLoc.isFileID()) { + // No other macro expansions. + if (MacroEnd) + *MacroEnd = expansionLoc; + return true; + } + + return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); +} + +static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, + const SourceManager &SM, + const LangOptions &LangOpts) { + SourceLocation Begin = Range.getBegin(); + SourceLocation End = Range.getEnd(); + assert(Begin.isFileID() && End.isFileID()); + if (Range.isTokenRange()) { + End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); + if (End.isInvalid()) + return {}; + } + + // Break down the source locations. + FileID FID; + unsigned BeginOffs; + std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); + if (FID.isInvalid()) + return {}; + + unsigned EndOffs; + if (!SM.isInFileID(End, FID, &EndOffs) || + BeginOffs > EndOffs) + return {}; + + return CharSourceRange::getCharRange(Begin, End); +} + +CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, + const SourceManager &SM, + const LangOptions &LangOpts) { + SourceLocation Begin = Range.getBegin(); + SourceLocation End = Range.getEnd(); + if (Begin.isInvalid() || End.isInvalid()) + return {}; + + if (Begin.isFileID() && End.isFileID()) + return makeRangeFromFileLocs(Range, SM, LangOpts); + + if (Begin.isMacroID() && End.isFileID()) { + if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) + return {}; + Range.setBegin(Begin); + return makeRangeFromFileLocs(Range, SM, LangOpts); + } + + if (Begin.isFileID() && End.isMacroID()) { + if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, + &End)) || + (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, + &End))) + return {}; + Range.setEnd(End); + return makeRangeFromFileLocs(Range, SM, LangOpts); + } + + assert(Begin.isMacroID() && End.isMacroID()); + SourceLocation MacroBegin, MacroEnd; + if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && + ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, + &MacroEnd)) || + (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, + &MacroEnd)))) { + Range.setBegin(MacroBegin); + Range.setEnd(MacroEnd); + return makeRangeFromFileLocs(Range, SM, LangOpts); + } + + bool Invalid = false; + const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), + &Invalid); + if (Invalid) + return {}; + + if (BeginEntry.getExpansion().isMacroArgExpansion()) { + const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), + &Invalid); + if (Invalid) + return {}; + + if (EndEntry.getExpansion().isMacroArgExpansion() && + BeginEntry.getExpansion().getExpansionLocStart() == + EndEntry.getExpansion().getExpansionLocStart()) { + Range.setBegin(SM.getImmediateSpellingLoc(Begin)); + Range.setEnd(SM.getImmediateSpellingLoc(End)); + return makeFileCharRange(Range, SM, LangOpts); + } + } + + return {}; +} + +StringRef Lexer::getSourceText(CharSourceRange Range, + const SourceManager &SM, + const LangOptions &LangOpts, + bool *Invalid) { + Range = makeFileCharRange(Range, SM, LangOpts); + if (Range.isInvalid()) { + if (Invalid) *Invalid = true; + return {}; + } + + // Break down the source location. + std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); + if (beginInfo.first.isInvalid()) { + if (Invalid) *Invalid = true; + return {}; + } + + unsigned EndOffs; + if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || + beginInfo.second > EndOffs) { + if (Invalid) *Invalid = true; + return {}; + } + + // Try to the load the file buffer. + bool invalidTemp = false; + StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); + if (invalidTemp) { + if (Invalid) *Invalid = true; + return {}; + } + + if (Invalid) *Invalid = false; + return file.substr(beginInfo.second, EndOffs - beginInfo.second); +} + +StringRef Lexer::getImmediateMacroName(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + assert(Loc.isMacroID() && "Only reasonable to call this on macros"); + + // Find the location of the immediate macro expansion. + while (true) { + FileID FID = SM.getFileID(Loc); + const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); + const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); + Loc = Expansion.getExpansionLocStart(); + if (!Expansion.isMacroArgExpansion()) + break; + + // For macro arguments we need to check that the argument did not come + // from an inner macro, e.g: "MAC1( MAC2(foo) )" + + // Loc points to the argument id of the macro definition, move to the + // macro expansion. + Loc = SM.getImmediateExpansionRange(Loc).getBegin(); + SourceLocation SpellLoc = Expansion.getSpellingLoc(); + if (SpellLoc.isFileID()) + break; // No inner macro. + + // If spelling location resides in the same FileID as macro expansion + // location, it means there is no inner macro. + FileID MacroFID = SM.getFileID(Loc); + if (SM.isInFileID(SpellLoc, MacroFID)) + break; + + // Argument came from inner macro. + Loc = SpellLoc; + } + + // Find the spelling location of the start of the non-argument expansion + // range. This is where the macro name was spelled in order to begin + // expanding this macro. + Loc = SM.getSpellingLoc(Loc); + + // Dig out the buffer where the macro name was spelled and the extents of the + // name so that we can render it into the expansion note. + std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); + unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); + StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); + return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); +} + +StringRef Lexer::getImmediateMacroNameForDiagnostics( + SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { + assert(Loc.isMacroID() && "Only reasonable to call this on macros"); + // Walk past macro argument expansions. + while (SM.isMacroArgExpansion(Loc)) + Loc = SM.getImmediateExpansionRange(Loc).getBegin(); + + // If the macro's spelling has no FileID, then it's actually a token paste + // or stringization (or similar) and not a macro at all. + if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) + return {}; + + // Find the spelling location of the start of the non-argument expansion + // range. This is where the macro name was spelled in order to begin + // expanding this macro. + Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); + + // Dig out the buffer where the macro name was spelled and the extents of the + // name so that we can render it into the expansion note. + std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); + unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); + StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); + return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); +} + +bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { + return isIdentifierBody(c, LangOpts.DollarIdents); +} + +bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { + assert(isVerticalWhitespace(Str[0])); + if (Str - 1 < BufferStart) + return false; + + if ((Str[0] == '\n' && Str[-1] == '\r') || + (Str[0] == '\r' && Str[-1] == '\n')) { + if (Str - 2 < BufferStart) + return false; + --Str; + } + --Str; + + // Rewind to first non-space character: + while (Str > BufferStart && isHorizontalWhitespace(*Str)) + --Str; + + return *Str == '\\'; +} + +StringRef Lexer::getIndentationForLine(SourceLocation Loc, + const SourceManager &SM) { + if (Loc.isInvalid() || Loc.isMacroID()) + return {}; + std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); + if (LocInfo.first.isInvalid()) + return {}; + bool Invalid = false; + StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); + if (Invalid) + return {}; + const char *Line = findBeginningOfLine(Buffer, LocInfo.second); + if (!Line) + return {}; + StringRef Rest = Buffer.substr(Line - Buffer.data()); + size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); + return NumWhitespaceChars == StringRef::npos + ? "" + : Rest.take_front(NumWhitespaceChars); +} + +//===----------------------------------------------------------------------===// +// Diagnostics forwarding code. +//===----------------------------------------------------------------------===// + +/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the +/// lexer buffer was all expanded at a single point, perform the mapping. +/// This is currently only used for _Pragma implementation, so it is the slow +/// path of the hot getSourceLocation method. Do not allow it to be inlined. +static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( + Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); +static SourceLocation GetMappedTokenLoc(Preprocessor &PP, + SourceLocation FileLoc, + unsigned CharNo, unsigned TokLen) { + assert(FileLoc.isMacroID() && "Must be a macro expansion"); + + // Otherwise, we're lexing "mapped tokens". This is used for things like + // _Pragma handling. Combine the expansion location of FileLoc with the + // spelling location. + SourceManager &SM = PP.getSourceManager(); + + // Create a new SLoc which is expanded from Expansion(FileLoc) but whose + // characters come from spelling(FileLoc)+Offset. + SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); + SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); + + // Figure out the expansion loc range, which is the range covered by the + // original _Pragma(...) sequence. + CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); + + return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); +} + +/// getSourceLocation - Return a source location identifier for the specified +/// offset in the current file. +SourceLocation Lexer::getSourceLocation(const char *Loc, + unsigned TokLen) const { + assert(Loc >= BufferStart && Loc <= BufferEnd && + "Location out of range for this buffer!"); + + // In the normal case, we're just lexing from a simple file buffer, return + // the file id from FileLoc with the offset specified. + unsigned CharNo = Loc-BufferStart; + if (FileLoc.isFileID()) + return FileLoc.getLocWithOffset(CharNo); + + // Otherwise, this is the _Pragma lexer case, which pretends that all of the + // tokens are lexed from where the _Pragma was defined. + assert(PP && "This doesn't work on raw lexers"); + return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); +} + +/// Diag - Forwarding function for diagnostics. This translate a source +/// position in the current buffer into a SourceLocation object for rendering. +DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { + return PP->Diag(getSourceLocation(Loc), DiagID); +} + +//===----------------------------------------------------------------------===// +// Trigraph and Escaped Newline Handling Code. +//===----------------------------------------------------------------------===// + +/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, +/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. +static char GetTrigraphCharForLetter(char Letter) { + switch (Letter) { + default: return 0; + case '=': return '#'; + case ')': return ']'; + case '(': return '['; + case '!': return '|'; + case '\'': return '^'; + case '>': return '}'; + case '/': return '\\'; + case '<': return '{'; + case '-': return '~'; + } +} + +/// DecodeTrigraphChar - If the specified character is a legal trigraph when +/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, +/// return the result character. Finally, emit a warning about trigraph use +/// whether trigraphs are enabled or not. +static char DecodeTrigraphChar(const char *CP, Lexer *L) { + char Res = GetTrigraphCharForLetter(*CP); + if (!Res || !L) return Res; + + if (!L->getLangOpts().Trigraphs) { + if (!L->isLexingRawMode()) + L->Diag(CP-2, diag::trigraph_ignored); + return 0; + } + + if (!L->isLexingRawMode()) + L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); + return Res; +} + +/// getEscapedNewLineSize - Return the size of the specified escaped newline, +/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a +/// trigraph equivalent on entry to this function. +unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { + unsigned Size = 0; + while (isWhitespace(Ptr[Size])) { + ++Size; + + if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') + continue; + + // If this is a \r\n or \n\r, skip the other half. + if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && + Ptr[Size-1] != Ptr[Size]) + ++Size; + + return Size; + } + + // Not an escaped newline, must be a \t or something else. + return 0; +} + +/// SkipEscapedNewLines - If P points to an escaped newline (or a series of +/// them), skip over them and return the first non-escaped-newline found, +/// otherwise return P. +const char *Lexer::SkipEscapedNewLines(const char *P) { + while (true) { + const char *AfterEscape; + if (*P == '\\') { + AfterEscape = P+1; + } else if (*P == '?') { + // If not a trigraph for escape, bail out. + if (P[1] != '?' || P[2] != '/') + return P; + // FIXME: Take LangOpts into account; the language might not + // support trigraphs. + AfterEscape = P+3; + } else { + return P; + } + + unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); + if (NewLineSize == 0) return P; + P = AfterEscape+NewLineSize; + } +} + +Optional<Token> Lexer::findNextToken(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + if (Loc.isMacroID()) { + if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) + return None; + } + Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); + + // Break down the source location. + std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); + + // Try to load the file buffer. + bool InvalidTemp = false; + StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); + if (InvalidTemp) + return None; + + const char *TokenBegin = File.data() + LocInfo.second; + + // Lex from the start of the given location. + Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), + TokenBegin, File.end()); + // Find the token. + Token Tok; + lexer.LexFromRawLexer(Tok); + return Tok; +} + +/// Checks that the given token is the first token that occurs after the +/// given location (this excludes comments and whitespace). Returns the location +/// immediately after the specified token. If the token is not found or the +/// location is inside a macro, the returned source location will be invalid. +SourceLocation Lexer::findLocationAfterToken( + SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, + const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { + Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); + if (!Tok || Tok->isNot(TKind)) + return {}; + SourceLocation TokenLoc = Tok->getLocation(); + + // Calculate how much whitespace needs to be skipped if any. + unsigned NumWhitespaceChars = 0; + if (SkipTrailingWhitespaceAndNewLine) { + const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); + unsigned char C = *TokenEnd; + while (isHorizontalWhitespace(C)) { + C = *(++TokenEnd); + NumWhitespaceChars++; + } + + // Skip \r, \n, \r\n, or \n\r + if (C == '\n' || C == '\r') { + char PrevC = C; + C = *(++TokenEnd); + NumWhitespaceChars++; + if ((C == '\n' || C == '\r') && C != PrevC) + NumWhitespaceChars++; + } + } + + return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); +} + +/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, +/// get its size, and return it. This is tricky in several cases: +/// 1. If currently at the start of a trigraph, we warn about the trigraph, +/// then either return the trigraph (skipping 3 chars) or the '?', +/// depending on whether trigraphs are enabled or not. +/// 2. If this is an escaped newline (potentially with whitespace between +/// the backslash and newline), implicitly skip the newline and return +/// the char after it. +/// +/// This handles the slow/uncommon case of the getCharAndSize method. Here we +/// know that we can accumulate into Size, and that we have already incremented +/// Ptr by Size bytes. +/// +/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should +/// be updated to match. +char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, + Token *Tok) { + // If we have a slash, look for an escaped newline. + if (Ptr[0] == '\\') { + ++Size; + ++Ptr; +Slash: + // Common case, backslash-char where the char is not whitespace. + if (!isWhitespace(Ptr[0])) return '\\'; + + // See if we have optional whitespace characters between the slash and + // newline. + if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { + // Remember that this token needs to be cleaned. + if (Tok) Tok->setFlag(Token::NeedsCleaning); + + // Warn if there was whitespace between the backslash and newline. + if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) + Diag(Ptr, diag::backslash_newline_space); + + // Found backslash<whitespace><newline>. Parse the char after it. + Size += EscapedNewLineSize; + Ptr += EscapedNewLineSize; + + // Use slow version to accumulate a correct size field. + return getCharAndSizeSlow(Ptr, Size, Tok); + } + + // Otherwise, this is not an escaped newline, just return the slash. + return '\\'; + } + + // If this is a trigraph, process it. + if (Ptr[0] == '?' && Ptr[1] == '?') { + // If this is actually a legal trigraph (not something like "??x"), emit + // a trigraph warning. If so, and if trigraphs are enabled, return it. + if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { + // Remember that this token needs to be cleaned. + if (Tok) Tok->setFlag(Token::NeedsCleaning); + + Ptr += 3; + Size += 3; + if (C == '\\') goto Slash; + return C; + } + } + + // If this is neither, return a single character. + ++Size; + return *Ptr; +} + +/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the +/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, +/// and that we have already incremented Ptr by Size bytes. +/// +/// NOTE: When this method is updated, getCharAndSizeSlow (above) should +/// be updated to match. +char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, + const LangOptions &LangOpts) { + // If we have a slash, look for an escaped newline. + if (Ptr[0] == '\\') { + ++Size; + ++Ptr; +Slash: + // Common case, backslash-char where the char is not whitespace. + if (!isWhitespace(Ptr[0])) return '\\'; + + // See if we have optional whitespace characters followed by a newline. + if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { + // Found backslash<whitespace><newline>. Parse the char after it. + Size += EscapedNewLineSize; + Ptr += EscapedNewLineSize; + + // Use slow version to accumulate a correct size field. + return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); + } + + // Otherwise, this is not an escaped newline, just return the slash. + return '\\'; + } + + // If this is a trigraph, process it. + if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { + // If this is actually a legal trigraph (not something like "??x"), return + // it. + if (char C = GetTrigraphCharForLetter(Ptr[2])) { + Ptr += 3; + Size += 3; + if (C == '\\') goto Slash; + return C; + } + } + + // If this is neither, return a single character. + ++Size; + return *Ptr; +} + +//===----------------------------------------------------------------------===// +// Helper methods for lexing. +//===----------------------------------------------------------------------===// + +/// Routine that indiscriminately sets the offset into the source file. +void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { + BufferPtr = BufferStart + Offset; + if (BufferPtr > BufferEnd) + BufferPtr = BufferEnd; + // FIXME: What exactly does the StartOfLine bit mean? There are two + // possible meanings for the "start" of the line: the first token on the + // unexpanded line, or the first token on the expanded line. + IsAtStartOfLine = StartOfLine; + IsAtPhysicalStartOfLine = StartOfLine; +} + +static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { + if (LangOpts.AsmPreprocessor) { + return false; + } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { + static const llvm::sys::UnicodeCharSet C11AllowedIDChars( + C11AllowedIDCharRanges); + return C11AllowedIDChars.contains(C); + } else if (LangOpts.CPlusPlus) { + static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( + CXX03AllowedIDCharRanges); + return CXX03AllowedIDChars.contains(C); + } else { + static const llvm::sys::UnicodeCharSet C99AllowedIDChars( + C99AllowedIDCharRanges); + return C99AllowedIDChars.contains(C); + } +} + +static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { + assert(isAllowedIDChar(C, LangOpts)); + if (LangOpts.AsmPreprocessor) { + return false; + } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { + static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( + C11DisallowedInitialIDCharRanges); + return !C11DisallowedInitialIDChars.contains(C); + } else if (LangOpts.CPlusPlus) { + return true; + } else { + static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( + C99DisallowedInitialIDCharRanges); + return !C99DisallowedInitialIDChars.contains(C); + } +} + +static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, + const char *End) { + return CharSourceRange::getCharRange(L.getSourceLocation(Begin), + L.getSourceLocation(End)); +} + +static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, + CharSourceRange Range, bool IsFirst) { + // Check C99 compatibility. + if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { + enum { + CannotAppearInIdentifier = 0, + CannotStartIdentifier + }; + + static const llvm::sys::UnicodeCharSet C99AllowedIDChars( + C99AllowedIDCharRanges); + static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( + C99DisallowedInitialIDCharRanges); + if (!C99AllowedIDChars.contains(C)) { + Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) + << Range + << CannotAppearInIdentifier; + } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { + Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) + << Range + << CannotStartIdentifier; + } + } + + // Check C++98 compatibility. + if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) { + static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( + CXX03AllowedIDCharRanges); + if (!CXX03AllowedIDChars.contains(C)) { + Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) + << Range; + } + } +} + +/// After encountering UTF-8 character C and interpreting it as an identifier +/// character, check whether it's a homoglyph for a common non-identifier +/// source character that is unlikely to be an intentional identifier +/// character and warn if so. +static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, + CharSourceRange Range) { + // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). + struct HomoglyphPair { + uint32_t Character; + char LooksLike; + bool operator<(HomoglyphPair R) const { return Character < R.Character; } + }; + static constexpr HomoglyphPair SortedHomoglyphs[] = { + {U'\u00ad', 0}, // SOFT HYPHEN + {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK + {U'\u037e', ';'}, // GREEK QUESTION MARK + {U'\u200b', 0}, // ZERO WIDTH SPACE + {U'\u200c', 0}, // ZERO WIDTH NON-JOINER + {U'\u200d', 0}, // ZERO WIDTH JOINER + {U'\u2060', 0}, // WORD JOINER + {U'\u2061', 0}, // FUNCTION APPLICATION + {U'\u2062', 0}, // INVISIBLE TIMES + {U'\u2063', 0}, // INVISIBLE SEPARATOR + {U'\u2064', 0}, // INVISIBLE PLUS + {U'\u2212', '-'}, // MINUS SIGN + {U'\u2215', '/'}, // DIVISION SLASH + {U'\u2216', '\\'}, // SET MINUS + {U'\u2217', '*'}, // ASTERISK OPERATOR + {U'\u2223', '|'}, // DIVIDES + {U'\u2227', '^'}, // LOGICAL AND + {U'\u2236', ':'}, // RATIO + {U'\u223c', '~'}, // TILDE OPERATOR + {U'\ua789', ':'}, // MODIFIER LETTER COLON + {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE + {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK + {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN + {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN + {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN + {U'\uff06', '&'}, // FULLWIDTH AMPERSAND + {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS + {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS + {U'\uff0a', '*'}, // FULLWIDTH ASTERISK + {U'\uff0b', '+'}, // FULLWIDTH ASTERISK + {U'\uff0c', ','}, // FULLWIDTH COMMA + {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS + {U'\uff0e', '.'}, // FULLWIDTH FULL STOP + {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS + {U'\uff1a', ':'}, // FULLWIDTH COLON + {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON + {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN + {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN + {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN + {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK + {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT + {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET + {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS + {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET + {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT + {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET + {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE + {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET + {U'\uff5e', '~'}, // FULLWIDTH TILDE + {0, 0} + }; + auto Homoglyph = + std::lower_bound(std::begin(SortedHomoglyphs), + std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); + if (Homoglyph->Character == C) { + llvm::SmallString<5> CharBuf; + { + llvm::raw_svector_ostream CharOS(CharBuf); + llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); + } + if (Homoglyph->LooksLike) { + const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) + << Range << CharBuf << LooksLikeStr; + } else { + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) + << Range << CharBuf; + } + } +} + +bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, + Token &Result) { + const char *UCNPtr = CurPtr + Size; + uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); + if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) + return false; + + if (!isLexingRawMode()) + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UCNPtr), + /*IsFirst=*/false); + + Result.setFlag(Token::HasUCN); + if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || + (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) + CurPtr = UCNPtr; + else + while (CurPtr != UCNPtr) + (void)getAndAdvanceChar(CurPtr, Result); + return true; +} + +bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { + const char *UnicodePtr = CurPtr; + llvm::UTF32 CodePoint; + llvm::ConversionResult Result = + llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, + (const llvm::UTF8 *)BufferEnd, + &CodePoint, + llvm::strictConversion); + if (Result != llvm::conversionOK || + !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) + return false; + + if (!isLexingRawMode()) { + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr), + /*IsFirst=*/false); + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr)); + } + + CurPtr = UnicodePtr; + return true; +} + +bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { + // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] + unsigned Size; + unsigned char C = *CurPtr++; + while (isIdentifierBody(C)) + C = *CurPtr++; + + --CurPtr; // Back up over the skipped character. + + // Fast path, no $,\,? in identifier found. '\' might be an escaped newline + // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. + // + // TODO: Could merge these checks into an InfoTable flag to make the + // comparison cheaper + if (isASCII(C) && C != '\\' && C != '?' && + (C != '$' || !LangOpts.DollarIdents)) { +FinishIdentifier: + const char *IdStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, tok::raw_identifier); + Result.setRawIdentifierData(IdStart); + + // If we are in raw mode, return this identifier raw. There is no need to + // look up identifier information or attempt to macro expand it. + if (LexingRawMode) + return true; + + // Fill in Result.IdentifierInfo and update the token kind, + // looking up the identifier in the identifier table. + IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); + // Note that we have to call PP->LookUpIdentifierInfo() even for code + // completion, it writes IdentifierInfo into Result, and callers rely on it. + + // If the completion point is at the end of an identifier, we want to treat + // the identifier as incomplete even if it resolves to a macro or a keyword. + // This allows e.g. 'class^' to complete to 'classifier'. + if (isCodeCompletionPoint(CurPtr)) { + // Return the code-completion token. + Result.setKind(tok::code_completion); + // Skip the code-completion char and all immediate identifier characters. + // This ensures we get consistent behavior when completing at any point in + // an identifier (i.e. at the start, in the middle, at the end). Note that + // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code + // simpler. + assert(*CurPtr == 0 && "Completion character must be 0"); + ++CurPtr; + // Note that code completion token is not added as a separate character + // when the completion point is at the end of the buffer. Therefore, we need + // to check if the buffer has ended. + if (CurPtr < BufferEnd) { + while (isIdentifierBody(*CurPtr)) + ++CurPtr; + } + BufferPtr = CurPtr; + return true; + } + + // Finally, now that we know we have an identifier, pass this off to the + // preprocessor, which may macro expand it or something. + if (II->isHandleIdentifierCase()) + return PP->HandleIdentifier(Result); + + return true; + } + + // Otherwise, $,\,? in identifier found. Enter slower path. + + C = getCharAndSize(CurPtr, Size); + while (true) { + if (C == '$') { + // If we hit a $ and they are not supported in identifiers, we are done. + if (!LangOpts.DollarIdents) goto FinishIdentifier; + + // Otherwise, emit a diagnostic and continue. + if (!isLexingRawMode()) + Diag(CurPtr, diag::ext_dollar_in_identifier); + CurPtr = ConsumeChar(CurPtr, Size, Result); + C = getCharAndSize(CurPtr, Size); + continue; + } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { + C = getCharAndSize(CurPtr, Size); + continue; + } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { + C = getCharAndSize(CurPtr, Size); + continue; + } else if (!isIdentifierBody(C)) { + goto FinishIdentifier; + } + + // Otherwise, this character is good, consume it. + CurPtr = ConsumeChar(CurPtr, Size, Result); + + C = getCharAndSize(CurPtr, Size); + while (isIdentifierBody(C)) { + CurPtr = ConsumeChar(CurPtr, Size, Result); + C = getCharAndSize(CurPtr, Size); + } + } +} + +/// isHexaLiteral - Return true if Start points to a hex constant. +/// in microsoft mode (where this is supposed to be several different tokens). +bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { + unsigned Size; + char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); + if (C1 != '0') + return false; + char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); + return (C2 == 'x' || C2 == 'X'); +} + +/// LexNumericConstant - Lex the remainder of a integer or floating point +/// constant. From[-1] is the first character lexed. Return the end of the +/// constant. +bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { + unsigned Size; + char C = getCharAndSize(CurPtr, Size); + char PrevCh = 0; + while (isPreprocessingNumberBody(C)) { + CurPtr = ConsumeChar(CurPtr, Size, Result); + PrevCh = C; + C = getCharAndSize(CurPtr, Size); + } + + // If we fell out, check for a sign, due to 1e+12. If we have one, continue. + if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { + // If we are in Microsoft mode, don't continue if the constant is hex. + // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 + if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) + return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); + } + + // If we have a hex FP constant, continue. + if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { + // Outside C99 and C++17, we accept hexadecimal floating point numbers as a + // not-quite-conforming extension. Only do so if this looks like it's + // actually meant to be a hexfloat, and not if it has a ud-suffix. + bool IsHexFloat = true; + if (!LangOpts.C99) { + if (!isHexaLiteral(BufferPtr, LangOpts)) + IsHexFloat = false; + else if (!getLangOpts().CPlusPlus17 && + std::find(BufferPtr, CurPtr, '_') != CurPtr) + IsHexFloat = false; + } + if (IsHexFloat) + return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); + } + + // If we have a digit separator, continue. + if (C == '\'' && getLangOpts().CPlusPlus14) { + unsigned NextSize; + char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); + if (isIdentifierBody(Next)) { + if (!isLexingRawMode()) + Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); + CurPtr = ConsumeChar(CurPtr, Size, Result); + CurPtr = ConsumeChar(CurPtr, NextSize, Result); + return LexNumericConstant(Result, CurPtr); + } + } + + // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) + return LexNumericConstant(Result, CurPtr); + if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + return LexNumericConstant(Result, CurPtr); + + // Update the location of token as well as BufferPtr. + const char *TokStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, tok::numeric_constant); + Result.setLiteralData(TokStart); + return true; +} + +/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes +/// in C++11, or warn on a ud-suffix in C++98. +const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, + bool IsStringLiteral) { + assert(getLangOpts().CPlusPlus); + + // Maximally munch an identifier. + unsigned Size; + char C = getCharAndSize(CurPtr, Size); + bool Consumed = false; + + if (!isIdentifierHead(C)) { + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) + Consumed = true; + else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + Consumed = true; + else + return CurPtr; + } + + if (!getLangOpts().CPlusPlus11) { + if (!isLexingRawMode()) + Diag(CurPtr, + C == '_' ? diag::warn_cxx11_compat_user_defined_literal + : diag::warn_cxx11_compat_reserved_user_defined_literal) + << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + return CurPtr; + } + + // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix + // that does not start with an underscore is ill-formed. As a conforming + // extension, we treat all such suffixes as if they had whitespace before + // them. We assume a suffix beginning with a UCN or UTF-8 character is more + // likely to be a ud-suffix than a macro, however, and accept that. + if (!Consumed) { + bool IsUDSuffix = false; + if (C == '_') + IsUDSuffix = true; + else if (IsStringLiteral && getLangOpts().CPlusPlus14) { + // In C++1y, we need to look ahead a few characters to see if this is a + // valid suffix for a string literal or a numeric literal (this could be + // the 'operator""if' defining a numeric literal operator). + const unsigned MaxStandardSuffixLength = 3; + char Buffer[MaxStandardSuffixLength] = { C }; + unsigned Consumed = Size; + unsigned Chars = 1; + while (true) { + unsigned NextSize; + char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, + getLangOpts()); + if (!isIdentifierBody(Next)) { + // End of suffix. Check whether this is on the whitelist. + const StringRef CompleteSuffix(Buffer, Chars); + IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), + CompleteSuffix); + break; + } + + if (Chars == MaxStandardSuffixLength) + // Too long: can't be a standard suffix. + break; + + Buffer[Chars++] = Next; + Consumed += NextSize; + } + } + + if (!IsUDSuffix) { + if (!isLexingRawMode()) + Diag(CurPtr, getLangOpts().MSVCCompat + ? diag::ext_ms_reserved_user_defined_literal + : diag::ext_reserved_user_defined_literal) + << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + return CurPtr; + } + + CurPtr = ConsumeChar(CurPtr, Size, Result); + } + + Result.setFlag(Token::HasUDSuffix); + while (true) { + C = getCharAndSize(CurPtr, Size); + if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } + else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} + else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} + else break; + } + + return CurPtr; +} + +/// LexStringLiteral - Lex the remainder of a string literal, after having lexed +/// either " or L" or u8" or u" or U". +bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { + const char *AfterQuote = CurPtr; + // Does this string contain the \0 character? + const char *NulCharacter = nullptr; + + if (!isLexingRawMode() && + (Kind == tok::utf8_string_literal || + Kind == tok::utf16_string_literal || + Kind == tok::utf32_string_literal)) + Diag(BufferPtr, getLangOpts().CPlusPlus + ? diag::warn_cxx98_compat_unicode_literal + : diag::warn_c99_compat_unicode_literal); + + char C = getAndAdvanceChar(CurPtr, Result); + while (C != '"') { + // Skip escaped characters. Escaped newlines will already be processed by + // getAndAdvanceChar. + if (C == '\\') + C = getAndAdvanceChar(CurPtr, Result); + + if (C == '\n' || C == '\r' || // Newline. + (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. + if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) + Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; + FormTokenWithChars(Result, CurPtr-1, tok::unknown); + return true; + } + + if (C == 0) { + if (isCodeCompletionPoint(CurPtr-1)) { + if (ParsingFilename) + codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); + else + PP->CodeCompleteNaturalLanguage(); + FormTokenWithChars(Result, CurPtr - 1, tok::unknown); + cutOffLexing(); + return true; + } + + NulCharacter = CurPtr-1; + } + C = getAndAdvanceChar(CurPtr, Result); + } + + // If we are in C++11, lex the optional ud-suffix. + if (getLangOpts().CPlusPlus) + CurPtr = LexUDSuffix(Result, CurPtr, true); + + // If a nul character existed in the string, warn about it. + if (NulCharacter && !isLexingRawMode()) + Diag(NulCharacter, diag::null_in_char_or_string) << 1; + + // Update the location of the token as well as the BufferPtr instance var. + const char *TokStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, Kind); + Result.setLiteralData(TokStart); + return true; +} + +/// LexRawStringLiteral - Lex the remainder of a raw string literal, after +/// having lexed R", LR", u8R", uR", or UR". +bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { + // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: + // Between the initial and final double quote characters of the raw string, + // any transformations performed in phases 1 and 2 (trigraphs, + // universal-character-names, and line splicing) are reverted. + + if (!isLexingRawMode()) + Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); + + unsigned PrefixLen = 0; + + while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) + ++PrefixLen; + + // If the last character was not a '(', then we didn't lex a valid delimiter. + if (CurPtr[PrefixLen] != '(') { + if (!isLexingRawMode()) { + const char *PrefixEnd = &CurPtr[PrefixLen]; + if (PrefixLen == 16) { + Diag(PrefixEnd, diag::err_raw_delim_too_long); + } else { + Diag(PrefixEnd, diag::err_invalid_char_raw_delim) + << StringRef(PrefixEnd, 1); + } + } + + // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, + // it's possible the '"' was intended to be part of the raw string, but + // there's not much we can do about that. + while (true) { + char C = *CurPtr++; + + if (C == '"') + break; + if (C == 0 && CurPtr-1 == BufferEnd) { + --CurPtr; + break; + } + } + + FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; + } + + // Save prefix and move CurPtr past it + const char *Prefix = CurPtr; + CurPtr += PrefixLen + 1; // skip over prefix and '(' + + while (true) { + char C = *CurPtr++; + + if (C == ')') { + // Check for prefix match and closing quote. + if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { + CurPtr += PrefixLen + 1; // skip over prefix and '"' + break; + } + } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. + if (!isLexingRawMode()) + Diag(BufferPtr, diag::err_unterminated_raw_string) + << StringRef(Prefix, PrefixLen); + FormTokenWithChars(Result, CurPtr-1, tok::unknown); + return true; + } + } + + // If we are in C++11, lex the optional ud-suffix. + if (getLangOpts().CPlusPlus) + CurPtr = LexUDSuffix(Result, CurPtr, true); + + // Update the location of token as well as BufferPtr. + const char *TokStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, Kind); + Result.setLiteralData(TokStart); + return true; +} + +/// LexAngledStringLiteral - Lex the remainder of an angled string literal, +/// after having lexed the '<' character. This is used for #include filenames. +bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { + // Does this string contain the \0 character? + const char *NulCharacter = nullptr; + const char *AfterLessPos = CurPtr; + char C = getAndAdvanceChar(CurPtr, Result); + while (C != '>') { + // Skip escaped characters. Escaped newlines will already be processed by + // getAndAdvanceChar. + if (C == '\\') + C = getAndAdvanceChar(CurPtr, Result); + + if (C == '\n' || C == '\r' || // Newline. + (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. + // If the filename is unterminated, then it must just be a lone < + // character. Return this as such. + FormTokenWithChars(Result, AfterLessPos, tok::less); + return true; + } + + if (C == 0) { + if (isCodeCompletionPoint(CurPtr - 1)) { + codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); + cutOffLexing(); + FormTokenWithChars(Result, CurPtr - 1, tok::unknown); + return true; + } + NulCharacter = CurPtr-1; + } + C = getAndAdvanceChar(CurPtr, Result); + } + + // If a nul character existed in the string, warn about it. + if (NulCharacter && !isLexingRawMode()) + Diag(NulCharacter, diag::null_in_char_or_string) << 1; + + // Update the location of token as well as BufferPtr. + const char *TokStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, tok::header_name); + Result.setLiteralData(TokStart); + return true; +} + +void Lexer::codeCompleteIncludedFile(const char *PathStart, + const char *CompletionPoint, + bool IsAngled) { + // Completion only applies to the filename, after the last slash. + StringRef PartialPath(PathStart, CompletionPoint - PathStart); + auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/"); + StringRef Dir = + (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); + const char *StartOfFilename = + (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; + // Code completion filter range is the filename only, up to completion point. + PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( + StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); + // We should replace the characters up to the closing quote, if any. + while (CompletionPoint < BufferEnd) { + char Next = *(CompletionPoint + 1); + if (Next == 0 || Next == '\r' || Next == '\n') + break; + ++CompletionPoint; + if (Next == (IsAngled ? '>' : '"')) + break; + } + PP->setCodeCompletionTokenRange( + FileLoc.getLocWithOffset(StartOfFilename - BufferStart), + FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); + PP->CodeCompleteIncludedFile(Dir, IsAngled); +} + +/// LexCharConstant - Lex the remainder of a character constant, after having +/// lexed either ' or L' or u8' or u' or U'. +bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { + // Does this character contain the \0 character? + const char *NulCharacter = nullptr; + + if (!isLexingRawMode()) { + if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) + Diag(BufferPtr, getLangOpts().CPlusPlus + ? diag::warn_cxx98_compat_unicode_literal + : diag::warn_c99_compat_unicode_literal); + else if (Kind == tok::utf8_char_constant) + Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); + } + + char C = getAndAdvanceChar(CurPtr, Result); + if (C == '\'') { + if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) + Diag(BufferPtr, diag::ext_empty_character); + FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; + } + + while (C != '\'') { + // Skip escaped characters. + if (C == '\\') + C = getAndAdvanceChar(CurPtr, Result); + + if (C == '\n' || C == '\r' || // Newline. + (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. + if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) + Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; + FormTokenWithChars(Result, CurPtr-1, tok::unknown); + return true; + } + + if (C == 0) { + if (isCodeCompletionPoint(CurPtr-1)) { + PP->CodeCompleteNaturalLanguage(); + FormTokenWithChars(Result, CurPtr-1, tok::unknown); + cutOffLexing(); + return true; + } + + NulCharacter = CurPtr-1; + } + C = getAndAdvanceChar(CurPtr, Result); + } + + // If we are in C++11, lex the optional ud-suffix. + if (getLangOpts().CPlusPlus) + CurPtr = LexUDSuffix(Result, CurPtr, false); + + // If a nul character existed in the character, warn about it. + if (NulCharacter && !isLexingRawMode()) + Diag(NulCharacter, diag::null_in_char_or_string) << 0; + + // Update the location of token as well as BufferPtr. + const char *TokStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, Kind); + Result.setLiteralData(TokStart); + return true; +} + +/// SkipWhitespace - Efficiently skip over a series of whitespace characters. +/// Update BufferPtr to point to the next non-whitespace character and return. +/// +/// This method forms a token and returns true if KeepWhitespaceMode is enabled. +bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, + bool &TokAtPhysicalStartOfLine) { + // Whitespace - Skip it, then return the token after the whitespace. + bool SawNewline = isVerticalWhitespace(CurPtr[-1]); + + unsigned char Char = *CurPtr; + + // Skip consecutive spaces efficiently. + while (true) { + // Skip horizontal whitespace very aggressively. + while (isHorizontalWhitespace(Char)) + Char = *++CurPtr; + + // Otherwise if we have something other than whitespace, we're done. + if (!isVerticalWhitespace(Char)) + break; + + if (ParsingPreprocessorDirective) { + // End of preprocessor directive line, let LexTokenInternal handle this. + BufferPtr = CurPtr; + return false; + } + + // OK, but handle newline. + SawNewline = true; + Char = *++CurPtr; + } + + // If the client wants us to return whitespace, return it now. + if (isKeepWhitespaceMode()) { + FormTokenWithChars(Result, CurPtr, tok::unknown); + if (SawNewline) { + IsAtStartOfLine = true; + IsAtPhysicalStartOfLine = true; + } + // FIXME: The next token will not have LeadingSpace set. + return true; + } + + // If this isn't immediately after a newline, there is leading space. + char PrevChar = CurPtr[-1]; + bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); + + Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); + if (SawNewline) { + Result.setFlag(Token::StartOfLine); + TokAtPhysicalStartOfLine = true; + } + + BufferPtr = CurPtr; + return false; +} + +/// We have just read the // characters from input. Skip until we find the +/// newline character that terminates the comment. Then update BufferPtr and +/// return. +/// +/// If we're in KeepCommentMode or any CommentHandler has inserted +/// some tokens, this will store the first token and return true. +bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, + bool &TokAtPhysicalStartOfLine) { + // If Line comments aren't explicitly enabled for this language, emit an + // extension warning. + if (!LangOpts.LineComment && !isLexingRawMode()) { + Diag(BufferPtr, diag::ext_line_comment); + + // Mark them enabled so we only emit one warning for this translation + // unit. + LangOpts.LineComment = true; + } + + // Scan over the body of the comment. The common case, when scanning, is that + // the comment contains normal ascii characters with nothing interesting in + // them. As such, optimize for this case with the inner loop. + // + // This loop terminates with CurPtr pointing at the newline (or end of buffer) + // character that ends the line comment. + char C; + while (true) { + C = *CurPtr; + // Skip over characters in the fast loop. + while (C != 0 && // Potentially EOF. + C != '\n' && C != '\r') // Newline or DOS-style newline. + C = *++CurPtr; + + const char *NextLine = CurPtr; + if (C != 0) { + // We found a newline, see if it's escaped. + const char *EscapePtr = CurPtr-1; + bool HasSpace = false; + while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. + --EscapePtr; + HasSpace = true; + } + + if (*EscapePtr == '\\') + // Escaped newline. + CurPtr = EscapePtr; + else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && + EscapePtr[-2] == '?' && LangOpts.Trigraphs) + // Trigraph-escaped newline. + CurPtr = EscapePtr-2; + else + break; // This is a newline, we're done. + + // If there was space between the backslash and newline, warn about it. + if (HasSpace && !isLexingRawMode()) + Diag(EscapePtr, diag::backslash_newline_space); + } + + // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to + // properly decode the character. Read it in raw mode to avoid emitting + // diagnostics about things like trigraphs. If we see an escaped newline, + // we'll handle it below. + const char *OldPtr = CurPtr; + bool OldRawMode = isLexingRawMode(); + LexingRawMode = true; + C = getAndAdvanceChar(CurPtr, Result); + LexingRawMode = OldRawMode; + + // If we only read only one character, then no special handling is needed. + // We're done and can skip forward to the newline. + if (C != 0 && CurPtr == OldPtr+1) { + CurPtr = NextLine; + break; + } + + // If we read multiple characters, and one of those characters was a \r or + // \n, then we had an escaped newline within the comment. Emit diagnostic + // unless the next line is also a // comment. + if (CurPtr != OldPtr + 1 && C != '/' && + (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { + for (; OldPtr != CurPtr; ++OldPtr) + if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { + // Okay, we found a // comment that ends in a newline, if the next + // line is also a // comment, but has spaces, don't emit a diagnostic. + if (isWhitespace(C)) { + const char *ForwardPtr = CurPtr; + while (isWhitespace(*ForwardPtr)) // Skip whitespace. + ++ForwardPtr; + if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') + break; + } + + if (!isLexingRawMode()) + Diag(OldPtr-1, diag::ext_multi_line_line_comment); + break; + } + } + + if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { + --CurPtr; + break; + } + + if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { + PP->CodeCompleteNaturalLanguage(); + cutOffLexing(); + return false; + } + } + + // Found but did not consume the newline. Notify comment handlers about the + // comment unless we're in a #if 0 block. + if (PP && !isLexingRawMode() && + PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), + getSourceLocation(CurPtr)))) { + BufferPtr = CurPtr; + return true; // A token has to be returned. + } + + // If we are returning comments as tokens, return this comment as a token. + if (inKeepCommentMode()) + return SaveLineComment(Result, CurPtr); + + // If we are inside a preprocessor directive and we see the end of line, + // return immediately, so that the lexer can return this as an EOD token. + if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { + BufferPtr = CurPtr; + return false; + } + + // Otherwise, eat the \n character. We don't care if this is a \n\r or + // \r\n sequence. This is an efficiency hack (because we know the \n can't + // contribute to another token), it isn't needed for correctness. Note that + // this is ok even in KeepWhitespaceMode, because we would have returned the + /// comment above in that mode. + ++CurPtr; + + // The next returned token is at the start of the line. + Result.setFlag(Token::StartOfLine); + TokAtPhysicalStartOfLine = true; + // No leading whitespace seen so far. + Result.clearFlag(Token::LeadingSpace); + BufferPtr = CurPtr; + return false; +} + +/// If in save-comment mode, package up this Line comment in an appropriate +/// way and return it. +bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { + // If we're not in a preprocessor directive, just return the // comment + // directly. + FormTokenWithChars(Result, CurPtr, tok::comment); + + if (!ParsingPreprocessorDirective || LexingRawMode) + return true; + + // If this Line-style comment is in a macro definition, transmogrify it into + // a C-style block comment. + bool Invalid = false; + std::string Spelling = PP->getSpelling(Result, &Invalid); + if (Invalid) + return true; + + assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); + Spelling[1] = '*'; // Change prefix to "/*". + Spelling += "*/"; // add suffix. + + Result.setKind(tok::comment); + PP->CreateString(Spelling, Result, + Result.getLocation(), Result.getLocation()); + return true; +} + +/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline +/// character (either \\n or \\r) is part of an escaped newline sequence. Issue +/// a diagnostic if so. We know that the newline is inside of a block comment. +static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, + Lexer *L) { + assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); + + // Back up off the newline. + --CurPtr; + + // If this is a two-character newline sequence, skip the other character. + if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { + // \n\n or \r\r -> not escaped newline. + if (CurPtr[0] == CurPtr[1]) + return false; + // \n\r or \r\n -> skip the newline. + --CurPtr; + } + + // If we have horizontal whitespace, skip over it. We allow whitespace + // between the slash and newline. + bool HasSpace = false; + while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { + --CurPtr; + HasSpace = true; + } + + // If we have a slash, we know this is an escaped newline. + if (*CurPtr == '\\') { + if (CurPtr[-1] != '*') return false; + } else { + // It isn't a slash, is it the ?? / trigraph? + if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || + CurPtr[-3] != '*') + return false; + + // This is the trigraph ending the comment. Emit a stern warning! + CurPtr -= 2; + + // If no trigraphs are enabled, warn that we ignored this trigraph and + // ignore this * character. + if (!L->getLangOpts().Trigraphs) { + if (!L->isLexingRawMode()) + L->Diag(CurPtr, diag::trigraph_ignored_block_comment); + return false; + } + if (!L->isLexingRawMode()) + L->Diag(CurPtr, diag::trigraph_ends_block_comment); + } + + // Warn about having an escaped newline between the */ characters. + if (!L->isLexingRawMode()) + L->Diag(CurPtr, diag::escaped_newline_block_comment_end); + + // If there was space between the backslash and newline, warn about it. + if (HasSpace && !L->isLexingRawMode()) + L->Diag(CurPtr, diag::backslash_newline_space); + + return true; +} + +#ifdef __SSE2__ +#include <emmintrin.h> +#elif __ALTIVEC__ +#include <altivec.h> +#undef bool +#endif + +/// We have just read from input the / and * characters that started a comment. +/// Read until we find the * and / characters that terminate the comment. +/// Note that we don't bother decoding trigraphs or escaped newlines in block +/// comments, because they cannot cause the comment to end. The only thing +/// that can happen is the comment could end with an escaped newline between +/// the terminating * and /. +/// +/// If we're in KeepCommentMode or any CommentHandler has inserted +/// some tokens, this will store the first token and return true. +bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, + bool &TokAtPhysicalStartOfLine) { + // Scan one character past where we should, looking for a '/' character. Once + // we find it, check to see if it was preceded by a *. This common + // optimization helps people who like to put a lot of * characters in their + // comments. + + // The first character we get with newlines and trigraphs skipped to handle + // the degenerate /*/ case below correctly if the * has an escaped newline + // after it. + unsigned CharSize; + unsigned char C = getCharAndSize(CurPtr, CharSize); + CurPtr += CharSize; + if (C == 0 && CurPtr == BufferEnd+1) { + if (!isLexingRawMode()) + Diag(BufferPtr, diag::err_unterminated_block_comment); + --CurPtr; + + // KeepWhitespaceMode should return this broken comment as a token. Since + // it isn't a well formed comment, just return it as an 'unknown' token. + if (isKeepWhitespaceMode()) { + FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; + } + + BufferPtr = CurPtr; + return false; + } + + // Check to see if the first character after the '/*' is another /. If so, + // then this slash does not end the block comment, it is part of it. + if (C == '/') + C = *CurPtr++; + + while (true) { + // Skip over all non-interesting characters until we find end of buffer or a + // (probably ending) '/' character. + if (CurPtr + 24 < BufferEnd && + // If there is a code-completion point avoid the fast scan because it + // doesn't check for '\0'. + !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { + // While not aligned to a 16-byte boundary. + while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) + C = *CurPtr++; + + if (C == '/') goto FoundSlash; + +#ifdef __SSE2__ + __m128i Slashes = _mm_set1_epi8('/'); + while (CurPtr+16 <= BufferEnd) { + int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, + Slashes)); + if (cmp != 0) { + // Adjust the pointer to point directly after the first slash. It's + // not necessary to set C here, it will be overwritten at the end of + // the outer loop. + CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; + goto FoundSlash; + } + CurPtr += 16; + } +#elif __ALTIVEC__ + __vector unsigned char Slashes = { + '/', '/', '/', '/', '/', '/', '/', '/', + '/', '/', '/', '/', '/', '/', '/', '/' + }; + while (CurPtr+16 <= BufferEnd && + !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes)) + CurPtr += 16; +#else + // Scan for '/' quickly. Many block comments are very large. + while (CurPtr[0] != '/' && + CurPtr[1] != '/' && + CurPtr[2] != '/' && + CurPtr[3] != '/' && + CurPtr+4 < BufferEnd) { + CurPtr += 4; + } +#endif + + // It has to be one of the bytes scanned, increment to it and read one. + C = *CurPtr++; + } + + // Loop to scan the remainder. + while (C != '/' && C != '\0') + C = *CurPtr++; + + if (C == '/') { + FoundSlash: + if (CurPtr[-2] == '*') // We found the final */. We're done! + break; + + if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { + if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { + // We found the final */, though it had an escaped newline between the + // * and /. We're done! + break; + } + } + if (CurPtr[0] == '*' && CurPtr[1] != '/') { + // If this is a /* inside of the comment, emit a warning. Don't do this + // if this is a /*/, which will end the comment. This misses cases with + // embedded escaped newlines, but oh well. + if (!isLexingRawMode()) + Diag(CurPtr-1, diag::warn_nested_block_comment); + } + } else if (C == 0 && CurPtr == BufferEnd+1) { + if (!isLexingRawMode()) + Diag(BufferPtr, diag::err_unterminated_block_comment); + // Note: the user probably forgot a */. We could continue immediately + // after the /*, but this would involve lexing a lot of what really is the + // comment, which surely would confuse the parser. + --CurPtr; + + // KeepWhitespaceMode should return this broken comment as a token. Since + // it isn't a well formed comment, just return it as an 'unknown' token. + if (isKeepWhitespaceMode()) { + FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; + } + + BufferPtr = CurPtr; + return false; + } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { + PP->CodeCompleteNaturalLanguage(); + cutOffLexing(); + return false; + } + + C = *CurPtr++; + } + + // Notify comment handlers about the comment unless we're in a #if 0 block. + if (PP && !isLexingRawMode() && + PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), + getSourceLocation(CurPtr)))) { + BufferPtr = CurPtr; + return true; // A token has to be returned. + } + + // If we are returning comments as tokens, return this comment as a token. + if (inKeepCommentMode()) { + FormTokenWithChars(Result, CurPtr, tok::comment); + return true; + } + + // It is common for the tokens immediately after a /**/ comment to be + // whitespace. Instead of going through the big switch, handle it + // efficiently now. This is safe even in KeepWhitespaceMode because we would + // have already returned above with the comment as a token. + if (isHorizontalWhitespace(*CurPtr)) { + SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); + return false; + } + + // Otherwise, just return so that the next character will be lexed as a token. + BufferPtr = CurPtr; + Result.setFlag(Token::LeadingSpace); + return false; +} + +//===----------------------------------------------------------------------===// +// Primary Lexing Entry Points +//===----------------------------------------------------------------------===// + +/// ReadToEndOfLine - Read the rest of the current preprocessor line as an +/// uninterpreted string. This switches the lexer out of directive mode. +void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { + assert(ParsingPreprocessorDirective && ParsingFilename == false && + "Must be in a preprocessing directive!"); + Token Tmp; + + // CurPtr - Cache BufferPtr in an automatic variable. + const char *CurPtr = BufferPtr; + while (true) { + char Char = getAndAdvanceChar(CurPtr, Tmp); + switch (Char) { + default: + if (Result) + Result->push_back(Char); + break; + case 0: // Null. + // Found end of file? + if (CurPtr-1 != BufferEnd) { + if (isCodeCompletionPoint(CurPtr-1)) { + PP->CodeCompleteNaturalLanguage(); + cutOffLexing(); + return; + } + + // Nope, normal character, continue. + if (Result) + Result->push_back(Char); + break; + } + // FALL THROUGH. + LLVM_FALLTHROUGH; + case '\r': + case '\n': + // Okay, we found the end of the line. First, back up past the \0, \r, \n. + assert(CurPtr[-1] == Char && "Trigraphs for newline?"); + BufferPtr = CurPtr-1; + + // Next, lex the character, which should handle the EOD transition. + Lex(Tmp); + if (Tmp.is(tok::code_completion)) { + if (PP) + PP->CodeCompleteNaturalLanguage(); + Lex(Tmp); + } + assert(Tmp.is(tok::eod) && "Unexpected token!"); + + // Finally, we're done; + return; + } + } +} + +/// LexEndOfFile - CurPtr points to the end of this file. Handle this +/// condition, reporting diagnostics and handling other edge cases as required. +/// This returns true if Result contains a token, false if PP.Lex should be +/// called again. +bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { + // If we hit the end of the file while parsing a preprocessor directive, + // end the preprocessor directive first. The next token returned will + // then be the end of file. + if (ParsingPreprocessorDirective) { + // Done parsing the "line". + ParsingPreprocessorDirective = false; + // Update the location of token as well as BufferPtr. + FormTokenWithChars(Result, CurPtr, tok::eod); + + // Restore comment saving mode, in case it was disabled for directive. + if (PP) + resetExtendedTokenMode(); + return true; // Have a token. + } + + // If we are in raw mode, return this event as an EOF token. Let the caller + // that put us in raw mode handle the event. + if (isLexingRawMode()) { + Result.startToken(); + BufferPtr = BufferEnd; + FormTokenWithChars(Result, BufferEnd, tok::eof); + return true; + } + + if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { + PP->setRecordedPreambleConditionalStack(ConditionalStack); + ConditionalStack.clear(); + } + + // Issue diagnostics for unterminated #if and missing newline. + + // If we are in a #if directive, emit an error. + while (!ConditionalStack.empty()) { + if (PP->getCodeCompletionFileLoc() != FileLoc) + PP->Diag(ConditionalStack.back().IfLoc, + diag::err_pp_unterminated_conditional); + ConditionalStack.pop_back(); + } + + // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue + // a pedwarn. + if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { + DiagnosticsEngine &Diags = PP->getDiagnostics(); + SourceLocation EndLoc = getSourceLocation(BufferEnd); + unsigned DiagID; + + if (LangOpts.CPlusPlus11) { + // C++11 [lex.phases] 2.2 p2 + // Prefer the C++98 pedantic compatibility warning over the generic, + // non-extension, user-requested "missing newline at EOF" warning. + if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { + DiagID = diag::warn_cxx98_compat_no_newline_eof; + } else { + DiagID = diag::warn_no_newline_eof; + } + } else { + DiagID = diag::ext_no_newline_eof; + } + + Diag(BufferEnd, DiagID) + << FixItHint::CreateInsertion(EndLoc, "\n"); + } + + BufferPtr = CurPtr; + + // Finally, let the preprocessor handle this. + return PP->HandleEndOfFile(Result, isPragmaLexer()); +} + +/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from +/// the specified lexer will return a tok::l_paren token, 0 if it is something +/// else and 2 if there are no more tokens in the buffer controlled by the +/// lexer. +unsigned Lexer::isNextPPTokenLParen() { + assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); + + // Switch to 'skipping' mode. This will ensure that we can lex a token + // without emitting diagnostics, disables macro expansion, and will cause EOF + // to return an EOF token instead of popping the include stack. + LexingRawMode = true; + + // Save state that can be changed while lexing so that we can restore it. + const char *TmpBufferPtr = BufferPtr; + bool inPPDirectiveMode = ParsingPreprocessorDirective; + bool atStartOfLine = IsAtStartOfLine; + bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; + bool leadingSpace = HasLeadingSpace; + + Token Tok; + Lex(Tok); + + // Restore state that may have changed. + BufferPtr = TmpBufferPtr; + ParsingPreprocessorDirective = inPPDirectiveMode; + HasLeadingSpace = leadingSpace; + IsAtStartOfLine = atStartOfLine; + IsAtPhysicalStartOfLine = atPhysicalStartOfLine; + + // Restore the lexer back to non-skipping mode. + LexingRawMode = false; + + if (Tok.is(tok::eof)) + return 2; + return Tok.is(tok::l_paren); +} + +/// Find the end of a version control conflict marker. +static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, + ConflictMarkerKind CMK) { + const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; + size_t TermLen = CMK == CMK_Perforce ? 5 : 7; + auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); + size_t Pos = RestOfBuffer.find(Terminator); + while (Pos != StringRef::npos) { + // Must occur at start of line. + if (Pos == 0 || + (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { + RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); + Pos = RestOfBuffer.find(Terminator); + continue; + } + return RestOfBuffer.data()+Pos; + } + return nullptr; +} + +/// IsStartOfConflictMarker - If the specified pointer is the start of a version +/// control conflict marker like '<<<<<<<', recognize it as such, emit an error +/// and recover nicely. This returns true if it is a conflict marker and false +/// if not. +bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { + // Only a conflict marker if it starts at the beginning of a line. + if (CurPtr != BufferStart && + CurPtr[-1] != '\n' && CurPtr[-1] != '\r') + return false; + + // Check to see if we have <<<<<<< or >>>>. + if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && + !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) + return false; + + // If we have a situation where we don't care about conflict markers, ignore + // it. + if (CurrentConflictMarkerState || isLexingRawMode()) + return false; + + ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; + + // Check to see if there is an ending marker somewhere in the buffer at the + // start of a line to terminate this conflict marker. + if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { + // We found a match. We are really in a conflict marker. + // Diagnose this, and ignore to the end of line. + Diag(CurPtr, diag::err_conflict_marker); + CurrentConflictMarkerState = Kind; + + // Skip ahead to the end of line. We know this exists because the + // end-of-conflict marker starts with \r or \n. + while (*CurPtr != '\r' && *CurPtr != '\n') { + assert(CurPtr != BufferEnd && "Didn't find end of line"); + ++CurPtr; + } + BufferPtr = CurPtr; + return true; + } + + // No end of conflict marker found. + return false; +} + +/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if +/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it +/// is the end of a conflict marker. Handle it by ignoring up until the end of +/// the line. This returns true if it is a conflict marker and false if not. +bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { + // Only a conflict marker if it starts at the beginning of a line. + if (CurPtr != BufferStart && + CurPtr[-1] != '\n' && CurPtr[-1] != '\r') + return false; + + // If we have a situation where we don't care about conflict markers, ignore + // it. + if (!CurrentConflictMarkerState || isLexingRawMode()) + return false; + + // Check to see if we have the marker (4 characters in a row). + for (unsigned i = 1; i != 4; ++i) + if (CurPtr[i] != CurPtr[0]) + return false; + + // If we do have it, search for the end of the conflict marker. This could + // fail if it got skipped with a '#if 0' or something. Note that CurPtr might + // be the end of conflict marker. + if (const char *End = FindConflictEnd(CurPtr, BufferEnd, + CurrentConflictMarkerState)) { + CurPtr = End; + + // Skip ahead to the end of line. + while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') + ++CurPtr; + + BufferPtr = CurPtr; + + // No longer in the conflict marker. + CurrentConflictMarkerState = CMK_None; + return true; + } + + return false; +} + +static const char *findPlaceholderEnd(const char *CurPtr, + const char *BufferEnd) { + if (CurPtr == BufferEnd) + return nullptr; + BufferEnd -= 1; // Scan until the second last character. + for (; CurPtr != BufferEnd; ++CurPtr) { + if (CurPtr[0] == '#' && CurPtr[1] == '>') + return CurPtr + 2; + } + return nullptr; +} + +bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { + assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); + if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) + return false; + const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); + if (!End) + return false; + const char *Start = CurPtr - 1; + if (!LangOpts.AllowEditorPlaceholders) + Diag(Start, diag::err_placeholder_in_source); + Result.startToken(); + FormTokenWithChars(Result, End, tok::raw_identifier); + Result.setRawIdentifierData(Start); + PP->LookUpIdentifierInfo(Result); + Result.setFlag(Token::IsEditorPlaceholder); + BufferPtr = End; + return true; +} + +bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { + if (PP && PP->isCodeCompletionEnabled()) { + SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); + return Loc == PP->getCodeCompletionLoc(); + } + + return false; +} + +uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, + Token *Result) { + unsigned CharSize; + char Kind = getCharAndSize(StartPtr, CharSize); + + unsigned NumHexDigits; + if (Kind == 'u') + NumHexDigits = 4; + else if (Kind == 'U') + NumHexDigits = 8; + else + return 0; + + if (!LangOpts.CPlusPlus && !LangOpts.C99) { + if (Result && !isLexingRawMode()) + Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); + return 0; + } + + const char *CurPtr = StartPtr + CharSize; + const char *KindLoc = &CurPtr[-1]; + + uint32_t CodePoint = 0; + for (unsigned i = 0; i < NumHexDigits; ++i) { + char C = getCharAndSize(CurPtr, CharSize); + + unsigned Value = llvm::hexDigitValue(C); + if (Value == -1U) { + if (Result && !isLexingRawMode()) { + if (i == 0) { + Diag(BufferPtr, diag::warn_ucn_escape_no_digits) + << StringRef(KindLoc, 1); + } else { + Diag(BufferPtr, diag::warn_ucn_escape_incomplete); + + // If the user wrote \U1234, suggest a fixit to \u. + if (i == 4 && NumHexDigits == 8) { + CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); + Diag(KindLoc, diag::note_ucn_four_not_eight) + << FixItHint::CreateReplacement(URange, "u"); + } + } + } + + return 0; + } + + CodePoint <<= 4; + CodePoint += Value; + + CurPtr += CharSize; + } + + if (Result) { + Result->setFlag(Token::HasUCN); + if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) + StartPtr = CurPtr; + else + while (StartPtr != CurPtr) + (void)getAndAdvanceChar(StartPtr, *Result); + } else { + StartPtr = CurPtr; + } + + // Don't apply C family restrictions to UCNs in assembly mode + if (LangOpts.AsmPreprocessor) + return CodePoint; + + // C99 6.4.3p2: A universal character name shall not specify a character whose + // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or + // 0060 (`), nor one in the range D800 through DFFF inclusive.) + // C++11 [lex.charset]p2: If the hexadecimal value for a + // universal-character-name corresponds to a surrogate code point (in the + // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, + // if the hexadecimal value for a universal-character-name outside the + // c-char-sequence, s-char-sequence, or r-char-sequence of a character or + // string literal corresponds to a control character (in either of the + // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the + // basic source character set, the program is ill-formed. + if (CodePoint < 0xA0) { + if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) + return CodePoint; + + // We don't use isLexingRawMode() here because we need to warn about bad + // UCNs even when skipping preprocessing tokens in a #if block. + if (Result && PP) { + if (CodePoint < 0x20 || CodePoint >= 0x7F) + Diag(BufferPtr, diag::err_ucn_control_character); + else { + char C = static_cast<char>(CodePoint); + Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); + } + } + + return 0; + } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { + // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. + // We don't use isLexingRawMode() here because we need to diagnose bad + // UCNs even when skipping preprocessing tokens in a #if block. + if (Result && PP) { + if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) + Diag(BufferPtr, diag::warn_ucn_escape_surrogate); + else + Diag(BufferPtr, diag::err_ucn_escape_invalid); + } + return 0; + } + + return CodePoint; +} + +bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, + const char *CurPtr) { + static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( + UnicodeWhitespaceCharRanges); + if (!isLexingRawMode() && !PP->isPreprocessedOutput() && + UnicodeWhitespaceChars.contains(C)) { + Diag(BufferPtr, diag::ext_unicode_whitespace) + << makeCharRange(*this, BufferPtr, CurPtr); + + Result.setFlag(Token::LeadingSpace); + return true; + } + return false; +} + +bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { + if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { + if (!isLexingRawMode() && !ParsingPreprocessorDirective && + !PP->isPreprocessedOutput()) { + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr), + /*IsFirst=*/true); + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr)); + } + + MIOpt.ReadToken(); + return LexIdentifier(Result, CurPtr); + } + + if (!isLexingRawMode() && !ParsingPreprocessorDirective && + !PP->isPreprocessedOutput() && + !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) { + // Non-ASCII characters tend to creep into source code unintentionally. + // Instead of letting the parser complain about the unknown token, + // just drop the character. + // Note that we can /only/ do this when the non-ASCII character is actually + // spelled as Unicode, not written as a UCN. The standard requires that + // we not throw away any possible preprocessor tokens, but there's a + // loophole in the mapping of Unicode characters to basic character set + // characters that allows us to map these particular characters to, say, + // whitespace. + Diag(BufferPtr, diag::err_non_ascii) + << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); + + BufferPtr = CurPtr; + return false; + } + + // Otherwise, we have an explicit UCN or a character that's unlikely to show + // up by accident. + MIOpt.ReadToken(); + FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; +} + +void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { + IsAtStartOfLine = Result.isAtStartOfLine(); + HasLeadingSpace = Result.hasLeadingSpace(); + HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); + // Note that this doesn't affect IsAtPhysicalStartOfLine. +} + +bool Lexer::Lex(Token &Result) { + // Start a new token. + Result.startToken(); + + // Set up misc whitespace flags for LexTokenInternal. + if (IsAtStartOfLine) { + Result.setFlag(Token::StartOfLine); + IsAtStartOfLine = false; + } + + if (HasLeadingSpace) { + Result.setFlag(Token::LeadingSpace); + HasLeadingSpace = false; + } + + if (HasLeadingEmptyMacro) { + Result.setFlag(Token::LeadingEmptyMacro); + HasLeadingEmptyMacro = false; + } + + bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; + IsAtPhysicalStartOfLine = false; + bool isRawLex = isLexingRawMode(); + (void) isRawLex; + bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); + // (After the LexTokenInternal call, the lexer might be destroyed.) + assert((returnedToken || !isRawLex) && "Raw lex must succeed"); + return returnedToken; +} + +/// LexTokenInternal - This implements a simple C family lexer. It is an +/// extremely performance critical piece of code. This assumes that the buffer +/// has a null character at the end of the file. This returns a preprocessing +/// token, not a normal token, as such, it is an internal interface. It assumes +/// that the Flags of result have been cleared before calling this. +bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { +LexNextToken: + // New token, can't need cleaning yet. + Result.clearFlag(Token::NeedsCleaning); + Result.setIdentifierInfo(nullptr); + + // CurPtr - Cache BufferPtr in an automatic variable. + const char *CurPtr = BufferPtr; + + // Small amounts of horizontal whitespace is very common between tokens. + if ((*CurPtr == ' ') || (*CurPtr == '\t')) { + ++CurPtr; + while ((*CurPtr == ' ') || (*CurPtr == '\t')) + ++CurPtr; + + // If we are keeping whitespace and other tokens, just return what we just + // skipped. The next lexer invocation will return the token after the + // whitespace. + if (isKeepWhitespaceMode()) { + FormTokenWithChars(Result, CurPtr, tok::unknown); + // FIXME: The next token will not have LeadingSpace set. + return true; + } + + BufferPtr = CurPtr; + Result.setFlag(Token::LeadingSpace); + } + + unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. + + // Read a character, advancing over it. + char Char = getAndAdvanceChar(CurPtr, Result); + tok::TokenKind Kind; + + switch (Char) { + case 0: // Null. + // Found end of file? + if (CurPtr-1 == BufferEnd) + return LexEndOfFile(Result, CurPtr-1); + + // Check if we are performing code completion. + if (isCodeCompletionPoint(CurPtr-1)) { + // Return the code-completion token. + Result.startToken(); + FormTokenWithChars(Result, CurPtr, tok::code_completion); + return true; + } + + if (!isLexingRawMode()) + Diag(CurPtr-1, diag::null_in_file); + Result.setFlag(Token::LeadingSpace); + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + // We know the lexer hasn't changed, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + + case 26: // DOS & CP/M EOF: "^Z". + // If we're in Microsoft extensions mode, treat this as end of file. + if (LangOpts.MicrosoftExt) { + if (!isLexingRawMode()) + Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); + return LexEndOfFile(Result, CurPtr-1); + } + + // If Microsoft extensions are disabled, this is just random garbage. + Kind = tok::unknown; + break; + + case '\r': + if (CurPtr[0] == '\n') + (void)getAndAdvanceChar(CurPtr, Result); + LLVM_FALLTHROUGH; + case '\n': + // If we are inside a preprocessor directive and we see the end of line, + // we know we are done with the directive, so return an EOD token. + if (ParsingPreprocessorDirective) { + // Done parsing the "line". + ParsingPreprocessorDirective = false; + + // Restore comment saving mode, in case it was disabled for directive. + if (PP) + resetExtendedTokenMode(); + + // Since we consumed a newline, we are back at the start of a line. + IsAtStartOfLine = true; + IsAtPhysicalStartOfLine = true; + + Kind = tok::eod; + break; + } + + // No leading whitespace seen so far. + Result.clearFlag(Token::LeadingSpace); + + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + case ' ': + case '\t': + case '\f': + case '\v': + SkipHorizontalWhitespace: + Result.setFlag(Token::LeadingSpace); + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + SkipIgnoredUnits: + CurPtr = BufferPtr; + + // If the next token is obviously a // or /* */ comment, skip it efficiently + // too (without going through the big switch stmt). + if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && + LangOpts.LineComment && + (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { + if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) + return true; // There is a token to return. + goto SkipIgnoredUnits; + } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { + if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) + return true; // There is a token to return. + goto SkipIgnoredUnits; + } else if (isHorizontalWhitespace(*CurPtr)) { + goto SkipHorizontalWhitespace; + } + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + + // C99 6.4.4.1: Integer Constants. + // C99 6.4.4.2: Floating Constants. + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexNumericConstant(Result, CurPtr); + + case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (LangOpts.CPlusPlus11 || LangOpts.C11) { + Char = getCharAndSize(CurPtr, SizeTmp); + + // UTF-16 string literal + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf16_string_literal); + + // UTF-16 character constant + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf16_char_constant); + + // UTF-16 raw string literal + if (Char == 'R' && LangOpts.CPlusPlus11 && + getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf16_string_literal); + + if (Char == '8') { + char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); + + // UTF-8 string literal + if (Char2 == '"') + return LexStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf8_string_literal); + if (Char2 == '\'' && LangOpts.CPlusPlus17) + return LexCharConstant( + Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf8_char_constant); + + if (Char2 == 'R' && LangOpts.CPlusPlus11) { + unsigned SizeTmp3; + char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); + // UTF-8 raw string literal + if (Char3 == '"') { + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + SizeTmp3, Result), + tok::utf8_string_literal); + } + } + } + } + + // treat u like the start of an identifier. + return LexIdentifier(Result, CurPtr); + + case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (LangOpts.CPlusPlus11 || LangOpts.C11) { + Char = getCharAndSize(CurPtr, SizeTmp); + + // UTF-32 string literal + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf32_string_literal); + + // UTF-32 character constant + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf32_char_constant); + + // UTF-32 raw string literal + if (Char == 'R' && LangOpts.CPlusPlus11 && + getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf32_string_literal); + } + + // treat U like the start of an identifier. + return LexIdentifier(Result, CurPtr); + + case 'R': // Identifier or C++0x raw string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (LangOpts.CPlusPlus11) { + Char = getCharAndSize(CurPtr, SizeTmp); + + if (Char == '"') + return LexRawStringLiteral(Result, + ConsumeChar(CurPtr, SizeTmp, Result), + tok::string_literal); + } + + // treat R like the start of an identifier. + return LexIdentifier(Result, CurPtr); + + case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + Char = getCharAndSize(CurPtr, SizeTmp); + + // Wide string literal. + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::wide_string_literal); + + // Wide raw string literal. + if (LangOpts.CPlusPlus11 && Char == 'R' && + getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexRawStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::wide_string_literal); + + // Wide character constant. + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::wide_char_constant); + // FALL THROUGH, treating L like the start of an identifier. + LLVM_FALLTHROUGH; + + // C99 6.4.2: Identifiers. + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': + case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': + case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ + case 'V': case 'W': case 'X': case 'Y': case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': + case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ + case 'v': case 'w': case 'x': case 'y': case 'z': + case '_': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexIdentifier(Result, CurPtr); + + case '$': // $ in identifiers. + if (LangOpts.DollarIdents) { + if (!isLexingRawMode()) + Diag(CurPtr-1, diag::ext_dollar_in_identifier); + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexIdentifier(Result, CurPtr); + } + + Kind = tok::unknown; + break; + + // C99 6.4.4: Character Constants. + case '\'': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexCharConstant(Result, CurPtr, tok::char_constant); + + // C99 6.4.5: String Literals. + case '"': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexStringLiteral(Result, CurPtr, + ParsingFilename ? tok::header_name + : tok::string_literal); + + // C99 6.4.6: Punctuators. + case '?': + Kind = tok::question; + break; + case '[': + Kind = tok::l_square; + break; + case ']': + Kind = tok::r_square; + break; + case '(': + Kind = tok::l_paren; + break; + case ')': + Kind = tok::r_paren; + break; + case '{': + Kind = tok::l_brace; + break; + case '}': + Kind = tok::r_brace; + break; + case '.': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char >= '0' && Char <= '9') { + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); + } else if (LangOpts.CPlusPlus && Char == '*') { + Kind = tok::periodstar; + CurPtr += SizeTmp; + } else if (Char == '.' && + getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { + Kind = tok::ellipsis; + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else { + Kind = tok::period; + } + break; + case '&': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '&') { + Kind = tok::ampamp; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '=') { + Kind = tok::ampequal; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Kind = tok::amp; + } + break; + case '*': + if (getCharAndSize(CurPtr, SizeTmp) == '=') { + Kind = tok::starequal; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Kind = tok::star; + } + break; + case '+': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '+') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::plusplus; + } else if (Char == '=') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::plusequal; + } else { + Kind = tok::plus; + } + break; + case '-': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '-') { // -- + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::minusminus; + } else if (Char == '>' && LangOpts.CPlusPlus && + getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + Kind = tok::arrowstar; + } else if (Char == '>') { // -> + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::arrow; + } else if (Char == '=') { // -= + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::minusequal; + } else { + Kind = tok::minus; + } + break; + case '~': + Kind = tok::tilde; + break; + case '!': + if (getCharAndSize(CurPtr, SizeTmp) == '=') { + Kind = tok::exclaimequal; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Kind = tok::exclaim; + } + break; + case '/': + // 6.4.9: Comments + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '/') { // Line comment. + // Even if Line comments are disabled (e.g. in C89 mode), we generally + // want to lex this as a comment. There is one problem with this though, + // that in one particular corner case, this can change the behavior of the + // resultant program. For example, In "foo //**/ bar", C89 would lex + // this as "foo / bar" and languages with Line comments would lex it as + // "foo". Check to see if the character after the second slash is a '*'. + // If so, we will lex that as a "/" instead of the start of a comment. + // However, we never do this if we are just preprocessing. + bool TreatAsComment = LangOpts.LineComment && + (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); + if (!TreatAsComment) + if (!(PP && PP->isPreprocessedOutput())) + TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; + + if (TreatAsComment) { + if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), + TokAtPhysicalStartOfLine)) + return true; // There is a token to return. + + // It is common for the tokens immediately after a // comment to be + // whitespace (indentation for the next line). Instead of going through + // the big switch, handle it efficiently now. + goto SkipIgnoredUnits; + } + } + + if (Char == '*') { // /**/ comment. + if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), + TokAtPhysicalStartOfLine)) + return true; // There is a token to return. + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + } + + if (Char == '=') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::slashequal; + } else { + Kind = tok::slash; + } + break; + case '%': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + Kind = tok::percentequal; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (LangOpts.Digraphs && Char == '>') { + Kind = tok::r_brace; // '%>' -> '}' + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (LangOpts.Digraphs && Char == ':') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { + Kind = tok::hashhash; // '%:%:' -> '##' + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + if (!isLexingRawMode()) + Diag(BufferPtr, diag::ext_charize_microsoft); + Kind = tok::hashat; + } else { // '%:' -> '#' + // We parsed a # character. If this occurs at the start of the line, + // it's actually the start of a preprocessing directive. Callback to + // the preprocessor to handle it. + // TODO: -fpreprocessed mode?? + if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) + goto HandleDirective; + + Kind = tok::hash; + } + } else { + Kind = tok::percent; + } + break; + case '<': + Char = getCharAndSize(CurPtr, SizeTmp); + if (ParsingFilename) { + return LexAngledStringLiteral(Result, CurPtr); + } else if (Char == '<') { + char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); + if (After == '=') { + Kind = tok::lesslessequal; + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { + // If this is actually a '<<<<<<<' version control conflict marker, + // recognize it as such and recover nicely. + goto LexNextToken; + } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { + // If this is '<<<<' and we're in a Perforce-style conflict marker, + // ignore it. + goto LexNextToken; + } else if (LangOpts.CUDA && After == '<') { + Kind = tok::lesslessless; + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::lessless; + } + } else if (Char == '=') { + char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); + if (After == '>') { + if (getLangOpts().CPlusPlus2a) { + if (!isLexingRawMode()) + Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + Kind = tok::spaceship; + break; + } + // Suggest adding a space between the '<=' and the '>' to avoid a + // change in semantics if this turns up in C++ <=17 mode. + if (getLangOpts().CPlusPlus && !isLexingRawMode()) { + Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship) + << FixItHint::CreateInsertion( + getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); + } + } + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::lessequal; + } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' + if (LangOpts.CPlusPlus11 && + getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { + // C++0x [lex.pptoken]p3: + // Otherwise, if the next three characters are <:: and the subsequent + // character is neither : nor >, the < is treated as a preprocessor + // token by itself and not as the first character of the alternative + // token <:. + unsigned SizeTmp3; + char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); + if (After != ':' && After != '>') { + Kind = tok::less; + if (!isLexingRawMode()) + Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); + break; + } + } + + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::l_square; + } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::l_brace; + } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && + lexEditorPlaceholder(Result, CurPtr)) { + return true; + } else { + Kind = tok::less; + } + break; + case '>': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::greaterequal; + } else if (Char == '>') { + char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); + if (After == '=') { + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + Kind = tok::greatergreaterequal; + } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { + // If this is actually a '>>>>' conflict marker, recognize it as such + // and recover nicely. + goto LexNextToken; + } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { + // If this is '>>>>>>>' and we're in a conflict marker, ignore it. + goto LexNextToken; + } else if (LangOpts.CUDA && After == '>') { + Kind = tok::greatergreatergreater; + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::greatergreater; + } + } else { + Kind = tok::greater; + } + break; + case '^': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::caretequal; + } else if (LangOpts.OpenCL && Char == '^') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Kind = tok::caretcaret; + } else { + Kind = tok::caret; + } + break; + case '|': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + Kind = tok::pipeequal; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '|') { + // If this is '|||||||' and we're in a conflict marker, ignore it. + if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) + goto LexNextToken; + Kind = tok::pipepipe; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Kind = tok::pipe; + } + break; + case ':': + Char = getCharAndSize(CurPtr, SizeTmp); + if (LangOpts.Digraphs && Char == '>') { + Kind = tok::r_square; // ':>' -> ']' + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if ((LangOpts.CPlusPlus || + LangOpts.DoubleSquareBracketAttributes) && + Char == ':') { + Kind = tok::coloncolon; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Kind = tok::colon; + } + break; + case ';': + Kind = tok::semi; + break; + case '=': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + // If this is '====' and we're in a conflict marker, ignore it. + if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) + goto LexNextToken; + + Kind = tok::equalequal; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Kind = tok::equal; + } + break; + case ',': + Kind = tok::comma; + break; + case '#': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '#') { + Kind = tok::hashhash; + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize + Kind = tok::hashat; + if (!isLexingRawMode()) + Diag(BufferPtr, diag::ext_charize_microsoft); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + // We parsed a # character. If this occurs at the start of the line, + // it's actually the start of a preprocessing directive. Callback to + // the preprocessor to handle it. + // TODO: -fpreprocessed mode?? + if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) + goto HandleDirective; + + Kind = tok::hash; + } + break; + + case '@': + // Objective C support. + if (CurPtr[-1] == '@' && LangOpts.ObjC) + Kind = tok::at; + else + Kind = tok::unknown; + break; + + // UCNs (C99 6.4.3, C++11 [lex.charset]p2) + case '\\': + if (!LangOpts.AsmPreprocessor) { + if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { + if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + } + + return LexUnicode(Result, CodePoint, CurPtr); + } + } + + Kind = tok::unknown; + break; + + default: { + if (isASCII(Char)) { + Kind = tok::unknown; + break; + } + + llvm::UTF32 CodePoint; + + // We can't just reset CurPtr to BufferPtr because BufferPtr may point to + // an escaped newline. + --CurPtr; + llvm::ConversionResult Status = + llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, + (const llvm::UTF8 *)BufferEnd, + &CodePoint, + llvm::strictConversion); + if (Status == llvm::conversionOK) { + if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + } + return LexUnicode(Result, CodePoint, CurPtr); + } + + if (isLexingRawMode() || ParsingPreprocessorDirective || + PP->isPreprocessedOutput()) { + ++CurPtr; + Kind = tok::unknown; + break; + } + + // Non-ASCII characters tend to creep into source code unintentionally. + // Instead of letting the parser complain about the unknown token, + // just diagnose the invalid UTF-8, then drop the character. + Diag(CurPtr, diag::err_invalid_utf8); + + BufferPtr = CurPtr+1; + // We're pretending the character didn't exist, so just try again with + // this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + } + } + + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + // Update the location of token as well as BufferPtr. + FormTokenWithChars(Result, CurPtr, Kind); + return true; + +HandleDirective: + // We parsed a # character and it's the start of a preprocessing directive. + + FormTokenWithChars(Result, CurPtr, tok::hash); + PP->HandleDirective(Result); + + if (PP->hadModuleLoaderFatalFailure()) { + // With a fatal failure in the module loader, we abort parsing. + assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); + return true; + } + + // We parsed the directive; lex a token with the new state. + return false; +} diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp new file mode 100644 index 000000000000..2108408377fb --- /dev/null +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -0,0 +1,1896 @@ +//===--- LiteralSupport.cpp - Code to parse and process literals ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the NumericLiteralParser, CharLiteralParser, and +// StringLiteralParser interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/LiteralSupport.h" +#include "clang/Basic/CharInfo.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/ErrorHandling.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <string> + +using namespace clang; + +static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { + switch (kind) { + default: llvm_unreachable("Unknown token type!"); + case tok::char_constant: + case tok::string_literal: + case tok::utf8_char_constant: + case tok::utf8_string_literal: + return Target.getCharWidth(); + case tok::wide_char_constant: + case tok::wide_string_literal: + return Target.getWCharWidth(); + case tok::utf16_char_constant: + case tok::utf16_string_literal: + return Target.getChar16Width(); + case tok::utf32_char_constant: + case tok::utf32_string_literal: + return Target.getChar32Width(); + } +} + +static CharSourceRange MakeCharSourceRange(const LangOptions &Features, + FullSourceLoc TokLoc, + const char *TokBegin, + const char *TokRangeBegin, + const char *TokRangeEnd) { + SourceLocation Begin = + Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, + TokLoc.getManager(), Features); + SourceLocation End = + Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin, + TokLoc.getManager(), Features); + return CharSourceRange::getCharRange(Begin, End); +} + +/// Produce a diagnostic highlighting some portion of a literal. +/// +/// Emits the diagnostic \p DiagID, highlighting the range of characters from +/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be +/// a substring of a spelling buffer for the token beginning at \p TokBegin. +static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, + const LangOptions &Features, FullSourceLoc TokLoc, + const char *TokBegin, const char *TokRangeBegin, + const char *TokRangeEnd, unsigned DiagID) { + SourceLocation Begin = + Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, + TokLoc.getManager(), Features); + return Diags->Report(Begin, DiagID) << + MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd); +} + +/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in +/// either a character or a string literal. +static unsigned ProcessCharEscape(const char *ThisTokBegin, + const char *&ThisTokBuf, + const char *ThisTokEnd, bool &HadError, + FullSourceLoc Loc, unsigned CharWidth, + DiagnosticsEngine *Diags, + const LangOptions &Features) { + const char *EscapeBegin = ThisTokBuf; + + // Skip the '\' char. + ++ThisTokBuf; + + // We know that this character can't be off the end of the buffer, because + // that would have been \", which would not have been the end of string. + unsigned ResultChar = *ThisTokBuf++; + switch (ResultChar) { + // These map to themselves. + case '\\': case '\'': case '"': case '?': break; + + // These have fixed mappings. + case 'a': + // TODO: K&R: the meaning of '\\a' is different in traditional C + ResultChar = 7; + break; + case 'b': + ResultChar = 8; + break; + case 'e': + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_nonstandard_escape) << "e"; + ResultChar = 27; + break; + case 'E': + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_nonstandard_escape) << "E"; + ResultChar = 27; + break; + case 'f': + ResultChar = 12; + break; + case 'n': + ResultChar = 10; + break; + case 'r': + ResultChar = 13; + break; + case 't': + ResultChar = 9; + break; + case 'v': + ResultChar = 11; + break; + case 'x': { // Hex escape. + ResultChar = 0; + if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_hex_escape_no_digits) << "x"; + HadError = true; + break; + } + + // Hex escapes are a maximal series of hex digits. + bool Overflow = false; + for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { + int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); + if (CharVal == -1) break; + // About to shift out a digit? + if (ResultChar & 0xF0000000) + Overflow = true; + ResultChar <<= 4; + ResultChar |= CharVal; + } + + // See if any bits will be truncated when evaluated as a character. + if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { + Overflow = true; + ResultChar &= ~0U >> (32-CharWidth); + } + + // Check for overflow. + if (Overflow && Diags) // Too many digits to fit in + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_escape_too_large) << 0; + break; + } + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': { + // Octal escapes. + --ThisTokBuf; + ResultChar = 0; + + // Octal escapes are a series of octal digits with maximum length 3. + // "\0123" is a two digit sequence equal to "\012" "3". + unsigned NumDigits = 0; + do { + ResultChar <<= 3; + ResultChar |= *ThisTokBuf++ - '0'; + ++NumDigits; + } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && + ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); + + // Check for overflow. Reject '\777', but not L'\777'. + if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_escape_too_large) << 1; + ResultChar &= ~0U >> (32-CharWidth); + } + break; + } + + // Otherwise, these are not valid escapes. + case '(': case '{': case '[': case '%': + // GCC accepts these as extensions. We warn about them as such though. + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_nonstandard_escape) + << std::string(1, ResultChar); + break; + default: + if (!Diags) + break; + + if (isPrintable(ResultChar)) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_unknown_escape) + << std::string(1, ResultChar); + else + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_unknown_escape) + << "x" + llvm::utohexstr(ResultChar); + break; + } + + return ResultChar; +} + +static void appendCodePoint(unsigned Codepoint, + llvm::SmallVectorImpl<char> &Str) { + char ResultBuf[4]; + char *ResultPtr = ResultBuf; + bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); + (void)Res; + assert(Res && "Unexpected conversion failure"); + Str.append(ResultBuf, ResultPtr); +} + +void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { + for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { + if (*I != '\\') { + Buf.push_back(*I); + continue; + } + + ++I; + assert(*I == 'u' || *I == 'U'); + + unsigned NumHexDigits; + if (*I == 'u') + NumHexDigits = 4; + else + NumHexDigits = 8; + + assert(I + NumHexDigits <= E); + + uint32_t CodePoint = 0; + for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { + unsigned Value = llvm::hexDigitValue(*I); + assert(Value != -1U); + + CodePoint <<= 4; + CodePoint += Value; + } + + appendCodePoint(CodePoint, Buf); + --I; + } +} + +/// ProcessUCNEscape - Read the Universal Character Name, check constraints and +/// return the UTF32. +static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, + const char *ThisTokEnd, + uint32_t &UcnVal, unsigned short &UcnLen, + FullSourceLoc Loc, DiagnosticsEngine *Diags, + const LangOptions &Features, + bool in_char_string_literal = false) { + const char *UcnBegin = ThisTokBuf; + + // Skip the '\u' char's. + ThisTokBuf += 2; + + if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1); + return false; + } + UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); + unsigned short UcnLenSave = UcnLen; + for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { + int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); + if (CharVal == -1) break; + UcnVal <<= 4; + UcnVal |= CharVal; + } + // If we didn't consume the proper number of digits, there is a problem. + if (UcnLenSave) { + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_ucn_escape_incomplete); + return false; + } + + // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] + if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints + UcnVal > 0x10FFFF) { // maximum legal UTF32 value + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_ucn_escape_invalid); + return false; + } + + // C++11 allows UCNs that refer to control characters and basic source + // characters inside character and string literals + if (UcnVal < 0xa0 && + (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, ` + bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal); + if (Diags) { + char BasicSCSChar = UcnVal; + if (UcnVal >= 0x20 && UcnVal < 0x7f) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + IsError ? diag::err_ucn_escape_basic_scs : + diag::warn_cxx98_compat_literal_ucn_escape_basic_scs) + << StringRef(&BasicSCSChar, 1); + else + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + IsError ? diag::err_ucn_control_character : + diag::warn_cxx98_compat_literal_ucn_control_character); + } + if (IsError) + return false; + } + + if (!Features.CPlusPlus && !Features.C99 && Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::warn_ucn_not_valid_in_c89_literal); + + return true; +} + +/// MeasureUCNEscape - Determine the number of bytes within the resulting string +/// which this UCN will occupy. +static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, + const char *ThisTokEnd, unsigned CharByteWidth, + const LangOptions &Features, bool &HadError) { + // UTF-32: 4 bytes per escape. + if (CharByteWidth == 4) + return 4; + + uint32_t UcnVal = 0; + unsigned short UcnLen = 0; + FullSourceLoc Loc; + + if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, + UcnLen, Loc, nullptr, Features, true)) { + HadError = true; + return 0; + } + + // UTF-16: 2 bytes for BMP, 4 bytes otherwise. + if (CharByteWidth == 2) + return UcnVal <= 0xFFFF ? 2 : 4; + + // UTF-8. + if (UcnVal < 0x80) + return 1; + if (UcnVal < 0x800) + return 2; + if (UcnVal < 0x10000) + return 3; + return 4; +} + +/// EncodeUCNEscape - Read the Universal Character Name, check constraints and +/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of +/// StringLiteralParser. When we decide to implement UCN's for identifiers, +/// we will likely rework our support for UCN's. +static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, + const char *ThisTokEnd, + char *&ResultBuf, bool &HadError, + FullSourceLoc Loc, unsigned CharByteWidth, + DiagnosticsEngine *Diags, + const LangOptions &Features) { + typedef uint32_t UTF32; + UTF32 UcnVal = 0; + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, + Loc, Diags, Features, true)) { + HadError = true; + return; + } + + assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) && + "only character widths of 1, 2, or 4 bytes supported"); + + (void)UcnLen; + assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); + + if (CharByteWidth == 4) { + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf); + *ResultPtr = UcnVal; + ResultBuf += 4; + return; + } + + if (CharByteWidth == 2) { + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf); + + if (UcnVal <= (UTF32)0xFFFF) { + *ResultPtr = UcnVal; + ResultBuf += 2; + return; + } + + // Convert to UTF16. + UcnVal -= 0x10000; + *ResultPtr = 0xD800 + (UcnVal >> 10); + *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF); + ResultBuf += 4; + return; + } + + assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); + + // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. + // The conversion below was inspired by: + // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c + // First, we determine how many bytes the result will require. + typedef uint8_t UTF8; + + unsigned short bytesToWrite = 0; + if (UcnVal < (UTF32)0x80) + bytesToWrite = 1; + else if (UcnVal < (UTF32)0x800) + bytesToWrite = 2; + else if (UcnVal < (UTF32)0x10000) + bytesToWrite = 3; + else + bytesToWrite = 4; + + const unsigned byteMask = 0xBF; + const unsigned byteMark = 0x80; + + // Once the bits are split out into bytes of UTF8, this is a mask OR-ed + // into the first byte, depending on how many bytes follow. + static const UTF8 firstByteMark[5] = { + 0x00, 0x00, 0xC0, 0xE0, 0xF0 + }; + // Finally, we write the bytes into ResultBuf. + ResultBuf += bytesToWrite; + switch (bytesToWrite) { // note: everything falls through. + case 4: + *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; + LLVM_FALLTHROUGH; + case 3: + *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; + LLVM_FALLTHROUGH; + case 2: + *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; + LLVM_FALLTHROUGH; + case 1: + *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); + } + // Update the buffer. + ResultBuf += bytesToWrite; +} + +/// integer-constant: [C99 6.4.4.1] +/// decimal-constant integer-suffix +/// octal-constant integer-suffix +/// hexadecimal-constant integer-suffix +/// binary-literal integer-suffix [GNU, C++1y] +/// user-defined-integer-literal: [C++11 lex.ext] +/// decimal-literal ud-suffix +/// octal-literal ud-suffix +/// hexadecimal-literal ud-suffix +/// binary-literal ud-suffix [GNU, C++1y] +/// decimal-constant: +/// nonzero-digit +/// decimal-constant digit +/// octal-constant: +/// 0 +/// octal-constant octal-digit +/// hexadecimal-constant: +/// hexadecimal-prefix hexadecimal-digit +/// hexadecimal-constant hexadecimal-digit +/// hexadecimal-prefix: one of +/// 0x 0X +/// binary-literal: +/// 0b binary-digit +/// 0B binary-digit +/// binary-literal binary-digit +/// integer-suffix: +/// unsigned-suffix [long-suffix] +/// unsigned-suffix [long-long-suffix] +/// long-suffix [unsigned-suffix] +/// long-long-suffix [unsigned-sufix] +/// nonzero-digit: +/// 1 2 3 4 5 6 7 8 9 +/// octal-digit: +/// 0 1 2 3 4 5 6 7 +/// hexadecimal-digit: +/// 0 1 2 3 4 5 6 7 8 9 +/// a b c d e f +/// A B C D E F +/// binary-digit: +/// 0 +/// 1 +/// unsigned-suffix: one of +/// u U +/// long-suffix: one of +/// l L +/// long-long-suffix: one of +/// ll LL +/// +/// floating-constant: [C99 6.4.4.2] +/// TODO: add rules... +/// +NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, + SourceLocation TokLoc, + Preprocessor &PP) + : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) { + + // This routine assumes that the range begin/end matches the regex for integer + // and FP constants (specifically, the 'pp-number' regex), and assumes that + // the byte at "*end" is both valid and not part of the regex. Because of + // this, it doesn't have to check for 'overscan' in various places. + assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?"); + + s = DigitsBegin = ThisTokBegin; + saw_exponent = false; + saw_period = false; + saw_ud_suffix = false; + saw_fixed_point_suffix = false; + isLong = false; + isUnsigned = false; + isLongLong = false; + isHalf = false; + isFloat = false; + isImaginary = false; + isFloat16 = false; + isFloat128 = false; + MicrosoftInteger = 0; + isFract = false; + isAccum = false; + hadError = false; + + if (*s == '0') { // parse radix + ParseNumberStartingWithZero(TokLoc); + if (hadError) + return; + } else { // the first digit is non-zero + radix = 10; + s = SkipDigits(s); + if (s == ThisTokEnd) { + // Done. + } else { + ParseDecimalOrOctalCommon(TokLoc); + if (hadError) + return; + } + } + + SuffixBegin = s; + checkSeparator(TokLoc, s, CSK_AfterDigits); + + // Initial scan to lookahead for fixed point suffix. + if (PP.getLangOpts().FixedPoint) { + for (const char *c = s; c != ThisTokEnd; ++c) { + if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') { + saw_fixed_point_suffix = true; + break; + } + } + } + + // Parse the suffix. At this point we can classify whether we have an FP or + // integer constant. + bool isFPConstant = isFloatingLiteral(); + + // Loop over all of the characters of the suffix. If we see something bad, + // we break out of the loop. + for (; s != ThisTokEnd; ++s) { + switch (*s) { + case 'R': + case 'r': + if (!PP.getLangOpts().FixedPoint) break; + if (isFract || isAccum) break; + if (!(saw_period || saw_exponent)) break; + isFract = true; + continue; + case 'K': + case 'k': + if (!PP.getLangOpts().FixedPoint) break; + if (isFract || isAccum) break; + if (!(saw_period || saw_exponent)) break; + isAccum = true; + continue; + case 'h': // FP Suffix for "half". + case 'H': + // OpenCL Extension v1.2 s9.5 - h or H suffix for half type. + if (!(PP.getLangOpts().Half || PP.getLangOpts().FixedPoint)) break; + if (isIntegerLiteral()) break; // Error for integer constant. + if (isHalf || isFloat || isLong) break; // HH, FH, LH invalid. + isHalf = true; + continue; // Success. + case 'f': // FP Suffix for "float" + case 'F': + if (!isFPConstant) break; // Error for integer constant. + if (isHalf || isFloat || isLong || isFloat128) + break; // HF, FF, LF, QF invalid. + + // CUDA host and device may have different _Float16 support, therefore + // allows f16 literals to avoid false alarm. + // ToDo: more precise check for CUDA. + if ((PP.getTargetInfo().hasFloat16Type() || PP.getLangOpts().CUDA) && + s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') { + s += 2; // success, eat up 2 characters. + isFloat16 = true; + continue; + } + + isFloat = true; + continue; // Success. + case 'q': // FP Suffix for "__float128" + case 'Q': + if (!isFPConstant) break; // Error for integer constant. + if (isHalf || isFloat || isLong || isFloat128) + break; // HQ, FQ, LQ, QQ invalid. + isFloat128 = true; + continue; // Success. + case 'u': + case 'U': + if (isFPConstant) break; // Error for floating constant. + if (isUnsigned) break; // Cannot be repeated. + isUnsigned = true; + continue; // Success. + case 'l': + case 'L': + if (isLong || isLongLong) break; // Cannot be repeated. + if (isHalf || isFloat || isFloat128) break; // LH, LF, LQ invalid. + + // Check for long long. The L's need to be adjacent and the same case. + if (s[1] == s[0]) { + assert(s + 1 < ThisTokEnd && "didn't maximally munch?"); + if (isFPConstant) break; // long long invalid for floats. + isLongLong = true; + ++s; // Eat both of them. + } else { + isLong = true; + } + continue; // Success. + case 'i': + case 'I': + if (PP.getLangOpts().MicrosoftExt) { + if (isLong || isLongLong || MicrosoftInteger) + break; + + if (!isFPConstant) { + // Allow i8, i16, i32, and i64. + switch (s[1]) { + case '8': + s += 2; // i8 suffix + MicrosoftInteger = 8; + break; + case '1': + if (s[2] == '6') { + s += 3; // i16 suffix + MicrosoftInteger = 16; + } + break; + case '3': + if (s[2] == '2') { + s += 3; // i32 suffix + MicrosoftInteger = 32; + } + break; + case '6': + if (s[2] == '4') { + s += 3; // i64 suffix + MicrosoftInteger = 64; + } + break; + default: + break; + } + } + if (MicrosoftInteger) { + assert(s <= ThisTokEnd && "didn't maximally munch?"); + break; + } + } + LLVM_FALLTHROUGH; + case 'j': + case 'J': + if (isImaginary) break; // Cannot be repeated. + isImaginary = true; + continue; // Success. + } + // If we reached here, there was an error or a ud-suffix. + break; + } + + // "i", "if", and "il" are user-defined suffixes in C++1y. + if (s != ThisTokEnd || isImaginary) { + // FIXME: Don't bother expanding UCNs if !tok.hasUCN(). + expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)); + if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) { + if (!isImaginary) { + // Any suffix pieces we might have parsed are actually part of the + // ud-suffix. + isLong = false; + isUnsigned = false; + isLongLong = false; + isFloat = false; + isFloat16 = false; + isHalf = false; + isImaginary = false; + MicrosoftInteger = 0; + saw_fixed_point_suffix = false; + isFract = false; + isAccum = false; + } + + saw_ud_suffix = true; + return; + } + + if (s != ThisTokEnd) { + // Report an error if there are any. + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin), + diag::err_invalid_suffix_constant) + << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin) << isFPConstant; + hadError = true; + } + } + + if (!hadError && saw_fixed_point_suffix) { + assert(isFract || isAccum); + } +} + +/// ParseDecimalOrOctalCommon - This method is called for decimal or octal +/// numbers. It issues an error for illegal digits, and handles floating point +/// parsing. If it detects a floating point number, the radix is set to 10. +void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){ + assert((radix == 8 || radix == 10) && "Unexpected radix"); + + // If we have a hex digit other than 'e' (which denotes a FP exponent) then + // the code is using an incorrect base. + if (isHexDigit(*s) && *s != 'e' && *s != 'E' && + !isValidUDSuffix(PP.getLangOpts(), StringRef(s, ThisTokEnd - s))) { + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), + diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0); + hadError = true; + return; + } + + if (*s == '.') { + checkSeparator(TokLoc, s, CSK_AfterDigits); + s++; + radix = 10; + saw_period = true; + checkSeparator(TokLoc, s, CSK_BeforeDigits); + s = SkipDigits(s); // Skip suffix. + } + if (*s == 'e' || *s == 'E') { // exponent + checkSeparator(TokLoc, s, CSK_AfterDigits); + const char *Exponent = s; + s++; + radix = 10; + saw_exponent = true; + if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign + const char *first_non_digit = SkipDigits(s); + if (containsDigits(s, first_non_digit)) { + checkSeparator(TokLoc, s, CSK_BeforeDigits); + s = first_non_digit; + } else { + if (!hadError) { + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), + diag::err_exponent_has_no_digits); + hadError = true; + } + return; + } + } +} + +/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved +/// suffixes as ud-suffixes, because the diagnostic experience is better if we +/// treat it as an invalid suffix. +bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, + StringRef Suffix) { + if (!LangOpts.CPlusPlus11 || Suffix.empty()) + return false; + + // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid. + if (Suffix[0] == '_') + return true; + + // In C++11, there are no library suffixes. + if (!LangOpts.CPlusPlus14) + return false; + + // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library. + // Per tweaked N3660, "il", "i", and "if" are also used in the library. + // In C++2a "d" and "y" are used in the library. + return llvm::StringSwitch<bool>(Suffix) + .Cases("h", "min", "s", true) + .Cases("ms", "us", "ns", true) + .Cases("il", "i", "if", true) + .Cases("d", "y", LangOpts.CPlusPlus2a) + .Default(false); +} + +void NumericLiteralParser::checkSeparator(SourceLocation TokLoc, + const char *Pos, + CheckSeparatorKind IsAfterDigits) { + if (IsAfterDigits == CSK_AfterDigits) { + if (Pos == ThisTokBegin) + return; + --Pos; + } else if (Pos == ThisTokEnd) + return; + + if (isDigitSeparator(*Pos)) { + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin), + diag::err_digit_separator_not_between_digits) + << IsAfterDigits; + hadError = true; + } +} + +/// ParseNumberStartingWithZero - This method is called when the first character +/// of the number is found to be a zero. This means it is either an octal +/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or +/// a floating point number (01239.123e4). Eat the prefix, determining the +/// radix etc. +void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { + assert(s[0] == '0' && "Invalid method call"); + s++; + + int c1 = s[0]; + + // Handle a hex number like 0x1234. + if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) { + s++; + assert(s < ThisTokEnd && "didn't maximally munch?"); + radix = 16; + DigitsBegin = s; + s = SkipHexDigits(s); + bool HasSignificandDigits = containsDigits(DigitsBegin, s); + if (s == ThisTokEnd) { + // Done. + } else if (*s == '.') { + s++; + saw_period = true; + const char *floatDigitsBegin = s; + s = SkipHexDigits(s); + if (containsDigits(floatDigitsBegin, s)) + HasSignificandDigits = true; + if (HasSignificandDigits) + checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits); + } + + if (!HasSignificandDigits) { + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin), + diag::err_hex_constant_requires) + << PP.getLangOpts().CPlusPlus << 1; + hadError = true; + return; + } + + // A binary exponent can appear with or with a '.'. If dotted, the + // binary exponent is required. + if (*s == 'p' || *s == 'P') { + checkSeparator(TokLoc, s, CSK_AfterDigits); + const char *Exponent = s; + s++; + saw_exponent = true; + if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign + const char *first_non_digit = SkipDigits(s); + if (!containsDigits(s, first_non_digit)) { + if (!hadError) { + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), + diag::err_exponent_has_no_digits); + hadError = true; + } + return; + } + checkSeparator(TokLoc, s, CSK_BeforeDigits); + s = first_non_digit; + + if (!PP.getLangOpts().HexFloats) + PP.Diag(TokLoc, PP.getLangOpts().CPlusPlus + ? diag::ext_hex_literal_invalid + : diag::ext_hex_constant_invalid); + else if (PP.getLangOpts().CPlusPlus17) + PP.Diag(TokLoc, diag::warn_cxx17_hex_literal); + } else if (saw_period) { + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin), + diag::err_hex_constant_requires) + << PP.getLangOpts().CPlusPlus << 0; + hadError = true; + } + return; + } + + // Handle simple binary numbers 0b01010 + if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) { + // 0b101010 is a C++1y / GCC extension. + PP.Diag(TokLoc, + PP.getLangOpts().CPlusPlus14 + ? diag::warn_cxx11_compat_binary_literal + : PP.getLangOpts().CPlusPlus + ? diag::ext_binary_literal_cxx14 + : diag::ext_binary_literal); + ++s; + assert(s < ThisTokEnd && "didn't maximally munch?"); + radix = 2; + DigitsBegin = s; + s = SkipBinaryDigits(s); + if (s == ThisTokEnd) { + // Done. + } else if (isHexDigit(*s) && + !isValidUDSuffix(PP.getLangOpts(), + StringRef(s, ThisTokEnd - s))) { + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), + diag::err_invalid_digit) << StringRef(s, 1) << 2; + hadError = true; + } + // Other suffixes will be diagnosed by the caller. + return; + } + + // For now, the radix is set to 8. If we discover that we have a + // floating point constant, the radix will change to 10. Octal floating + // point constants are not permitted (only decimal and hexadecimal). + radix = 8; + DigitsBegin = s; + s = SkipOctalDigits(s); + if (s == ThisTokEnd) + return; // Done, simple octal number like 01234 + + // If we have some other non-octal digit that *is* a decimal digit, see if + // this is part of a floating point number like 094.123 or 09e1. + if (isDigit(*s)) { + const char *EndDecimal = SkipDigits(s); + if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { + s = EndDecimal; + radix = 10; + } + } + + ParseDecimalOrOctalCommon(TokLoc); +} + +static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) { + switch (Radix) { + case 2: + return NumDigits <= 64; + case 8: + return NumDigits <= 64 / 3; // Digits are groups of 3 bits. + case 10: + return NumDigits <= 19; // floor(log10(2^64)) + case 16: + return NumDigits <= 64 / 4; // Digits are groups of 4 bits. + default: + llvm_unreachable("impossible Radix"); + } +} + +/// GetIntegerValue - Convert this numeric literal value to an APInt that +/// matches Val's input width. If there is an overflow, set Val to the low bits +/// of the result and return true. Otherwise, return false. +bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { + // Fast path: Compute a conservative bound on the maximum number of + // bits per digit in this radix. If we can't possibly overflow a + // uint64 based on that bound then do the simple conversion to + // integer. This avoids the expensive overflow checking below, and + // handles the common cases that matter (small decimal integers and + // hex/octal values which don't overflow). + const unsigned NumDigits = SuffixBegin - DigitsBegin; + if (alwaysFitsInto64Bits(radix, NumDigits)) { + uint64_t N = 0; + for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr) + if (!isDigitSeparator(*Ptr)) + N = N * radix + llvm::hexDigitValue(*Ptr); + + // This will truncate the value to Val's input width. Simply check + // for overflow by comparing. + Val = N; + return Val.getZExtValue() != N; + } + + Val = 0; + const char *Ptr = DigitsBegin; + + llvm::APInt RadixVal(Val.getBitWidth(), radix); + llvm::APInt CharVal(Val.getBitWidth(), 0); + llvm::APInt OldVal = Val; + + bool OverflowOccurred = false; + while (Ptr < SuffixBegin) { + if (isDigitSeparator(*Ptr)) { + ++Ptr; + continue; + } + + unsigned C = llvm::hexDigitValue(*Ptr++); + + // If this letter is out of bound for this radix, reject it. + assert(C < radix && "NumericLiteralParser ctor should have rejected this"); + + CharVal = C; + + // Add the digit to the value in the appropriate radix. If adding in digits + // made the value smaller, then this overflowed. + OldVal = Val; + + // Multiply by radix, did overflow occur on the multiply? + Val *= RadixVal; + OverflowOccurred |= Val.udiv(RadixVal) != OldVal; + + // Add value, did overflow occur on the value? + // (a + b) ult b <=> overflow + Val += CharVal; + OverflowOccurred |= Val.ult(CharVal); + } + return OverflowOccurred; +} + +llvm::APFloat::opStatus +NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { + using llvm::APFloat; + + unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); + + llvm::SmallString<16> Buffer; + StringRef Str(ThisTokBegin, n); + if (Str.find('\'') != StringRef::npos) { + Buffer.reserve(n); + std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer), + &isDigitSeparator); + Str = Buffer; + } + + return Result.convertFromString(Str, APFloat::rmNearestTiesToEven); +} + +static inline bool IsExponentPart(char c) { + return c == 'p' || c == 'P' || c == 'e' || c == 'E'; +} + +bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) { + assert(radix == 16 || radix == 10); + + // Find how many digits are needed to store the whole literal. + unsigned NumDigits = SuffixBegin - DigitsBegin; + if (saw_period) --NumDigits; + + // Initial scan of the exponent if it exists + bool ExpOverflowOccurred = false; + bool NegativeExponent = false; + const char *ExponentBegin; + uint64_t Exponent = 0; + int64_t BaseShift = 0; + if (saw_exponent) { + const char *Ptr = DigitsBegin; + + while (!IsExponentPart(*Ptr)) ++Ptr; + ExponentBegin = Ptr; + ++Ptr; + NegativeExponent = *Ptr == '-'; + if (NegativeExponent) ++Ptr; + + unsigned NumExpDigits = SuffixBegin - Ptr; + if (alwaysFitsInto64Bits(radix, NumExpDigits)) { + llvm::StringRef ExpStr(Ptr, NumExpDigits); + llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10); + Exponent = ExpInt.getZExtValue(); + } else { + ExpOverflowOccurred = true; + } + + if (NegativeExponent) BaseShift -= Exponent; + else BaseShift += Exponent; + } + + // Number of bits needed for decimal literal is + // ceil(NumDigits * log2(10)) Integral part + // + Scale Fractional part + // + ceil(Exponent * log2(10)) Exponent + // -------------------------------------------------- + // ceil((NumDigits + Exponent) * log2(10)) + Scale + // + // But for simplicity in handling integers, we can round up log2(10) to 4, + // making: + // 4 * (NumDigits + Exponent) + Scale + // + // Number of digits needed for hexadecimal literal is + // 4 * NumDigits Integral part + // + Scale Fractional part + // + Exponent Exponent + // -------------------------------------------------- + // (4 * NumDigits) + Scale + Exponent + uint64_t NumBitsNeeded; + if (radix == 10) + NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale; + else + NumBitsNeeded = 4 * NumDigits + Exponent + Scale; + + if (NumBitsNeeded > std::numeric_limits<unsigned>::max()) + ExpOverflowOccurred = true; + llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false); + + bool FoundDecimal = false; + + int64_t FractBaseShift = 0; + const char *End = saw_exponent ? ExponentBegin : SuffixBegin; + for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) { + if (*Ptr == '.') { + FoundDecimal = true; + continue; + } + + // Normal reading of an integer + unsigned C = llvm::hexDigitValue(*Ptr); + assert(C < radix && "NumericLiteralParser ctor should have rejected this"); + + Val *= radix; + Val += C; + + if (FoundDecimal) + // Keep track of how much we will need to adjust this value by from the + // number of digits past the radix point. + --FractBaseShift; + } + + // For a radix of 16, we will be multiplying by 2 instead of 16. + if (radix == 16) FractBaseShift *= 4; + BaseShift += FractBaseShift; + + Val <<= Scale; + + uint64_t Base = (radix == 16) ? 2 : 10; + if (BaseShift > 0) { + for (int64_t i = 0; i < BaseShift; ++i) { + Val *= Base; + } + } else if (BaseShift < 0) { + for (int64_t i = BaseShift; i < 0 && !Val.isNullValue(); ++i) + Val = Val.udiv(Base); + } + + bool IntOverflowOccurred = false; + auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth()); + if (Val.getBitWidth() > StoreVal.getBitWidth()) { + IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth())); + StoreVal = Val.trunc(StoreVal.getBitWidth()); + } else if (Val.getBitWidth() < StoreVal.getBitWidth()) { + IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal); + StoreVal = Val.zext(StoreVal.getBitWidth()); + } else { + StoreVal = Val; + } + + return IntOverflowOccurred || ExpOverflowOccurred; +} + +/// \verbatim +/// user-defined-character-literal: [C++11 lex.ext] +/// character-literal ud-suffix +/// ud-suffix: +/// identifier +/// character-literal: [C++11 lex.ccon] +/// ' c-char-sequence ' +/// u' c-char-sequence ' +/// U' c-char-sequence ' +/// L' c-char-sequence ' +/// u8' c-char-sequence ' [C++1z lex.ccon] +/// c-char-sequence: +/// c-char +/// c-char-sequence c-char +/// c-char: +/// any member of the source character set except the single-quote ', +/// backslash \, or new-line character +/// escape-sequence +/// universal-character-name +/// escape-sequence: +/// simple-escape-sequence +/// octal-escape-sequence +/// hexadecimal-escape-sequence +/// simple-escape-sequence: +/// one of \' \" \? \\ \a \b \f \n \r \t \v +/// octal-escape-sequence: +/// \ octal-digit +/// \ octal-digit octal-digit +/// \ octal-digit octal-digit octal-digit +/// hexadecimal-escape-sequence: +/// \x hexadecimal-digit +/// hexadecimal-escape-sequence hexadecimal-digit +/// universal-character-name: [C++11 lex.charset] +/// \u hex-quad +/// \U hex-quad hex-quad +/// hex-quad: +/// hex-digit hex-digit hex-digit hex-digit +/// \endverbatim +/// +CharLiteralParser::CharLiteralParser(const char *begin, const char *end, + SourceLocation Loc, Preprocessor &PP, + tok::TokenKind kind) { + // At this point we know that the character matches the regex "(L|u|U)?'.*'". + HadError = false; + + Kind = kind; + + const char *TokBegin = begin; + + // Skip over wide character determinant. + if (Kind != tok::char_constant) + ++begin; + if (Kind == tok::utf8_char_constant) + ++begin; + + // Skip over the entry quote. + assert(begin[0] == '\'' && "Invalid token lexed"); + ++begin; + + // Remove an optional ud-suffix. + if (end[-1] != '\'') { + const char *UDSuffixEnd = end; + do { + --end; + } while (end[-1] != '\''); + // FIXME: Don't bother with this if !tok.hasUCN(). + expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end)); + UDSuffixOffset = end - TokBegin; + } + + // Trim the ending quote. + assert(end != begin && "Invalid token lexed"); + --end; + + // FIXME: The "Value" is an uint64_t so we can handle char literals of + // up to 64-bits. + // FIXME: This extensively assumes that 'char' is 8-bits. + assert(PP.getTargetInfo().getCharWidth() == 8 && + "Assumes char is 8 bits"); + assert(PP.getTargetInfo().getIntWidth() <= 64 && + (PP.getTargetInfo().getIntWidth() & 7) == 0 && + "Assumes sizeof(int) on target is <= 64 and a multiple of char"); + assert(PP.getTargetInfo().getWCharWidth() <= 64 && + "Assumes sizeof(wchar) on target is <= 64"); + + SmallVector<uint32_t, 4> codepoint_buffer; + codepoint_buffer.resize(end - begin); + uint32_t *buffer_begin = &codepoint_buffer.front(); + uint32_t *buffer_end = buffer_begin + codepoint_buffer.size(); + + // Unicode escapes representing characters that cannot be correctly + // represented in a single code unit are disallowed in character literals + // by this implementation. + uint32_t largest_character_for_kind; + if (tok::wide_char_constant == Kind) { + largest_character_for_kind = + 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth()); + } else if (tok::utf8_char_constant == Kind) { + largest_character_for_kind = 0x7F; + } else if (tok::utf16_char_constant == Kind) { + largest_character_for_kind = 0xFFFF; + } else if (tok::utf32_char_constant == Kind) { + largest_character_for_kind = 0x10FFFF; + } else { + largest_character_for_kind = 0x7Fu; + } + + while (begin != end) { + // Is this a span of non-escape characters? + if (begin[0] != '\\') { + char const *start = begin; + do { + ++begin; + } while (begin != end && *begin != '\\'); + + char const *tmp_in_start = start; + uint32_t *tmp_out_start = buffer_begin; + llvm::ConversionResult res = + llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start), + reinterpret_cast<llvm::UTF8 const *>(begin), + &buffer_begin, buffer_end, llvm::strictConversion); + if (res != llvm::conversionOK) { + // If we see bad encoding for unprefixed character literals, warn and + // simply copy the byte values, for compatibility with gcc and + // older versions of clang. + bool NoErrorOnBadEncoding = isAscii(); + unsigned Msg = diag::err_bad_character_encoding; + if (NoErrorOnBadEncoding) + Msg = diag::warn_bad_character_encoding; + PP.Diag(Loc, Msg); + if (NoErrorOnBadEncoding) { + start = tmp_in_start; + buffer_begin = tmp_out_start; + for (; start != begin; ++start, ++buffer_begin) + *buffer_begin = static_cast<uint8_t>(*start); + } else { + HadError = true; + } + } else { + for (; tmp_out_start < buffer_begin; ++tmp_out_start) { + if (*tmp_out_start > largest_character_for_kind) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } + } + } + + continue; + } + // Is this a Universal Character Name escape? + if (begin[1] == 'u' || begin[1] == 'U') { + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, + FullSourceLoc(Loc, PP.getSourceManager()), + &PP.getDiagnostics(), PP.getLangOpts(), true)) { + HadError = true; + } else if (*buffer_begin > largest_character_for_kind) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } + + ++buffer_begin; + continue; + } + unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); + uint64_t result = + ProcessCharEscape(TokBegin, begin, end, HadError, + FullSourceLoc(Loc,PP.getSourceManager()), + CharWidth, &PP.getDiagnostics(), PP.getLangOpts()); + *buffer_begin++ = result; + } + + unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front(); + + if (NumCharsSoFar > 1) { + if (isWide()) + PP.Diag(Loc, diag::warn_extraneous_char_constant); + else if (isAscii() && NumCharsSoFar == 4) + PP.Diag(Loc, diag::ext_four_char_character_literal); + else if (isAscii()) + PP.Diag(Loc, diag::ext_multichar_character_literal); + else + PP.Diag(Loc, diag::err_multichar_utf_character_literal); + IsMultiChar = true; + } else { + IsMultiChar = false; + } + + llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0); + + // Narrow character literals act as though their value is concatenated + // in this implementation, but warn on overflow. + bool multi_char_too_long = false; + if (isAscii() && isMultiChar()) { + LitVal = 0; + for (size_t i = 0; i < NumCharsSoFar; ++i) { + // check for enough leading zeros to shift into + multi_char_too_long |= (LitVal.countLeadingZeros() < 8); + LitVal <<= 8; + LitVal = LitVal + (codepoint_buffer[i] & 0xFF); + } + } else if (NumCharsSoFar > 0) { + // otherwise just take the last character + LitVal = buffer_begin[-1]; + } + + if (!HadError && multi_char_too_long) { + PP.Diag(Loc, diag::warn_char_constant_too_large); + } + + // Transfer the value from APInt to uint64_t + Value = LitVal.getZExtValue(); + + // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") + // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple + // character constants are not sign extended in the this implementation: + // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. + if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && + PP.getLangOpts().CharIsSigned) + Value = (signed char)Value; +} + +/// \verbatim +/// string-literal: [C++0x lex.string] +/// encoding-prefix " [s-char-sequence] " +/// encoding-prefix R raw-string +/// encoding-prefix: +/// u8 +/// u +/// U +/// L +/// s-char-sequence: +/// s-char +/// s-char-sequence s-char +/// s-char: +/// any member of the source character set except the double-quote ", +/// backslash \, or new-line character +/// escape-sequence +/// universal-character-name +/// raw-string: +/// " d-char-sequence ( r-char-sequence ) d-char-sequence " +/// r-char-sequence: +/// r-char +/// r-char-sequence r-char +/// r-char: +/// any member of the source character set, except a right parenthesis ) +/// followed by the initial d-char-sequence (which may be empty) +/// followed by a double quote ". +/// d-char-sequence: +/// d-char +/// d-char-sequence d-char +/// d-char: +/// any member of the basic source character set except: +/// space, the left parenthesis (, the right parenthesis ), +/// the backslash \, and the control characters representing horizontal +/// tab, vertical tab, form feed, and newline. +/// escape-sequence: [C++0x lex.ccon] +/// simple-escape-sequence +/// octal-escape-sequence +/// hexadecimal-escape-sequence +/// simple-escape-sequence: +/// one of \' \" \? \\ \a \b \f \n \r \t \v +/// octal-escape-sequence: +/// \ octal-digit +/// \ octal-digit octal-digit +/// \ octal-digit octal-digit octal-digit +/// hexadecimal-escape-sequence: +/// \x hexadecimal-digit +/// hexadecimal-escape-sequence hexadecimal-digit +/// universal-character-name: +/// \u hex-quad +/// \U hex-quad hex-quad +/// hex-quad: +/// hex-digit hex-digit hex-digit hex-digit +/// \endverbatim +/// +StringLiteralParser:: +StringLiteralParser(ArrayRef<Token> StringToks, + Preprocessor &PP, bool Complain) + : SM(PP.getSourceManager()), Features(PP.getLangOpts()), + Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr), + MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), + ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { + init(StringToks); +} + +void StringLiteralParser::init(ArrayRef<Token> StringToks){ + // The literal token may have come from an invalid source location (e.g. due + // to a PCH error), in which case the token length will be 0. + if (StringToks.empty() || StringToks[0].getLength() < 2) + return DiagnoseLexingError(SourceLocation()); + + // Scan all of the string portions, remember the max individual token length, + // computing a bound on the concatenated string length, and see whether any + // piece is a wide-string. If any of the string portions is a wide-string + // literal, the result is a wide-string literal [C99 6.4.5p4]. + assert(!StringToks.empty() && "expected at least one token"); + MaxTokenLength = StringToks[0].getLength(); + assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); + SizeBound = StringToks[0].getLength()-2; // -2 for "". + Kind = StringToks[0].getKind(); + + hadError = false; + + // Implement Translation Phase #6: concatenation of string literals + /// (C99 5.1.1.2p1). The common case is only one string fragment. + for (unsigned i = 1; i != StringToks.size(); ++i) { + if (StringToks[i].getLength() < 2) + return DiagnoseLexingError(StringToks[i].getLocation()); + + // The string could be shorter than this if it needs cleaning, but this is a + // reasonable bound, which is all we need. + assert(StringToks[i].getLength() >= 2 && "literal token is invalid!"); + SizeBound += StringToks[i].getLength()-2; // -2 for "". + + // Remember maximum string piece length. + if (StringToks[i].getLength() > MaxTokenLength) + MaxTokenLength = StringToks[i].getLength(); + + // Remember if we see any wide or utf-8/16/32 strings. + // Also check for illegal concatenations. + if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { + if (isAscii()) { + Kind = StringToks[i].getKind(); + } else { + if (Diags) + Diags->Report(StringToks[i].getLocation(), + diag::err_unsupported_string_concat); + hadError = true; + } + } + } + + // Include space for the null terminator. + ++SizeBound; + + // TODO: K&R warning: "traditional C rejects string constant concatenation" + + // Get the width in bytes of char/wchar_t/char16_t/char32_t + CharByteWidth = getCharWidth(Kind, Target); + assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); + CharByteWidth /= 8; + + // The output buffer size needs to be large enough to hold wide characters. + // This is a worst-case assumption which basically corresponds to L"" "long". + SizeBound *= CharByteWidth; + + // Size the temporary buffer to hold the result string data. + ResultBuf.resize(SizeBound); + + // Likewise, but for each string piece. + SmallString<512> TokenBuf; + TokenBuf.resize(MaxTokenLength); + + // Loop over all the strings, getting their spelling, and expanding them to + // wide strings as appropriate. + ResultPtr = &ResultBuf[0]; // Next byte to fill in. + + Pascal = false; + + SourceLocation UDSuffixTokLoc; + + for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { + const char *ThisTokBuf = &TokenBuf[0]; + // Get the spelling of the token, which eliminates trigraphs, etc. We know + // that ThisTokBuf points to a buffer that is big enough for the whole token + // and 'spelled' tokens can only shrink. + bool StringInvalid = false; + unsigned ThisTokLen = + Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, + &StringInvalid); + if (StringInvalid) + return DiagnoseLexingError(StringToks[i].getLocation()); + + const char *ThisTokBegin = ThisTokBuf; + const char *ThisTokEnd = ThisTokBuf+ThisTokLen; + + // Remove an optional ud-suffix. + if (ThisTokEnd[-1] != '"') { + const char *UDSuffixEnd = ThisTokEnd; + do { + --ThisTokEnd; + } while (ThisTokEnd[-1] != '"'); + + StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd); + + if (UDSuffixBuf.empty()) { + if (StringToks[i].hasUCN()) + expandUCNs(UDSuffixBuf, UDSuffix); + else + UDSuffixBuf.assign(UDSuffix); + UDSuffixToken = i; + UDSuffixOffset = ThisTokEnd - ThisTokBuf; + UDSuffixTokLoc = StringToks[i].getLocation(); + } else { + SmallString<32> ExpandedUDSuffix; + if (StringToks[i].hasUCN()) { + expandUCNs(ExpandedUDSuffix, UDSuffix); + UDSuffix = ExpandedUDSuffix; + } + + // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the + // result of a concatenation involving at least one user-defined-string- + // literal, all the participating user-defined-string-literals shall + // have the same ud-suffix. + if (UDSuffixBuf != UDSuffix) { + if (Diags) { + SourceLocation TokLoc = StringToks[i].getLocation(); + Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) + << UDSuffixBuf << UDSuffix + << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) + << SourceRange(TokLoc, TokLoc); + } + hadError = true; + } + } + } + + // Strip the end quote. + --ThisTokEnd; + + // TODO: Input character set mapping support. + + // Skip marker for wide or unicode strings. + if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { + ++ThisTokBuf; + // Skip 8 of u8 marker for utf8 strings. + if (ThisTokBuf[0] == '8') + ++ThisTokBuf; + } + + // Check for raw string + if (ThisTokBuf[0] == 'R') { + ThisTokBuf += 2; // skip R" + + const char *Prefix = ThisTokBuf; + while (ThisTokBuf[0] != '(') + ++ThisTokBuf; + ++ThisTokBuf; // skip '(' + + // Remove same number of characters from the end + ThisTokEnd -= ThisTokBuf - Prefix; + assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal"); + + // C++14 [lex.string]p4: A source-file new-line in a raw string literal + // results in a new-line in the resulting execution string-literal. + StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf); + while (!RemainingTokenSpan.empty()) { + // Split the string literal on \r\n boundaries. + size_t CRLFPos = RemainingTokenSpan.find("\r\n"); + StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos); + StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos); + + // Copy everything before the \r\n sequence into the string literal. + if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) + hadError = true; + + // Point into the \n inside the \r\n sequence and operate on the + // remaining portion of the literal. + RemainingTokenSpan = AfterCRLF.substr(1); + } + } else { + if (ThisTokBuf[0] != '"') { + // The file may have come from PCH and then changed after loading the + // PCH; Fail gracefully. + return DiagnoseLexingError(StringToks[i].getLocation()); + } + ++ThisTokBuf; // skip " + + // Check if this is a pascal string + if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && + ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { + + // If the \p sequence is found in the first token, we have a pascal string + // Otherwise, if we already have a pascal string, ignore the first \p + if (i == 0) { + ++ThisTokBuf; + Pascal = true; + } else if (Pascal) + ThisTokBuf += 2; + } + + while (ThisTokBuf != ThisTokEnd) { + // Is this a span of non-escape characters? + if (ThisTokBuf[0] != '\\') { + const char *InStart = ThisTokBuf; + do { + ++ThisTokBuf; + } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); + + // Copy the character span over. + if (CopyStringFragment(StringToks[i], ThisTokBegin, + StringRef(InStart, ThisTokBuf - InStart))) + hadError = true; + continue; + } + // Is this a Universal Character Name escape? + if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { + EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, + ResultPtr, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), + CharByteWidth, Diags, Features); + continue; + } + // Otherwise, this is a non-UCN escape character. Process it. + unsigned ResultChar = + ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), + CharByteWidth*8, Diags, Features); + + if (CharByteWidth == 4) { + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr); + *ResultWidePtr = ResultChar; + ResultPtr += 4; + } else if (CharByteWidth == 2) { + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr); + *ResultWidePtr = ResultChar & 0xFFFF; + ResultPtr += 2; + } else { + assert(CharByteWidth == 1 && "Unexpected char width"); + *ResultPtr++ = ResultChar & 0xFF; + } + } + } + } + + if (Pascal) { + if (CharByteWidth == 4) { + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data()); + ResultWidePtr[0] = GetNumStringChars() - 1; + } else if (CharByteWidth == 2) { + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data()); + ResultWidePtr[0] = GetNumStringChars() - 1; + } else { + assert(CharByteWidth == 1 && "Unexpected char width"); + ResultBuf[0] = GetNumStringChars() - 1; + } + + // Verify that pascal strings aren't too large. + if (GetStringLength() > 256) { + if (Diags) + Diags->Report(StringToks.front().getLocation(), + diag::err_pascal_string_too_long) + << SourceRange(StringToks.front().getLocation(), + StringToks.back().getLocation()); + hadError = true; + return; + } + } else if (Diags) { + // Complain if this string literal has too many characters. + unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509; + + if (GetNumStringChars() > MaxChars) + Diags->Report(StringToks.front().getLocation(), + diag::ext_string_too_long) + << GetNumStringChars() << MaxChars + << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0) + << SourceRange(StringToks.front().getLocation(), + StringToks.back().getLocation()); + } +} + +static const char *resyncUTF8(const char *Err, const char *End) { + if (Err == End) + return End; + End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err); + while (++Err != End && (*Err & 0xC0) == 0x80) + ; + return Err; +} + +/// This function copies from Fragment, which is a sequence of bytes +/// within Tok's contents (which begin at TokBegin) into ResultPtr. +/// Performs widening for multi-byte characters. +bool StringLiteralParser::CopyStringFragment(const Token &Tok, + const char *TokBegin, + StringRef Fragment) { + const llvm::UTF8 *ErrorPtrTmp; + if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) + return false; + + // If we see bad encoding for unprefixed string literals, warn and + // simply copy the byte values, for compatibility with gcc and older + // versions of clang. + bool NoErrorOnBadEncoding = isAscii(); + if (NoErrorOnBadEncoding) { + memcpy(ResultPtr, Fragment.data(), Fragment.size()); + ResultPtr += Fragment.size(); + } + + if (Diags) { + const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); + + FullSourceLoc SourceLoc(Tok.getLocation(), SM); + const DiagnosticBuilder &Builder = + Diag(Diags, Features, SourceLoc, TokBegin, + ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()), + NoErrorOnBadEncoding ? diag::warn_bad_string_encoding + : diag::err_bad_string_encoding); + + const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end()); + StringRef NextFragment(NextStart, Fragment.end()-NextStart); + + // Decode into a dummy buffer. + SmallString<512> Dummy; + Dummy.reserve(Fragment.size() * CharByteWidth); + char *Ptr = Dummy.data(); + + while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) { + const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); + NextStart = resyncUTF8(ErrorPtr, Fragment.end()); + Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin, + ErrorPtr, NextStart); + NextFragment = StringRef(NextStart, Fragment.end()-NextStart); + } + } + return !NoErrorOnBadEncoding; +} + +void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) { + hadError = true; + if (Diags) + Diags->Report(Loc, diag::err_lexing_string); +} + +/// getOffsetOfStringByte - This function returns the offset of the +/// specified byte of the string data represented by Token. This handles +/// advancing over escape sequences in the string. +unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, + unsigned ByteNo) const { + // Get the spelling of the token. + SmallString<32> SpellingBuffer; + SpellingBuffer.resize(Tok.getLength()); + + bool StringInvalid = false; + const char *SpellingPtr = &SpellingBuffer[0]; + unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features, + &StringInvalid); + if (StringInvalid) + return 0; + + const char *SpellingStart = SpellingPtr; + const char *SpellingEnd = SpellingPtr+TokLen; + + // Handle UTF-8 strings just like narrow strings. + if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8') + SpellingPtr += 2; + + assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && + SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); + + // For raw string literals, this is easy. + if (SpellingPtr[0] == 'R') { + assert(SpellingPtr[1] == '"' && "Should be a raw string literal!"); + // Skip 'R"'. + SpellingPtr += 2; + while (*SpellingPtr != '(') { + ++SpellingPtr; + assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal"); + } + // Skip '('. + ++SpellingPtr; + return SpellingPtr - SpellingStart + ByteNo; + } + + // Skip over the leading quote + assert(SpellingPtr[0] == '"' && "Should be a string literal!"); + ++SpellingPtr; + + // Skip over bytes until we find the offset we're looking for. + while (ByteNo) { + assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!"); + + // Step over non-escapes simply. + if (*SpellingPtr != '\\') { + ++SpellingPtr; + --ByteNo; + continue; + } + + // Otherwise, this is an escape character. Advance over it. + bool HadError = false; + if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') { + const char *EscapePtr = SpellingPtr; + unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd, + 1, Features, HadError); + if (Len > ByteNo) { + // ByteNo is somewhere within the escape sequence. + SpellingPtr = EscapePtr; + break; + } + ByteNo -= Len; + } else { + ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, + FullSourceLoc(Tok.getLocation(), SM), + CharByteWidth*8, Diags, Features); + --ByteNo; + } + assert(!HadError && "This method isn't valid on erroneous strings"); + } + + return SpellingPtr-SpellingStart; +} + +/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved +/// suffixes as ud-suffixes, because the diagnostic experience is better if we +/// treat it as an invalid suffix. +bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, + StringRef Suffix) { + return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) || + Suffix == "sv"; +} diff --git a/clang/lib/Lex/MacroArgs.cpp b/clang/lib/Lex/MacroArgs.cpp new file mode 100644 index 000000000000..7ede00b4aa64 --- /dev/null +++ b/clang/lib/Lex/MacroArgs.cpp @@ -0,0 +1,307 @@ +//===--- MacroArgs.cpp - Formal argument info for Macros ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the MacroArgs interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/MacroArgs.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Preprocessor.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/SaveAndRestore.h" +#include <algorithm> + +using namespace clang; + +/// MacroArgs ctor function - This destroys the vector passed in. +MacroArgs *MacroArgs::create(const MacroInfo *MI, + ArrayRef<Token> UnexpArgTokens, + bool VarargsElided, Preprocessor &PP) { + assert(MI->isFunctionLike() && + "Can't have args for an object-like macro!"); + MacroArgs **ResultEnt = nullptr; + unsigned ClosestMatch = ~0U; + + // See if we have an entry with a big enough argument list to reuse on the + // free list. If so, reuse it. + for (MacroArgs **Entry = &PP.MacroArgCache; *Entry; + Entry = &(*Entry)->ArgCache) { + if ((*Entry)->NumUnexpArgTokens >= UnexpArgTokens.size() && + (*Entry)->NumUnexpArgTokens < ClosestMatch) { + ResultEnt = Entry; + + // If we have an exact match, use it. + if ((*Entry)->NumUnexpArgTokens == UnexpArgTokens.size()) + break; + // Otherwise, use the best fit. + ClosestMatch = (*Entry)->NumUnexpArgTokens; + } + } + MacroArgs *Result; + if (!ResultEnt) { + // Allocate memory for a MacroArgs object with the lexer tokens at the end, + // and construct the MacroArgs object. + Result = new ( + llvm::safe_malloc(totalSizeToAlloc<Token>(UnexpArgTokens.size()))) + MacroArgs(UnexpArgTokens.size(), VarargsElided, MI->getNumParams()); + } else { + Result = *ResultEnt; + // Unlink this node from the preprocessors singly linked list. + *ResultEnt = Result->ArgCache; + Result->NumUnexpArgTokens = UnexpArgTokens.size(); + Result->VarargsElided = VarargsElided; + Result->NumMacroArgs = MI->getNumParams(); + } + + // Copy the actual unexpanded tokens to immediately after the result ptr. + if (!UnexpArgTokens.empty()) { + static_assert(std::is_trivial<Token>::value, + "assume trivial copyability if copying into the " + "uninitialized array (as opposed to reusing a cached " + "MacroArgs)"); + std::copy(UnexpArgTokens.begin(), UnexpArgTokens.end(), + Result->getTrailingObjects<Token>()); + } + + return Result; +} + +/// destroy - Destroy and deallocate the memory for this object. +/// +void MacroArgs::destroy(Preprocessor &PP) { + // Don't clear PreExpArgTokens, just clear the entries. Clearing the entries + // would deallocate the element vectors. + for (unsigned i = 0, e = PreExpArgTokens.size(); i != e; ++i) + PreExpArgTokens[i].clear(); + + // Add this to the preprocessor's free list. + ArgCache = PP.MacroArgCache; + PP.MacroArgCache = this; +} + +/// deallocate - This should only be called by the Preprocessor when managing +/// its freelist. +MacroArgs *MacroArgs::deallocate() { + MacroArgs *Next = ArgCache; + + // Run the dtor to deallocate the vectors. + this->~MacroArgs(); + // Release the memory for the object. + static_assert(std::is_trivially_destructible<Token>::value, + "assume trivially destructible and forego destructors"); + free(this); + + return Next; +} + + +/// getArgLength - Given a pointer to an expanded or unexpanded argument, +/// return the number of tokens, not counting the EOF, that make up the +/// argument. +unsigned MacroArgs::getArgLength(const Token *ArgPtr) { + unsigned NumArgTokens = 0; + for (; ArgPtr->isNot(tok::eof); ++ArgPtr) + ++NumArgTokens; + return NumArgTokens; +} + + +/// getUnexpArgument - Return the unexpanded tokens for the specified formal. +/// +const Token *MacroArgs::getUnexpArgument(unsigned Arg) const { + + assert(Arg < getNumMacroArguments() && "Invalid arg #"); + // The unexpanded argument tokens start immediately after the MacroArgs object + // in memory. + const Token *Start = getTrailingObjects<Token>(); + const Token *Result = Start; + + // Scan to find Arg. + for (; Arg; ++Result) { + assert(Result < Start+NumUnexpArgTokens && "Invalid arg #"); + if (Result->is(tok::eof)) + --Arg; + } + assert(Result < Start+NumUnexpArgTokens && "Invalid arg #"); + return Result; +} + +bool MacroArgs::invokedWithVariadicArgument(const MacroInfo *const MI, + Preprocessor &PP) { + if (!MI->isVariadic()) + return false; + const int VariadicArgIndex = getNumMacroArguments() - 1; + return getPreExpArgument(VariadicArgIndex, PP).front().isNot(tok::eof); +} + +/// ArgNeedsPreexpansion - If we can prove that the argument won't be affected +/// by pre-expansion, return false. Otherwise, conservatively return true. +bool MacroArgs::ArgNeedsPreexpansion(const Token *ArgTok, + Preprocessor &PP) const { + // If there are no identifiers in the argument list, or if the identifiers are + // known to not be macros, pre-expansion won't modify it. + for (; ArgTok->isNot(tok::eof); ++ArgTok) + if (IdentifierInfo *II = ArgTok->getIdentifierInfo()) + if (II->hasMacroDefinition()) + // Return true even though the macro could be a function-like macro + // without a following '(' token, or could be disabled, or not visible. + return true; + return false; +} + +/// getPreExpArgument - Return the pre-expanded form of the specified +/// argument. +const std::vector<Token> &MacroArgs::getPreExpArgument(unsigned Arg, + Preprocessor &PP) { + assert(Arg < getNumMacroArguments() && "Invalid argument number!"); + + // If we have already computed this, return it. + if (PreExpArgTokens.size() < getNumMacroArguments()) + PreExpArgTokens.resize(getNumMacroArguments()); + + std::vector<Token> &Result = PreExpArgTokens[Arg]; + if (!Result.empty()) return Result; + + SaveAndRestore<bool> PreExpandingMacroArgs(PP.InMacroArgPreExpansion, true); + + const Token *AT = getUnexpArgument(Arg); + unsigned NumToks = getArgLength(AT)+1; // Include the EOF. + + // Otherwise, we have to pre-expand this argument, populating Result. To do + // this, we set up a fake TokenLexer to lex from the unexpanded argument + // list. With this installed, we lex expanded tokens until we hit the EOF + // token at the end of the unexp list. + PP.EnterTokenStream(AT, NumToks, false /*disable expand*/, + false /*owns tokens*/, false /*is reinject*/); + + // Lex all of the macro-expanded tokens into Result. + do { + Result.push_back(Token()); + Token &Tok = Result.back(); + PP.Lex(Tok); + } while (Result.back().isNot(tok::eof)); + + // Pop the token stream off the top of the stack. We know that the internal + // pointer inside of it is to the "end" of the token stream, but the stack + // will not otherwise be popped until the next token is lexed. The problem is + // that the token may be lexed sometime after the vector of tokens itself is + // destroyed, which would be badness. + if (PP.InCachingLexMode()) + PP.ExitCachingLexMode(); + PP.RemoveTopOfLexerStack(); + return Result; +} + + +/// StringifyArgument - Implement C99 6.10.3.2p2, converting a sequence of +/// tokens into the literal string token that should be produced by the C # +/// preprocessor operator. If Charify is true, then it should be turned into +/// a character literal for the Microsoft charize (#@) extension. +/// +Token MacroArgs::StringifyArgument(const Token *ArgToks, + Preprocessor &PP, bool Charify, + SourceLocation ExpansionLocStart, + SourceLocation ExpansionLocEnd) { + Token Tok; + Tok.startToken(); + Tok.setKind(Charify ? tok::char_constant : tok::string_literal); + + const Token *ArgTokStart = ArgToks; + + // Stringify all the tokens. + SmallString<128> Result; + Result += "\""; + + bool isFirst = true; + for (; ArgToks->isNot(tok::eof); ++ArgToks) { + const Token &Tok = *ArgToks; + if (!isFirst && (Tok.hasLeadingSpace() || Tok.isAtStartOfLine())) + Result += ' '; + isFirst = false; + + // If this is a string or character constant, escape the token as specified + // by 6.10.3.2p2. + if (tok::isStringLiteral(Tok.getKind()) || // "foo", u8R"x(foo)x"_bar, etc. + Tok.is(tok::char_constant) || // 'x' + Tok.is(tok::wide_char_constant) || // L'x'. + Tok.is(tok::utf8_char_constant) || // u8'x'. + Tok.is(tok::utf16_char_constant) || // u'x'. + Tok.is(tok::utf32_char_constant)) { // U'x'. + bool Invalid = false; + std::string TokStr = PP.getSpelling(Tok, &Invalid); + if (!Invalid) { + std::string Str = Lexer::Stringify(TokStr); + Result.append(Str.begin(), Str.end()); + } + } else if (Tok.is(tok::code_completion)) { + PP.CodeCompleteNaturalLanguage(); + } else { + // Otherwise, just append the token. Do some gymnastics to get the token + // in place and avoid copies where possible. + unsigned CurStrLen = Result.size(); + Result.resize(CurStrLen+Tok.getLength()); + const char *BufPtr = Result.data() + CurStrLen; + bool Invalid = false; + unsigned ActualTokLen = PP.getSpelling(Tok, BufPtr, &Invalid); + + if (!Invalid) { + // If getSpelling returned a pointer to an already uniqued version of + // the string instead of filling in BufPtr, memcpy it onto our string. + if (ActualTokLen && BufPtr != &Result[CurStrLen]) + memcpy(&Result[CurStrLen], BufPtr, ActualTokLen); + + // If the token was dirty, the spelling may be shorter than the token. + if (ActualTokLen != Tok.getLength()) + Result.resize(CurStrLen+ActualTokLen); + } + } + } + + // If the last character of the string is a \, and if it isn't escaped, this + // is an invalid string literal, diagnose it as specified in C99. + if (Result.back() == '\\') { + // Count the number of consecutive \ characters. If even, then they are + // just escaped backslashes, otherwise it's an error. + unsigned FirstNonSlash = Result.size()-2; + // Guaranteed to find the starting " if nothing else. + while (Result[FirstNonSlash] == '\\') + --FirstNonSlash; + if ((Result.size()-1-FirstNonSlash) & 1) { + // Diagnose errors for things like: #define F(X) #X / F(\) + PP.Diag(ArgToks[-1], diag::pp_invalid_string_literal); + Result.pop_back(); // remove one of the \'s. + } + } + Result += '"'; + + // If this is the charify operation and the result is not a legal character + // constant, diagnose it. + if (Charify) { + // First step, turn double quotes into single quotes: + Result[0] = '\''; + Result[Result.size()-1] = '\''; + + // Check for bogus character. + bool isBad = false; + if (Result.size() == 3) + isBad = Result[1] == '\''; // ''' is not legal. '\' already fixed above. + else + isBad = (Result.size() != 4 || Result[1] != '\\'); // Not '\x' + + if (isBad) { + PP.Diag(ArgTokStart[0], diag::err_invalid_character_to_charify); + Result = "' '"; // Use something arbitrary, but legal. + } + } + + PP.CreateString(Result, Tok, + ExpansionLocStart, ExpansionLocEnd); + return Tok; +} diff --git a/clang/lib/Lex/MacroInfo.cpp b/clang/lib/Lex/MacroInfo.cpp new file mode 100644 index 000000000000..1ccd140364ae --- /dev/null +++ b/clang/lib/Lex/MacroInfo.cpp @@ -0,0 +1,248 @@ +//===- MacroInfo.cpp - Information about #defined identifiers -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the MacroInfo interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/MacroInfo.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <utility> + +using namespace clang; + +MacroInfo::MacroInfo(SourceLocation DefLoc) + : Location(DefLoc), IsDefinitionLengthCached(false), IsFunctionLike(false), + IsC99Varargs(false), IsGNUVarargs(false), IsBuiltinMacro(false), + HasCommaPasting(false), IsDisabled(false), IsUsed(false), + IsAllowRedefinitionsWithoutWarning(false), IsWarnIfUnused(false), + UsedForHeaderGuard(false) {} + +unsigned MacroInfo::getDefinitionLengthSlow(const SourceManager &SM) const { + assert(!IsDefinitionLengthCached); + IsDefinitionLengthCached = true; + + if (ReplacementTokens.empty()) + return (DefinitionLength = 0); + + const Token &firstToken = ReplacementTokens.front(); + const Token &lastToken = ReplacementTokens.back(); + SourceLocation macroStart = firstToken.getLocation(); + SourceLocation macroEnd = lastToken.getLocation(); + assert(macroStart.isValid() && macroEnd.isValid()); + assert((macroStart.isFileID() || firstToken.is(tok::comment)) && + "Macro defined in macro?"); + assert((macroEnd.isFileID() || lastToken.is(tok::comment)) && + "Macro defined in macro?"); + std::pair<FileID, unsigned> + startInfo = SM.getDecomposedExpansionLoc(macroStart); + std::pair<FileID, unsigned> + endInfo = SM.getDecomposedExpansionLoc(macroEnd); + assert(startInfo.first == endInfo.first && + "Macro definition spanning multiple FileIDs ?"); + assert(startInfo.second <= endInfo.second); + DefinitionLength = endInfo.second - startInfo.second; + DefinitionLength += lastToken.getLength(); + + return DefinitionLength; +} + +/// Return true if the specified macro definition is equal to +/// this macro in spelling, arguments, and whitespace. +/// +/// \param Syntactically if true, the macro definitions can be identical even +/// if they use different identifiers for the function macro parameters. +/// Otherwise the comparison is lexical and this implements the rules in +/// C99 6.10.3. +bool MacroInfo::isIdenticalTo(const MacroInfo &Other, Preprocessor &PP, + bool Syntactically) const { + bool Lexically = !Syntactically; + + // Check # tokens in replacement, number of args, and various flags all match. + if (ReplacementTokens.size() != Other.ReplacementTokens.size() || + getNumParams() != Other.getNumParams() || + isFunctionLike() != Other.isFunctionLike() || + isC99Varargs() != Other.isC99Varargs() || + isGNUVarargs() != Other.isGNUVarargs()) + return false; + + if (Lexically) { + // Check arguments. + for (param_iterator I = param_begin(), OI = Other.param_begin(), + E = param_end(); + I != E; ++I, ++OI) + if (*I != *OI) return false; + } + + // Check all the tokens. + for (unsigned i = 0, e = ReplacementTokens.size(); i != e; ++i) { + const Token &A = ReplacementTokens[i]; + const Token &B = Other.ReplacementTokens[i]; + if (A.getKind() != B.getKind()) + return false; + + // If this isn't the first first token, check that the whitespace and + // start-of-line characteristics match. + if (i != 0 && + (A.isAtStartOfLine() != B.isAtStartOfLine() || + A.hasLeadingSpace() != B.hasLeadingSpace())) + return false; + + // If this is an identifier, it is easy. + if (A.getIdentifierInfo() || B.getIdentifierInfo()) { + if (A.getIdentifierInfo() == B.getIdentifierInfo()) + continue; + if (Lexically) + return false; + // With syntactic equivalence the parameter names can be different as long + // as they are used in the same place. + int AArgNum = getParameterNum(A.getIdentifierInfo()); + if (AArgNum == -1) + return false; + if (AArgNum != Other.getParameterNum(B.getIdentifierInfo())) + return false; + continue; + } + + // Otherwise, check the spelling. + if (PP.getSpelling(A) != PP.getSpelling(B)) + return false; + } + + return true; +} + +LLVM_DUMP_METHOD void MacroInfo::dump() const { + llvm::raw_ostream &Out = llvm::errs(); + + // FIXME: Dump locations. + Out << "MacroInfo " << this; + if (IsBuiltinMacro) Out << " builtin"; + if (IsDisabled) Out << " disabled"; + if (IsUsed) Out << " used"; + if (IsAllowRedefinitionsWithoutWarning) + Out << " allow_redefinitions_without_warning"; + if (IsWarnIfUnused) Out << " warn_if_unused"; + if (UsedForHeaderGuard) Out << " header_guard"; + + Out << "\n #define <macro>"; + if (IsFunctionLike) { + Out << "("; + for (unsigned I = 0; I != NumParameters; ++I) { + if (I) Out << ", "; + Out << ParameterList[I]->getName(); + } + if (IsC99Varargs || IsGNUVarargs) { + if (NumParameters && IsC99Varargs) Out << ", "; + Out << "..."; + } + Out << ")"; + } + + bool First = true; + for (const Token &Tok : ReplacementTokens) { + // Leading space is semantically meaningful in a macro definition, + // so preserve it in the dump output. + if (First || Tok.hasLeadingSpace()) + Out << " "; + First = false; + + if (const char *Punc = tok::getPunctuatorSpelling(Tok.getKind())) + Out << Punc; + else if (Tok.isLiteral() && Tok.getLiteralData()) + Out << StringRef(Tok.getLiteralData(), Tok.getLength()); + else if (auto *II = Tok.getIdentifierInfo()) + Out << II->getName(); + else + Out << Tok.getName(); + } +} + +MacroDirective::DefInfo MacroDirective::getDefinition() { + MacroDirective *MD = this; + SourceLocation UndefLoc; + Optional<bool> isPublic; + for (; MD; MD = MD->getPrevious()) { + if (DefMacroDirective *DefMD = dyn_cast<DefMacroDirective>(MD)) + return DefInfo(DefMD, UndefLoc, + !isPublic.hasValue() || isPublic.getValue()); + + if (UndefMacroDirective *UndefMD = dyn_cast<UndefMacroDirective>(MD)) { + UndefLoc = UndefMD->getLocation(); + continue; + } + + VisibilityMacroDirective *VisMD = cast<VisibilityMacroDirective>(MD); + if (!isPublic.hasValue()) + isPublic = VisMD->isPublic(); + } + + return DefInfo(nullptr, UndefLoc, + !isPublic.hasValue() || isPublic.getValue()); +} + +const MacroDirective::DefInfo +MacroDirective::findDirectiveAtLoc(SourceLocation L, + const SourceManager &SM) const { + assert(L.isValid() && "SourceLocation is invalid."); + for (DefInfo Def = getDefinition(); Def; Def = Def.getPreviousDefinition()) { + if (Def.getLocation().isInvalid() || // For macros defined on the command line. + SM.isBeforeInTranslationUnit(Def.getLocation(), L)) + return (!Def.isUndefined() || + SM.isBeforeInTranslationUnit(L, Def.getUndefLocation())) + ? Def : DefInfo(); + } + return DefInfo(); +} + +LLVM_DUMP_METHOD void MacroDirective::dump() const { + llvm::raw_ostream &Out = llvm::errs(); + + switch (getKind()) { + case MD_Define: Out << "DefMacroDirective"; break; + case MD_Undefine: Out << "UndefMacroDirective"; break; + case MD_Visibility: Out << "VisibilityMacroDirective"; break; + } + Out << " " << this; + // FIXME: Dump SourceLocation. + if (auto *Prev = getPrevious()) + Out << " prev " << Prev; + if (IsFromPCH) Out << " from_pch"; + + if (isa<VisibilityMacroDirective>(this)) + Out << (IsPublic ? " public" : " private"); + + if (auto *DMD = dyn_cast<DefMacroDirective>(this)) { + if (auto *Info = DMD->getInfo()) { + Out << "\n "; + Info->dump(); + } + } + Out << "\n"; +} + +ModuleMacro *ModuleMacro::create(Preprocessor &PP, Module *OwningModule, + IdentifierInfo *II, MacroInfo *Macro, + ArrayRef<ModuleMacro *> Overrides) { + void *Mem = PP.getPreprocessorAllocator().Allocate( + sizeof(ModuleMacro) + sizeof(ModuleMacro *) * Overrides.size(), + alignof(ModuleMacro)); + return new (Mem) ModuleMacro(OwningModule, II, Macro, Overrides); +} diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp new file mode 100644 index 000000000000..db59629997ee --- /dev/null +++ b/clang/lib/Lex/ModuleMap.cpp @@ -0,0 +1,3010 @@ +//===- ModuleMap.cpp - Describe the layout of modules ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the ModuleMap implementation, which describes the layout +// of a module as it relates to headers. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/ModuleMap.h" +#include "clang/Basic/CharInfo.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/Module.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/HeaderSearchOptions.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <string> +#include <system_error> +#include <utility> + +using namespace clang; + +void ModuleMapCallbacks::anchor() {} + +void ModuleMap::resolveLinkAsDependencies(Module *Mod) { + auto PendingLinkAs = PendingLinkAsModule.find(Mod->Name); + if (PendingLinkAs != PendingLinkAsModule.end()) { + for (auto &Name : PendingLinkAs->second) { + auto *M = findModule(Name.getKey()); + if (M) + M->UseExportAsModuleLinkName = true; + } + } +} + +void ModuleMap::addLinkAsDependency(Module *Mod) { + if (findModule(Mod->ExportAsModule)) + Mod->UseExportAsModuleLinkName = true; + else + PendingLinkAsModule[Mod->ExportAsModule].insert(Mod->Name); +} + +Module::HeaderKind ModuleMap::headerRoleToKind(ModuleHeaderRole Role) { + switch ((int)Role) { + default: llvm_unreachable("unknown header role"); + case NormalHeader: + return Module::HK_Normal; + case PrivateHeader: + return Module::HK_Private; + case TextualHeader: + return Module::HK_Textual; + case PrivateHeader | TextualHeader: + return Module::HK_PrivateTextual; + } +} + +ModuleMap::ModuleHeaderRole +ModuleMap::headerKindToRole(Module::HeaderKind Kind) { + switch ((int)Kind) { + case Module::HK_Normal: + return NormalHeader; + case Module::HK_Private: + return PrivateHeader; + case Module::HK_Textual: + return TextualHeader; + case Module::HK_PrivateTextual: + return ModuleHeaderRole(PrivateHeader | TextualHeader); + case Module::HK_Excluded: + llvm_unreachable("unexpected header kind"); + } + llvm_unreachable("unknown header kind"); +} + +Module::ExportDecl +ModuleMap::resolveExport(Module *Mod, + const Module::UnresolvedExportDecl &Unresolved, + bool Complain) const { + // We may have just a wildcard. + if (Unresolved.Id.empty()) { + assert(Unresolved.Wildcard && "Invalid unresolved export"); + return Module::ExportDecl(nullptr, true); + } + + // Resolve the module-id. + Module *Context = resolveModuleId(Unresolved.Id, Mod, Complain); + if (!Context) + return {}; + + return Module::ExportDecl(Context, Unresolved.Wildcard); +} + +Module *ModuleMap::resolveModuleId(const ModuleId &Id, Module *Mod, + bool Complain) const { + // Find the starting module. + Module *Context = lookupModuleUnqualified(Id[0].first, Mod); + if (!Context) { + if (Complain) + Diags.Report(Id[0].second, diag::err_mmap_missing_module_unqualified) + << Id[0].first << Mod->getFullModuleName(); + + return nullptr; + } + + // Dig into the module path. + for (unsigned I = 1, N = Id.size(); I != N; ++I) { + Module *Sub = lookupModuleQualified(Id[I].first, Context); + if (!Sub) { + if (Complain) + Diags.Report(Id[I].second, diag::err_mmap_missing_module_qualified) + << Id[I].first << Context->getFullModuleName() + << SourceRange(Id[0].second, Id[I-1].second); + + return nullptr; + } + + Context = Sub; + } + + return Context; +} + +/// Append to \p Paths the set of paths needed to get to the +/// subframework in which the given module lives. +static void appendSubframeworkPaths(Module *Mod, + SmallVectorImpl<char> &Path) { + // Collect the framework names from the given module to the top-level module. + SmallVector<StringRef, 2> Paths; + for (; Mod; Mod = Mod->Parent) { + if (Mod->IsFramework) + Paths.push_back(Mod->Name); + } + + if (Paths.empty()) + return; + + // Add Frameworks/Name.framework for each subframework. + for (unsigned I = Paths.size() - 1; I != 0; --I) + llvm::sys::path::append(Path, "Frameworks", Paths[I-1] + ".framework"); +} + +const FileEntry *ModuleMap::findHeader( + Module *M, const Module::UnresolvedHeaderDirective &Header, + SmallVectorImpl<char> &RelativePathName, bool &NeedsFramework) { + // Search for the header file within the module's home directory. + auto *Directory = M->Directory; + SmallString<128> FullPathName(Directory->getName()); + + auto GetFile = [&](StringRef Filename) -> const FileEntry * { + auto File = SourceMgr.getFileManager().getFile(Filename); + if (!File || + (Header.Size && (*File)->getSize() != *Header.Size) || + (Header.ModTime && (*File)->getModificationTime() != *Header.ModTime)) + return nullptr; + return *File; + }; + + auto GetFrameworkFile = [&]() -> const FileEntry * { + unsigned FullPathLength = FullPathName.size(); + appendSubframeworkPaths(M, RelativePathName); + unsigned RelativePathLength = RelativePathName.size(); + + // Check whether this file is in the public headers. + llvm::sys::path::append(RelativePathName, "Headers", Header.FileName); + llvm::sys::path::append(FullPathName, RelativePathName); + if (auto *File = GetFile(FullPathName)) + return File; + + // Check whether this file is in the private headers. + // Ideally, private modules in the form 'FrameworkName.Private' should + // be defined as 'module FrameworkName.Private', and not as + // 'framework module FrameworkName.Private', since a 'Private.Framework' + // does not usually exist. However, since both are currently widely used + // for private modules, make sure we find the right path in both cases. + if (M->IsFramework && M->Name == "Private") + RelativePathName.clear(); + else + RelativePathName.resize(RelativePathLength); + FullPathName.resize(FullPathLength); + llvm::sys::path::append(RelativePathName, "PrivateHeaders", + Header.FileName); + llvm::sys::path::append(FullPathName, RelativePathName); + return GetFile(FullPathName); + }; + + if (llvm::sys::path::is_absolute(Header.FileName)) { + RelativePathName.clear(); + RelativePathName.append(Header.FileName.begin(), Header.FileName.end()); + return GetFile(Header.FileName); + } + + if (M->isPartOfFramework()) + return GetFrameworkFile(); + + // Lookup for normal headers. + llvm::sys::path::append(RelativePathName, Header.FileName); + llvm::sys::path::append(FullPathName, RelativePathName); + auto *NormalHdrFile = GetFile(FullPathName); + + if (M && !NormalHdrFile && Directory->getName().endswith(".framework")) { + // The lack of 'framework' keyword in a module declaration it's a simple + // mistake we can diagnose when the header exists within the proper + // framework style path. + FullPathName.assign(Directory->getName()); + RelativePathName.clear(); + if (GetFrameworkFile()) { + Diags.Report(Header.FileNameLoc, + diag::warn_mmap_incomplete_framework_module_declaration) + << Header.FileName << M->getFullModuleName(); + NeedsFramework = true; + } + return nullptr; + } + + return NormalHdrFile; +} + +void ModuleMap::resolveHeader(Module *Mod, + const Module::UnresolvedHeaderDirective &Header, + bool &NeedsFramework) { + SmallString<128> RelativePathName; + if (const FileEntry *File = + findHeader(Mod, Header, RelativePathName, NeedsFramework)) { + if (Header.IsUmbrella) { + const DirectoryEntry *UmbrellaDir = File->getDir(); + if (Module *UmbrellaMod = UmbrellaDirs[UmbrellaDir]) + Diags.Report(Header.FileNameLoc, diag::err_mmap_umbrella_clash) + << UmbrellaMod->getFullModuleName(); + else + // Record this umbrella header. + setUmbrellaHeader(Mod, File, RelativePathName.str()); + } else { + Module::Header H = {RelativePathName.str(), File}; + if (Header.Kind == Module::HK_Excluded) + excludeHeader(Mod, H); + else + addHeader(Mod, H, headerKindToRole(Header.Kind)); + } + } else if (Header.HasBuiltinHeader && !Header.Size && !Header.ModTime) { + // There's a builtin header but no corresponding on-disk header. Assume + // this was supposed to modularize the builtin header alone. + } else if (Header.Kind == Module::HK_Excluded) { + // Ignore missing excluded header files. They're optional anyway. + } else { + // If we find a module that has a missing header, we mark this module as + // unavailable and store the header directive for displaying diagnostics. + Mod->MissingHeaders.push_back(Header); + // A missing header with stat information doesn't make the module + // unavailable; this keeps our behavior consistent as headers are lazily + // resolved. (Such a module still can't be built though, except from + // preprocessed source.) + if (!Header.Size && !Header.ModTime) + Mod->markUnavailable(); + } +} + +bool ModuleMap::resolveAsBuiltinHeader( + Module *Mod, const Module::UnresolvedHeaderDirective &Header) { + if (Header.Kind == Module::HK_Excluded || + llvm::sys::path::is_absolute(Header.FileName) || + Mod->isPartOfFramework() || !Mod->IsSystem || Header.IsUmbrella || + !BuiltinIncludeDir || BuiltinIncludeDir == Mod->Directory || + !isBuiltinHeader(Header.FileName)) + return false; + + // This is a system module with a top-level header. This header + // may have a counterpart (or replacement) in the set of headers + // supplied by Clang. Find that builtin header. + SmallString<128> Path; + llvm::sys::path::append(Path, BuiltinIncludeDir->getName(), Header.FileName); + auto File = SourceMgr.getFileManager().getFile(Path); + if (!File) + return false; + + auto Role = headerKindToRole(Header.Kind); + Module::Header H = {Path.str(), *File}; + addHeader(Mod, H, Role); + return true; +} + +ModuleMap::ModuleMap(SourceManager &SourceMgr, DiagnosticsEngine &Diags, + const LangOptions &LangOpts, const TargetInfo *Target, + HeaderSearch &HeaderInfo) + : SourceMgr(SourceMgr), Diags(Diags), LangOpts(LangOpts), Target(Target), + HeaderInfo(HeaderInfo) { + MMapLangOpts.LineComment = true; +} + +ModuleMap::~ModuleMap() { + for (auto &M : Modules) + delete M.getValue(); + for (auto *M : ShadowModules) + delete M; +} + +void ModuleMap::setTarget(const TargetInfo &Target) { + assert((!this->Target || this->Target == &Target) && + "Improper target override"); + this->Target = &Target; +} + +/// "Sanitize" a filename so that it can be used as an identifier. +static StringRef sanitizeFilenameAsIdentifier(StringRef Name, + SmallVectorImpl<char> &Buffer) { + if (Name.empty()) + return Name; + + if (!isValidIdentifier(Name)) { + // If we don't already have something with the form of an identifier, + // create a buffer with the sanitized name. + Buffer.clear(); + if (isDigit(Name[0])) + Buffer.push_back('_'); + Buffer.reserve(Buffer.size() + Name.size()); + for (unsigned I = 0, N = Name.size(); I != N; ++I) { + if (isIdentifierBody(Name[I])) + Buffer.push_back(Name[I]); + else + Buffer.push_back('_'); + } + + Name = StringRef(Buffer.data(), Buffer.size()); + } + + while (llvm::StringSwitch<bool>(Name) +#define KEYWORD(Keyword,Conditions) .Case(#Keyword, true) +#define ALIAS(Keyword, AliasOf, Conditions) .Case(Keyword, true) +#include "clang/Basic/TokenKinds.def" + .Default(false)) { + if (Name.data() != Buffer.data()) + Buffer.append(Name.begin(), Name.end()); + Buffer.push_back('_'); + Name = StringRef(Buffer.data(), Buffer.size()); + } + + return Name; +} + +/// Determine whether the given file name is the name of a builtin +/// header, supplied by Clang to replace, override, or augment existing system +/// headers. +bool ModuleMap::isBuiltinHeader(StringRef FileName) { + return llvm::StringSwitch<bool>(FileName) + .Case("float.h", true) + .Case("iso646.h", true) + .Case("limits.h", true) + .Case("stdalign.h", true) + .Case("stdarg.h", true) + .Case("stdatomic.h", true) + .Case("stdbool.h", true) + .Case("stddef.h", true) + .Case("stdint.h", true) + .Case("tgmath.h", true) + .Case("unwind.h", true) + .Default(false); +} + +ModuleMap::HeadersMap::iterator +ModuleMap::findKnownHeader(const FileEntry *File) { + resolveHeaderDirectives(File); + HeadersMap::iterator Known = Headers.find(File); + if (HeaderInfo.getHeaderSearchOpts().ImplicitModuleMaps && + Known == Headers.end() && File->getDir() == BuiltinIncludeDir && + ModuleMap::isBuiltinHeader(llvm::sys::path::filename(File->getName()))) { + HeaderInfo.loadTopLevelSystemModules(); + return Headers.find(File); + } + return Known; +} + +ModuleMap::KnownHeader +ModuleMap::findHeaderInUmbrellaDirs(const FileEntry *File, + SmallVectorImpl<const DirectoryEntry *> &IntermediateDirs) { + if (UmbrellaDirs.empty()) + return {}; + + const DirectoryEntry *Dir = File->getDir(); + assert(Dir && "file in no directory"); + + // Note: as an egregious but useful hack we use the real path here, because + // frameworks moving from top-level frameworks to embedded frameworks tend + // to be symlinked from the top-level location to the embedded location, + // and we need to resolve lookups as if we had found the embedded location. + StringRef DirName = SourceMgr.getFileManager().getCanonicalName(Dir); + + // Keep walking up the directory hierarchy, looking for a directory with + // an umbrella header. + do { + auto KnownDir = UmbrellaDirs.find(Dir); + if (KnownDir != UmbrellaDirs.end()) + return KnownHeader(KnownDir->second, NormalHeader); + + IntermediateDirs.push_back(Dir); + + // Retrieve our parent path. + DirName = llvm::sys::path::parent_path(DirName); + if (DirName.empty()) + break; + + // Resolve the parent path to a directory entry. + if (auto DirEntry = SourceMgr.getFileManager().getDirectory(DirName)) + Dir = *DirEntry; + else + Dir = nullptr; + } while (Dir); + return {}; +} + +static bool violatesPrivateInclude(Module *RequestingModule, + const FileEntry *IncFileEnt, + ModuleMap::KnownHeader Header) { +#ifndef NDEBUG + if (Header.getRole() & ModuleMap::PrivateHeader) { + // Check for consistency between the module header role + // as obtained from the lookup and as obtained from the module. + // This check is not cheap, so enable it only for debugging. + bool IsPrivate = false; + SmallVectorImpl<Module::Header> *HeaderList[] = { + &Header.getModule()->Headers[Module::HK_Private], + &Header.getModule()->Headers[Module::HK_PrivateTextual]}; + for (auto *Hs : HeaderList) + IsPrivate |= + std::find_if(Hs->begin(), Hs->end(), [&](const Module::Header &H) { + return H.Entry == IncFileEnt; + }) != Hs->end(); + assert(IsPrivate && "inconsistent headers and roles"); + } +#endif + return !Header.isAccessibleFrom(RequestingModule); +} + +static Module *getTopLevelOrNull(Module *M) { + return M ? M->getTopLevelModule() : nullptr; +} + +void ModuleMap::diagnoseHeaderInclusion(Module *RequestingModule, + bool RequestingModuleIsModuleInterface, + SourceLocation FilenameLoc, + StringRef Filename, + const FileEntry *File) { + // No errors for indirect modules. This may be a bit of a problem for modules + // with no source files. + if (getTopLevelOrNull(RequestingModule) != getTopLevelOrNull(SourceModule)) + return; + + if (RequestingModule) { + resolveUses(RequestingModule, /*Complain=*/false); + resolveHeaderDirectives(RequestingModule); + } + + bool Excluded = false; + Module *Private = nullptr; + Module *NotUsed = nullptr; + + HeadersMap::iterator Known = findKnownHeader(File); + if (Known != Headers.end()) { + for (const KnownHeader &Header : Known->second) { + // Remember private headers for later printing of a diagnostic. + if (violatesPrivateInclude(RequestingModule, File, Header)) { + Private = Header.getModule(); + continue; + } + + // If uses need to be specified explicitly, we are only allowed to return + // modules that are explicitly used by the requesting module. + if (RequestingModule && LangOpts.ModulesDeclUse && + !RequestingModule->directlyUses(Header.getModule())) { + NotUsed = Header.getModule(); + continue; + } + + // We have found a module that we can happily use. + return; + } + + Excluded = true; + } + + // We have found a header, but it is private. + if (Private) { + Diags.Report(FilenameLoc, diag::warn_use_of_private_header_outside_module) + << Filename; + return; + } + + // We have found a module, but we don't use it. + if (NotUsed) { + Diags.Report(FilenameLoc, diag::err_undeclared_use_of_module) + << RequestingModule->getTopLevelModule()->Name << Filename; + return; + } + + if (Excluded || isHeaderInUmbrellaDirs(File)) + return; + + // At this point, only non-modular includes remain. + + if (RequestingModule && LangOpts.ModulesStrictDeclUse) { + Diags.Report(FilenameLoc, diag::err_undeclared_use_of_module) + << RequestingModule->getTopLevelModule()->Name << Filename; + } else if (RequestingModule && RequestingModuleIsModuleInterface && + LangOpts.isCompilingModule()) { + // Do not diagnose when we are not compiling a module. + diag::kind DiagID = RequestingModule->getTopLevelModule()->IsFramework ? + diag::warn_non_modular_include_in_framework_module : + diag::warn_non_modular_include_in_module; + Diags.Report(FilenameLoc, DiagID) << RequestingModule->getFullModuleName() + << File->getName(); + } +} + +static bool isBetterKnownHeader(const ModuleMap::KnownHeader &New, + const ModuleMap::KnownHeader &Old) { + // Prefer available modules. + if (New.getModule()->isAvailable() && !Old.getModule()->isAvailable()) + return true; + + // Prefer a public header over a private header. + if ((New.getRole() & ModuleMap::PrivateHeader) != + (Old.getRole() & ModuleMap::PrivateHeader)) + return !(New.getRole() & ModuleMap::PrivateHeader); + + // Prefer a non-textual header over a textual header. + if ((New.getRole() & ModuleMap::TextualHeader) != + (Old.getRole() & ModuleMap::TextualHeader)) + return !(New.getRole() & ModuleMap::TextualHeader); + + // Don't have a reason to choose between these. Just keep the first one. + return false; +} + +ModuleMap::KnownHeader ModuleMap::findModuleForHeader(const FileEntry *File, + bool AllowTextual) { + auto MakeResult = [&](ModuleMap::KnownHeader R) -> ModuleMap::KnownHeader { + if (!AllowTextual && R.getRole() & ModuleMap::TextualHeader) + return {}; + return R; + }; + + HeadersMap::iterator Known = findKnownHeader(File); + if (Known != Headers.end()) { + ModuleMap::KnownHeader Result; + // Iterate over all modules that 'File' is part of to find the best fit. + for (KnownHeader &H : Known->second) { + // Prefer a header from the source module over all others. + if (H.getModule()->getTopLevelModule() == SourceModule) + return MakeResult(H); + if (!Result || isBetterKnownHeader(H, Result)) + Result = H; + } + return MakeResult(Result); + } + + return MakeResult(findOrCreateModuleForHeaderInUmbrellaDir(File)); +} + +ModuleMap::KnownHeader +ModuleMap::findOrCreateModuleForHeaderInUmbrellaDir(const FileEntry *File) { + assert(!Headers.count(File) && "already have a module for this header"); + + SmallVector<const DirectoryEntry *, 2> SkippedDirs; + KnownHeader H = findHeaderInUmbrellaDirs(File, SkippedDirs); + if (H) { + Module *Result = H.getModule(); + + // Search up the module stack until we find a module with an umbrella + // directory. + Module *UmbrellaModule = Result; + while (!UmbrellaModule->getUmbrellaDir() && UmbrellaModule->Parent) + UmbrellaModule = UmbrellaModule->Parent; + + if (UmbrellaModule->InferSubmodules) { + const FileEntry *UmbrellaModuleMap = + getModuleMapFileForUniquing(UmbrellaModule); + + // Infer submodules for each of the directories we found between + // the directory of the umbrella header and the directory where + // the actual header is located. + bool Explicit = UmbrellaModule->InferExplicitSubmodules; + + for (unsigned I = SkippedDirs.size(); I != 0; --I) { + // Find or create the module that corresponds to this directory name. + SmallString<32> NameBuf; + StringRef Name = sanitizeFilenameAsIdentifier( + llvm::sys::path::stem(SkippedDirs[I-1]->getName()), NameBuf); + Result = findOrCreateModule(Name, Result, /*IsFramework=*/false, + Explicit).first; + InferredModuleAllowedBy[Result] = UmbrellaModuleMap; + Result->IsInferred = true; + + // Associate the module and the directory. + UmbrellaDirs[SkippedDirs[I-1]] = Result; + + // If inferred submodules export everything they import, add a + // wildcard to the set of exports. + if (UmbrellaModule->InferExportWildcard && Result->Exports.empty()) + Result->Exports.push_back(Module::ExportDecl(nullptr, true)); + } + + // Infer a submodule with the same name as this header file. + SmallString<32> NameBuf; + StringRef Name = sanitizeFilenameAsIdentifier( + llvm::sys::path::stem(File->getName()), NameBuf); + Result = findOrCreateModule(Name, Result, /*IsFramework=*/false, + Explicit).first; + InferredModuleAllowedBy[Result] = UmbrellaModuleMap; + Result->IsInferred = true; + Result->addTopHeader(File); + + // If inferred submodules export everything they import, add a + // wildcard to the set of exports. + if (UmbrellaModule->InferExportWildcard && Result->Exports.empty()) + Result->Exports.push_back(Module::ExportDecl(nullptr, true)); + } else { + // Record each of the directories we stepped through as being part of + // the module we found, since the umbrella header covers them all. + for (unsigned I = 0, N = SkippedDirs.size(); I != N; ++I) + UmbrellaDirs[SkippedDirs[I]] = Result; + } + + KnownHeader Header(Result, NormalHeader); + Headers[File].push_back(Header); + return Header; + } + + return {}; +} + +ArrayRef<ModuleMap::KnownHeader> +ModuleMap::findAllModulesForHeader(const FileEntry *File) const { + resolveHeaderDirectives(File); + auto It = Headers.find(File); + if (It == Headers.end()) + return None; + return It->second; +} + +bool ModuleMap::isHeaderInUnavailableModule(const FileEntry *Header) const { + return isHeaderUnavailableInModule(Header, nullptr); +} + +bool +ModuleMap::isHeaderUnavailableInModule(const FileEntry *Header, + const Module *RequestingModule) const { + resolveHeaderDirectives(Header); + HeadersMap::const_iterator Known = Headers.find(Header); + if (Known != Headers.end()) { + for (SmallVectorImpl<KnownHeader>::const_iterator + I = Known->second.begin(), + E = Known->second.end(); + I != E; ++I) { + + if (I->isAvailable() && + (!RequestingModule || + I->getModule()->isSubModuleOf(RequestingModule))) { + // When no requesting module is available, the caller is looking if a + // header is part a module by only looking into the module map. This is + // done by warn_uncovered_module_header checks; don't consider textual + // headers part of it in this mode, otherwise we get misleading warnings + // that a umbrella header is not including a textual header. + if (!RequestingModule && I->getRole() == ModuleMap::TextualHeader) + continue; + return false; + } + } + return true; + } + + const DirectoryEntry *Dir = Header->getDir(); + SmallVector<const DirectoryEntry *, 2> SkippedDirs; + StringRef DirName = Dir->getName(); + + auto IsUnavailable = [&](const Module *M) { + return !M->isAvailable() && (!RequestingModule || + M->isSubModuleOf(RequestingModule)); + }; + + // Keep walking up the directory hierarchy, looking for a directory with + // an umbrella header. + do { + llvm::DenseMap<const DirectoryEntry *, Module *>::const_iterator KnownDir + = UmbrellaDirs.find(Dir); + if (KnownDir != UmbrellaDirs.end()) { + Module *Found = KnownDir->second; + if (IsUnavailable(Found)) + return true; + + // Search up the module stack until we find a module with an umbrella + // directory. + Module *UmbrellaModule = Found; + while (!UmbrellaModule->getUmbrellaDir() && UmbrellaModule->Parent) + UmbrellaModule = UmbrellaModule->Parent; + + if (UmbrellaModule->InferSubmodules) { + for (unsigned I = SkippedDirs.size(); I != 0; --I) { + // Find or create the module that corresponds to this directory name. + SmallString<32> NameBuf; + StringRef Name = sanitizeFilenameAsIdentifier( + llvm::sys::path::stem(SkippedDirs[I-1]->getName()), + NameBuf); + Found = lookupModuleQualified(Name, Found); + if (!Found) + return false; + if (IsUnavailable(Found)) + return true; + } + + // Infer a submodule with the same name as this header file. + SmallString<32> NameBuf; + StringRef Name = sanitizeFilenameAsIdentifier( + llvm::sys::path::stem(Header->getName()), + NameBuf); + Found = lookupModuleQualified(Name, Found); + if (!Found) + return false; + } + + return IsUnavailable(Found); + } + + SkippedDirs.push_back(Dir); + + // Retrieve our parent path. + DirName = llvm::sys::path::parent_path(DirName); + if (DirName.empty()) + break; + + // Resolve the parent path to a directory entry. + if (auto DirEntry = SourceMgr.getFileManager().getDirectory(DirName)) + Dir = *DirEntry; + else + Dir = nullptr; + } while (Dir); + + return false; +} + +Module *ModuleMap::findModule(StringRef Name) const { + llvm::StringMap<Module *>::const_iterator Known = Modules.find(Name); + if (Known != Modules.end()) + return Known->getValue(); + + return nullptr; +} + +Module *ModuleMap::lookupModuleUnqualified(StringRef Name, + Module *Context) const { + for(; Context; Context = Context->Parent) { + if (Module *Sub = lookupModuleQualified(Name, Context)) + return Sub; + } + + return findModule(Name); +} + +Module *ModuleMap::lookupModuleQualified(StringRef Name, Module *Context) const{ + if (!Context) + return findModule(Name); + + return Context->findSubmodule(Name); +} + +std::pair<Module *, bool> ModuleMap::findOrCreateModule(StringRef Name, + Module *Parent, + bool IsFramework, + bool IsExplicit) { + // Try to find an existing module with this name. + if (Module *Sub = lookupModuleQualified(Name, Parent)) + return std::make_pair(Sub, false); + + // Create a new module with this name. + Module *Result = new Module(Name, SourceLocation(), Parent, IsFramework, + IsExplicit, NumCreatedModules++); + if (!Parent) { + if (LangOpts.CurrentModule == Name) + SourceModule = Result; + Modules[Name] = Result; + ModuleScopeIDs[Result] = CurrentModuleScopeID; + } + return std::make_pair(Result, true); +} + +Module *ModuleMap::createGlobalModuleFragmentForModuleUnit(SourceLocation Loc) { + PendingSubmodules.emplace_back( + new Module("<global>", Loc, nullptr, /*IsFramework*/ false, + /*IsExplicit*/ true, NumCreatedModules++)); + PendingSubmodules.back()->Kind = Module::GlobalModuleFragment; + return PendingSubmodules.back().get(); +} + +Module * +ModuleMap::createPrivateModuleFragmentForInterfaceUnit(Module *Parent, + SourceLocation Loc) { + auto *Result = + new Module("<private>", Loc, Parent, /*IsFramework*/ false, + /*IsExplicit*/ true, NumCreatedModules++); + Result->Kind = Module::PrivateModuleFragment; + return Result; +} + +Module *ModuleMap::createModuleForInterfaceUnit(SourceLocation Loc, + StringRef Name, + Module *GlobalModule) { + assert(LangOpts.CurrentModule == Name && "module name mismatch"); + assert(!Modules[Name] && "redefining existing module"); + + auto *Result = + new Module(Name, Loc, nullptr, /*IsFramework*/ false, + /*IsExplicit*/ false, NumCreatedModules++); + Result->Kind = Module::ModuleInterfaceUnit; + Modules[Name] = SourceModule = Result; + + // Reparent the current global module fragment as a submodule of this module. + for (auto &Submodule : PendingSubmodules) { + Submodule->setParent(Result); + Submodule.release(); // now owned by parent + } + PendingSubmodules.clear(); + + // Mark the main source file as being within the newly-created module so that + // declarations and macros are properly visibility-restricted to it. + auto *MainFile = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()); + assert(MainFile && "no input file for module interface"); + Headers[MainFile].push_back(KnownHeader(Result, PrivateHeader)); + + return Result; +} + +Module *ModuleMap::createHeaderModule(StringRef Name, + ArrayRef<Module::Header> Headers) { + assert(LangOpts.CurrentModule == Name && "module name mismatch"); + assert(!Modules[Name] && "redefining existing module"); + + auto *Result = + new Module(Name, SourceLocation(), nullptr, /*IsFramework*/ false, + /*IsExplicit*/ false, NumCreatedModules++); + Result->Kind = Module::ModuleInterfaceUnit; + Modules[Name] = SourceModule = Result; + + for (const Module::Header &H : Headers) { + auto *M = new Module(H.NameAsWritten, SourceLocation(), Result, + /*IsFramework*/ false, + /*IsExplicit*/ true, NumCreatedModules++); + // Header modules are implicitly 'export *'. + M->Exports.push_back(Module::ExportDecl(nullptr, true)); + addHeader(M, H, NormalHeader); + } + + return Result; +} + +/// For a framework module, infer the framework against which we +/// should link. +static void inferFrameworkLink(Module *Mod, const DirectoryEntry *FrameworkDir, + FileManager &FileMgr) { + assert(Mod->IsFramework && "Can only infer linking for framework modules"); + assert(!Mod->isSubFramework() && + "Can only infer linking for top-level frameworks"); + + SmallString<128> LibName; + LibName += FrameworkDir->getName(); + llvm::sys::path::append(LibName, Mod->Name); + + // The library name of a framework has more than one possible extension since + // the introduction of the text-based dynamic library format. We need to check + // for both before we give up. + for (const char *extension : {"", ".tbd"}) { + llvm::sys::path::replace_extension(LibName, extension); + if (FileMgr.getFile(LibName)) { + Mod->LinkLibraries.push_back(Module::LinkLibrary(Mod->Name, + /*IsFramework=*/true)); + return; + } + } +} + +Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, + bool IsSystem, Module *Parent) { + Attributes Attrs; + Attrs.IsSystem = IsSystem; + return inferFrameworkModule(FrameworkDir, Attrs, Parent); +} + +Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, + Attributes Attrs, Module *Parent) { + // Note: as an egregious but useful hack we use the real path here, because + // we might be looking at an embedded framework that symlinks out to a + // top-level framework, and we need to infer as if we were naming the + // top-level framework. + StringRef FrameworkDirName = + SourceMgr.getFileManager().getCanonicalName(FrameworkDir); + + // In case this is a case-insensitive filesystem, use the canonical + // directory name as the ModuleName, since modules are case-sensitive. + // FIXME: we should be able to give a fix-it hint for the correct spelling. + SmallString<32> ModuleNameStorage; + StringRef ModuleName = sanitizeFilenameAsIdentifier( + llvm::sys::path::stem(FrameworkDirName), ModuleNameStorage); + + // Check whether we've already found this module. + if (Module *Mod = lookupModuleQualified(ModuleName, Parent)) + return Mod; + + FileManager &FileMgr = SourceMgr.getFileManager(); + + // If the framework has a parent path from which we're allowed to infer + // a framework module, do so. + const FileEntry *ModuleMapFile = nullptr; + if (!Parent) { + // Determine whether we're allowed to infer a module map. + bool canInfer = false; + if (llvm::sys::path::has_parent_path(FrameworkDirName)) { + // Figure out the parent path. + StringRef Parent = llvm::sys::path::parent_path(FrameworkDirName); + if (auto ParentDir = FileMgr.getDirectory(Parent)) { + // Check whether we have already looked into the parent directory + // for a module map. + llvm::DenseMap<const DirectoryEntry *, InferredDirectory>::const_iterator + inferred = InferredDirectories.find(*ParentDir); + if (inferred == InferredDirectories.end()) { + // We haven't looked here before. Load a module map, if there is + // one. + bool IsFrameworkDir = Parent.endswith(".framework"); + if (const FileEntry *ModMapFile = + HeaderInfo.lookupModuleMapFile(*ParentDir, IsFrameworkDir)) { + parseModuleMapFile(ModMapFile, Attrs.IsSystem, *ParentDir); + inferred = InferredDirectories.find(*ParentDir); + } + + if (inferred == InferredDirectories.end()) + inferred = InferredDirectories.insert( + std::make_pair(*ParentDir, InferredDirectory())).first; + } + + if (inferred->second.InferModules) { + // We're allowed to infer for this directory, but make sure it's okay + // to infer this particular module. + StringRef Name = llvm::sys::path::stem(FrameworkDirName); + canInfer = std::find(inferred->second.ExcludedModules.begin(), + inferred->second.ExcludedModules.end(), + Name) == inferred->second.ExcludedModules.end(); + + Attrs.IsSystem |= inferred->second.Attrs.IsSystem; + Attrs.IsExternC |= inferred->second.Attrs.IsExternC; + Attrs.IsExhaustive |= inferred->second.Attrs.IsExhaustive; + Attrs.NoUndeclaredIncludes |= + inferred->second.Attrs.NoUndeclaredIncludes; + ModuleMapFile = inferred->second.ModuleMapFile; + } + } + } + + // If we're not allowed to infer a framework module, don't. + if (!canInfer) + return nullptr; + } else + ModuleMapFile = getModuleMapFileForUniquing(Parent); + + + // Look for an umbrella header. + SmallString<128> UmbrellaName = StringRef(FrameworkDir->getName()); + llvm::sys::path::append(UmbrellaName, "Headers", ModuleName + ".h"); + auto UmbrellaHeader = FileMgr.getFile(UmbrellaName); + + // FIXME: If there's no umbrella header, we could probably scan the + // framework to load *everything*. But, it's not clear that this is a good + // idea. + if (!UmbrellaHeader) + return nullptr; + + Module *Result = new Module(ModuleName, SourceLocation(), Parent, + /*IsFramework=*/true, /*IsExplicit=*/false, + NumCreatedModules++); + InferredModuleAllowedBy[Result] = ModuleMapFile; + Result->IsInferred = true; + if (!Parent) { + if (LangOpts.CurrentModule == ModuleName) + SourceModule = Result; + Modules[ModuleName] = Result; + ModuleScopeIDs[Result] = CurrentModuleScopeID; + } + + Result->IsSystem |= Attrs.IsSystem; + Result->IsExternC |= Attrs.IsExternC; + Result->ConfigMacrosExhaustive |= Attrs.IsExhaustive; + Result->NoUndeclaredIncludes |= Attrs.NoUndeclaredIncludes; + Result->Directory = FrameworkDir; + + // umbrella header "umbrella-header-name" + // + // The "Headers/" component of the name is implied because this is + // a framework module. + setUmbrellaHeader(Result, *UmbrellaHeader, ModuleName + ".h"); + + // export * + Result->Exports.push_back(Module::ExportDecl(nullptr, true)); + + // module * { export * } + Result->InferSubmodules = true; + Result->InferExportWildcard = true; + + // Look for subframeworks. + std::error_code EC; + SmallString<128> SubframeworksDirName + = StringRef(FrameworkDir->getName()); + llvm::sys::path::append(SubframeworksDirName, "Frameworks"); + llvm::sys::path::native(SubframeworksDirName); + llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); + for (llvm::vfs::directory_iterator + Dir = FS.dir_begin(SubframeworksDirName, EC), + DirEnd; + Dir != DirEnd && !EC; Dir.increment(EC)) { + if (!StringRef(Dir->path()).endswith(".framework")) + continue; + + if (auto SubframeworkDir = + FileMgr.getDirectory(Dir->path())) { + // Note: as an egregious but useful hack, we use the real path here and + // check whether it is actually a subdirectory of the parent directory. + // This will not be the case if the 'subframework' is actually a symlink + // out to a top-level framework. + StringRef SubframeworkDirName = + FileMgr.getCanonicalName(*SubframeworkDir); + bool FoundParent = false; + do { + // Get the parent directory name. + SubframeworkDirName + = llvm::sys::path::parent_path(SubframeworkDirName); + if (SubframeworkDirName.empty()) + break; + + if (auto SubDir = FileMgr.getDirectory(SubframeworkDirName)) { + if (*SubDir == FrameworkDir) { + FoundParent = true; + break; + } + } + } while (true); + + if (!FoundParent) + continue; + + // FIXME: Do we want to warn about subframeworks without umbrella headers? + inferFrameworkModule(*SubframeworkDir, Attrs, Result); + } + } + + // If the module is a top-level framework, automatically link against the + // framework. + if (!Result->isSubFramework()) { + inferFrameworkLink(Result, FrameworkDir, FileMgr); + } + + return Result; +} + +Module *ModuleMap::createShadowedModule(StringRef Name, bool IsFramework, + Module *ShadowingModule) { + + // Create a new module with this name. + Module *Result = + new Module(Name, SourceLocation(), /*Parent=*/nullptr, IsFramework, + /*IsExplicit=*/false, NumCreatedModules++); + Result->ShadowingModule = ShadowingModule; + Result->IsAvailable = false; + ModuleScopeIDs[Result] = CurrentModuleScopeID; + ShadowModules.push_back(Result); + + return Result; +} + +void ModuleMap::setUmbrellaHeader(Module *Mod, const FileEntry *UmbrellaHeader, + Twine NameAsWritten) { + Headers[UmbrellaHeader].push_back(KnownHeader(Mod, NormalHeader)); + Mod->Umbrella = UmbrellaHeader; + Mod->UmbrellaAsWritten = NameAsWritten.str(); + UmbrellaDirs[UmbrellaHeader->getDir()] = Mod; + + // Notify callbacks that we just added a new header. + for (const auto &Cb : Callbacks) + Cb->moduleMapAddUmbrellaHeader(&SourceMgr.getFileManager(), UmbrellaHeader); +} + +void ModuleMap::setUmbrellaDir(Module *Mod, const DirectoryEntry *UmbrellaDir, + Twine NameAsWritten) { + Mod->Umbrella = UmbrellaDir; + Mod->UmbrellaAsWritten = NameAsWritten.str(); + UmbrellaDirs[UmbrellaDir] = Mod; +} + +void ModuleMap::addUnresolvedHeader(Module *Mod, + Module::UnresolvedHeaderDirective Header, + bool &NeedsFramework) { + // If there is a builtin counterpart to this file, add it now so it can + // wrap the system header. + if (resolveAsBuiltinHeader(Mod, Header)) { + // If we have both a builtin and system version of the file, the + // builtin version may want to inject macros into the system header, so + // force the system header to be treated as a textual header in this + // case. + Header.Kind = headerRoleToKind(ModuleMap::ModuleHeaderRole( + headerKindToRole(Header.Kind) | ModuleMap::TextualHeader)); + Header.HasBuiltinHeader = true; + } + + // If possible, don't stat the header until we need to. This requires the + // user to have provided us with some stat information about the file. + // FIXME: Add support for lazily stat'ing umbrella headers and excluded + // headers. + if ((Header.Size || Header.ModTime) && !Header.IsUmbrella && + Header.Kind != Module::HK_Excluded) { + // We expect more variation in mtime than size, so if we're given both, + // use the mtime as the key. + if (Header.ModTime) + LazyHeadersByModTime[*Header.ModTime].push_back(Mod); + else + LazyHeadersBySize[*Header.Size].push_back(Mod); + Mod->UnresolvedHeaders.push_back(Header); + return; + } + + // We don't have stat information or can't defer looking this file up. + // Perform the lookup now. + resolveHeader(Mod, Header, NeedsFramework); +} + +void ModuleMap::resolveHeaderDirectives(const FileEntry *File) const { + auto BySize = LazyHeadersBySize.find(File->getSize()); + if (BySize != LazyHeadersBySize.end()) { + for (auto *M : BySize->second) + resolveHeaderDirectives(M); + LazyHeadersBySize.erase(BySize); + } + + auto ByModTime = LazyHeadersByModTime.find(File->getModificationTime()); + if (ByModTime != LazyHeadersByModTime.end()) { + for (auto *M : ByModTime->second) + resolveHeaderDirectives(M); + LazyHeadersByModTime.erase(ByModTime); + } +} + +void ModuleMap::resolveHeaderDirectives(Module *Mod) const { + bool NeedsFramework = false; + for (auto &Header : Mod->UnresolvedHeaders) + // This operation is logically const; we're just changing how we represent + // the header information for this file. + const_cast<ModuleMap*>(this)->resolveHeader(Mod, Header, NeedsFramework); + Mod->UnresolvedHeaders.clear(); +} + +void ModuleMap::addHeader(Module *Mod, Module::Header Header, + ModuleHeaderRole Role, bool Imported) { + KnownHeader KH(Mod, Role); + + // Only add each header to the headers list once. + // FIXME: Should we diagnose if a header is listed twice in the + // same module definition? + auto &HeaderList = Headers[Header.Entry]; + for (auto H : HeaderList) + if (H == KH) + return; + + HeaderList.push_back(KH); + Mod->Headers[headerRoleToKind(Role)].push_back(Header); + + bool isCompilingModuleHeader = + LangOpts.isCompilingModule() && Mod->getTopLevelModule() == SourceModule; + if (!Imported || isCompilingModuleHeader) { + // When we import HeaderFileInfo, the external source is expected to + // set the isModuleHeader flag itself. + HeaderInfo.MarkFileModuleHeader(Header.Entry, Role, + isCompilingModuleHeader); + } + + // Notify callbacks that we just added a new header. + for (const auto &Cb : Callbacks) + Cb->moduleMapAddHeader(Header.Entry->getName()); +} + +void ModuleMap::excludeHeader(Module *Mod, Module::Header Header) { + // Add this as a known header so we won't implicitly add it to any + // umbrella directory module. + // FIXME: Should we only exclude it from umbrella modules within the + // specified module? + (void) Headers[Header.Entry]; + + Mod->Headers[Module::HK_Excluded].push_back(std::move(Header)); +} + +const FileEntry * +ModuleMap::getContainingModuleMapFile(const Module *Module) const { + if (Module->DefinitionLoc.isInvalid()) + return nullptr; + + return SourceMgr.getFileEntryForID( + SourceMgr.getFileID(Module->DefinitionLoc)); +} + +const FileEntry *ModuleMap::getModuleMapFileForUniquing(const Module *M) const { + if (M->IsInferred) { + assert(InferredModuleAllowedBy.count(M) && "missing inferred module map"); + return InferredModuleAllowedBy.find(M)->second; + } + return getContainingModuleMapFile(M); +} + +void ModuleMap::setInferredModuleAllowedBy(Module *M, const FileEntry *ModMap) { + assert(M->IsInferred && "module not inferred"); + InferredModuleAllowedBy[M] = ModMap; +} + +LLVM_DUMP_METHOD void ModuleMap::dump() { + llvm::errs() << "Modules:"; + for (llvm::StringMap<Module *>::iterator M = Modules.begin(), + MEnd = Modules.end(); + M != MEnd; ++M) + M->getValue()->print(llvm::errs(), 2); + + llvm::errs() << "Headers:"; + for (HeadersMap::iterator H = Headers.begin(), HEnd = Headers.end(); + H != HEnd; ++H) { + llvm::errs() << " \"" << H->first->getName() << "\" -> "; + for (SmallVectorImpl<KnownHeader>::const_iterator I = H->second.begin(), + E = H->second.end(); + I != E; ++I) { + if (I != H->second.begin()) + llvm::errs() << ","; + llvm::errs() << I->getModule()->getFullModuleName(); + } + llvm::errs() << "\n"; + } +} + +bool ModuleMap::resolveExports(Module *Mod, bool Complain) { + auto Unresolved = std::move(Mod->UnresolvedExports); + Mod->UnresolvedExports.clear(); + for (auto &UE : Unresolved) { + Module::ExportDecl Export = resolveExport(Mod, UE, Complain); + if (Export.getPointer() || Export.getInt()) + Mod->Exports.push_back(Export); + else + Mod->UnresolvedExports.push_back(UE); + } + return !Mod->UnresolvedExports.empty(); +} + +bool ModuleMap::resolveUses(Module *Mod, bool Complain) { + auto Unresolved = std::move(Mod->UnresolvedDirectUses); + Mod->UnresolvedDirectUses.clear(); + for (auto &UDU : Unresolved) { + Module *DirectUse = resolveModuleId(UDU, Mod, Complain); + if (DirectUse) + Mod->DirectUses.push_back(DirectUse); + else + Mod->UnresolvedDirectUses.push_back(UDU); + } + return !Mod->UnresolvedDirectUses.empty(); +} + +bool ModuleMap::resolveConflicts(Module *Mod, bool Complain) { + auto Unresolved = std::move(Mod->UnresolvedConflicts); + Mod->UnresolvedConflicts.clear(); + for (auto &UC : Unresolved) { + if (Module *OtherMod = resolveModuleId(UC.Id, Mod, Complain)) { + Module::Conflict Conflict; + Conflict.Other = OtherMod; + Conflict.Message = UC.Message; + Mod->Conflicts.push_back(Conflict); + } else + Mod->UnresolvedConflicts.push_back(UC); + } + return !Mod->UnresolvedConflicts.empty(); +} + +//----------------------------------------------------------------------------// +// Module map file parser +//----------------------------------------------------------------------------// + +namespace clang { + + /// A token in a module map file. + struct MMToken { + enum TokenKind { + Comma, + ConfigMacros, + Conflict, + EndOfFile, + HeaderKeyword, + Identifier, + Exclaim, + ExcludeKeyword, + ExplicitKeyword, + ExportKeyword, + ExportAsKeyword, + ExternKeyword, + FrameworkKeyword, + LinkKeyword, + ModuleKeyword, + Period, + PrivateKeyword, + UmbrellaKeyword, + UseKeyword, + RequiresKeyword, + Star, + StringLiteral, + IntegerLiteral, + TextualKeyword, + LBrace, + RBrace, + LSquare, + RSquare + } Kind; + + unsigned Location; + unsigned StringLength; + union { + // If Kind != IntegerLiteral. + const char *StringData; + + // If Kind == IntegerLiteral. + uint64_t IntegerValue; + }; + + void clear() { + Kind = EndOfFile; + Location = 0; + StringLength = 0; + StringData = nullptr; + } + + bool is(TokenKind K) const { return Kind == K; } + + SourceLocation getLocation() const { + return SourceLocation::getFromRawEncoding(Location); + } + + uint64_t getInteger() const { + return Kind == IntegerLiteral ? IntegerValue : 0; + } + + StringRef getString() const { + return Kind == IntegerLiteral ? StringRef() + : StringRef(StringData, StringLength); + } + }; + + class ModuleMapParser { + Lexer &L; + SourceManager &SourceMgr; + + /// Default target information, used only for string literal + /// parsing. + const TargetInfo *Target; + + DiagnosticsEngine &Diags; + ModuleMap ⤅ + + /// The current module map file. + const FileEntry *ModuleMapFile; + + /// Source location of most recent parsed module declaration + SourceLocation CurrModuleDeclLoc; + + /// The directory that file names in this module map file should + /// be resolved relative to. + const DirectoryEntry *Directory; + + /// Whether this module map is in a system header directory. + bool IsSystem; + + /// Whether an error occurred. + bool HadError = false; + + /// Stores string data for the various string literals referenced + /// during parsing. + llvm::BumpPtrAllocator StringData; + + /// The current token. + MMToken Tok; + + /// The active module. + Module *ActiveModule = nullptr; + + /// Whether a module uses the 'requires excluded' hack to mark its + /// contents as 'textual'. + /// + /// On older Darwin SDK versions, 'requires excluded' is used to mark the + /// contents of the Darwin.C.excluded (assert.h) and Tcl.Private modules as + /// non-modular headers. For backwards compatibility, we continue to + /// support this idiom for just these modules, and map the headers to + /// 'textual' to match the original intent. + llvm::SmallPtrSet<Module *, 2> UsesRequiresExcludedHack; + + /// Consume the current token and return its location. + SourceLocation consumeToken(); + + /// Skip tokens until we reach the a token with the given kind + /// (or the end of the file). + void skipUntil(MMToken::TokenKind K); + + using ModuleId = SmallVector<std::pair<std::string, SourceLocation>, 2>; + + bool parseModuleId(ModuleId &Id); + void parseModuleDecl(); + void parseExternModuleDecl(); + void parseRequiresDecl(); + void parseHeaderDecl(MMToken::TokenKind, SourceLocation LeadingLoc); + void parseUmbrellaDirDecl(SourceLocation UmbrellaLoc); + void parseExportDecl(); + void parseExportAsDecl(); + void parseUseDecl(); + void parseLinkDecl(); + void parseConfigMacros(); + void parseConflict(); + void parseInferredModuleDecl(bool Framework, bool Explicit); + + /// Private modules are canonicalized as Foo_Private. Clang provides extra + /// module map search logic to find the appropriate private module when PCH + /// is used with implicit module maps. Warn when private modules are written + /// in other ways (FooPrivate and Foo.Private), providing notes and fixits. + void diagnosePrivateModules(SourceLocation ExplicitLoc, + SourceLocation FrameworkLoc); + + using Attributes = ModuleMap::Attributes; + + bool parseOptionalAttributes(Attributes &Attrs); + + public: + explicit ModuleMapParser(Lexer &L, SourceManager &SourceMgr, + const TargetInfo *Target, DiagnosticsEngine &Diags, + ModuleMap &Map, const FileEntry *ModuleMapFile, + const DirectoryEntry *Directory, bool IsSystem) + : L(L), SourceMgr(SourceMgr), Target(Target), Diags(Diags), Map(Map), + ModuleMapFile(ModuleMapFile), Directory(Directory), + IsSystem(IsSystem) { + Tok.clear(); + consumeToken(); + } + + bool parseModuleMapFile(); + + bool terminatedByDirective() { return false; } + SourceLocation getLocation() { return Tok.getLocation(); } + }; + +} // namespace clang + +SourceLocation ModuleMapParser::consumeToken() { + SourceLocation Result = Tok.getLocation(); + +retry: + Tok.clear(); + Token LToken; + L.LexFromRawLexer(LToken); + Tok.Location = LToken.getLocation().getRawEncoding(); + switch (LToken.getKind()) { + case tok::raw_identifier: { + StringRef RI = LToken.getRawIdentifier(); + Tok.StringData = RI.data(); + Tok.StringLength = RI.size(); + Tok.Kind = llvm::StringSwitch<MMToken::TokenKind>(RI) + .Case("config_macros", MMToken::ConfigMacros) + .Case("conflict", MMToken::Conflict) + .Case("exclude", MMToken::ExcludeKeyword) + .Case("explicit", MMToken::ExplicitKeyword) + .Case("export", MMToken::ExportKeyword) + .Case("export_as", MMToken::ExportAsKeyword) + .Case("extern", MMToken::ExternKeyword) + .Case("framework", MMToken::FrameworkKeyword) + .Case("header", MMToken::HeaderKeyword) + .Case("link", MMToken::LinkKeyword) + .Case("module", MMToken::ModuleKeyword) + .Case("private", MMToken::PrivateKeyword) + .Case("requires", MMToken::RequiresKeyword) + .Case("textual", MMToken::TextualKeyword) + .Case("umbrella", MMToken::UmbrellaKeyword) + .Case("use", MMToken::UseKeyword) + .Default(MMToken::Identifier); + break; + } + + case tok::comma: + Tok.Kind = MMToken::Comma; + break; + + case tok::eof: + Tok.Kind = MMToken::EndOfFile; + break; + + case tok::l_brace: + Tok.Kind = MMToken::LBrace; + break; + + case tok::l_square: + Tok.Kind = MMToken::LSquare; + break; + + case tok::period: + Tok.Kind = MMToken::Period; + break; + + case tok::r_brace: + Tok.Kind = MMToken::RBrace; + break; + + case tok::r_square: + Tok.Kind = MMToken::RSquare; + break; + + case tok::star: + Tok.Kind = MMToken::Star; + break; + + case tok::exclaim: + Tok.Kind = MMToken::Exclaim; + break; + + case tok::string_literal: { + if (LToken.hasUDSuffix()) { + Diags.Report(LToken.getLocation(), diag::err_invalid_string_udl); + HadError = true; + goto retry; + } + + // Parse the string literal. + LangOptions LangOpts; + StringLiteralParser StringLiteral(LToken, SourceMgr, LangOpts, *Target); + if (StringLiteral.hadError) + goto retry; + + // Copy the string literal into our string data allocator. + unsigned Length = StringLiteral.GetStringLength(); + char *Saved = StringData.Allocate<char>(Length + 1); + memcpy(Saved, StringLiteral.GetString().data(), Length); + Saved[Length] = 0; + + // Form the token. + Tok.Kind = MMToken::StringLiteral; + Tok.StringData = Saved; + Tok.StringLength = Length; + break; + } + + case tok::numeric_constant: { + // We don't support any suffixes or other complications. + SmallString<32> SpellingBuffer; + SpellingBuffer.resize(LToken.getLength() + 1); + const char *Start = SpellingBuffer.data(); + unsigned Length = + Lexer::getSpelling(LToken, Start, SourceMgr, L.getLangOpts()); + uint64_t Value; + if (StringRef(Start, Length).getAsInteger(0, Value)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_unknown_token); + HadError = true; + goto retry; + } + + Tok.Kind = MMToken::IntegerLiteral; + Tok.IntegerValue = Value; + break; + } + + case tok::comment: + goto retry; + + case tok::hash: + // A module map can be terminated prematurely by + // #pragma clang module contents + // When building the module, we'll treat the rest of the file as the + // contents of the module. + { + auto NextIsIdent = [&](StringRef Str) -> bool { + L.LexFromRawLexer(LToken); + return !LToken.isAtStartOfLine() && LToken.is(tok::raw_identifier) && + LToken.getRawIdentifier() == Str; + }; + if (NextIsIdent("pragma") && NextIsIdent("clang") && + NextIsIdent("module") && NextIsIdent("contents")) { + Tok.Kind = MMToken::EndOfFile; + break; + } + } + LLVM_FALLTHROUGH; + + default: + Diags.Report(Tok.getLocation(), diag::err_mmap_unknown_token); + HadError = true; + goto retry; + } + + return Result; +} + +void ModuleMapParser::skipUntil(MMToken::TokenKind K) { + unsigned braceDepth = 0; + unsigned squareDepth = 0; + do { + switch (Tok.Kind) { + case MMToken::EndOfFile: + return; + + case MMToken::LBrace: + if (Tok.is(K) && braceDepth == 0 && squareDepth == 0) + return; + + ++braceDepth; + break; + + case MMToken::LSquare: + if (Tok.is(K) && braceDepth == 0 && squareDepth == 0) + return; + + ++squareDepth; + break; + + case MMToken::RBrace: + if (braceDepth > 0) + --braceDepth; + else if (Tok.is(K)) + return; + break; + + case MMToken::RSquare: + if (squareDepth > 0) + --squareDepth; + else if (Tok.is(K)) + return; + break; + + default: + if (braceDepth == 0 && squareDepth == 0 && Tok.is(K)) + return; + break; + } + + consumeToken(); + } while (true); +} + +/// Parse a module-id. +/// +/// module-id: +/// identifier +/// identifier '.' module-id +/// +/// \returns true if an error occurred, false otherwise. +bool ModuleMapParser::parseModuleId(ModuleId &Id) { + Id.clear(); + do { + if (Tok.is(MMToken::Identifier) || Tok.is(MMToken::StringLiteral)) { + Id.push_back(std::make_pair(Tok.getString(), Tok.getLocation())); + consumeToken(); + } else { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_module_name); + return true; + } + + if (!Tok.is(MMToken::Period)) + break; + + consumeToken(); + } while (true); + + return false; +} + +namespace { + + /// Enumerates the known attributes. + enum AttributeKind { + /// An unknown attribute. + AT_unknown, + + /// The 'system' attribute. + AT_system, + + /// The 'extern_c' attribute. + AT_extern_c, + + /// The 'exhaustive' attribute. + AT_exhaustive, + + /// The 'no_undeclared_includes' attribute. + AT_no_undeclared_includes + }; + +} // namespace + +/// Private modules are canonicalized as Foo_Private. Clang provides extra +/// module map search logic to find the appropriate private module when PCH +/// is used with implicit module maps. Warn when private modules are written +/// in other ways (FooPrivate and Foo.Private), providing notes and fixits. +void ModuleMapParser::diagnosePrivateModules(SourceLocation ExplicitLoc, + SourceLocation FrameworkLoc) { + auto GenNoteAndFixIt = [&](StringRef BadName, StringRef Canonical, + const Module *M, SourceRange ReplLoc) { + auto D = Diags.Report(ActiveModule->DefinitionLoc, + diag::note_mmap_rename_top_level_private_module); + D << BadName << M->Name; + D << FixItHint::CreateReplacement(ReplLoc, Canonical); + }; + + for (auto E = Map.module_begin(); E != Map.module_end(); ++E) { + auto const *M = E->getValue(); + if (M->Directory != ActiveModule->Directory) + continue; + + SmallString<128> FullName(ActiveModule->getFullModuleName()); + if (!FullName.startswith(M->Name) && !FullName.endswith("Private")) + continue; + SmallString<128> FixedPrivModDecl; + SmallString<128> Canonical(M->Name); + Canonical.append("_Private"); + + // Foo.Private -> Foo_Private + if (ActiveModule->Parent && ActiveModule->Name == "Private" && !M->Parent && + M->Name == ActiveModule->Parent->Name) { + Diags.Report(ActiveModule->DefinitionLoc, + diag::warn_mmap_mismatched_private_submodule) + << FullName; + + SourceLocation FixItInitBegin = CurrModuleDeclLoc; + if (FrameworkLoc.isValid()) + FixItInitBegin = FrameworkLoc; + if (ExplicitLoc.isValid()) + FixItInitBegin = ExplicitLoc; + + if (FrameworkLoc.isValid() || ActiveModule->Parent->IsFramework) + FixedPrivModDecl.append("framework "); + FixedPrivModDecl.append("module "); + FixedPrivModDecl.append(Canonical); + + GenNoteAndFixIt(FullName, FixedPrivModDecl, M, + SourceRange(FixItInitBegin, ActiveModule->DefinitionLoc)); + continue; + } + + // FooPrivate and whatnots -> Foo_Private + if (!ActiveModule->Parent && !M->Parent && M->Name != ActiveModule->Name && + ActiveModule->Name != Canonical) { + Diags.Report(ActiveModule->DefinitionLoc, + diag::warn_mmap_mismatched_private_module_name) + << ActiveModule->Name; + GenNoteAndFixIt(ActiveModule->Name, Canonical, M, + SourceRange(ActiveModule->DefinitionLoc)); + } + } +} + +/// Parse a module declaration. +/// +/// module-declaration: +/// 'extern' 'module' module-id string-literal +/// 'explicit'[opt] 'framework'[opt] 'module' module-id attributes[opt] +/// { module-member* } +/// +/// module-member: +/// requires-declaration +/// header-declaration +/// submodule-declaration +/// export-declaration +/// export-as-declaration +/// link-declaration +/// +/// submodule-declaration: +/// module-declaration +/// inferred-submodule-declaration +void ModuleMapParser::parseModuleDecl() { + assert(Tok.is(MMToken::ExplicitKeyword) || Tok.is(MMToken::ModuleKeyword) || + Tok.is(MMToken::FrameworkKeyword) || Tok.is(MMToken::ExternKeyword)); + if (Tok.is(MMToken::ExternKeyword)) { + parseExternModuleDecl(); + return; + } + + // Parse 'explicit' or 'framework' keyword, if present. + SourceLocation ExplicitLoc; + SourceLocation FrameworkLoc; + bool Explicit = false; + bool Framework = false; + + // Parse 'explicit' keyword, if present. + if (Tok.is(MMToken::ExplicitKeyword)) { + ExplicitLoc = consumeToken(); + Explicit = true; + } + + // Parse 'framework' keyword, if present. + if (Tok.is(MMToken::FrameworkKeyword)) { + FrameworkLoc = consumeToken(); + Framework = true; + } + + // Parse 'module' keyword. + if (!Tok.is(MMToken::ModuleKeyword)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_module); + consumeToken(); + HadError = true; + return; + } + CurrModuleDeclLoc = consumeToken(); // 'module' keyword + + // If we have a wildcard for the module name, this is an inferred submodule. + // Parse it. + if (Tok.is(MMToken::Star)) + return parseInferredModuleDecl(Framework, Explicit); + + // Parse the module name. + ModuleId Id; + if (parseModuleId(Id)) { + HadError = true; + return; + } + + if (ActiveModule) { + if (Id.size() > 1) { + Diags.Report(Id.front().second, diag::err_mmap_nested_submodule_id) + << SourceRange(Id.front().second, Id.back().second); + + HadError = true; + return; + } + } else if (Id.size() == 1 && Explicit) { + // Top-level modules can't be explicit. + Diags.Report(ExplicitLoc, diag::err_mmap_explicit_top_level); + Explicit = false; + ExplicitLoc = SourceLocation(); + HadError = true; + } + + Module *PreviousActiveModule = ActiveModule; + if (Id.size() > 1) { + // This module map defines a submodule. Go find the module of which it + // is a submodule. + ActiveModule = nullptr; + const Module *TopLevelModule = nullptr; + for (unsigned I = 0, N = Id.size() - 1; I != N; ++I) { + if (Module *Next = Map.lookupModuleQualified(Id[I].first, ActiveModule)) { + if (I == 0) + TopLevelModule = Next; + ActiveModule = Next; + continue; + } + + if (ActiveModule) { + Diags.Report(Id[I].second, diag::err_mmap_missing_module_qualified) + << Id[I].first + << ActiveModule->getTopLevelModule()->getFullModuleName(); + } else { + Diags.Report(Id[I].second, diag::err_mmap_expected_module_name); + } + HadError = true; + return; + } + + if (ModuleMapFile != Map.getContainingModuleMapFile(TopLevelModule)) { + assert(ModuleMapFile != Map.getModuleMapFileForUniquing(TopLevelModule) && + "submodule defined in same file as 'module *' that allowed its " + "top-level module"); + Map.addAdditionalModuleMapFile(TopLevelModule, ModuleMapFile); + } + } + + StringRef ModuleName = Id.back().first; + SourceLocation ModuleNameLoc = Id.back().second; + + // Parse the optional attribute list. + Attributes Attrs; + if (parseOptionalAttributes(Attrs)) + return; + + // Parse the opening brace. + if (!Tok.is(MMToken::LBrace)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_lbrace) + << ModuleName; + HadError = true; + return; + } + SourceLocation LBraceLoc = consumeToken(); + + // Determine whether this (sub)module has already been defined. + Module *ShadowingModule = nullptr; + if (Module *Existing = Map.lookupModuleQualified(ModuleName, ActiveModule)) { + // We might see a (re)definition of a module that we already have a + // definition for in two cases: + // - If we loaded one definition from an AST file and we've just found a + // corresponding definition in a module map file, or + bool LoadedFromASTFile = Existing->DefinitionLoc.isInvalid(); + // - If we're building a (preprocessed) module and we've just loaded the + // module map file from which it was created. + bool ParsedAsMainInput = + Map.LangOpts.getCompilingModule() == LangOptions::CMK_ModuleMap && + Map.LangOpts.CurrentModule == ModuleName && + SourceMgr.getDecomposedLoc(ModuleNameLoc).first != + SourceMgr.getDecomposedLoc(Existing->DefinitionLoc).first; + if (!ActiveModule && (LoadedFromASTFile || ParsedAsMainInput)) { + // Skip the module definition. + skipUntil(MMToken::RBrace); + if (Tok.is(MMToken::RBrace)) + consumeToken(); + else { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_rbrace); + Diags.Report(LBraceLoc, diag::note_mmap_lbrace_match); + HadError = true; + } + return; + } + + if (!Existing->Parent && Map.mayShadowNewModule(Existing)) { + ShadowingModule = Existing; + } else { + // This is not a shawdowed module decl, it is an illegal redefinition. + Diags.Report(ModuleNameLoc, diag::err_mmap_module_redefinition) + << ModuleName; + Diags.Report(Existing->DefinitionLoc, diag::note_mmap_prev_definition); + + // Skip the module definition. + skipUntil(MMToken::RBrace); + if (Tok.is(MMToken::RBrace)) + consumeToken(); + + HadError = true; + return; + } + } + + // Start defining this module. + if (ShadowingModule) { + ActiveModule = + Map.createShadowedModule(ModuleName, Framework, ShadowingModule); + } else { + ActiveModule = + Map.findOrCreateModule(ModuleName, ActiveModule, Framework, Explicit) + .first; + } + + ActiveModule->DefinitionLoc = ModuleNameLoc; + if (Attrs.IsSystem || IsSystem) + ActiveModule->IsSystem = true; + if (Attrs.IsExternC) + ActiveModule->IsExternC = true; + if (Attrs.NoUndeclaredIncludes || + (!ActiveModule->Parent && ModuleName == "Darwin")) + ActiveModule->NoUndeclaredIncludes = true; + ActiveModule->Directory = Directory; + + StringRef MapFileName(ModuleMapFile->getName()); + if (MapFileName.endswith("module.private.modulemap") || + MapFileName.endswith("module_private.map")) { + ActiveModule->ModuleMapIsPrivate = true; + } + + // Private modules named as FooPrivate, Foo.Private or similar are likely a + // user error; provide warnings, notes and fixits to direct users to use + // Foo_Private instead. + SourceLocation StartLoc = + SourceMgr.getLocForStartOfFile(SourceMgr.getMainFileID()); + if (Map.HeaderInfo.getHeaderSearchOpts().ImplicitModuleMaps && + !Diags.isIgnored(diag::warn_mmap_mismatched_private_submodule, + StartLoc) && + !Diags.isIgnored(diag::warn_mmap_mismatched_private_module_name, + StartLoc) && + ActiveModule->ModuleMapIsPrivate) + diagnosePrivateModules(ExplicitLoc, FrameworkLoc); + + bool Done = false; + do { + switch (Tok.Kind) { + case MMToken::EndOfFile: + case MMToken::RBrace: + Done = true; + break; + + case MMToken::ConfigMacros: + parseConfigMacros(); + break; + + case MMToken::Conflict: + parseConflict(); + break; + + case MMToken::ExplicitKeyword: + case MMToken::ExternKeyword: + case MMToken::FrameworkKeyword: + case MMToken::ModuleKeyword: + parseModuleDecl(); + break; + + case MMToken::ExportKeyword: + parseExportDecl(); + break; + + case MMToken::ExportAsKeyword: + parseExportAsDecl(); + break; + + case MMToken::UseKeyword: + parseUseDecl(); + break; + + case MMToken::RequiresKeyword: + parseRequiresDecl(); + break; + + case MMToken::TextualKeyword: + parseHeaderDecl(MMToken::TextualKeyword, consumeToken()); + break; + + case MMToken::UmbrellaKeyword: { + SourceLocation UmbrellaLoc = consumeToken(); + if (Tok.is(MMToken::HeaderKeyword)) + parseHeaderDecl(MMToken::UmbrellaKeyword, UmbrellaLoc); + else + parseUmbrellaDirDecl(UmbrellaLoc); + break; + } + + case MMToken::ExcludeKeyword: + parseHeaderDecl(MMToken::ExcludeKeyword, consumeToken()); + break; + + case MMToken::PrivateKeyword: + parseHeaderDecl(MMToken::PrivateKeyword, consumeToken()); + break; + + case MMToken::HeaderKeyword: + parseHeaderDecl(MMToken::HeaderKeyword, consumeToken()); + break; + + case MMToken::LinkKeyword: + parseLinkDecl(); + break; + + default: + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_member); + consumeToken(); + break; + } + } while (!Done); + + if (Tok.is(MMToken::RBrace)) + consumeToken(); + else { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_rbrace); + Diags.Report(LBraceLoc, diag::note_mmap_lbrace_match); + HadError = true; + } + + // If the active module is a top-level framework, and there are no link + // libraries, automatically link against the framework. + if (ActiveModule->IsFramework && !ActiveModule->isSubFramework() && + ActiveModule->LinkLibraries.empty()) { + inferFrameworkLink(ActiveModule, Directory, SourceMgr.getFileManager()); + } + + // If the module meets all requirements but is still unavailable, mark the + // whole tree as unavailable to prevent it from building. + if (!ActiveModule->IsAvailable && !ActiveModule->IsMissingRequirement && + ActiveModule->Parent) { + ActiveModule->getTopLevelModule()->markUnavailable(); + ActiveModule->getTopLevelModule()->MissingHeaders.append( + ActiveModule->MissingHeaders.begin(), ActiveModule->MissingHeaders.end()); + } + + // We're done parsing this module. Pop back to the previous module. + ActiveModule = PreviousActiveModule; +} + +/// Parse an extern module declaration. +/// +/// extern module-declaration: +/// 'extern' 'module' module-id string-literal +void ModuleMapParser::parseExternModuleDecl() { + assert(Tok.is(MMToken::ExternKeyword)); + SourceLocation ExternLoc = consumeToken(); // 'extern' keyword + + // Parse 'module' keyword. + if (!Tok.is(MMToken::ModuleKeyword)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_module); + consumeToken(); + HadError = true; + return; + } + consumeToken(); // 'module' keyword + + // Parse the module name. + ModuleId Id; + if (parseModuleId(Id)) { + HadError = true; + return; + } + + // Parse the referenced module map file name. + if (!Tok.is(MMToken::StringLiteral)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_mmap_file); + HadError = true; + return; + } + std::string FileName = Tok.getString(); + consumeToken(); // filename + + StringRef FileNameRef = FileName; + SmallString<128> ModuleMapFileName; + if (llvm::sys::path::is_relative(FileNameRef)) { + ModuleMapFileName += Directory->getName(); + llvm::sys::path::append(ModuleMapFileName, FileName); + FileNameRef = ModuleMapFileName; + } + if (auto File = SourceMgr.getFileManager().getFile(FileNameRef)) + Map.parseModuleMapFile( + *File, /*IsSystem=*/false, + Map.HeaderInfo.getHeaderSearchOpts().ModuleMapFileHomeIsCwd + ? Directory + : (*File)->getDir(), + FileID(), nullptr, ExternLoc); +} + +/// Whether to add the requirement \p Feature to the module \p M. +/// +/// This preserves backwards compatibility for two hacks in the Darwin system +/// module map files: +/// +/// 1. The use of 'requires excluded' to make headers non-modular, which +/// should really be mapped to 'textual' now that we have this feature. We +/// drop the 'excluded' requirement, and set \p IsRequiresExcludedHack to +/// true. Later, this bit will be used to map all the headers inside this +/// module to 'textual'. +/// +/// This affects Darwin.C.excluded (for assert.h) and Tcl.Private. +/// +/// 2. Removes a bogus cplusplus requirement from IOKit.avc. This requirement +/// was never correct and causes issues now that we check it, so drop it. +static bool shouldAddRequirement(Module *M, StringRef Feature, + bool &IsRequiresExcludedHack) { + if (Feature == "excluded" && + (M->fullModuleNameIs({"Darwin", "C", "excluded"}) || + M->fullModuleNameIs({"Tcl", "Private"}))) { + IsRequiresExcludedHack = true; + return false; + } else if (Feature == "cplusplus" && M->fullModuleNameIs({"IOKit", "avc"})) { + return false; + } + + return true; +} + +/// Parse a requires declaration. +/// +/// requires-declaration: +/// 'requires' feature-list +/// +/// feature-list: +/// feature ',' feature-list +/// feature +/// +/// feature: +/// '!'[opt] identifier +void ModuleMapParser::parseRequiresDecl() { + assert(Tok.is(MMToken::RequiresKeyword)); + + // Parse 'requires' keyword. + consumeToken(); + + // Parse the feature-list. + do { + bool RequiredState = true; + if (Tok.is(MMToken::Exclaim)) { + RequiredState = false; + consumeToken(); + } + + if (!Tok.is(MMToken::Identifier)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_feature); + HadError = true; + return; + } + + // Consume the feature name. + std::string Feature = Tok.getString(); + consumeToken(); + + bool IsRequiresExcludedHack = false; + bool ShouldAddRequirement = + shouldAddRequirement(ActiveModule, Feature, IsRequiresExcludedHack); + + if (IsRequiresExcludedHack) + UsesRequiresExcludedHack.insert(ActiveModule); + + if (ShouldAddRequirement) { + // Add this feature. + ActiveModule->addRequirement(Feature, RequiredState, Map.LangOpts, + *Map.Target); + } + + if (!Tok.is(MMToken::Comma)) + break; + + // Consume the comma. + consumeToken(); + } while (true); +} + +/// Parse a header declaration. +/// +/// header-declaration: +/// 'textual'[opt] 'header' string-literal +/// 'private' 'textual'[opt] 'header' string-literal +/// 'exclude' 'header' string-literal +/// 'umbrella' 'header' string-literal +/// +/// FIXME: Support 'private textual header'. +void ModuleMapParser::parseHeaderDecl(MMToken::TokenKind LeadingToken, + SourceLocation LeadingLoc) { + // We've already consumed the first token. + ModuleMap::ModuleHeaderRole Role = ModuleMap::NormalHeader; + if (LeadingToken == MMToken::PrivateKeyword) { + Role = ModuleMap::PrivateHeader; + // 'private' may optionally be followed by 'textual'. + if (Tok.is(MMToken::TextualKeyword)) { + LeadingToken = Tok.Kind; + consumeToken(); + } + } + + if (LeadingToken == MMToken::TextualKeyword) + Role = ModuleMap::ModuleHeaderRole(Role | ModuleMap::TextualHeader); + + if (UsesRequiresExcludedHack.count(ActiveModule)) { + // Mark this header 'textual' (see doc comment for + // Module::UsesRequiresExcludedHack). + Role = ModuleMap::ModuleHeaderRole(Role | ModuleMap::TextualHeader); + } + + if (LeadingToken != MMToken::HeaderKeyword) { + if (!Tok.is(MMToken::HeaderKeyword)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_header) + << (LeadingToken == MMToken::PrivateKeyword ? "private" : + LeadingToken == MMToken::ExcludeKeyword ? "exclude" : + LeadingToken == MMToken::TextualKeyword ? "textual" : "umbrella"); + return; + } + consumeToken(); + } + + // Parse the header name. + if (!Tok.is(MMToken::StringLiteral)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_header) + << "header"; + HadError = true; + return; + } + Module::UnresolvedHeaderDirective Header; + Header.FileName = Tok.getString(); + Header.FileNameLoc = consumeToken(); + Header.IsUmbrella = LeadingToken == MMToken::UmbrellaKeyword; + Header.Kind = + (LeadingToken == MMToken::ExcludeKeyword ? Module::HK_Excluded + : Map.headerRoleToKind(Role)); + + // Check whether we already have an umbrella. + if (Header.IsUmbrella && ActiveModule->Umbrella) { + Diags.Report(Header.FileNameLoc, diag::err_mmap_umbrella_clash) + << ActiveModule->getFullModuleName(); + HadError = true; + return; + } + + // If we were given stat information, parse it so we can skip looking for + // the file. + if (Tok.is(MMToken::LBrace)) { + SourceLocation LBraceLoc = consumeToken(); + + while (!Tok.is(MMToken::RBrace) && !Tok.is(MMToken::EndOfFile)) { + enum Attribute { Size, ModTime, Unknown }; + StringRef Str = Tok.getString(); + SourceLocation Loc = consumeToken(); + switch (llvm::StringSwitch<Attribute>(Str) + .Case("size", Size) + .Case("mtime", ModTime) + .Default(Unknown)) { + case Size: + if (Header.Size) + Diags.Report(Loc, diag::err_mmap_duplicate_header_attribute) << Str; + if (!Tok.is(MMToken::IntegerLiteral)) { + Diags.Report(Tok.getLocation(), + diag::err_mmap_invalid_header_attribute_value) << Str; + skipUntil(MMToken::RBrace); + break; + } + Header.Size = Tok.getInteger(); + consumeToken(); + break; + + case ModTime: + if (Header.ModTime) + Diags.Report(Loc, diag::err_mmap_duplicate_header_attribute) << Str; + if (!Tok.is(MMToken::IntegerLiteral)) { + Diags.Report(Tok.getLocation(), + diag::err_mmap_invalid_header_attribute_value) << Str; + skipUntil(MMToken::RBrace); + break; + } + Header.ModTime = Tok.getInteger(); + consumeToken(); + break; + + case Unknown: + Diags.Report(Loc, diag::err_mmap_expected_header_attribute); + skipUntil(MMToken::RBrace); + break; + } + } + + if (Tok.is(MMToken::RBrace)) + consumeToken(); + else { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_rbrace); + Diags.Report(LBraceLoc, diag::note_mmap_lbrace_match); + HadError = true; + } + } + + bool NeedsFramework = false; + Map.addUnresolvedHeader(ActiveModule, std::move(Header), NeedsFramework); + + if (NeedsFramework && ActiveModule) + Diags.Report(CurrModuleDeclLoc, diag::note_mmap_add_framework_keyword) + << ActiveModule->getFullModuleName() + << FixItHint::CreateReplacement(CurrModuleDeclLoc, "framework module"); +} + +static int compareModuleHeaders(const Module::Header *A, + const Module::Header *B) { + return A->NameAsWritten.compare(B->NameAsWritten); +} + +/// Parse an umbrella directory declaration. +/// +/// umbrella-dir-declaration: +/// umbrella string-literal +void ModuleMapParser::parseUmbrellaDirDecl(SourceLocation UmbrellaLoc) { + // Parse the directory name. + if (!Tok.is(MMToken::StringLiteral)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_header) + << "umbrella"; + HadError = true; + return; + } + + std::string DirName = Tok.getString(); + SourceLocation DirNameLoc = consumeToken(); + + // Check whether we already have an umbrella. + if (ActiveModule->Umbrella) { + Diags.Report(DirNameLoc, diag::err_mmap_umbrella_clash) + << ActiveModule->getFullModuleName(); + HadError = true; + return; + } + + // Look for this file. + const DirectoryEntry *Dir = nullptr; + if (llvm::sys::path::is_absolute(DirName)) { + if (auto D = SourceMgr.getFileManager().getDirectory(DirName)) + Dir = *D; + } else { + SmallString<128> PathName; + PathName = Directory->getName(); + llvm::sys::path::append(PathName, DirName); + if (auto D = SourceMgr.getFileManager().getDirectory(PathName)) + Dir = *D; + } + + if (!Dir) { + Diags.Report(DirNameLoc, diag::warn_mmap_umbrella_dir_not_found) + << DirName; + return; + } + + if (UsesRequiresExcludedHack.count(ActiveModule)) { + // Mark this header 'textual' (see doc comment for + // ModuleMapParser::UsesRequiresExcludedHack). Although iterating over the + // directory is relatively expensive, in practice this only applies to the + // uncommonly used Tcl module on Darwin platforms. + std::error_code EC; + SmallVector<Module::Header, 6> Headers; + llvm::vfs::FileSystem &FS = + SourceMgr.getFileManager().getVirtualFileSystem(); + for (llvm::vfs::recursive_directory_iterator I(FS, Dir->getName(), EC), E; + I != E && !EC; I.increment(EC)) { + if (auto FE = SourceMgr.getFileManager().getFile(I->path())) { + + Module::Header Header = {I->path(), *FE}; + Headers.push_back(std::move(Header)); + } + } + + // Sort header paths so that the pcm doesn't depend on iteration order. + llvm::array_pod_sort(Headers.begin(), Headers.end(), compareModuleHeaders); + + for (auto &Header : Headers) + Map.addHeader(ActiveModule, std::move(Header), ModuleMap::TextualHeader); + return; + } + + if (Module *OwningModule = Map.UmbrellaDirs[Dir]) { + Diags.Report(UmbrellaLoc, diag::err_mmap_umbrella_clash) + << OwningModule->getFullModuleName(); + HadError = true; + return; + } + + // Record this umbrella directory. + Map.setUmbrellaDir(ActiveModule, Dir, DirName); +} + +/// Parse a module export declaration. +/// +/// export-declaration: +/// 'export' wildcard-module-id +/// +/// wildcard-module-id: +/// identifier +/// '*' +/// identifier '.' wildcard-module-id +void ModuleMapParser::parseExportDecl() { + assert(Tok.is(MMToken::ExportKeyword)); + SourceLocation ExportLoc = consumeToken(); + + // Parse the module-id with an optional wildcard at the end. + ModuleId ParsedModuleId; + bool Wildcard = false; + do { + // FIXME: Support string-literal module names here. + if (Tok.is(MMToken::Identifier)) { + ParsedModuleId.push_back(std::make_pair(Tok.getString(), + Tok.getLocation())); + consumeToken(); + + if (Tok.is(MMToken::Period)) { + consumeToken(); + continue; + } + + break; + } + + if(Tok.is(MMToken::Star)) { + Wildcard = true; + consumeToken(); + break; + } + + Diags.Report(Tok.getLocation(), diag::err_mmap_module_id); + HadError = true; + return; + } while (true); + + Module::UnresolvedExportDecl Unresolved = { + ExportLoc, ParsedModuleId, Wildcard + }; + ActiveModule->UnresolvedExports.push_back(Unresolved); +} + +/// Parse a module export_as declaration. +/// +/// export-as-declaration: +/// 'export_as' identifier +void ModuleMapParser::parseExportAsDecl() { + assert(Tok.is(MMToken::ExportAsKeyword)); + consumeToken(); + + if (!Tok.is(MMToken::Identifier)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_module_id); + HadError = true; + return; + } + + if (ActiveModule->Parent) { + Diags.Report(Tok.getLocation(), diag::err_mmap_submodule_export_as); + consumeToken(); + return; + } + + if (!ActiveModule->ExportAsModule.empty()) { + if (ActiveModule->ExportAsModule == Tok.getString()) { + Diags.Report(Tok.getLocation(), diag::warn_mmap_redundant_export_as) + << ActiveModule->Name << Tok.getString(); + } else { + Diags.Report(Tok.getLocation(), diag::err_mmap_conflicting_export_as) + << ActiveModule->Name << ActiveModule->ExportAsModule + << Tok.getString(); + } + } + + ActiveModule->ExportAsModule = Tok.getString(); + Map.addLinkAsDependency(ActiveModule); + + consumeToken(); +} + +/// Parse a module use declaration. +/// +/// use-declaration: +/// 'use' wildcard-module-id +void ModuleMapParser::parseUseDecl() { + assert(Tok.is(MMToken::UseKeyword)); + auto KWLoc = consumeToken(); + // Parse the module-id. + ModuleId ParsedModuleId; + parseModuleId(ParsedModuleId); + + if (ActiveModule->Parent) + Diags.Report(KWLoc, diag::err_mmap_use_decl_submodule); + else + ActiveModule->UnresolvedDirectUses.push_back(ParsedModuleId); +} + +/// Parse a link declaration. +/// +/// module-declaration: +/// 'link' 'framework'[opt] string-literal +void ModuleMapParser::parseLinkDecl() { + assert(Tok.is(MMToken::LinkKeyword)); + SourceLocation LinkLoc = consumeToken(); + + // Parse the optional 'framework' keyword. + bool IsFramework = false; + if (Tok.is(MMToken::FrameworkKeyword)) { + consumeToken(); + IsFramework = true; + } + + // Parse the library name + if (!Tok.is(MMToken::StringLiteral)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_library_name) + << IsFramework << SourceRange(LinkLoc); + HadError = true; + return; + } + + std::string LibraryName = Tok.getString(); + consumeToken(); + ActiveModule->LinkLibraries.push_back(Module::LinkLibrary(LibraryName, + IsFramework)); +} + +/// Parse a configuration macro declaration. +/// +/// module-declaration: +/// 'config_macros' attributes[opt] config-macro-list? +/// +/// config-macro-list: +/// identifier (',' identifier)? +void ModuleMapParser::parseConfigMacros() { + assert(Tok.is(MMToken::ConfigMacros)); + SourceLocation ConfigMacrosLoc = consumeToken(); + + // Only top-level modules can have configuration macros. + if (ActiveModule->Parent) { + Diags.Report(ConfigMacrosLoc, diag::err_mmap_config_macro_submodule); + } + + // Parse the optional attributes. + Attributes Attrs; + if (parseOptionalAttributes(Attrs)) + return; + + if (Attrs.IsExhaustive && !ActiveModule->Parent) { + ActiveModule->ConfigMacrosExhaustive = true; + } + + // If we don't have an identifier, we're done. + // FIXME: Support macros with the same name as a keyword here. + if (!Tok.is(MMToken::Identifier)) + return; + + // Consume the first identifier. + if (!ActiveModule->Parent) { + ActiveModule->ConfigMacros.push_back(Tok.getString().str()); + } + consumeToken(); + + do { + // If there's a comma, consume it. + if (!Tok.is(MMToken::Comma)) + break; + consumeToken(); + + // We expect to see a macro name here. + // FIXME: Support macros with the same name as a keyword here. + if (!Tok.is(MMToken::Identifier)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_config_macro); + break; + } + + // Consume the macro name. + if (!ActiveModule->Parent) { + ActiveModule->ConfigMacros.push_back(Tok.getString().str()); + } + consumeToken(); + } while (true); +} + +/// Format a module-id into a string. +static std::string formatModuleId(const ModuleId &Id) { + std::string result; + { + llvm::raw_string_ostream OS(result); + + for (unsigned I = 0, N = Id.size(); I != N; ++I) { + if (I) + OS << "."; + OS << Id[I].first; + } + } + + return result; +} + +/// Parse a conflict declaration. +/// +/// module-declaration: +/// 'conflict' module-id ',' string-literal +void ModuleMapParser::parseConflict() { + assert(Tok.is(MMToken::Conflict)); + SourceLocation ConflictLoc = consumeToken(); + Module::UnresolvedConflict Conflict; + + // Parse the module-id. + if (parseModuleId(Conflict.Id)) + return; + + // Parse the ','. + if (!Tok.is(MMToken::Comma)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_conflicts_comma) + << SourceRange(ConflictLoc); + return; + } + consumeToken(); + + // Parse the message. + if (!Tok.is(MMToken::StringLiteral)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_conflicts_message) + << formatModuleId(Conflict.Id); + return; + } + Conflict.Message = Tok.getString().str(); + consumeToken(); + + // Add this unresolved conflict. + ActiveModule->UnresolvedConflicts.push_back(Conflict); +} + +/// Parse an inferred module declaration (wildcard modules). +/// +/// module-declaration: +/// 'explicit'[opt] 'framework'[opt] 'module' * attributes[opt] +/// { inferred-module-member* } +/// +/// inferred-module-member: +/// 'export' '*' +/// 'exclude' identifier +void ModuleMapParser::parseInferredModuleDecl(bool Framework, bool Explicit) { + assert(Tok.is(MMToken::Star)); + SourceLocation StarLoc = consumeToken(); + bool Failed = false; + + // Inferred modules must be submodules. + if (!ActiveModule && !Framework) { + Diags.Report(StarLoc, diag::err_mmap_top_level_inferred_submodule); + Failed = true; + } + + if (ActiveModule) { + // Inferred modules must have umbrella directories. + if (!Failed && ActiveModule->IsAvailable && + !ActiveModule->getUmbrellaDir()) { + Diags.Report(StarLoc, diag::err_mmap_inferred_no_umbrella); + Failed = true; + } + + // Check for redefinition of an inferred module. + if (!Failed && ActiveModule->InferSubmodules) { + Diags.Report(StarLoc, diag::err_mmap_inferred_redef); + if (ActiveModule->InferredSubmoduleLoc.isValid()) + Diags.Report(ActiveModule->InferredSubmoduleLoc, + diag::note_mmap_prev_definition); + Failed = true; + } + + // Check for the 'framework' keyword, which is not permitted here. + if (Framework) { + Diags.Report(StarLoc, diag::err_mmap_inferred_framework_submodule); + Framework = false; + } + } else if (Explicit) { + Diags.Report(StarLoc, diag::err_mmap_explicit_inferred_framework); + Explicit = false; + } + + // If there were any problems with this inferred submodule, skip its body. + if (Failed) { + if (Tok.is(MMToken::LBrace)) { + consumeToken(); + skipUntil(MMToken::RBrace); + if (Tok.is(MMToken::RBrace)) + consumeToken(); + } + HadError = true; + return; + } + + // Parse optional attributes. + Attributes Attrs; + if (parseOptionalAttributes(Attrs)) + return; + + if (ActiveModule) { + // Note that we have an inferred submodule. + ActiveModule->InferSubmodules = true; + ActiveModule->InferredSubmoduleLoc = StarLoc; + ActiveModule->InferExplicitSubmodules = Explicit; + } else { + // We'll be inferring framework modules for this directory. + Map.InferredDirectories[Directory].InferModules = true; + Map.InferredDirectories[Directory].Attrs = Attrs; + Map.InferredDirectories[Directory].ModuleMapFile = ModuleMapFile; + // FIXME: Handle the 'framework' keyword. + } + + // Parse the opening brace. + if (!Tok.is(MMToken::LBrace)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_lbrace_wildcard); + HadError = true; + return; + } + SourceLocation LBraceLoc = consumeToken(); + + // Parse the body of the inferred submodule. + bool Done = false; + do { + switch (Tok.Kind) { + case MMToken::EndOfFile: + case MMToken::RBrace: + Done = true; + break; + + case MMToken::ExcludeKeyword: + if (ActiveModule) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_inferred_member) + << (ActiveModule != nullptr); + consumeToken(); + break; + } + + consumeToken(); + // FIXME: Support string-literal module names here. + if (!Tok.is(MMToken::Identifier)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_missing_exclude_name); + break; + } + + Map.InferredDirectories[Directory].ExcludedModules + .push_back(Tok.getString()); + consumeToken(); + break; + + case MMToken::ExportKeyword: + if (!ActiveModule) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_inferred_member) + << (ActiveModule != nullptr); + consumeToken(); + break; + } + + consumeToken(); + if (Tok.is(MMToken::Star)) + ActiveModule->InferExportWildcard = true; + else + Diags.Report(Tok.getLocation(), + diag::err_mmap_expected_export_wildcard); + consumeToken(); + break; + + case MMToken::ExplicitKeyword: + case MMToken::ModuleKeyword: + case MMToken::HeaderKeyword: + case MMToken::PrivateKeyword: + case MMToken::UmbrellaKeyword: + default: + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_inferred_member) + << (ActiveModule != nullptr); + consumeToken(); + break; + } + } while (!Done); + + if (Tok.is(MMToken::RBrace)) + consumeToken(); + else { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_rbrace); + Diags.Report(LBraceLoc, diag::note_mmap_lbrace_match); + HadError = true; + } +} + +/// Parse optional attributes. +/// +/// attributes: +/// attribute attributes +/// attribute +/// +/// attribute: +/// [ identifier ] +/// +/// \param Attrs Will be filled in with the parsed attributes. +/// +/// \returns true if an error occurred, false otherwise. +bool ModuleMapParser::parseOptionalAttributes(Attributes &Attrs) { + bool HadError = false; + + while (Tok.is(MMToken::LSquare)) { + // Consume the '['. + SourceLocation LSquareLoc = consumeToken(); + + // Check whether we have an attribute name here. + if (!Tok.is(MMToken::Identifier)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_attribute); + skipUntil(MMToken::RSquare); + if (Tok.is(MMToken::RSquare)) + consumeToken(); + HadError = true; + } + + // Decode the attribute name. + AttributeKind Attribute + = llvm::StringSwitch<AttributeKind>(Tok.getString()) + .Case("exhaustive", AT_exhaustive) + .Case("extern_c", AT_extern_c) + .Case("no_undeclared_includes", AT_no_undeclared_includes) + .Case("system", AT_system) + .Default(AT_unknown); + switch (Attribute) { + case AT_unknown: + Diags.Report(Tok.getLocation(), diag::warn_mmap_unknown_attribute) + << Tok.getString(); + break; + + case AT_system: + Attrs.IsSystem = true; + break; + + case AT_extern_c: + Attrs.IsExternC = true; + break; + + case AT_exhaustive: + Attrs.IsExhaustive = true; + break; + + case AT_no_undeclared_includes: + Attrs.NoUndeclaredIncludes = true; + break; + } + consumeToken(); + + // Consume the ']'. + if (!Tok.is(MMToken::RSquare)) { + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_rsquare); + Diags.Report(LSquareLoc, diag::note_mmap_lsquare_match); + skipUntil(MMToken::RSquare); + HadError = true; + } + + if (Tok.is(MMToken::RSquare)) + consumeToken(); + } + + return HadError; +} + +/// Parse a module map file. +/// +/// module-map-file: +/// module-declaration* +bool ModuleMapParser::parseModuleMapFile() { + do { + switch (Tok.Kind) { + case MMToken::EndOfFile: + return HadError; + + case MMToken::ExplicitKeyword: + case MMToken::ExternKeyword: + case MMToken::ModuleKeyword: + case MMToken::FrameworkKeyword: + parseModuleDecl(); + break; + + case MMToken::Comma: + case MMToken::ConfigMacros: + case MMToken::Conflict: + case MMToken::Exclaim: + case MMToken::ExcludeKeyword: + case MMToken::ExportKeyword: + case MMToken::ExportAsKeyword: + case MMToken::HeaderKeyword: + case MMToken::Identifier: + case MMToken::LBrace: + case MMToken::LinkKeyword: + case MMToken::LSquare: + case MMToken::Period: + case MMToken::PrivateKeyword: + case MMToken::RBrace: + case MMToken::RSquare: + case MMToken::RequiresKeyword: + case MMToken::Star: + case MMToken::StringLiteral: + case MMToken::IntegerLiteral: + case MMToken::TextualKeyword: + case MMToken::UmbrellaKeyword: + case MMToken::UseKeyword: + Diags.Report(Tok.getLocation(), diag::err_mmap_expected_module); + HadError = true; + consumeToken(); + break; + } + } while (true); +} + +bool ModuleMap::parseModuleMapFile(const FileEntry *File, bool IsSystem, + const DirectoryEntry *Dir, FileID ID, + unsigned *Offset, + SourceLocation ExternModuleLoc) { + assert(Target && "Missing target information"); + llvm::DenseMap<const FileEntry *, bool>::iterator Known + = ParsedModuleMap.find(File); + if (Known != ParsedModuleMap.end()) + return Known->second; + + // If the module map file wasn't already entered, do so now. + if (ID.isInvalid()) { + auto FileCharacter = + IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap; + ID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter); + } + + assert(Target && "Missing target information"); + const llvm::MemoryBuffer *Buffer = SourceMgr.getBuffer(ID); + if (!Buffer) + return ParsedModuleMap[File] = true; + assert((!Offset || *Offset <= Buffer->getBufferSize()) && + "invalid buffer offset"); + + // Parse this module map file. + Lexer L(SourceMgr.getLocForStartOfFile(ID), MMapLangOpts, + Buffer->getBufferStart(), + Buffer->getBufferStart() + (Offset ? *Offset : 0), + Buffer->getBufferEnd()); + SourceLocation Start = L.getSourceLocation(); + ModuleMapParser Parser(L, SourceMgr, Target, Diags, *this, File, Dir, + IsSystem); + bool Result = Parser.parseModuleMapFile(); + ParsedModuleMap[File] = Result; + + if (Offset) { + auto Loc = SourceMgr.getDecomposedLoc(Parser.getLocation()); + assert(Loc.first == ID && "stopped in a different file?"); + *Offset = Loc.second; + } + + // Notify callbacks that we parsed it. + for (const auto &Cb : Callbacks) + Cb->moduleMapFileRead(Start, *File, IsSystem); + + return Result; +} diff --git a/clang/lib/Lex/PPCaching.cpp b/clang/lib/Lex/PPCaching.cpp new file mode 100644 index 000000000000..31548d246d5a --- /dev/null +++ b/clang/lib/Lex/PPCaching.cpp @@ -0,0 +1,163 @@ +//===--- PPCaching.cpp - Handle caching lexed tokens ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements pieces of the Preprocessor interface that manage the +// caching of lexed tokens. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +using namespace clang; + +// EnableBacktrackAtThisPos - From the point that this method is called, and +// until CommitBacktrackedTokens() or Backtrack() is called, the Preprocessor +// keeps track of the lexed tokens so that a subsequent Backtrack() call will +// make the Preprocessor re-lex the same tokens. +// +// Nested backtracks are allowed, meaning that EnableBacktrackAtThisPos can +// be called multiple times and CommitBacktrackedTokens/Backtrack calls will +// be combined with the EnableBacktrackAtThisPos calls in reverse order. +void Preprocessor::EnableBacktrackAtThisPos() { + assert(LexLevel == 0 && "cannot use lookahead while lexing"); + BacktrackPositions.push_back(CachedLexPos); + EnterCachingLexMode(); +} + +// Disable the last EnableBacktrackAtThisPos call. +void Preprocessor::CommitBacktrackedTokens() { + assert(!BacktrackPositions.empty() + && "EnableBacktrackAtThisPos was not called!"); + BacktrackPositions.pop_back(); +} + +// Make Preprocessor re-lex the tokens that were lexed since +// EnableBacktrackAtThisPos() was previously called. +void Preprocessor::Backtrack() { + assert(!BacktrackPositions.empty() + && "EnableBacktrackAtThisPos was not called!"); + CachedLexPos = BacktrackPositions.back(); + BacktrackPositions.pop_back(); + recomputeCurLexerKind(); +} + +void Preprocessor::CachingLex(Token &Result) { + if (!InCachingLexMode()) + return; + + // The assert in EnterCachingLexMode should prevent this from happening. + assert(LexLevel == 1 && + "should not use token caching within the preprocessor"); + + if (CachedLexPos < CachedTokens.size()) { + Result = CachedTokens[CachedLexPos++]; + Result.setFlag(Token::IsReinjected); + return; + } + + ExitCachingLexMode(); + Lex(Result); + + if (isBacktrackEnabled()) { + // Cache the lexed token. + EnterCachingLexModeUnchecked(); + CachedTokens.push_back(Result); + ++CachedLexPos; + return; + } + + if (CachedLexPos < CachedTokens.size()) { + EnterCachingLexModeUnchecked(); + } else { + // All cached tokens were consumed. + CachedTokens.clear(); + CachedLexPos = 0; + } +} + +void Preprocessor::EnterCachingLexMode() { + // The caching layer sits on top of all the other lexers, so it's incorrect + // to cache tokens while inside a nested lex action. The cached tokens would + // be retained after returning to the enclosing lex action and, at best, + // would appear at the wrong position in the token stream. + assert(LexLevel == 0 && + "entered caching lex mode while lexing something else"); + + if (InCachingLexMode()) { + assert(CurLexerKind == CLK_CachingLexer && "Unexpected lexer kind"); + return; + } + + EnterCachingLexModeUnchecked(); +} + +void Preprocessor::EnterCachingLexModeUnchecked() { + assert(CurLexerKind != CLK_CachingLexer && "already in caching lex mode"); + PushIncludeMacroStack(); + CurLexerKind = CLK_CachingLexer; +} + + +const Token &Preprocessor::PeekAhead(unsigned N) { + assert(CachedLexPos + N > CachedTokens.size() && "Confused caching."); + ExitCachingLexMode(); + for (size_t C = CachedLexPos + N - CachedTokens.size(); C > 0; --C) { + CachedTokens.push_back(Token()); + Lex(CachedTokens.back()); + } + EnterCachingLexMode(); + return CachedTokens.back(); +} + +void Preprocessor::AnnotatePreviousCachedTokens(const Token &Tok) { + assert(Tok.isAnnotation() && "Expected annotation token"); + assert(CachedLexPos != 0 && "Expected to have some cached tokens"); + assert(CachedTokens[CachedLexPos-1].getLastLoc() == Tok.getAnnotationEndLoc() + && "The annotation should be until the most recent cached token"); + + // Start from the end of the cached tokens list and look for the token + // that is the beginning of the annotation token. + for (CachedTokensTy::size_type i = CachedLexPos; i != 0; --i) { + CachedTokensTy::iterator AnnotBegin = CachedTokens.begin() + i-1; + if (AnnotBegin->getLocation() == Tok.getLocation()) { + assert((BacktrackPositions.empty() || BacktrackPositions.back() <= i) && + "The backtrack pos points inside the annotated tokens!"); + // Replace the cached tokens with the single annotation token. + if (i < CachedLexPos) + CachedTokens.erase(AnnotBegin + 1, CachedTokens.begin() + CachedLexPos); + *AnnotBegin = Tok; + CachedLexPos = i; + return; + } + } +} + +bool Preprocessor::IsPreviousCachedToken(const Token &Tok) const { + // There's currently no cached token... + if (!CachedLexPos) + return false; + + const Token LastCachedTok = CachedTokens[CachedLexPos - 1]; + if (LastCachedTok.getKind() != Tok.getKind()) + return false; + + int RelOffset = 0; + if ((!getSourceManager().isInSameSLocAddrSpace( + Tok.getLocation(), getLastCachedTokenLocation(), &RelOffset)) || + RelOffset) + return false; + + return true; +} + +void Preprocessor::ReplacePreviousCachedToken(ArrayRef<Token> NewToks) { + assert(CachedLexPos != 0 && "Expected to have some cached tokens"); + CachedTokens.insert(CachedTokens.begin() + CachedLexPos - 1, NewToks.begin(), + NewToks.end()); + CachedTokens.erase(CachedTokens.begin() + CachedLexPos - 1 + NewToks.size()); + CachedLexPos += NewToks.size() - 1; +} diff --git a/clang/lib/Lex/PPCallbacks.cpp b/clang/lib/Lex/PPCallbacks.cpp new file mode 100644 index 000000000000..cd8b04b20d24 --- /dev/null +++ b/clang/lib/Lex/PPCallbacks.cpp @@ -0,0 +1,13 @@ +//===--- PPCallbacks.cpp - Callbacks for Preprocessor actions ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/PPCallbacks.h" + +using namespace clang; + +void PPChainedCallbacks::anchor() { } diff --git a/clang/lib/Lex/PPConditionalDirectiveRecord.cpp b/clang/lib/Lex/PPConditionalDirectiveRecord.cpp new file mode 100644 index 000000000000..facee28007c7 --- /dev/null +++ b/clang/lib/Lex/PPConditionalDirectiveRecord.cpp @@ -0,0 +1,119 @@ +//===--- PPConditionalDirectiveRecord.h - Preprocessing Directives-*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPConditionalDirectiveRecord class, which maintains +// a record of conditional directive regions. +// +//===----------------------------------------------------------------------===// +#include "clang/Lex/PPConditionalDirectiveRecord.h" +#include "llvm/Support/Capacity.h" + +using namespace clang; + +PPConditionalDirectiveRecord::PPConditionalDirectiveRecord(SourceManager &SM) + : SourceMgr(SM) { + CondDirectiveStack.push_back(SourceLocation()); +} + +bool PPConditionalDirectiveRecord::rangeIntersectsConditionalDirective( + SourceRange Range) const { + if (Range.isInvalid()) + return false; + + CondDirectiveLocsTy::const_iterator low = llvm::lower_bound( + CondDirectiveLocs, Range.getBegin(), CondDirectiveLoc::Comp(SourceMgr)); + if (low == CondDirectiveLocs.end()) + return false; + + if (SourceMgr.isBeforeInTranslationUnit(Range.getEnd(), low->getLoc())) + return false; + + CondDirectiveLocsTy::const_iterator + upp = std::upper_bound(low, CondDirectiveLocs.end(), + Range.getEnd(), CondDirectiveLoc::Comp(SourceMgr)); + SourceLocation uppRegion; + if (upp != CondDirectiveLocs.end()) + uppRegion = upp->getRegionLoc(); + + return low->getRegionLoc() != uppRegion; +} + +SourceLocation PPConditionalDirectiveRecord::findConditionalDirectiveRegionLoc( + SourceLocation Loc) const { + if (Loc.isInvalid()) + return SourceLocation(); + if (CondDirectiveLocs.empty()) + return SourceLocation(); + + if (SourceMgr.isBeforeInTranslationUnit(CondDirectiveLocs.back().getLoc(), + Loc)) + return CondDirectiveStack.back(); + + CondDirectiveLocsTy::const_iterator low = llvm::lower_bound( + CondDirectiveLocs, Loc, CondDirectiveLoc::Comp(SourceMgr)); + assert(low != CondDirectiveLocs.end()); + return low->getRegionLoc(); +} + +void PPConditionalDirectiveRecord::addCondDirectiveLoc( + CondDirectiveLoc DirLoc) { + // Ignore directives in system headers. + if (SourceMgr.isInSystemHeader(DirLoc.getLoc())) + return; + + assert(CondDirectiveLocs.empty() || + SourceMgr.isBeforeInTranslationUnit(CondDirectiveLocs.back().getLoc(), + DirLoc.getLoc())); + CondDirectiveLocs.push_back(DirLoc); +} + +void PPConditionalDirectiveRecord::If(SourceLocation Loc, + SourceRange ConditionRange, + ConditionValueKind ConditionValue) { + addCondDirectiveLoc(CondDirectiveLoc(Loc, CondDirectiveStack.back())); + CondDirectiveStack.push_back(Loc); +} + +void PPConditionalDirectiveRecord::Ifdef(SourceLocation Loc, + const Token &MacroNameTok, + const MacroDefinition &MD) { + addCondDirectiveLoc(CondDirectiveLoc(Loc, CondDirectiveStack.back())); + CondDirectiveStack.push_back(Loc); +} + +void PPConditionalDirectiveRecord::Ifndef(SourceLocation Loc, + const Token &MacroNameTok, + const MacroDefinition &MD) { + addCondDirectiveLoc(CondDirectiveLoc(Loc, CondDirectiveStack.back())); + CondDirectiveStack.push_back(Loc); +} + +void PPConditionalDirectiveRecord::Elif(SourceLocation Loc, + SourceRange ConditionRange, + ConditionValueKind ConditionValue, + SourceLocation IfLoc) { + addCondDirectiveLoc(CondDirectiveLoc(Loc, CondDirectiveStack.back())); + CondDirectiveStack.back() = Loc; +} + +void PPConditionalDirectiveRecord::Else(SourceLocation Loc, + SourceLocation IfLoc) { + addCondDirectiveLoc(CondDirectiveLoc(Loc, CondDirectiveStack.back())); + CondDirectiveStack.back() = Loc; +} + +void PPConditionalDirectiveRecord::Endif(SourceLocation Loc, + SourceLocation IfLoc) { + addCondDirectiveLoc(CondDirectiveLoc(Loc, CondDirectiveStack.back())); + assert(!CondDirectiveStack.empty()); + CondDirectiveStack.pop_back(); +} + +size_t PPConditionalDirectiveRecord::getTotalMemory() const { + return llvm::capacity_in_bytes(CondDirectiveLocs); +} diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp new file mode 100644 index 000000000000..3b7eaee3c914 --- /dev/null +++ b/clang/lib/Lex/PPDirectives.cpp @@ -0,0 +1,3084 @@ +//===--- PPDirectives.cpp - Directive Handling for Preprocessor -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Implements # directive processing for the Preprocessor. +/// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/CharInfo.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/Module.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/CodeCompletionHandler.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/ModuleLoader.h" +#include "clang/Lex/ModuleMap.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/Pragma.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/Token.h" +#include "clang/Lex/VariadicMacroSupport.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/AlignOf.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Path.h" +#include <algorithm> +#include <cassert> +#include <cstring> +#include <new> +#include <string> +#include <utility> + +using namespace clang; + +//===----------------------------------------------------------------------===// +// Utility Methods for Preprocessor Directive Handling. +//===----------------------------------------------------------------------===// + +MacroInfo *Preprocessor::AllocateMacroInfo(SourceLocation L) { + auto *MIChain = new (BP) MacroInfoChain{L, MIChainHead}; + MIChainHead = MIChain; + return &MIChain->MI; +} + +DefMacroDirective *Preprocessor::AllocateDefMacroDirective(MacroInfo *MI, + SourceLocation Loc) { + return new (BP) DefMacroDirective(MI, Loc); +} + +UndefMacroDirective * +Preprocessor::AllocateUndefMacroDirective(SourceLocation UndefLoc) { + return new (BP) UndefMacroDirective(UndefLoc); +} + +VisibilityMacroDirective * +Preprocessor::AllocateVisibilityMacroDirective(SourceLocation Loc, + bool isPublic) { + return new (BP) VisibilityMacroDirective(Loc, isPublic); +} + +/// Read and discard all tokens remaining on the current line until +/// the tok::eod token is found. +SourceRange Preprocessor::DiscardUntilEndOfDirective() { + Token Tmp; + SourceRange Res; + + LexUnexpandedToken(Tmp); + Res.setBegin(Tmp.getLocation()); + while (Tmp.isNot(tok::eod)) { + assert(Tmp.isNot(tok::eof) && "EOF seen while discarding directive tokens"); + LexUnexpandedToken(Tmp); + } + Res.setEnd(Tmp.getLocation()); + return Res; +} + +/// Enumerates possible cases of #define/#undef a reserved identifier. +enum MacroDiag { + MD_NoWarn, //> Not a reserved identifier + MD_KeywordDef, //> Macro hides keyword, enabled by default + MD_ReservedMacro //> #define of #undef reserved id, disabled by default +}; + +/// Checks if the specified identifier is reserved in the specified +/// language. +/// This function does not check if the identifier is a keyword. +static bool isReservedId(StringRef Text, const LangOptions &Lang) { + // C++ [macro.names], C11 7.1.3: + // All identifiers that begin with an underscore and either an uppercase + // letter or another underscore are always reserved for any use. + if (Text.size() >= 2 && Text[0] == '_' && + (isUppercase(Text[1]) || Text[1] == '_')) + return true; + // C++ [global.names] + // Each name that contains a double underscore ... is reserved to the + // implementation for any use. + if (Lang.CPlusPlus) { + if (Text.find("__") != StringRef::npos) + return true; + } + return false; +} + +// The -fmodule-name option tells the compiler to textually include headers in +// the specified module, meaning clang won't build the specified module. This is +// useful in a number of situations, for instance, when building a library that +// vends a module map, one might want to avoid hitting intermediate build +// products containimg the the module map or avoid finding the system installed +// modulemap for that library. +static bool isForModuleBuilding(Module *M, StringRef CurrentModule, + StringRef ModuleName) { + StringRef TopLevelName = M->getTopLevelModuleName(); + + // When building framework Foo, we wanna make sure that Foo *and* Foo_Private + // are textually included and no modules are built for both. + if (M->getTopLevelModule()->IsFramework && CurrentModule == ModuleName && + !CurrentModule.endswith("_Private") && TopLevelName.endswith("_Private")) + TopLevelName = TopLevelName.drop_back(8); + + return TopLevelName == CurrentModule; +} + +static MacroDiag shouldWarnOnMacroDef(Preprocessor &PP, IdentifierInfo *II) { + const LangOptions &Lang = PP.getLangOpts(); + StringRef Text = II->getName(); + if (isReservedId(Text, Lang)) + return MD_ReservedMacro; + if (II->isKeyword(Lang)) + return MD_KeywordDef; + if (Lang.CPlusPlus11 && (Text.equals("override") || Text.equals("final"))) + return MD_KeywordDef; + return MD_NoWarn; +} + +static MacroDiag shouldWarnOnMacroUndef(Preprocessor &PP, IdentifierInfo *II) { + const LangOptions &Lang = PP.getLangOpts(); + StringRef Text = II->getName(); + // Do not warn on keyword undef. It is generally harmless and widely used. + if (isReservedId(Text, Lang)) + return MD_ReservedMacro; + return MD_NoWarn; +} + +// Return true if we want to issue a diagnostic by default if we +// encounter this name in a #include with the wrong case. For now, +// this includes the standard C and C++ headers, Posix headers, +// and Boost headers. Improper case for these #includes is a +// potential portability issue. +static bool warnByDefaultOnWrongCase(StringRef Include) { + // If the first component of the path is "boost", treat this like a standard header + // for the purposes of diagnostics. + if (::llvm::sys::path::begin(Include)->equals_lower("boost")) + return true; + + // "condition_variable" is the longest standard header name at 18 characters. + // If the include file name is longer than that, it can't be a standard header. + static const size_t MaxStdHeaderNameLen = 18u; + if (Include.size() > MaxStdHeaderNameLen) + return false; + + // Lowercase and normalize the search string. + SmallString<32> LowerInclude{Include}; + for (char &Ch : LowerInclude) { + // In the ASCII range? + if (static_cast<unsigned char>(Ch) > 0x7f) + return false; // Can't be a standard header + // ASCII lowercase: + if (Ch >= 'A' && Ch <= 'Z') + Ch += 'a' - 'A'; + // Normalize path separators for comparison purposes. + else if (::llvm::sys::path::is_separator(Ch)) + Ch = '/'; + } + + // The standard C/C++ and Posix headers + return llvm::StringSwitch<bool>(LowerInclude) + // C library headers + .Cases("assert.h", "complex.h", "ctype.h", "errno.h", "fenv.h", true) + .Cases("float.h", "inttypes.h", "iso646.h", "limits.h", "locale.h", true) + .Cases("math.h", "setjmp.h", "signal.h", "stdalign.h", "stdarg.h", true) + .Cases("stdatomic.h", "stdbool.h", "stddef.h", "stdint.h", "stdio.h", true) + .Cases("stdlib.h", "stdnoreturn.h", "string.h", "tgmath.h", "threads.h", true) + .Cases("time.h", "uchar.h", "wchar.h", "wctype.h", true) + + // C++ headers for C library facilities + .Cases("cassert", "ccomplex", "cctype", "cerrno", "cfenv", true) + .Cases("cfloat", "cinttypes", "ciso646", "climits", "clocale", true) + .Cases("cmath", "csetjmp", "csignal", "cstdalign", "cstdarg", true) + .Cases("cstdbool", "cstddef", "cstdint", "cstdio", "cstdlib", true) + .Cases("cstring", "ctgmath", "ctime", "cuchar", "cwchar", true) + .Case("cwctype", true) + + // C++ library headers + .Cases("algorithm", "fstream", "list", "regex", "thread", true) + .Cases("array", "functional", "locale", "scoped_allocator", "tuple", true) + .Cases("atomic", "future", "map", "set", "type_traits", true) + .Cases("bitset", "initializer_list", "memory", "shared_mutex", "typeindex", true) + .Cases("chrono", "iomanip", "mutex", "sstream", "typeinfo", true) + .Cases("codecvt", "ios", "new", "stack", "unordered_map", true) + .Cases("complex", "iosfwd", "numeric", "stdexcept", "unordered_set", true) + .Cases("condition_variable", "iostream", "ostream", "streambuf", "utility", true) + .Cases("deque", "istream", "queue", "string", "valarray", true) + .Cases("exception", "iterator", "random", "strstream", "vector", true) + .Cases("forward_list", "limits", "ratio", "system_error", true) + + // POSIX headers (which aren't also C headers) + .Cases("aio.h", "arpa/inet.h", "cpio.h", "dirent.h", "dlfcn.h", true) + .Cases("fcntl.h", "fmtmsg.h", "fnmatch.h", "ftw.h", "glob.h", true) + .Cases("grp.h", "iconv.h", "langinfo.h", "libgen.h", "monetary.h", true) + .Cases("mqueue.h", "ndbm.h", "net/if.h", "netdb.h", "netinet/in.h", true) + .Cases("netinet/tcp.h", "nl_types.h", "poll.h", "pthread.h", "pwd.h", true) + .Cases("regex.h", "sched.h", "search.h", "semaphore.h", "spawn.h", true) + .Cases("strings.h", "stropts.h", "sys/ipc.h", "sys/mman.h", "sys/msg.h", true) + .Cases("sys/resource.h", "sys/select.h", "sys/sem.h", "sys/shm.h", "sys/socket.h", true) + .Cases("sys/stat.h", "sys/statvfs.h", "sys/time.h", "sys/times.h", "sys/types.h", true) + .Cases("sys/uio.h", "sys/un.h", "sys/utsname.h", "sys/wait.h", "syslog.h", true) + .Cases("tar.h", "termios.h", "trace.h", "ulimit.h", true) + .Cases("unistd.h", "utime.h", "utmpx.h", "wordexp.h", true) + .Default(false); +} + +bool Preprocessor::CheckMacroName(Token &MacroNameTok, MacroUse isDefineUndef, + bool *ShadowFlag) { + // Missing macro name? + if (MacroNameTok.is(tok::eod)) + return Diag(MacroNameTok, diag::err_pp_missing_macro_name); + + IdentifierInfo *II = MacroNameTok.getIdentifierInfo(); + if (!II) + return Diag(MacroNameTok, diag::err_pp_macro_not_identifier); + + if (II->isCPlusPlusOperatorKeyword()) { + // C++ 2.5p2: Alternative tokens behave the same as its primary token + // except for their spellings. + Diag(MacroNameTok, getLangOpts().MicrosoftExt + ? diag::ext_pp_operator_used_as_macro_name + : diag::err_pp_operator_used_as_macro_name) + << II << MacroNameTok.getKind(); + // Allow #defining |and| and friends for Microsoft compatibility or + // recovery when legacy C headers are included in C++. + } + + if ((isDefineUndef != MU_Other) && II->getPPKeywordID() == tok::pp_defined) { + // Error if defining "defined": C99 6.10.8/4, C++ [cpp.predefined]p4. + return Diag(MacroNameTok, diag::err_defined_macro_name); + } + + if (isDefineUndef == MU_Undef) { + auto *MI = getMacroInfo(II); + if (MI && MI->isBuiltinMacro()) { + // Warn if undefining "__LINE__" and other builtins, per C99 6.10.8/4 + // and C++ [cpp.predefined]p4], but allow it as an extension. + Diag(MacroNameTok, diag::ext_pp_undef_builtin_macro); + } + } + + // If defining/undefining reserved identifier or a keyword, we need to issue + // a warning. + SourceLocation MacroNameLoc = MacroNameTok.getLocation(); + if (ShadowFlag) + *ShadowFlag = false; + if (!SourceMgr.isInSystemHeader(MacroNameLoc) && + (SourceMgr.getBufferName(MacroNameLoc) != "<built-in>")) { + MacroDiag D = MD_NoWarn; + if (isDefineUndef == MU_Define) { + D = shouldWarnOnMacroDef(*this, II); + } + else if (isDefineUndef == MU_Undef) + D = shouldWarnOnMacroUndef(*this, II); + if (D == MD_KeywordDef) { + // We do not want to warn on some patterns widely used in configuration + // scripts. This requires analyzing next tokens, so do not issue warnings + // now, only inform caller. + if (ShadowFlag) + *ShadowFlag = true; + } + if (D == MD_ReservedMacro) + Diag(MacroNameTok, diag::warn_pp_macro_is_reserved_id); + } + + // Okay, we got a good identifier. + return false; +} + +/// Lex and validate a macro name, which occurs after a +/// \#define or \#undef. +/// +/// This sets the token kind to eod and discards the rest of the macro line if +/// the macro name is invalid. +/// +/// \param MacroNameTok Token that is expected to be a macro name. +/// \param isDefineUndef Context in which macro is used. +/// \param ShadowFlag Points to a flag that is set if macro shadows a keyword. +void Preprocessor::ReadMacroName(Token &MacroNameTok, MacroUse isDefineUndef, + bool *ShadowFlag) { + // Read the token, don't allow macro expansion on it. + LexUnexpandedToken(MacroNameTok); + + if (MacroNameTok.is(tok::code_completion)) { + if (CodeComplete) + CodeComplete->CodeCompleteMacroName(isDefineUndef == MU_Define); + setCodeCompletionReached(); + LexUnexpandedToken(MacroNameTok); + } + + if (!CheckMacroName(MacroNameTok, isDefineUndef, ShadowFlag)) + return; + + // Invalid macro name, read and discard the rest of the line and set the + // token kind to tok::eod if necessary. + if (MacroNameTok.isNot(tok::eod)) { + MacroNameTok.setKind(tok::eod); + DiscardUntilEndOfDirective(); + } +} + +/// Ensure that the next token is a tok::eod token. +/// +/// If not, emit a diagnostic and consume up until the eod. If EnableMacros is +/// true, then we consider macros that expand to zero tokens as being ok. +/// +/// Returns the location of the end of the directive. +SourceLocation Preprocessor::CheckEndOfDirective(const char *DirType, + bool EnableMacros) { + Token Tmp; + // Lex unexpanded tokens for most directives: macros might expand to zero + // tokens, causing us to miss diagnosing invalid lines. Some directives (like + // #line) allow empty macros. + if (EnableMacros) + Lex(Tmp); + else + LexUnexpandedToken(Tmp); + + // There should be no tokens after the directive, but we allow them as an + // extension. + while (Tmp.is(tok::comment)) // Skip comments in -C mode. + LexUnexpandedToken(Tmp); + + if (Tmp.is(tok::eod)) + return Tmp.getLocation(); + + // Add a fixit in GNU/C99/C++ mode. Don't offer a fixit for strict-C89, + // or if this is a macro-style preprocessing directive, because it is more + // trouble than it is worth to insert /**/ and check that there is no /**/ + // in the range also. + FixItHint Hint; + if ((LangOpts.GNUMode || LangOpts.C99 || LangOpts.CPlusPlus) && + !CurTokenLexer) + Hint = FixItHint::CreateInsertion(Tmp.getLocation(),"//"); + Diag(Tmp, diag::ext_pp_extra_tokens_at_eol) << DirType << Hint; + return DiscardUntilEndOfDirective().getEnd(); +} + +Optional<unsigned> Preprocessor::getSkippedRangeForExcludedConditionalBlock( + SourceLocation HashLoc) { + if (!ExcludedConditionalDirectiveSkipMappings) + return None; + if (!HashLoc.isFileID()) + return None; + + std::pair<FileID, unsigned> HashFileOffset = + SourceMgr.getDecomposedLoc(HashLoc); + const llvm::MemoryBuffer *Buf = SourceMgr.getBuffer(HashFileOffset.first); + auto It = ExcludedConditionalDirectiveSkipMappings->find(Buf); + if (It == ExcludedConditionalDirectiveSkipMappings->end()) + return None; + + const PreprocessorSkippedRangeMapping &SkippedRanges = *It->getSecond(); + // Check if the offset of '#' is mapped in the skipped ranges. + auto MappingIt = SkippedRanges.find(HashFileOffset.second); + if (MappingIt == SkippedRanges.end()) + return None; + + unsigned BytesToSkip = MappingIt->getSecond(); + unsigned CurLexerBufferOffset = CurLexer->getCurrentBufferOffset(); + assert(CurLexerBufferOffset >= HashFileOffset.second && + "lexer is before the hash?"); + // Take into account the fact that the lexer has already advanced, so the + // number of bytes to skip must be adjusted. + unsigned LengthDiff = CurLexerBufferOffset - HashFileOffset.second; + assert(BytesToSkip >= LengthDiff && "lexer is after the skipped range?"); + return BytesToSkip - LengthDiff; +} + +/// SkipExcludedConditionalBlock - We just read a \#if or related directive and +/// decided that the subsequent tokens are in the \#if'd out portion of the +/// file. Lex the rest of the file, until we see an \#endif. If +/// FoundNonSkipPortion is true, then we have already emitted code for part of +/// this \#if directive, so \#else/\#elif blocks should never be entered. +/// If ElseOk is true, then \#else directives are ok, if not, then we have +/// already seen one so a \#else directive is a duplicate. When this returns, +/// the caller can lex the first valid token. +void Preprocessor::SkipExcludedConditionalBlock(SourceLocation HashTokenLoc, + SourceLocation IfTokenLoc, + bool FoundNonSkipPortion, + bool FoundElse, + SourceLocation ElseLoc) { + ++NumSkipped; + assert(!CurTokenLexer && CurPPLexer && "Lexing a macro, not a file?"); + + if (PreambleConditionalStack.reachedEOFWhileSkipping()) + PreambleConditionalStack.clearSkipInfo(); + else + CurPPLexer->pushConditionalLevel(IfTokenLoc, /*isSkipping*/ false, + FoundNonSkipPortion, FoundElse); + + // Enter raw mode to disable identifier lookup (and thus macro expansion), + // disabling warnings, etc. + CurPPLexer->LexingRawMode = true; + Token Tok; + if (auto SkipLength = + getSkippedRangeForExcludedConditionalBlock(HashTokenLoc)) { + // Skip to the next '#endif' / '#else' / '#elif'. + CurLexer->skipOver(*SkipLength); + } + while (true) { + CurLexer->Lex(Tok); + + if (Tok.is(tok::code_completion)) { + if (CodeComplete) + CodeComplete->CodeCompleteInConditionalExclusion(); + setCodeCompletionReached(); + continue; + } + + // If this is the end of the buffer, we have an error. + if (Tok.is(tok::eof)) { + // We don't emit errors for unterminated conditionals here, + // Lexer::LexEndOfFile can do that properly. + // Just return and let the caller lex after this #include. + if (PreambleConditionalStack.isRecording()) + PreambleConditionalStack.SkipInfo.emplace( + HashTokenLoc, IfTokenLoc, FoundNonSkipPortion, FoundElse, ElseLoc); + break; + } + + // If this token is not a preprocessor directive, just skip it. + if (Tok.isNot(tok::hash) || !Tok.isAtStartOfLine()) + continue; + + // We just parsed a # character at the start of a line, so we're in + // directive mode. Tell the lexer this so any newlines we see will be + // converted into an EOD token (this terminates the macro). + CurPPLexer->ParsingPreprocessorDirective = true; + if (CurLexer) CurLexer->SetKeepWhitespaceMode(false); + + + // Read the next token, the directive flavor. + LexUnexpandedToken(Tok); + + // If this isn't an identifier directive (e.g. is "# 1\n" or "#\n", or + // something bogus), skip it. + if (Tok.isNot(tok::raw_identifier)) { + CurPPLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + if (CurLexer) CurLexer->resetExtendedTokenMode(); + continue; + } + + // If the first letter isn't i or e, it isn't intesting to us. We know that + // this is safe in the face of spelling differences, because there is no way + // to spell an i/e in a strange way that is another letter. Skipping this + // allows us to avoid looking up the identifier info for #define/#undef and + // other common directives. + StringRef RI = Tok.getRawIdentifier(); + + char FirstChar = RI[0]; + if (FirstChar >= 'a' && FirstChar <= 'z' && + FirstChar != 'i' && FirstChar != 'e') { + CurPPLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + if (CurLexer) CurLexer->resetExtendedTokenMode(); + continue; + } + + // Get the identifier name without trigraphs or embedded newlines. Note + // that we can't use Tok.getIdentifierInfo() because its lookup is disabled + // when skipping. + char DirectiveBuf[20]; + StringRef Directive; + if (!Tok.needsCleaning() && RI.size() < 20) { + Directive = RI; + } else { + std::string DirectiveStr = getSpelling(Tok); + size_t IdLen = DirectiveStr.size(); + if (IdLen >= 20) { + CurPPLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + if (CurLexer) CurLexer->resetExtendedTokenMode(); + continue; + } + memcpy(DirectiveBuf, &DirectiveStr[0], IdLen); + Directive = StringRef(DirectiveBuf, IdLen); + } + + if (Directive.startswith("if")) { + StringRef Sub = Directive.substr(2); + if (Sub.empty() || // "if" + Sub == "def" || // "ifdef" + Sub == "ndef") { // "ifndef" + // We know the entire #if/#ifdef/#ifndef block will be skipped, don't + // bother parsing the condition. + DiscardUntilEndOfDirective(); + CurPPLexer->pushConditionalLevel(Tok.getLocation(), /*wasskipping*/true, + /*foundnonskip*/false, + /*foundelse*/false); + } + } else if (Directive[0] == 'e') { + StringRef Sub = Directive.substr(1); + if (Sub == "ndif") { // "endif" + PPConditionalInfo CondInfo; + CondInfo.WasSkipping = true; // Silence bogus warning. + bool InCond = CurPPLexer->popConditionalLevel(CondInfo); + (void)InCond; // Silence warning in no-asserts mode. + assert(!InCond && "Can't be skipping if not in a conditional!"); + + // If we popped the outermost skipping block, we're done skipping! + if (!CondInfo.WasSkipping) { + // Restore the value of LexingRawMode so that trailing comments + // are handled correctly, if we've reached the outermost block. + CurPPLexer->LexingRawMode = false; + CheckEndOfDirective("endif"); + CurPPLexer->LexingRawMode = true; + if (Callbacks) + Callbacks->Endif(Tok.getLocation(), CondInfo.IfLoc); + break; + } else { + DiscardUntilEndOfDirective(); + } + } else if (Sub == "lse") { // "else". + // #else directive in a skipping conditional. If not in some other + // skipping conditional, and if #else hasn't already been seen, enter it + // as a non-skipping conditional. + PPConditionalInfo &CondInfo = CurPPLexer->peekConditionalLevel(); + + // If this is a #else with a #else before it, report the error. + if (CondInfo.FoundElse) Diag(Tok, diag::pp_err_else_after_else); + + // Note that we've seen a #else in this conditional. + CondInfo.FoundElse = true; + + // If the conditional is at the top level, and the #if block wasn't + // entered, enter the #else block now. + if (!CondInfo.WasSkipping && !CondInfo.FoundNonSkip) { + CondInfo.FoundNonSkip = true; + // Restore the value of LexingRawMode so that trailing comments + // are handled correctly. + CurPPLexer->LexingRawMode = false; + CheckEndOfDirective("else"); + CurPPLexer->LexingRawMode = true; + if (Callbacks) + Callbacks->Else(Tok.getLocation(), CondInfo.IfLoc); + break; + } else { + DiscardUntilEndOfDirective(); // C99 6.10p4. + } + } else if (Sub == "lif") { // "elif". + PPConditionalInfo &CondInfo = CurPPLexer->peekConditionalLevel(); + + // If this is a #elif with a #else before it, report the error. + if (CondInfo.FoundElse) Diag(Tok, diag::pp_err_elif_after_else); + + // If this is in a skipping block or if we're already handled this #if + // block, don't bother parsing the condition. + if (CondInfo.WasSkipping || CondInfo.FoundNonSkip) { + DiscardUntilEndOfDirective(); + } else { + // Restore the value of LexingRawMode so that identifiers are + // looked up, etc, inside the #elif expression. + assert(CurPPLexer->LexingRawMode && "We have to be skipping here!"); + CurPPLexer->LexingRawMode = false; + IdentifierInfo *IfNDefMacro = nullptr; + DirectiveEvalResult DER = EvaluateDirectiveExpression(IfNDefMacro); + const bool CondValue = DER.Conditional; + CurPPLexer->LexingRawMode = true; + if (Callbacks) { + Callbacks->Elif( + Tok.getLocation(), DER.ExprRange, + (CondValue ? PPCallbacks::CVK_True : PPCallbacks::CVK_False), + CondInfo.IfLoc); + } + // If this condition is true, enter it! + if (CondValue) { + CondInfo.FoundNonSkip = true; + break; + } + } + } + } + + CurPPLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + if (CurLexer) CurLexer->resetExtendedTokenMode(); + } + + // Finally, if we are out of the conditional (saw an #endif or ran off the end + // of the file, just stop skipping and return to lexing whatever came after + // the #if block. + CurPPLexer->LexingRawMode = false; + + // The last skipped range isn't actually skipped yet if it's truncated + // by the end of the preamble; we'll resume parsing after the preamble. + if (Callbacks && (Tok.isNot(tok::eof) || !isRecordingPreamble())) + Callbacks->SourceRangeSkipped( + SourceRange(HashTokenLoc, CurPPLexer->getSourceLocation()), + Tok.getLocation()); +} + +Module *Preprocessor::getModuleForLocation(SourceLocation Loc) { + if (!SourceMgr.isInMainFile(Loc)) { + // Try to determine the module of the include directive. + // FIXME: Look into directly passing the FileEntry from LookupFile instead. + FileID IDOfIncl = SourceMgr.getFileID(SourceMgr.getExpansionLoc(Loc)); + if (const FileEntry *EntryOfIncl = SourceMgr.getFileEntryForID(IDOfIncl)) { + // The include comes from an included file. + return HeaderInfo.getModuleMap() + .findModuleForHeader(EntryOfIncl) + .getModule(); + } + } + + // This is either in the main file or not in a file at all. It belongs + // to the current module, if there is one. + return getLangOpts().CurrentModule.empty() + ? nullptr + : HeaderInfo.lookupModule(getLangOpts().CurrentModule); +} + +const FileEntry * +Preprocessor::getModuleHeaderToIncludeForDiagnostics(SourceLocation IncLoc, + Module *M, + SourceLocation Loc) { + assert(M && "no module to include"); + + // If the context is the global module fragment of some module, we never + // want to return that file; instead, we want the innermost include-guarded + // header that it included. + bool InGlobalModuleFragment = M->Kind == Module::GlobalModuleFragment; + + // If we have a module import syntax, we shouldn't include a header to + // make a particular module visible. + if ((getLangOpts().ObjC || getLangOpts().CPlusPlusModules || + getLangOpts().ModulesTS) && + !InGlobalModuleFragment) + return nullptr; + + Module *TopM = M->getTopLevelModule(); + Module *IncM = getModuleForLocation(IncLoc); + + // Walk up through the include stack, looking through textual headers of M + // until we hit a non-textual header that we can #include. (We assume textual + // headers of a module with non-textual headers aren't meant to be used to + // import entities from the module.) + auto &SM = getSourceManager(); + while (!Loc.isInvalid() && !SM.isInMainFile(Loc)) { + auto ID = SM.getFileID(SM.getExpansionLoc(Loc)); + auto *FE = SM.getFileEntryForID(ID); + if (!FE) + break; + + if (InGlobalModuleFragment) { + if (getHeaderSearchInfo().isFileMultipleIncludeGuarded(FE)) + return FE; + Loc = SM.getIncludeLoc(ID); + continue; + } + + bool InTextualHeader = false; + for (auto Header : HeaderInfo.getModuleMap().findAllModulesForHeader(FE)) { + if (!Header.getModule()->isSubModuleOf(TopM)) + continue; + + if (!(Header.getRole() & ModuleMap::TextualHeader)) { + // If this is an accessible, non-textual header of M's top-level module + // that transitively includes the given location and makes the + // corresponding module visible, this is the thing to #include. + if (Header.isAccessibleFrom(IncM)) + return FE; + + // It's in a private header; we can't #include it. + // FIXME: If there's a public header in some module that re-exports it, + // then we could suggest including that, but it's not clear that's the + // expected way to make this entity visible. + continue; + } + + InTextualHeader = true; + } + + if (!InTextualHeader) + break; + + Loc = SM.getIncludeLoc(ID); + } + + return nullptr; +} + +Optional<FileEntryRef> Preprocessor::LookupFile( + SourceLocation FilenameLoc, StringRef Filename, bool isAngled, + const DirectoryLookup *FromDir, const FileEntry *FromFile, + const DirectoryLookup *&CurDir, SmallVectorImpl<char> *SearchPath, + SmallVectorImpl<char> *RelativePath, + ModuleMap::KnownHeader *SuggestedModule, bool *IsMapped, + bool *IsFrameworkFound, bool SkipCache) { + Module *RequestingModule = getModuleForLocation(FilenameLoc); + bool RequestingModuleIsModuleInterface = !SourceMgr.isInMainFile(FilenameLoc); + + // If the header lookup mechanism may be relative to the current inclusion + // stack, record the parent #includes. + SmallVector<std::pair<const FileEntry *, const DirectoryEntry *>, 16> + Includers; + bool BuildSystemModule = false; + if (!FromDir && !FromFile) { + FileID FID = getCurrentFileLexer()->getFileID(); + const FileEntry *FileEnt = SourceMgr.getFileEntryForID(FID); + + // If there is no file entry associated with this file, it must be the + // predefines buffer or the module includes buffer. Any other file is not + // lexed with a normal lexer, so it won't be scanned for preprocessor + // directives. + // + // If we have the predefines buffer, resolve #include references (which come + // from the -include command line argument) from the current working + // directory instead of relative to the main file. + // + // If we have the module includes buffer, resolve #include references (which + // come from header declarations in the module map) relative to the module + // map file. + if (!FileEnt) { + if (FID == SourceMgr.getMainFileID() && MainFileDir) { + Includers.push_back(std::make_pair(nullptr, MainFileDir)); + BuildSystemModule = getCurrentModule()->IsSystem; + } else if ((FileEnt = + SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()))) + Includers.push_back(std::make_pair(FileEnt, *FileMgr.getDirectory("."))); + } else { + Includers.push_back(std::make_pair(FileEnt, FileEnt->getDir())); + } + + // MSVC searches the current include stack from top to bottom for + // headers included by quoted include directives. + // See: http://msdn.microsoft.com/en-us/library/36k2cdd4.aspx + if (LangOpts.MSVCCompat && !isAngled) { + for (IncludeStackInfo &ISEntry : llvm::reverse(IncludeMacroStack)) { + if (IsFileLexer(ISEntry)) + if ((FileEnt = ISEntry.ThePPLexer->getFileEntry())) + Includers.push_back(std::make_pair(FileEnt, FileEnt->getDir())); + } + } + } + + CurDir = CurDirLookup; + + if (FromFile) { + // We're supposed to start looking from after a particular file. Search + // the include path until we find that file or run out of files. + const DirectoryLookup *TmpCurDir = CurDir; + const DirectoryLookup *TmpFromDir = nullptr; + while (Optional<FileEntryRef> FE = HeaderInfo.LookupFile( + Filename, FilenameLoc, isAngled, TmpFromDir, TmpCurDir, + Includers, SearchPath, RelativePath, RequestingModule, + SuggestedModule, /*IsMapped=*/nullptr, + /*IsFrameworkFound=*/nullptr, SkipCache)) { + // Keep looking as if this file did a #include_next. + TmpFromDir = TmpCurDir; + ++TmpFromDir; + if (&FE->getFileEntry() == FromFile) { + // Found it. + FromDir = TmpFromDir; + CurDir = TmpCurDir; + break; + } + } + } + + // Do a standard file entry lookup. + Optional<FileEntryRef> FE = HeaderInfo.LookupFile( + Filename, FilenameLoc, isAngled, FromDir, CurDir, Includers, SearchPath, + RelativePath, RequestingModule, SuggestedModule, IsMapped, + IsFrameworkFound, SkipCache, BuildSystemModule); + if (FE) { + if (SuggestedModule && !LangOpts.AsmPreprocessor) + HeaderInfo.getModuleMap().diagnoseHeaderInclusion( + RequestingModule, RequestingModuleIsModuleInterface, FilenameLoc, + Filename, &FE->getFileEntry()); + return FE; + } + + const FileEntry *CurFileEnt; + // Otherwise, see if this is a subframework header. If so, this is relative + // to one of the headers on the #include stack. Walk the list of the current + // headers on the #include stack and pass them to HeaderInfo. + if (IsFileLexer()) { + if ((CurFileEnt = CurPPLexer->getFileEntry())) { + if (Optional<FileEntryRef> FE = HeaderInfo.LookupSubframeworkHeader( + Filename, CurFileEnt, SearchPath, RelativePath, RequestingModule, + SuggestedModule)) { + if (SuggestedModule && !LangOpts.AsmPreprocessor) + HeaderInfo.getModuleMap().diagnoseHeaderInclusion( + RequestingModule, RequestingModuleIsModuleInterface, FilenameLoc, + Filename, &FE->getFileEntry()); + return FE; + } + } + } + + for (IncludeStackInfo &ISEntry : llvm::reverse(IncludeMacroStack)) { + if (IsFileLexer(ISEntry)) { + if ((CurFileEnt = ISEntry.ThePPLexer->getFileEntry())) { + if (Optional<FileEntryRef> FE = HeaderInfo.LookupSubframeworkHeader( + Filename, CurFileEnt, SearchPath, RelativePath, + RequestingModule, SuggestedModule)) { + if (SuggestedModule && !LangOpts.AsmPreprocessor) + HeaderInfo.getModuleMap().diagnoseHeaderInclusion( + RequestingModule, RequestingModuleIsModuleInterface, + FilenameLoc, Filename, &FE->getFileEntry()); + return FE; + } + } + } + } + + // Otherwise, we really couldn't find the file. + return None; +} + +//===----------------------------------------------------------------------===// +// Preprocessor Directive Handling. +//===----------------------------------------------------------------------===// + +class Preprocessor::ResetMacroExpansionHelper { +public: + ResetMacroExpansionHelper(Preprocessor *pp) + : PP(pp), save(pp->DisableMacroExpansion) { + if (pp->MacroExpansionInDirectivesOverride) + pp->DisableMacroExpansion = false; + } + + ~ResetMacroExpansionHelper() { + PP->DisableMacroExpansion = save; + } + +private: + Preprocessor *PP; + bool save; +}; + +/// Process a directive while looking for the through header or a #pragma +/// hdrstop. The following directives are handled: +/// #include (to check if it is the through header) +/// #define (to warn about macros that don't match the PCH) +/// #pragma (to check for pragma hdrstop). +/// All other directives are completely discarded. +void Preprocessor::HandleSkippedDirectiveWhileUsingPCH(Token &Result, + SourceLocation HashLoc) { + if (const IdentifierInfo *II = Result.getIdentifierInfo()) { + if (II->getPPKeywordID() == tok::pp_define) { + return HandleDefineDirective(Result, + /*ImmediatelyAfterHeaderGuard=*/false); + } + if (SkippingUntilPCHThroughHeader && + II->getPPKeywordID() == tok::pp_include) { + return HandleIncludeDirective(HashLoc, Result); + } + if (SkippingUntilPragmaHdrStop && II->getPPKeywordID() == tok::pp_pragma) { + Lex(Result); + auto *II = Result.getIdentifierInfo(); + if (II && II->getName() == "hdrstop") + return HandlePragmaHdrstop(Result); + } + } + DiscardUntilEndOfDirective(); +} + +/// HandleDirective - This callback is invoked when the lexer sees a # token +/// at the start of a line. This consumes the directive, modifies the +/// lexer/preprocessor state, and advances the lexer(s) so that the next token +/// read is the correct one. +void Preprocessor::HandleDirective(Token &Result) { + // FIXME: Traditional: # with whitespace before it not recognized by K&R? + + // We just parsed a # character at the start of a line, so we're in directive + // mode. Tell the lexer this so any newlines we see will be converted into an + // EOD token (which terminates the directive). + CurPPLexer->ParsingPreprocessorDirective = true; + if (CurLexer) CurLexer->SetKeepWhitespaceMode(false); + + bool ImmediatelyAfterTopLevelIfndef = + CurPPLexer->MIOpt.getImmediatelyAfterTopLevelIfndef(); + CurPPLexer->MIOpt.resetImmediatelyAfterTopLevelIfndef(); + + ++NumDirectives; + + // We are about to read a token. For the multiple-include optimization FA to + // work, we have to remember if we had read any tokens *before* this + // pp-directive. + bool ReadAnyTokensBeforeDirective =CurPPLexer->MIOpt.getHasReadAnyTokensVal(); + + // Save the '#' token in case we need to return it later. + Token SavedHash = Result; + + // Read the next token, the directive flavor. This isn't expanded due to + // C99 6.10.3p8. + LexUnexpandedToken(Result); + + // C99 6.10.3p11: Is this preprocessor directive in macro invocation? e.g.: + // #define A(x) #x + // A(abc + // #warning blah + // def) + // If so, the user is relying on undefined behavior, emit a diagnostic. Do + // not support this for #include-like directives, since that can result in + // terrible diagnostics, and does not work in GCC. + if (InMacroArgs) { + if (IdentifierInfo *II = Result.getIdentifierInfo()) { + switch (II->getPPKeywordID()) { + case tok::pp_include: + case tok::pp_import: + case tok::pp_include_next: + case tok::pp___include_macros: + case tok::pp_pragma: + Diag(Result, diag::err_embedded_directive) << II->getName(); + Diag(*ArgMacro, diag::note_macro_expansion_here) + << ArgMacro->getIdentifierInfo(); + DiscardUntilEndOfDirective(); + return; + default: + break; + } + } + Diag(Result, diag::ext_embedded_directive); + } + + // Temporarily enable macro expansion if set so + // and reset to previous state when returning from this function. + ResetMacroExpansionHelper helper(this); + + if (SkippingUntilPCHThroughHeader || SkippingUntilPragmaHdrStop) + return HandleSkippedDirectiveWhileUsingPCH(Result, SavedHash.getLocation()); + + switch (Result.getKind()) { + case tok::eod: + return; // null directive. + case tok::code_completion: + if (CodeComplete) + CodeComplete->CodeCompleteDirective( + CurPPLexer->getConditionalStackDepth() > 0); + setCodeCompletionReached(); + return; + case tok::numeric_constant: // # 7 GNU line marker directive. + if (getLangOpts().AsmPreprocessor) + break; // # 4 is not a preprocessor directive in .S files. + return HandleDigitDirective(Result); + default: + IdentifierInfo *II = Result.getIdentifierInfo(); + if (!II) break; // Not an identifier. + + // Ask what the preprocessor keyword ID is. + switch (II->getPPKeywordID()) { + default: break; + // C99 6.10.1 - Conditional Inclusion. + case tok::pp_if: + return HandleIfDirective(Result, SavedHash, ReadAnyTokensBeforeDirective); + case tok::pp_ifdef: + return HandleIfdefDirective(Result, SavedHash, false, + true /*not valid for miopt*/); + case tok::pp_ifndef: + return HandleIfdefDirective(Result, SavedHash, true, + ReadAnyTokensBeforeDirective); + case tok::pp_elif: + return HandleElifDirective(Result, SavedHash); + case tok::pp_else: + return HandleElseDirective(Result, SavedHash); + case tok::pp_endif: + return HandleEndifDirective(Result); + + // C99 6.10.2 - Source File Inclusion. + case tok::pp_include: + // Handle #include. + return HandleIncludeDirective(SavedHash.getLocation(), Result); + case tok::pp___include_macros: + // Handle -imacros. + return HandleIncludeMacrosDirective(SavedHash.getLocation(), Result); + + // C99 6.10.3 - Macro Replacement. + case tok::pp_define: + return HandleDefineDirective(Result, ImmediatelyAfterTopLevelIfndef); + case tok::pp_undef: + return HandleUndefDirective(); + + // C99 6.10.4 - Line Control. + case tok::pp_line: + return HandleLineDirective(); + + // C99 6.10.5 - Error Directive. + case tok::pp_error: + return HandleUserDiagnosticDirective(Result, false); + + // C99 6.10.6 - Pragma Directive. + case tok::pp_pragma: + return HandlePragmaDirective({PIK_HashPragma, SavedHash.getLocation()}); + + // GNU Extensions. + case tok::pp_import: + return HandleImportDirective(SavedHash.getLocation(), Result); + case tok::pp_include_next: + return HandleIncludeNextDirective(SavedHash.getLocation(), Result); + + case tok::pp_warning: + Diag(Result, diag::ext_pp_warning_directive); + return HandleUserDiagnosticDirective(Result, true); + case tok::pp_ident: + return HandleIdentSCCSDirective(Result); + case tok::pp_sccs: + return HandleIdentSCCSDirective(Result); + case tok::pp_assert: + //isExtension = true; // FIXME: implement #assert + break; + case tok::pp_unassert: + //isExtension = true; // FIXME: implement #unassert + break; + + case tok::pp___public_macro: + if (getLangOpts().Modules) + return HandleMacroPublicDirective(Result); + break; + + case tok::pp___private_macro: + if (getLangOpts().Modules) + return HandleMacroPrivateDirective(); + break; + } + break; + } + + // If this is a .S file, treat unknown # directives as non-preprocessor + // directives. This is important because # may be a comment or introduce + // various pseudo-ops. Just return the # token and push back the following + // token to be lexed next time. + if (getLangOpts().AsmPreprocessor) { + auto Toks = std::make_unique<Token[]>(2); + // Return the # and the token after it. + Toks[0] = SavedHash; + Toks[1] = Result; + + // If the second token is a hashhash token, then we need to translate it to + // unknown so the token lexer doesn't try to perform token pasting. + if (Result.is(tok::hashhash)) + Toks[1].setKind(tok::unknown); + + // Enter this token stream so that we re-lex the tokens. Make sure to + // enable macro expansion, in case the token after the # is an identifier + // that is expanded. + EnterTokenStream(std::move(Toks), 2, false, /*IsReinject*/false); + return; + } + + // If we reached here, the preprocessing token is not valid! + Diag(Result, diag::err_pp_invalid_directive); + + // Read the rest of the PP line. + DiscardUntilEndOfDirective(); + + // Okay, we're done parsing the directive. +} + +/// GetLineValue - Convert a numeric token into an unsigned value, emitting +/// Diagnostic DiagID if it is invalid, and returning the value in Val. +static bool GetLineValue(Token &DigitTok, unsigned &Val, + unsigned DiagID, Preprocessor &PP, + bool IsGNULineDirective=false) { + if (DigitTok.isNot(tok::numeric_constant)) { + PP.Diag(DigitTok, DiagID); + + if (DigitTok.isNot(tok::eod)) + PP.DiscardUntilEndOfDirective(); + return true; + } + + SmallString<64> IntegerBuffer; + IntegerBuffer.resize(DigitTok.getLength()); + const char *DigitTokBegin = &IntegerBuffer[0]; + bool Invalid = false; + unsigned ActualLength = PP.getSpelling(DigitTok, DigitTokBegin, &Invalid); + if (Invalid) + return true; + + // Verify that we have a simple digit-sequence, and compute the value. This + // is always a simple digit string computed in decimal, so we do this manually + // here. + Val = 0; + for (unsigned i = 0; i != ActualLength; ++i) { + // C++1y [lex.fcon]p1: + // Optional separating single quotes in a digit-sequence are ignored + if (DigitTokBegin[i] == '\'') + continue; + + if (!isDigit(DigitTokBegin[i])) { + PP.Diag(PP.AdvanceToTokenCharacter(DigitTok.getLocation(), i), + diag::err_pp_line_digit_sequence) << IsGNULineDirective; + PP.DiscardUntilEndOfDirective(); + return true; + } + + unsigned NextVal = Val*10+(DigitTokBegin[i]-'0'); + if (NextVal < Val) { // overflow. + PP.Diag(DigitTok, DiagID); + PP.DiscardUntilEndOfDirective(); + return true; + } + Val = NextVal; + } + + if (DigitTokBegin[0] == '0' && Val) + PP.Diag(DigitTok.getLocation(), diag::warn_pp_line_decimal) + << IsGNULineDirective; + + return false; +} + +/// Handle a \#line directive: C99 6.10.4. +/// +/// The two acceptable forms are: +/// \verbatim +/// # line digit-sequence +/// # line digit-sequence "s-char-sequence" +/// \endverbatim +void Preprocessor::HandleLineDirective() { + // Read the line # and string argument. Per C99 6.10.4p5, these tokens are + // expanded. + Token DigitTok; + Lex(DigitTok); + + // Validate the number and convert it to an unsigned. + unsigned LineNo; + if (GetLineValue(DigitTok, LineNo, diag::err_pp_line_requires_integer,*this)) + return; + + if (LineNo == 0) + Diag(DigitTok, diag::ext_pp_line_zero); + + // Enforce C99 6.10.4p3: "The digit sequence shall not specify ... a + // number greater than 2147483647". C90 requires that the line # be <= 32767. + unsigned LineLimit = 32768U; + if (LangOpts.C99 || LangOpts.CPlusPlus11) + LineLimit = 2147483648U; + if (LineNo >= LineLimit) + Diag(DigitTok, diag::ext_pp_line_too_big) << LineLimit; + else if (LangOpts.CPlusPlus11 && LineNo >= 32768U) + Diag(DigitTok, diag::warn_cxx98_compat_pp_line_too_big); + + int FilenameID = -1; + Token StrTok; + Lex(StrTok); + + // If the StrTok is "eod", then it wasn't present. Otherwise, it must be a + // string followed by eod. + if (StrTok.is(tok::eod)) + ; // ok + else if (StrTok.isNot(tok::string_literal)) { + Diag(StrTok, diag::err_pp_line_invalid_filename); + DiscardUntilEndOfDirective(); + return; + } else if (StrTok.hasUDSuffix()) { + Diag(StrTok, diag::err_invalid_string_udl); + DiscardUntilEndOfDirective(); + return; + } else { + // Parse and validate the string, converting it into a unique ID. + StringLiteralParser Literal(StrTok, *this); + assert(Literal.isAscii() && "Didn't allow wide strings in"); + if (Literal.hadError) { + DiscardUntilEndOfDirective(); + return; + } + if (Literal.Pascal) { + Diag(StrTok, diag::err_pp_linemarker_invalid_filename); + DiscardUntilEndOfDirective(); + return; + } + FilenameID = SourceMgr.getLineTableFilenameID(Literal.GetString()); + + // Verify that there is nothing after the string, other than EOD. Because + // of C99 6.10.4p5, macros that expand to empty tokens are ok. + CheckEndOfDirective("line", true); + } + + // Take the file kind of the file containing the #line directive. #line + // directives are often used for generated sources from the same codebase, so + // the new file should generally be classified the same way as the current + // file. This is visible in GCC's pre-processed output, which rewrites #line + // to GNU line markers. + SrcMgr::CharacteristicKind FileKind = + SourceMgr.getFileCharacteristic(DigitTok.getLocation()); + + SourceMgr.AddLineNote(DigitTok.getLocation(), LineNo, FilenameID, false, + false, FileKind); + + if (Callbacks) + Callbacks->FileChanged(CurPPLexer->getSourceLocation(), + PPCallbacks::RenameFile, FileKind); +} + +/// ReadLineMarkerFlags - Parse and validate any flags at the end of a GNU line +/// marker directive. +static bool ReadLineMarkerFlags(bool &IsFileEntry, bool &IsFileExit, + SrcMgr::CharacteristicKind &FileKind, + Preprocessor &PP) { + unsigned FlagVal; + Token FlagTok; + PP.Lex(FlagTok); + if (FlagTok.is(tok::eod)) return false; + if (GetLineValue(FlagTok, FlagVal, diag::err_pp_linemarker_invalid_flag, PP)) + return true; + + if (FlagVal == 1) { + IsFileEntry = true; + + PP.Lex(FlagTok); + if (FlagTok.is(tok::eod)) return false; + if (GetLineValue(FlagTok, FlagVal, diag::err_pp_linemarker_invalid_flag,PP)) + return true; + } else if (FlagVal == 2) { + IsFileExit = true; + + SourceManager &SM = PP.getSourceManager(); + // If we are leaving the current presumed file, check to make sure the + // presumed include stack isn't empty! + FileID CurFileID = + SM.getDecomposedExpansionLoc(FlagTok.getLocation()).first; + PresumedLoc PLoc = SM.getPresumedLoc(FlagTok.getLocation()); + if (PLoc.isInvalid()) + return true; + + // If there is no include loc (main file) or if the include loc is in a + // different physical file, then we aren't in a "1" line marker flag region. + SourceLocation IncLoc = PLoc.getIncludeLoc(); + if (IncLoc.isInvalid() || + SM.getDecomposedExpansionLoc(IncLoc).first != CurFileID) { + PP.Diag(FlagTok, diag::err_pp_linemarker_invalid_pop); + PP.DiscardUntilEndOfDirective(); + return true; + } + + PP.Lex(FlagTok); + if (FlagTok.is(tok::eod)) return false; + if (GetLineValue(FlagTok, FlagVal, diag::err_pp_linemarker_invalid_flag,PP)) + return true; + } + + // We must have 3 if there are still flags. + if (FlagVal != 3) { + PP.Diag(FlagTok, diag::err_pp_linemarker_invalid_flag); + PP.DiscardUntilEndOfDirective(); + return true; + } + + FileKind = SrcMgr::C_System; + + PP.Lex(FlagTok); + if (FlagTok.is(tok::eod)) return false; + if (GetLineValue(FlagTok, FlagVal, diag::err_pp_linemarker_invalid_flag, PP)) + return true; + + // We must have 4 if there is yet another flag. + if (FlagVal != 4) { + PP.Diag(FlagTok, diag::err_pp_linemarker_invalid_flag); + PP.DiscardUntilEndOfDirective(); + return true; + } + + FileKind = SrcMgr::C_ExternCSystem; + + PP.Lex(FlagTok); + if (FlagTok.is(tok::eod)) return false; + + // There are no more valid flags here. + PP.Diag(FlagTok, diag::err_pp_linemarker_invalid_flag); + PP.DiscardUntilEndOfDirective(); + return true; +} + +/// HandleDigitDirective - Handle a GNU line marker directive, whose syntax is +/// one of the following forms: +/// +/// # 42 +/// # 42 "file" ('1' | '2')? +/// # 42 "file" ('1' | '2')? '3' '4'? +/// +void Preprocessor::HandleDigitDirective(Token &DigitTok) { + // Validate the number and convert it to an unsigned. GNU does not have a + // line # limit other than it fit in 32-bits. + unsigned LineNo; + if (GetLineValue(DigitTok, LineNo, diag::err_pp_linemarker_requires_integer, + *this, true)) + return; + + Token StrTok; + Lex(StrTok); + + bool IsFileEntry = false, IsFileExit = false; + int FilenameID = -1; + SrcMgr::CharacteristicKind FileKind = SrcMgr::C_User; + + // If the StrTok is "eod", then it wasn't present. Otherwise, it must be a + // string followed by eod. + if (StrTok.is(tok::eod)) { + // Treat this like "#line NN", which doesn't change file characteristics. + FileKind = SourceMgr.getFileCharacteristic(DigitTok.getLocation()); + } else if (StrTok.isNot(tok::string_literal)) { + Diag(StrTok, diag::err_pp_linemarker_invalid_filename); + DiscardUntilEndOfDirective(); + return; + } else if (StrTok.hasUDSuffix()) { + Diag(StrTok, diag::err_invalid_string_udl); + DiscardUntilEndOfDirective(); + return; + } else { + // Parse and validate the string, converting it into a unique ID. + StringLiteralParser Literal(StrTok, *this); + assert(Literal.isAscii() && "Didn't allow wide strings in"); + if (Literal.hadError) { + DiscardUntilEndOfDirective(); + return; + } + if (Literal.Pascal) { + Diag(StrTok, diag::err_pp_linemarker_invalid_filename); + DiscardUntilEndOfDirective(); + return; + } + FilenameID = SourceMgr.getLineTableFilenameID(Literal.GetString()); + + // If a filename was present, read any flags that are present. + if (ReadLineMarkerFlags(IsFileEntry, IsFileExit, FileKind, *this)) + return; + } + + // Create a line note with this information. + SourceMgr.AddLineNote(DigitTok.getLocation(), LineNo, FilenameID, IsFileEntry, + IsFileExit, FileKind); + + // If the preprocessor has callbacks installed, notify them of the #line + // change. This is used so that the line marker comes out in -E mode for + // example. + if (Callbacks) { + PPCallbacks::FileChangeReason Reason = PPCallbacks::RenameFile; + if (IsFileEntry) + Reason = PPCallbacks::EnterFile; + else if (IsFileExit) + Reason = PPCallbacks::ExitFile; + + Callbacks->FileChanged(CurPPLexer->getSourceLocation(), Reason, FileKind); + } +} + +/// HandleUserDiagnosticDirective - Handle a #warning or #error directive. +/// +void Preprocessor::HandleUserDiagnosticDirective(Token &Tok, + bool isWarning) { + // Read the rest of the line raw. We do this because we don't want macros + // to be expanded and we don't require that the tokens be valid preprocessing + // tokens. For example, this is allowed: "#warning ` 'foo". GCC does + // collapse multiple consecutive white space between tokens, but this isn't + // specified by the standard. + SmallString<128> Message; + CurLexer->ReadToEndOfLine(&Message); + + // Find the first non-whitespace character, so that we can make the + // diagnostic more succinct. + StringRef Msg = StringRef(Message).ltrim(' '); + + if (isWarning) + Diag(Tok, diag::pp_hash_warning) << Msg; + else + Diag(Tok, diag::err_pp_hash_error) << Msg; +} + +/// HandleIdentSCCSDirective - Handle a #ident/#sccs directive. +/// +void Preprocessor::HandleIdentSCCSDirective(Token &Tok) { + // Yes, this directive is an extension. + Diag(Tok, diag::ext_pp_ident_directive); + + // Read the string argument. + Token StrTok; + Lex(StrTok); + + // If the token kind isn't a string, it's a malformed directive. + if (StrTok.isNot(tok::string_literal) && + StrTok.isNot(tok::wide_string_literal)) { + Diag(StrTok, diag::err_pp_malformed_ident); + if (StrTok.isNot(tok::eod)) + DiscardUntilEndOfDirective(); + return; + } + + if (StrTok.hasUDSuffix()) { + Diag(StrTok, diag::err_invalid_string_udl); + DiscardUntilEndOfDirective(); + return; + } + + // Verify that there is nothing after the string, other than EOD. + CheckEndOfDirective("ident"); + + if (Callbacks) { + bool Invalid = false; + std::string Str = getSpelling(StrTok, &Invalid); + if (!Invalid) + Callbacks->Ident(Tok.getLocation(), Str); + } +} + +/// Handle a #public directive. +void Preprocessor::HandleMacroPublicDirective(Token &Tok) { + Token MacroNameTok; + ReadMacroName(MacroNameTok, MU_Undef); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eod)) + return; + + // Check to see if this is the last token on the #__public_macro line. + CheckEndOfDirective("__public_macro"); + + IdentifierInfo *II = MacroNameTok.getIdentifierInfo(); + // Okay, we finally have a valid identifier to undef. + MacroDirective *MD = getLocalMacroDirective(II); + + // If the macro is not defined, this is an error. + if (!MD) { + Diag(MacroNameTok, diag::err_pp_visibility_non_macro) << II; + return; + } + + // Note that this macro has now been exported. + appendMacroDirective(II, AllocateVisibilityMacroDirective( + MacroNameTok.getLocation(), /*isPublic=*/true)); +} + +/// Handle a #private directive. +void Preprocessor::HandleMacroPrivateDirective() { + Token MacroNameTok; + ReadMacroName(MacroNameTok, MU_Undef); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eod)) + return; + + // Check to see if this is the last token on the #__private_macro line. + CheckEndOfDirective("__private_macro"); + + IdentifierInfo *II = MacroNameTok.getIdentifierInfo(); + // Okay, we finally have a valid identifier to undef. + MacroDirective *MD = getLocalMacroDirective(II); + + // If the macro is not defined, this is an error. + if (!MD) { + Diag(MacroNameTok, diag::err_pp_visibility_non_macro) << II; + return; + } + + // Note that this macro has now been marked private. + appendMacroDirective(II, AllocateVisibilityMacroDirective( + MacroNameTok.getLocation(), /*isPublic=*/false)); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Include Directive Handling. +//===----------------------------------------------------------------------===// + +/// GetIncludeFilenameSpelling - Turn the specified lexer token into a fully +/// checked and spelled filename, e.g. as an operand of \#include. This returns +/// true if the input filename was in <>'s or false if it were in ""'s. The +/// caller is expected to provide a buffer that is large enough to hold the +/// spelling of the filename, but is also expected to handle the case when +/// this method decides to use a different buffer. +bool Preprocessor::GetIncludeFilenameSpelling(SourceLocation Loc, + StringRef &Buffer) { + // Get the text form of the filename. + assert(!Buffer.empty() && "Can't have tokens with empty spellings!"); + + // FIXME: Consider warning on some of the cases described in C11 6.4.7/3 and + // C++20 [lex.header]/2: + // + // If `"`, `'`, `\`, `/*`, or `//` appears in a header-name, then + // in C: behavior is undefined + // in C++: program is conditionally-supported with implementation-defined + // semantics + + // Make sure the filename is <x> or "x". + bool isAngled; + if (Buffer[0] == '<') { + if (Buffer.back() != '>') { + Diag(Loc, diag::err_pp_expects_filename); + Buffer = StringRef(); + return true; + } + isAngled = true; + } else if (Buffer[0] == '"') { + if (Buffer.back() != '"') { + Diag(Loc, diag::err_pp_expects_filename); + Buffer = StringRef(); + return true; + } + isAngled = false; + } else { + Diag(Loc, diag::err_pp_expects_filename); + Buffer = StringRef(); + return true; + } + + // Diagnose #include "" as invalid. + if (Buffer.size() <= 2) { + Diag(Loc, diag::err_pp_empty_filename); + Buffer = StringRef(); + return true; + } + + // Skip the brackets. + Buffer = Buffer.substr(1, Buffer.size()-2); + return isAngled; +} + +/// Push a token onto the token stream containing an annotation. +void Preprocessor::EnterAnnotationToken(SourceRange Range, + tok::TokenKind Kind, + void *AnnotationVal) { + // FIXME: Produce this as the current token directly, rather than + // allocating a new token for it. + auto Tok = std::make_unique<Token[]>(1); + Tok[0].startToken(); + Tok[0].setKind(Kind); + Tok[0].setLocation(Range.getBegin()); + Tok[0].setAnnotationEndLoc(Range.getEnd()); + Tok[0].setAnnotationValue(AnnotationVal); + EnterTokenStream(std::move(Tok), 1, true, /*IsReinject*/ false); +} + +/// Produce a diagnostic informing the user that a #include or similar +/// was implicitly treated as a module import. +static void diagnoseAutoModuleImport( + Preprocessor &PP, SourceLocation HashLoc, Token &IncludeTok, + ArrayRef<std::pair<IdentifierInfo *, SourceLocation>> Path, + SourceLocation PathEnd) { + StringRef ImportKeyword; + if (PP.getLangOpts().ObjC) + ImportKeyword = "@import"; + else if (PP.getLangOpts().ModulesTS || PP.getLangOpts().CPlusPlusModules) + ImportKeyword = "import"; + else + return; // no import syntax available + + SmallString<128> PathString; + for (size_t I = 0, N = Path.size(); I != N; ++I) { + if (I) + PathString += '.'; + PathString += Path[I].first->getName(); + } + int IncludeKind = 0; + + switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) { + case tok::pp_include: + IncludeKind = 0; + break; + + case tok::pp_import: + IncludeKind = 1; + break; + + case tok::pp_include_next: + IncludeKind = 2; + break; + + case tok::pp___include_macros: + IncludeKind = 3; + break; + + default: + llvm_unreachable("unknown include directive kind"); + } + + CharSourceRange ReplaceRange(SourceRange(HashLoc, PathEnd), + /*IsTokenRange=*/false); + PP.Diag(HashLoc, diag::warn_auto_module_import) + << IncludeKind << PathString + << FixItHint::CreateReplacement( + ReplaceRange, (ImportKeyword + " " + PathString + ";").str()); +} + +// Given a vector of path components and a string containing the real +// path to the file, build a properly-cased replacement in the vector, +// and return true if the replacement should be suggested. +static bool trySimplifyPath(SmallVectorImpl<StringRef> &Components, + StringRef RealPathName) { + auto RealPathComponentIter = llvm::sys::path::rbegin(RealPathName); + auto RealPathComponentEnd = llvm::sys::path::rend(RealPathName); + int Cnt = 0; + bool SuggestReplacement = false; + // Below is a best-effort to handle ".." in paths. It is admittedly + // not 100% correct in the presence of symlinks. + for (auto &Component : llvm::reverse(Components)) { + if ("." == Component) { + } else if (".." == Component) { + ++Cnt; + } else if (Cnt) { + --Cnt; + } else if (RealPathComponentIter != RealPathComponentEnd) { + if (Component != *RealPathComponentIter) { + // If these path components differ by more than just case, then we + // may be looking at symlinked paths. Bail on this diagnostic to avoid + // noisy false positives. + SuggestReplacement = RealPathComponentIter->equals_lower(Component); + if (!SuggestReplacement) + break; + Component = *RealPathComponentIter; + } + ++RealPathComponentIter; + } + } + return SuggestReplacement; +} + +bool Preprocessor::checkModuleIsAvailable(const LangOptions &LangOpts, + const TargetInfo &TargetInfo, + DiagnosticsEngine &Diags, Module *M) { + Module::Requirement Requirement; + Module::UnresolvedHeaderDirective MissingHeader; + Module *ShadowingModule = nullptr; + if (M->isAvailable(LangOpts, TargetInfo, Requirement, MissingHeader, + ShadowingModule)) + return false; + + if (MissingHeader.FileNameLoc.isValid()) { + Diags.Report(MissingHeader.FileNameLoc, diag::err_module_header_missing) + << MissingHeader.IsUmbrella << MissingHeader.FileName; + } else if (ShadowingModule) { + Diags.Report(M->DefinitionLoc, diag::err_module_shadowed) << M->Name; + Diags.Report(ShadowingModule->DefinitionLoc, + diag::note_previous_definition); + } else { + // FIXME: Track the location at which the requirement was specified, and + // use it here. + Diags.Report(M->DefinitionLoc, diag::err_module_unavailable) + << M->getFullModuleName() << Requirement.second << Requirement.first; + } + return true; +} + +/// HandleIncludeDirective - The "\#include" tokens have just been read, read +/// the file to be included from the lexer, then include it! This is a common +/// routine with functionality shared between \#include, \#include_next and +/// \#import. LookupFrom is set when this is a \#include_next directive, it +/// specifies the file to start searching from. +void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc, + Token &IncludeTok, + const DirectoryLookup *LookupFrom, + const FileEntry *LookupFromFile) { + Token FilenameTok; + if (LexHeaderName(FilenameTok)) + return; + + if (FilenameTok.isNot(tok::header_name)) { + Diag(FilenameTok.getLocation(), diag::err_pp_expects_filename); + if (FilenameTok.isNot(tok::eod)) + DiscardUntilEndOfDirective(); + return; + } + + // Verify that there is nothing after the filename, other than EOD. Note + // that we allow macros that expand to nothing after the filename, because + // this falls into the category of "#include pp-tokens new-line" specified + // in C99 6.10.2p4. + SourceLocation EndLoc = + CheckEndOfDirective(IncludeTok.getIdentifierInfo()->getNameStart(), true); + + auto Action = HandleHeaderIncludeOrImport(HashLoc, IncludeTok, FilenameTok, + EndLoc, LookupFrom, LookupFromFile); + switch (Action.Kind) { + case ImportAction::None: + case ImportAction::SkippedModuleImport: + break; + case ImportAction::ModuleBegin: + EnterAnnotationToken(SourceRange(HashLoc, EndLoc), + tok::annot_module_begin, Action.ModuleForHeader); + break; + case ImportAction::ModuleImport: + EnterAnnotationToken(SourceRange(HashLoc, EndLoc), + tok::annot_module_include, Action.ModuleForHeader); + break; + } +} + +Optional<FileEntryRef> Preprocessor::LookupHeaderIncludeOrImport( + const DirectoryLookup *&CurDir, StringRef Filename, + SourceLocation FilenameLoc, CharSourceRange FilenameRange, + const Token &FilenameTok, bool &IsFrameworkFound, bool IsImportDecl, + bool &IsMapped, const DirectoryLookup *LookupFrom, + const FileEntry *LookupFromFile, StringRef LookupFilename, + SmallVectorImpl<char> &RelativePath, SmallVectorImpl<char> &SearchPath, + ModuleMap::KnownHeader &SuggestedModule, bool isAngled) { + Optional<FileEntryRef> File = LookupFile( + FilenameLoc, LookupFilename, + isAngled, LookupFrom, LookupFromFile, CurDir, + Callbacks ? &SearchPath : nullptr, Callbacks ? &RelativePath : nullptr, + &SuggestedModule, &IsMapped, &IsFrameworkFound); + if (File) + return File; + + if (Callbacks) { + // Give the clients a chance to recover. + SmallString<128> RecoveryPath; + if (Callbacks->FileNotFound(Filename, RecoveryPath)) { + if (auto DE = FileMgr.getOptionalDirectoryRef(RecoveryPath)) { + // Add the recovery path to the list of search paths. + DirectoryLookup DL(*DE, SrcMgr::C_User, false); + HeaderInfo.AddSearchPath(DL, isAngled); + + // Try the lookup again, skipping the cache. + Optional<FileEntryRef> File = LookupFile( + FilenameLoc, + LookupFilename, isAngled, + LookupFrom, LookupFromFile, CurDir, nullptr, nullptr, + &SuggestedModule, &IsMapped, /*IsFrameworkFound=*/nullptr, + /*SkipCache*/ true); + if (File) + return File; + } + } + } + + if (SuppressIncludeNotFoundError) + return None; + + // If the file could not be located and it was included via angle + // brackets, we can attempt a lookup as though it were a quoted path to + // provide the user with a possible fixit. + if (isAngled) { + Optional<FileEntryRef> File = LookupFile( + FilenameLoc, LookupFilename, + false, LookupFrom, LookupFromFile, CurDir, + Callbacks ? &SearchPath : nullptr, Callbacks ? &RelativePath : nullptr, + &SuggestedModule, &IsMapped, + /*IsFrameworkFound=*/nullptr); + if (File) { + Diag(FilenameTok, diag::err_pp_file_not_found_angled_include_not_fatal) + << Filename << IsImportDecl + << FixItHint::CreateReplacement(FilenameRange, + "\"" + Filename.str() + "\""); + return File; + } + } + + // Check for likely typos due to leading or trailing non-isAlphanumeric + // characters + StringRef OriginalFilename = Filename; + if (LangOpts.SpellChecking) { + // A heuristic to correct a typo file name by removing leading and + // trailing non-isAlphanumeric characters. + auto CorrectTypoFilename = [](llvm::StringRef Filename) { + Filename = Filename.drop_until(isAlphanumeric); + while (!Filename.empty() && !isAlphanumeric(Filename.back())) { + Filename = Filename.drop_back(); + } + return Filename; + }; + StringRef TypoCorrectionName = CorrectTypoFilename(Filename); + +#ifndef _WIN32 + // Normalize slashes when compiling with -fms-extensions on non-Windows. + // This is unnecessary on Windows since the filesystem there handles + // backslashes. + SmallString<128> NormalizedTypoCorrectionPath; + if (LangOpts.MicrosoftExt) { + NormalizedTypoCorrectionPath = TypoCorrectionName; + llvm::sys::path::native(NormalizedTypoCorrectionPath); + TypoCorrectionName = NormalizedTypoCorrectionPath; + } +#endif + + Optional<FileEntryRef> File = LookupFile( + FilenameLoc, TypoCorrectionName, isAngled, LookupFrom, LookupFromFile, + CurDir, Callbacks ? &SearchPath : nullptr, + Callbacks ? &RelativePath : nullptr, &SuggestedModule, &IsMapped, + /*IsFrameworkFound=*/nullptr); + if (File) { + auto Hint = + isAngled ? FixItHint::CreateReplacement( + FilenameRange, "<" + TypoCorrectionName.str() + ">") + : FixItHint::CreateReplacement( + FilenameRange, "\"" + TypoCorrectionName.str() + "\""); + Diag(FilenameTok, diag::err_pp_file_not_found_typo_not_fatal) + << OriginalFilename << TypoCorrectionName << Hint; + // We found the file, so set the Filename to the name after typo + // correction. + Filename = TypoCorrectionName; + return File; + } + } + + // If the file is still not found, just go with the vanilla diagnostic + assert(!File.hasValue() && "expected missing file"); + Diag(FilenameTok, diag::err_pp_file_not_found) + << OriginalFilename << FilenameRange; + if (IsFrameworkFound) { + size_t SlashPos = OriginalFilename.find('/'); + assert(SlashPos != StringRef::npos && + "Include with framework name should have '/' in the filename"); + StringRef FrameworkName = OriginalFilename.substr(0, SlashPos); + FrameworkCacheEntry &CacheEntry = + HeaderInfo.LookupFrameworkCache(FrameworkName); + assert(CacheEntry.Directory && "Found framework should be in cache"); + Diag(FilenameTok, diag::note_pp_framework_without_header) + << OriginalFilename.substr(SlashPos + 1) << FrameworkName + << CacheEntry.Directory->getName(); + } + + return None; +} + +/// Handle either a #include-like directive or an import declaration that names +/// a header file. +/// +/// \param HashLoc The location of the '#' token for an include, or +/// SourceLocation() for an import declaration. +/// \param IncludeTok The include / include_next / import token. +/// \param FilenameTok The header-name token. +/// \param EndLoc The location at which any imported macros become visible. +/// \param LookupFrom For #include_next, the starting directory for the +/// directory lookup. +/// \param LookupFromFile For #include_next, the starting file for the directory +/// lookup. +Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( + SourceLocation HashLoc, Token &IncludeTok, Token &FilenameTok, + SourceLocation EndLoc, const DirectoryLookup *LookupFrom, + const FileEntry *LookupFromFile) { + SmallString<128> FilenameBuffer; + StringRef Filename = getSpelling(FilenameTok, FilenameBuffer); + SourceLocation CharEnd = FilenameTok.getEndLoc(); + + CharSourceRange FilenameRange + = CharSourceRange::getCharRange(FilenameTok.getLocation(), CharEnd); + StringRef OriginalFilename = Filename; + bool isAngled = + GetIncludeFilenameSpelling(FilenameTok.getLocation(), Filename); + + // If GetIncludeFilenameSpelling set the start ptr to null, there was an + // error. + if (Filename.empty()) + return {ImportAction::None}; + + bool IsImportDecl = HashLoc.isInvalid(); + SourceLocation StartLoc = IsImportDecl ? IncludeTok.getLocation() : HashLoc; + + // Complain about attempts to #include files in an audit pragma. + if (PragmaARCCFCodeAuditedInfo.second.isValid()) { + Diag(StartLoc, diag::err_pp_include_in_arc_cf_code_audited) << IsImportDecl; + Diag(PragmaARCCFCodeAuditedInfo.second, diag::note_pragma_entered_here); + + // Immediately leave the pragma. + PragmaARCCFCodeAuditedInfo = {nullptr, SourceLocation()}; + } + + // Complain about attempts to #include files in an assume-nonnull pragma. + if (PragmaAssumeNonNullLoc.isValid()) { + Diag(StartLoc, diag::err_pp_include_in_assume_nonnull) << IsImportDecl; + Diag(PragmaAssumeNonNullLoc, diag::note_pragma_entered_here); + + // Immediately leave the pragma. + PragmaAssumeNonNullLoc = SourceLocation(); + } + + if (HeaderInfo.HasIncludeAliasMap()) { + // Map the filename with the brackets still attached. If the name doesn't + // map to anything, fall back on the filename we've already gotten the + // spelling for. + StringRef NewName = HeaderInfo.MapHeaderToIncludeAlias(OriginalFilename); + if (!NewName.empty()) + Filename = NewName; + } + + // Search include directories. + bool IsMapped = false; + bool IsFrameworkFound = false; + const DirectoryLookup *CurDir; + SmallString<1024> SearchPath; + SmallString<1024> RelativePath; + // We get the raw path only if we have 'Callbacks' to which we later pass + // the path. + ModuleMap::KnownHeader SuggestedModule; + SourceLocation FilenameLoc = FilenameTok.getLocation(); + StringRef LookupFilename = Filename; + +#ifndef _WIN32 + // Normalize slashes when compiling with -fms-extensions on non-Windows. This + // is unnecessary on Windows since the filesystem there handles backslashes. + SmallString<128> NormalizedPath; + if (LangOpts.MicrosoftExt) { + NormalizedPath = Filename.str(); + llvm::sys::path::native(NormalizedPath); + LookupFilename = NormalizedPath; + } +#endif + + Optional<FileEntryRef> File = LookupHeaderIncludeOrImport( + CurDir, Filename, FilenameLoc, FilenameRange, FilenameTok, + IsFrameworkFound, IsImportDecl, IsMapped, LookupFrom, LookupFromFile, + LookupFilename, RelativePath, SearchPath, SuggestedModule, isAngled); + + if (usingPCHWithThroughHeader() && SkippingUntilPCHThroughHeader) { + if (File && isPCHThroughHeader(&File->getFileEntry())) + SkippingUntilPCHThroughHeader = false; + return {ImportAction::None}; + } + + // Check for circular inclusion of the main file. + // We can't generate a consistent preamble with regard to the conditional + // stack if the main file is included again as due to the preamble bounds + // some directives (e.g. #endif of a header guard) will never be seen. + // Since this will lead to confusing errors, avoid the inclusion. + if (File && PreambleConditionalStack.isRecording() && + SourceMgr.translateFile(&File->getFileEntry()) == + SourceMgr.getMainFileID()) { + Diag(FilenameTok.getLocation(), + diag::err_pp_including_mainfile_in_preamble); + return {ImportAction::None}; + } + + // Should we enter the source file? Set to Skip if either the source file is + // known to have no effect beyond its effect on module visibility -- that is, + // if it's got an include guard that is already defined, set to Import if it + // is a modular header we've already built and should import. + enum { Enter, Import, Skip, IncludeLimitReached } Action = Enter; + + if (PPOpts->SingleFileParseMode) + Action = IncludeLimitReached; + + // If we've reached the max allowed include depth, it is usually due to an + // include cycle. Don't enter already processed files again as it can lead to + // reaching the max allowed include depth again. + if (Action == Enter && HasReachedMaxIncludeDepth && File && + HeaderInfo.getFileInfo(&File->getFileEntry()).NumIncludes) + Action = IncludeLimitReached; + + // Determine whether we should try to import the module for this #include, if + // there is one. Don't do so if precompiled module support is disabled or we + // are processing this module textually (because we're building the module). + if (Action == Enter && File && SuggestedModule && getLangOpts().Modules && + !isForModuleBuilding(SuggestedModule.getModule(), + getLangOpts().CurrentModule, + getLangOpts().ModuleName)) { + // If this include corresponds to a module but that module is + // unavailable, diagnose the situation and bail out. + // FIXME: Remove this; loadModule does the same check (but produces + // slightly worse diagnostics). + if (checkModuleIsAvailable(getLangOpts(), getTargetInfo(), getDiagnostics(), + SuggestedModule.getModule())) { + Diag(FilenameTok.getLocation(), + diag::note_implicit_top_level_module_import_here) + << SuggestedModule.getModule()->getTopLevelModuleName(); + return {ImportAction::None}; + } + + // Compute the module access path corresponding to this module. + // FIXME: Should we have a second loadModule() overload to avoid this + // extra lookup step? + SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> Path; + for (Module *Mod = SuggestedModule.getModule(); Mod; Mod = Mod->Parent) + Path.push_back(std::make_pair(getIdentifierInfo(Mod->Name), + FilenameTok.getLocation())); + std::reverse(Path.begin(), Path.end()); + + // Warn that we're replacing the include/import with a module import. + if (!IsImportDecl) + diagnoseAutoModuleImport(*this, StartLoc, IncludeTok, Path, CharEnd); + + // Load the module to import its macros. We'll make the declarations + // visible when the parser gets here. + // FIXME: Pass SuggestedModule in here rather than converting it to a path + // and making the module loader convert it back again. + ModuleLoadResult Imported = TheModuleLoader.loadModule( + IncludeTok.getLocation(), Path, Module::Hidden, + /*IsInclusionDirective=*/true); + assert((Imported == nullptr || Imported == SuggestedModule.getModule()) && + "the imported module is different than the suggested one"); + + if (Imported) { + Action = Import; + } else if (Imported.isMissingExpected()) { + // We failed to find a submodule that we assumed would exist (because it + // was in the directory of an umbrella header, for instance), but no + // actual module containing it exists (because the umbrella header is + // incomplete). Treat this as a textual inclusion. + SuggestedModule = ModuleMap::KnownHeader(); + } else if (Imported.isConfigMismatch()) { + // On a configuration mismatch, enter the header textually. We still know + // that it's part of the corresponding module. + } else { + // We hit an error processing the import. Bail out. + if (hadModuleLoaderFatalFailure()) { + // With a fatal failure in the module loader, we abort parsing. + Token &Result = IncludeTok; + assert(CurLexer && "#include but no current lexer set!"); + Result.startToken(); + CurLexer->FormTokenWithChars(Result, CurLexer->BufferEnd, tok::eof); + CurLexer->cutOffLexing(); + } + return {ImportAction::None}; + } + } + + // The #included file will be considered to be a system header if either it is + // in a system include directory, or if the #includer is a system include + // header. + SrcMgr::CharacteristicKind FileCharacter = + SourceMgr.getFileCharacteristic(FilenameTok.getLocation()); + if (File) + FileCharacter = std::max(HeaderInfo.getFileDirFlavor(&File->getFileEntry()), + FileCharacter); + + // If this is a '#import' or an import-declaration, don't re-enter the file. + // + // FIXME: If we have a suggested module for a '#include', and we've already + // visited this file, don't bother entering it again. We know it has no + // further effect. + bool EnterOnce = + IsImportDecl || + IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp_import; + + // Ask HeaderInfo if we should enter this #include file. If not, #including + // this file will have no effect. + if (Action == Enter && File && + !HeaderInfo.ShouldEnterIncludeFile(*this, &File->getFileEntry(), + EnterOnce, getLangOpts().Modules, + SuggestedModule.getModule())) { + // Even if we've already preprocessed this header once and know that we + // don't need to see its contents again, we still need to import it if it's + // modular because we might not have imported it from this submodule before. + // + // FIXME: We don't do this when compiling a PCH because the AST + // serialization layer can't cope with it. This means we get local + // submodule visibility semantics wrong in that case. + Action = (SuggestedModule && !getLangOpts().CompilingPCH) ? Import : Skip; + } + + if (Callbacks && !IsImportDecl) { + // Notify the callback object that we've seen an inclusion directive. + // FIXME: Use a different callback for a pp-import? + Callbacks->InclusionDirective( + HashLoc, IncludeTok, LookupFilename, isAngled, FilenameRange, + File ? &File->getFileEntry() : nullptr, SearchPath, RelativePath, + Action == Import ? SuggestedModule.getModule() : nullptr, + FileCharacter); + if (Action == Skip && File) + Callbacks->FileSkipped(*File, FilenameTok, FileCharacter); + } + + if (!File) + return {ImportAction::None}; + + // If this is a C++20 pp-import declaration, diagnose if we didn't find any + // module corresponding to the named header. + if (IsImportDecl && !SuggestedModule) { + Diag(FilenameTok, diag::err_header_import_not_header_unit) + << OriginalFilename << File->getName(); + return {ImportAction::None}; + } + + // Issue a diagnostic if the name of the file on disk has a different case + // than the one we're about to open. + const bool CheckIncludePathPortability = + !IsMapped && !File->getFileEntry().tryGetRealPathName().empty(); + + if (CheckIncludePathPortability) { + StringRef Name = LookupFilename; + StringRef RealPathName = File->getFileEntry().tryGetRealPathName(); + SmallVector<StringRef, 16> Components(llvm::sys::path::begin(Name), + llvm::sys::path::end(Name)); + + if (trySimplifyPath(Components, RealPathName)) { + SmallString<128> Path; + Path.reserve(Name.size()+2); + Path.push_back(isAngled ? '<' : '"'); + bool isLeadingSeparator = llvm::sys::path::is_absolute(Name); + for (auto Component : Components) { + if (isLeadingSeparator) + isLeadingSeparator = false; + else + Path.append(Component); + // Append the separator the user used, or the close quote + Path.push_back( + Path.size() <= Filename.size() ? Filename[Path.size()-1] : + (isAngled ? '>' : '"')); + } + // For user files and known standard headers, by default we issue a diagnostic. + // For other system headers, we don't. They can be controlled separately. + auto DiagId = (FileCharacter == SrcMgr::C_User || warnByDefaultOnWrongCase(Name)) ? + diag::pp_nonportable_path : diag::pp_nonportable_system_path; + Diag(FilenameTok, DiagId) << Path << + FixItHint::CreateReplacement(FilenameRange, Path); + } + } + + switch (Action) { + case Skip: + // If we don't need to enter the file, stop now. + if (Module *M = SuggestedModule.getModule()) + return {ImportAction::SkippedModuleImport, M}; + return {ImportAction::None}; + + case IncludeLimitReached: + // If we reached our include limit and don't want to enter any more files, + // don't go any further. + return {ImportAction::None}; + + case Import: { + // If this is a module import, make it visible if needed. + Module *M = SuggestedModule.getModule(); + assert(M && "no module to import"); + + makeModuleVisible(M, EndLoc); + + if (IncludeTok.getIdentifierInfo()->getPPKeywordID() == + tok::pp___include_macros) + return {ImportAction::None}; + + return {ImportAction::ModuleImport, M}; + } + + case Enter: + break; + } + + // Check that we don't have infinite #include recursion. + if (IncludeMacroStack.size() == MaxAllowedIncludeStackDepth-1) { + Diag(FilenameTok, diag::err_pp_include_too_deep); + HasReachedMaxIncludeDepth = true; + return {ImportAction::None}; + } + + // Look up the file, create a File ID for it. + SourceLocation IncludePos = FilenameTok.getLocation(); + // If the filename string was the result of macro expansions, set the include + // position on the file where it will be included and after the expansions. + if (IncludePos.isMacroID()) + IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd(); + FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter); + assert(FID.isValid() && "Expected valid file ID"); + + // If all is good, enter the new file! + if (EnterSourceFile(FID, CurDir, FilenameTok.getLocation())) + return {ImportAction::None}; + + // Determine if we're switching to building a new submodule, and which one. + if (auto *M = SuggestedModule.getModule()) { + if (M->getTopLevelModule()->ShadowingModule) { + // We are building a submodule that belongs to a shadowed module. This + // means we find header files in the shadowed module. + Diag(M->DefinitionLoc, diag::err_module_build_shadowed_submodule) + << M->getFullModuleName(); + Diag(M->getTopLevelModule()->ShadowingModule->DefinitionLoc, + diag::note_previous_definition); + return {ImportAction::None}; + } + // When building a pch, -fmodule-name tells the compiler to textually + // include headers in the specified module. We are not building the + // specified module. + // + // FIXME: This is the wrong way to handle this. We should produce a PCH + // that behaves the same as the header would behave in a compilation using + // that PCH, which means we should enter the submodule. We need to teach + // the AST serialization layer to deal with the resulting AST. + if (getLangOpts().CompilingPCH && + isForModuleBuilding(M, getLangOpts().CurrentModule, + getLangOpts().ModuleName)) + return {ImportAction::None}; + + assert(!CurLexerSubmodule && "should not have marked this as a module yet"); + CurLexerSubmodule = M; + + // Let the macro handling code know that any future macros are within + // the new submodule. + EnterSubmodule(M, EndLoc, /*ForPragma*/false); + + // Let the parser know that any future declarations are within the new + // submodule. + // FIXME: There's no point doing this if we're handling a #__include_macros + // directive. + return {ImportAction::ModuleBegin, M}; + } + + assert(!IsImportDecl && "failed to diagnose missing module for import decl"); + return {ImportAction::None}; +} + +/// HandleIncludeNextDirective - Implements \#include_next. +/// +void Preprocessor::HandleIncludeNextDirective(SourceLocation HashLoc, + Token &IncludeNextTok) { + Diag(IncludeNextTok, diag::ext_pp_include_next_directive); + + // #include_next is like #include, except that we start searching after + // the current found directory. If we can't do this, issue a + // diagnostic. + const DirectoryLookup *Lookup = CurDirLookup; + const FileEntry *LookupFromFile = nullptr; + if (isInPrimaryFile() && LangOpts.IsHeaderFile) { + // If the main file is a header, then it's either for PCH/AST generation, + // or libclang opened it. Either way, handle it as a normal include below + // and do not complain about include_next. + } else if (isInPrimaryFile()) { + Lookup = nullptr; + Diag(IncludeNextTok, diag::pp_include_next_in_primary); + } else if (CurLexerSubmodule) { + // Start looking up in the directory *after* the one in which the current + // file would be found, if any. + assert(CurPPLexer && "#include_next directive in macro?"); + LookupFromFile = CurPPLexer->getFileEntry(); + Lookup = nullptr; + } else if (!Lookup) { + // The current file was not found by walking the include path. Either it + // is the primary file (handled above), or it was found by absolute path, + // or it was found relative to such a file. + // FIXME: Track enough information so we know which case we're in. + Diag(IncludeNextTok, diag::pp_include_next_absolute_path); + } else { + // Start looking up in the next directory. + ++Lookup; + } + + return HandleIncludeDirective(HashLoc, IncludeNextTok, Lookup, + LookupFromFile); +} + +/// HandleMicrosoftImportDirective - Implements \#import for Microsoft Mode +void Preprocessor::HandleMicrosoftImportDirective(Token &Tok) { + // The Microsoft #import directive takes a type library and generates header + // files from it, and includes those. This is beyond the scope of what clang + // does, so we ignore it and error out. However, #import can optionally have + // trailing attributes that span multiple lines. We're going to eat those + // so we can continue processing from there. + Diag(Tok, diag::err_pp_import_directive_ms ); + + // Read tokens until we get to the end of the directive. Note that the + // directive can be split over multiple lines using the backslash character. + DiscardUntilEndOfDirective(); +} + +/// HandleImportDirective - Implements \#import. +/// +void Preprocessor::HandleImportDirective(SourceLocation HashLoc, + Token &ImportTok) { + if (!LangOpts.ObjC) { // #import is standard for ObjC. + if (LangOpts.MSVCCompat) + return HandleMicrosoftImportDirective(ImportTok); + Diag(ImportTok, diag::ext_pp_import_directive); + } + return HandleIncludeDirective(HashLoc, ImportTok); +} + +/// HandleIncludeMacrosDirective - The -imacros command line option turns into a +/// pseudo directive in the predefines buffer. This handles it by sucking all +/// tokens through the preprocessor and discarding them (only keeping the side +/// effects on the preprocessor). +void Preprocessor::HandleIncludeMacrosDirective(SourceLocation HashLoc, + Token &IncludeMacrosTok) { + // This directive should only occur in the predefines buffer. If not, emit an + // error and reject it. + SourceLocation Loc = IncludeMacrosTok.getLocation(); + if (SourceMgr.getBufferName(Loc) != "<built-in>") { + Diag(IncludeMacrosTok.getLocation(), + diag::pp_include_macros_out_of_predefines); + DiscardUntilEndOfDirective(); + return; + } + + // Treat this as a normal #include for checking purposes. If this is + // successful, it will push a new lexer onto the include stack. + HandleIncludeDirective(HashLoc, IncludeMacrosTok); + + Token TmpTok; + do { + Lex(TmpTok); + assert(TmpTok.isNot(tok::eof) && "Didn't find end of -imacros!"); + } while (TmpTok.isNot(tok::hashhash)); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Macro Directive Handling. +//===----------------------------------------------------------------------===// + +/// ReadMacroParameterList - The ( starting a parameter list of a macro +/// definition has just been read. Lex the rest of the parameters and the +/// closing ), updating MI with what we learn. Return true if an error occurs +/// parsing the param list. +bool Preprocessor::ReadMacroParameterList(MacroInfo *MI, Token &Tok) { + SmallVector<IdentifierInfo*, 32> Parameters; + + while (true) { + LexUnexpandedToken(Tok); + switch (Tok.getKind()) { + case tok::r_paren: + // Found the end of the parameter list. + if (Parameters.empty()) // #define FOO() + return false; + // Otherwise we have #define FOO(A,) + Diag(Tok, diag::err_pp_expected_ident_in_arg_list); + return true; + case tok::ellipsis: // #define X(... -> C99 varargs + if (!LangOpts.C99) + Diag(Tok, LangOpts.CPlusPlus11 ? + diag::warn_cxx98_compat_variadic_macro : + diag::ext_variadic_macro); + + // OpenCL v1.2 s6.9.e: variadic macros are not supported. + if (LangOpts.OpenCL) { + Diag(Tok, diag::ext_pp_opencl_variadic_macros); + } + + // Lex the token after the identifier. + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::r_paren)) { + Diag(Tok, diag::err_pp_missing_rparen_in_macro_def); + return true; + } + // Add the __VA_ARGS__ identifier as a parameter. + Parameters.push_back(Ident__VA_ARGS__); + MI->setIsC99Varargs(); + MI->setParameterList(Parameters, BP); + return false; + case tok::eod: // #define X( + Diag(Tok, diag::err_pp_missing_rparen_in_macro_def); + return true; + default: + // Handle keywords and identifiers here to accept things like + // #define Foo(for) for. + IdentifierInfo *II = Tok.getIdentifierInfo(); + if (!II) { + // #define X(1 + Diag(Tok, diag::err_pp_invalid_tok_in_arg_list); + return true; + } + + // If this is already used as a parameter, it is used multiple times (e.g. + // #define X(A,A. + if (llvm::find(Parameters, II) != Parameters.end()) { // C99 6.10.3p6 + Diag(Tok, diag::err_pp_duplicate_name_in_arg_list) << II; + return true; + } + + // Add the parameter to the macro info. + Parameters.push_back(II); + + // Lex the token after the identifier. + LexUnexpandedToken(Tok); + + switch (Tok.getKind()) { + default: // #define X(A B + Diag(Tok, diag::err_pp_expected_comma_in_arg_list); + return true; + case tok::r_paren: // #define X(A) + MI->setParameterList(Parameters, BP); + return false; + case tok::comma: // #define X(A, + break; + case tok::ellipsis: // #define X(A... -> GCC extension + // Diagnose extension. + Diag(Tok, diag::ext_named_variadic_macro); + + // Lex the token after the identifier. + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::r_paren)) { + Diag(Tok, diag::err_pp_missing_rparen_in_macro_def); + return true; + } + + MI->setIsGNUVarargs(); + MI->setParameterList(Parameters, BP); + return false; + } + } + } +} + +static bool isConfigurationPattern(Token &MacroName, MacroInfo *MI, + const LangOptions &LOptions) { + if (MI->getNumTokens() == 1) { + const Token &Value = MI->getReplacementToken(0); + + // Macro that is identity, like '#define inline inline' is a valid pattern. + if (MacroName.getKind() == Value.getKind()) + return true; + + // Macro that maps a keyword to the same keyword decorated with leading/ + // trailing underscores is a valid pattern: + // #define inline __inline + // #define inline __inline__ + // #define inline _inline (in MS compatibility mode) + StringRef MacroText = MacroName.getIdentifierInfo()->getName(); + if (IdentifierInfo *II = Value.getIdentifierInfo()) { + if (!II->isKeyword(LOptions)) + return false; + StringRef ValueText = II->getName(); + StringRef TrimmedValue = ValueText; + if (!ValueText.startswith("__")) { + if (ValueText.startswith("_")) + TrimmedValue = TrimmedValue.drop_front(1); + else + return false; + } else { + TrimmedValue = TrimmedValue.drop_front(2); + if (TrimmedValue.endswith("__")) + TrimmedValue = TrimmedValue.drop_back(2); + } + return TrimmedValue.equals(MacroText); + } else { + return false; + } + } + + // #define inline + return MacroName.isOneOf(tok::kw_extern, tok::kw_inline, tok::kw_static, + tok::kw_const) && + MI->getNumTokens() == 0; +} + +// ReadOptionalMacroParameterListAndBody - This consumes all (i.e. the +// entire line) of the macro's tokens and adds them to MacroInfo, and while +// doing so performs certain validity checks including (but not limited to): +// - # (stringization) is followed by a macro parameter +// +// Returns a nullptr if an invalid sequence of tokens is encountered or returns +// a pointer to a MacroInfo object. + +MacroInfo *Preprocessor::ReadOptionalMacroParameterListAndBody( + const Token &MacroNameTok, const bool ImmediatelyAfterHeaderGuard) { + + Token LastTok = MacroNameTok; + // Create the new macro. + MacroInfo *const MI = AllocateMacroInfo(MacroNameTok.getLocation()); + + Token Tok; + LexUnexpandedToken(Tok); + + // Ensure we consume the rest of the macro body if errors occur. + auto _ = llvm::make_scope_exit([&]() { + // The flag indicates if we are still waiting for 'eod'. + if (CurLexer->ParsingPreprocessorDirective) + DiscardUntilEndOfDirective(); + }); + + // Used to un-poison and then re-poison identifiers of the __VA_ARGS__ ilk + // within their appropriate context. + VariadicMacroScopeGuard VariadicMacroScopeGuard(*this); + + // If this is a function-like macro definition, parse the argument list, + // marking each of the identifiers as being used as macro arguments. Also, + // check other constraints on the first token of the macro body. + if (Tok.is(tok::eod)) { + if (ImmediatelyAfterHeaderGuard) { + // Save this macro information since it may part of a header guard. + CurPPLexer->MIOpt.SetDefinedMacro(MacroNameTok.getIdentifierInfo(), + MacroNameTok.getLocation()); + } + // If there is no body to this macro, we have no special handling here. + } else if (Tok.hasLeadingSpace()) { + // This is a normal token with leading space. Clear the leading space + // marker on the first token to get proper expansion. + Tok.clearFlag(Token::LeadingSpace); + } else if (Tok.is(tok::l_paren)) { + // This is a function-like macro definition. Read the argument list. + MI->setIsFunctionLike(); + if (ReadMacroParameterList(MI, LastTok)) + return nullptr; + + // If this is a definition of an ISO C/C++ variadic function-like macro (not + // using the GNU named varargs extension) inform our variadic scope guard + // which un-poisons and re-poisons certain identifiers (e.g. __VA_ARGS__) + // allowed only within the definition of a variadic macro. + + if (MI->isC99Varargs()) { + VariadicMacroScopeGuard.enterScope(); + } + + // Read the first token after the arg list for down below. + LexUnexpandedToken(Tok); + } else if (LangOpts.C99 || LangOpts.CPlusPlus11) { + // C99 requires whitespace between the macro definition and the body. Emit + // a diagnostic for something like "#define X+". + Diag(Tok, diag::ext_c99_whitespace_required_after_macro_name); + } else { + // C90 6.8 TC1 says: "In the definition of an object-like macro, if the + // first character of a replacement list is not a character required by + // subclause 5.2.1, then there shall be white-space separation between the + // identifier and the replacement list.". 5.2.1 lists this set: + // "A-Za-z0-9!"#%&'()*+,_./:;<=>?[\]^_{|}~" as well as whitespace, which + // is irrelevant here. + bool isInvalid = false; + if (Tok.is(tok::at)) // @ is not in the list above. + isInvalid = true; + else if (Tok.is(tok::unknown)) { + // If we have an unknown token, it is something strange like "`". Since + // all of valid characters would have lexed into a single character + // token of some sort, we know this is not a valid case. + isInvalid = true; + } + if (isInvalid) + Diag(Tok, diag::ext_missing_whitespace_after_macro_name); + else + Diag(Tok, diag::warn_missing_whitespace_after_macro_name); + } + + if (!Tok.is(tok::eod)) + LastTok = Tok; + + // Read the rest of the macro body. + if (MI->isObjectLike()) { + // Object-like macros are very simple, just read their body. + while (Tok.isNot(tok::eod)) { + LastTok = Tok; + MI->AddTokenToBody(Tok); + // Get the next token of the macro. + LexUnexpandedToken(Tok); + } + } else { + // Otherwise, read the body of a function-like macro. While we are at it, + // check C99 6.10.3.2p1: ensure that # operators are followed by macro + // parameters in function-like macro expansions. + + VAOptDefinitionContext VAOCtx(*this); + + while (Tok.isNot(tok::eod)) { + LastTok = Tok; + + if (!Tok.isOneOf(tok::hash, tok::hashat, tok::hashhash)) { + MI->AddTokenToBody(Tok); + + if (VAOCtx.isVAOptToken(Tok)) { + // If we're already within a VAOPT, emit an error. + if (VAOCtx.isInVAOpt()) { + Diag(Tok, diag::err_pp_vaopt_nested_use); + return nullptr; + } + // Ensure VAOPT is followed by a '(' . + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::l_paren)) { + Diag(Tok, diag::err_pp_missing_lparen_in_vaopt_use); + return nullptr; + } + MI->AddTokenToBody(Tok); + VAOCtx.sawVAOptFollowedByOpeningParens(Tok.getLocation()); + LexUnexpandedToken(Tok); + if (Tok.is(tok::hashhash)) { + Diag(Tok, diag::err_vaopt_paste_at_start); + return nullptr; + } + continue; + } else if (VAOCtx.isInVAOpt()) { + if (Tok.is(tok::r_paren)) { + if (VAOCtx.sawClosingParen()) { + const unsigned NumTokens = MI->getNumTokens(); + assert(NumTokens >= 3 && "Must have seen at least __VA_OPT__( " + "and a subsequent tok::r_paren"); + if (MI->getReplacementToken(NumTokens - 2).is(tok::hashhash)) { + Diag(Tok, diag::err_vaopt_paste_at_end); + return nullptr; + } + } + } else if (Tok.is(tok::l_paren)) { + VAOCtx.sawOpeningParen(Tok.getLocation()); + } + } + // Get the next token of the macro. + LexUnexpandedToken(Tok); + continue; + } + + // If we're in -traditional mode, then we should ignore stringification + // and token pasting. Mark the tokens as unknown so as not to confuse + // things. + if (getLangOpts().TraditionalCPP) { + Tok.setKind(tok::unknown); + MI->AddTokenToBody(Tok); + + // Get the next token of the macro. + LexUnexpandedToken(Tok); + continue; + } + + if (Tok.is(tok::hashhash)) { + // If we see token pasting, check if it looks like the gcc comma + // pasting extension. We'll use this information to suppress + // diagnostics later on. + + // Get the next token of the macro. + LexUnexpandedToken(Tok); + + if (Tok.is(tok::eod)) { + MI->AddTokenToBody(LastTok); + break; + } + + unsigned NumTokens = MI->getNumTokens(); + if (NumTokens && Tok.getIdentifierInfo() == Ident__VA_ARGS__ && + MI->getReplacementToken(NumTokens-1).is(tok::comma)) + MI->setHasCommaPasting(); + + // Things look ok, add the '##' token to the macro. + MI->AddTokenToBody(LastTok); + continue; + } + + // Our Token is a stringization operator. + // Get the next token of the macro. + LexUnexpandedToken(Tok); + + // Check for a valid macro arg identifier or __VA_OPT__. + if (!VAOCtx.isVAOptToken(Tok) && + (Tok.getIdentifierInfo() == nullptr || + MI->getParameterNum(Tok.getIdentifierInfo()) == -1)) { + + // If this is assembler-with-cpp mode, we accept random gibberish after + // the '#' because '#' is often a comment character. However, change + // the kind of the token to tok::unknown so that the preprocessor isn't + // confused. + if (getLangOpts().AsmPreprocessor && Tok.isNot(tok::eod)) { + LastTok.setKind(tok::unknown); + MI->AddTokenToBody(LastTok); + continue; + } else { + Diag(Tok, diag::err_pp_stringize_not_parameter) + << LastTok.is(tok::hashat); + return nullptr; + } + } + + // Things look ok, add the '#' and param name tokens to the macro. + MI->AddTokenToBody(LastTok); + + // If the token following '#' is VAOPT, let the next iteration handle it + // and check it for correctness, otherwise add the token and prime the + // loop with the next one. + if (!VAOCtx.isVAOptToken(Tok)) { + MI->AddTokenToBody(Tok); + LastTok = Tok; + + // Get the next token of the macro. + LexUnexpandedToken(Tok); + } + } + if (VAOCtx.isInVAOpt()) { + assert(Tok.is(tok::eod) && "Must be at End Of preprocessing Directive"); + Diag(Tok, diag::err_pp_expected_after) + << LastTok.getKind() << tok::r_paren; + Diag(VAOCtx.getUnmatchedOpeningParenLoc(), diag::note_matching) << tok::l_paren; + return nullptr; + } + } + MI->setDefinitionEndLoc(LastTok.getLocation()); + return MI; +} +/// HandleDefineDirective - Implements \#define. This consumes the entire macro +/// line then lets the caller lex the next real token. +void Preprocessor::HandleDefineDirective( + Token &DefineTok, const bool ImmediatelyAfterHeaderGuard) { + ++NumDefined; + + Token MacroNameTok; + bool MacroShadowsKeyword; + ReadMacroName(MacroNameTok, MU_Define, &MacroShadowsKeyword); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eod)) + return; + + // If we are supposed to keep comments in #defines, reenable comment saving + // mode. + if (CurLexer) CurLexer->SetCommentRetentionState(KeepMacroComments); + + MacroInfo *const MI = ReadOptionalMacroParameterListAndBody( + MacroNameTok, ImmediatelyAfterHeaderGuard); + + if (!MI) return; + + if (MacroShadowsKeyword && + !isConfigurationPattern(MacroNameTok, MI, getLangOpts())) { + Diag(MacroNameTok, diag::warn_pp_macro_hides_keyword); + } + // Check that there is no paste (##) operator at the beginning or end of the + // replacement list. + unsigned NumTokens = MI->getNumTokens(); + if (NumTokens != 0) { + if (MI->getReplacementToken(0).is(tok::hashhash)) { + Diag(MI->getReplacementToken(0), diag::err_paste_at_start); + return; + } + if (MI->getReplacementToken(NumTokens-1).is(tok::hashhash)) { + Diag(MI->getReplacementToken(NumTokens-1), diag::err_paste_at_end); + return; + } + } + + // When skipping just warn about macros that do not match. + if (SkippingUntilPCHThroughHeader) { + const MacroInfo *OtherMI = getMacroInfo(MacroNameTok.getIdentifierInfo()); + if (!OtherMI || !MI->isIdenticalTo(*OtherMI, *this, + /*Syntactic=*/LangOpts.MicrosoftExt)) + Diag(MI->getDefinitionLoc(), diag::warn_pp_macro_def_mismatch_with_pch) + << MacroNameTok.getIdentifierInfo(); + return; + } + + // Finally, if this identifier already had a macro defined for it, verify that + // the macro bodies are identical, and issue diagnostics if they are not. + if (const MacroInfo *OtherMI=getMacroInfo(MacroNameTok.getIdentifierInfo())) { + // In Objective-C, ignore attempts to directly redefine the builtin + // definitions of the ownership qualifiers. It's still possible to + // #undef them. + auto isObjCProtectedMacro = [](const IdentifierInfo *II) -> bool { + return II->isStr("__strong") || + II->isStr("__weak") || + II->isStr("__unsafe_unretained") || + II->isStr("__autoreleasing"); + }; + if (getLangOpts().ObjC && + SourceMgr.getFileID(OtherMI->getDefinitionLoc()) + == getPredefinesFileID() && + isObjCProtectedMacro(MacroNameTok.getIdentifierInfo())) { + // Warn if it changes the tokens. + if ((!getDiagnostics().getSuppressSystemWarnings() || + !SourceMgr.isInSystemHeader(DefineTok.getLocation())) && + !MI->isIdenticalTo(*OtherMI, *this, + /*Syntactic=*/LangOpts.MicrosoftExt)) { + Diag(MI->getDefinitionLoc(), diag::warn_pp_objc_macro_redef_ignored); + } + assert(!OtherMI->isWarnIfUnused()); + return; + } + + // It is very common for system headers to have tons of macro redefinitions + // and for warnings to be disabled in system headers. If this is the case, + // then don't bother calling MacroInfo::isIdenticalTo. + if (!getDiagnostics().getSuppressSystemWarnings() || + !SourceMgr.isInSystemHeader(DefineTok.getLocation())) { + if (!OtherMI->isUsed() && OtherMI->isWarnIfUnused()) + Diag(OtherMI->getDefinitionLoc(), diag::pp_macro_not_used); + + // Warn if defining "__LINE__" and other builtins, per C99 6.10.8/4 and + // C++ [cpp.predefined]p4, but allow it as an extension. + if (OtherMI->isBuiltinMacro()) + Diag(MacroNameTok, diag::ext_pp_redef_builtin_macro); + // Macros must be identical. This means all tokens and whitespace + // separation must be the same. C99 6.10.3p2. + else if (!OtherMI->isAllowRedefinitionsWithoutWarning() && + !MI->isIdenticalTo(*OtherMI, *this, /*Syntactic=*/LangOpts.MicrosoftExt)) { + Diag(MI->getDefinitionLoc(), diag::ext_pp_macro_redef) + << MacroNameTok.getIdentifierInfo(); + Diag(OtherMI->getDefinitionLoc(), diag::note_previous_definition); + } + } + if (OtherMI->isWarnIfUnused()) + WarnUnusedMacroLocs.erase(OtherMI->getDefinitionLoc()); + } + + DefMacroDirective *MD = + appendDefMacroDirective(MacroNameTok.getIdentifierInfo(), MI); + + assert(!MI->isUsed()); + // If we need warning for not using the macro, add its location in the + // warn-because-unused-macro set. If it gets used it will be removed from set. + if (getSourceManager().isInMainFile(MI->getDefinitionLoc()) && + !Diags->isIgnored(diag::pp_macro_not_used, MI->getDefinitionLoc()) && + !MacroExpansionInDirectivesOverride) { + MI->setIsWarnIfUnused(true); + WarnUnusedMacroLocs.insert(MI->getDefinitionLoc()); + } + + // If the callbacks want to know, tell them about the macro definition. + if (Callbacks) + Callbacks->MacroDefined(MacroNameTok, MD); +} + +/// HandleUndefDirective - Implements \#undef. +/// +void Preprocessor::HandleUndefDirective() { + ++NumUndefined; + + Token MacroNameTok; + ReadMacroName(MacroNameTok, MU_Undef); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eod)) + return; + + // Check to see if this is the last token on the #undef line. + CheckEndOfDirective("undef"); + + // Okay, we have a valid identifier to undef. + auto *II = MacroNameTok.getIdentifierInfo(); + auto MD = getMacroDefinition(II); + UndefMacroDirective *Undef = nullptr; + + // If the macro is not defined, this is a noop undef. + if (const MacroInfo *MI = MD.getMacroInfo()) { + if (!MI->isUsed() && MI->isWarnIfUnused()) + Diag(MI->getDefinitionLoc(), diag::pp_macro_not_used); + + if (MI->isWarnIfUnused()) + WarnUnusedMacroLocs.erase(MI->getDefinitionLoc()); + + Undef = AllocateUndefMacroDirective(MacroNameTok.getLocation()); + } + + // If the callbacks want to know, tell them about the macro #undef. + // Note: no matter if the macro was defined or not. + if (Callbacks) + Callbacks->MacroUndefined(MacroNameTok, MD, Undef); + + if (Undef) + appendMacroDirective(II, Undef); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Conditional Directive Handling. +//===----------------------------------------------------------------------===// + +/// HandleIfdefDirective - Implements the \#ifdef/\#ifndef directive. isIfndef +/// is true when this is a \#ifndef directive. ReadAnyTokensBeforeDirective is +/// true if any tokens have been returned or pp-directives activated before this +/// \#ifndef has been lexed. +/// +void Preprocessor::HandleIfdefDirective(Token &Result, + const Token &HashToken, + bool isIfndef, + bool ReadAnyTokensBeforeDirective) { + ++NumIf; + Token DirectiveTok = Result; + + Token MacroNameTok; + ReadMacroName(MacroNameTok); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eod)) { + // Skip code until we get to #endif. This helps with recovery by not + // emitting an error when the #endif is reached. + SkipExcludedConditionalBlock(HashToken.getLocation(), + DirectiveTok.getLocation(), + /*Foundnonskip*/ false, /*FoundElse*/ false); + return; + } + + // Check to see if this is the last token on the #if[n]def line. + CheckEndOfDirective(isIfndef ? "ifndef" : "ifdef"); + + IdentifierInfo *MII = MacroNameTok.getIdentifierInfo(); + auto MD = getMacroDefinition(MII); + MacroInfo *MI = MD.getMacroInfo(); + + if (CurPPLexer->getConditionalStackDepth() == 0) { + // If the start of a top-level #ifdef and if the macro is not defined, + // inform MIOpt that this might be the start of a proper include guard. + // Otherwise it is some other form of unknown conditional which we can't + // handle. + if (!ReadAnyTokensBeforeDirective && !MI) { + assert(isIfndef && "#ifdef shouldn't reach here"); + CurPPLexer->MIOpt.EnterTopLevelIfndef(MII, MacroNameTok.getLocation()); + } else + CurPPLexer->MIOpt.EnterTopLevelConditional(); + } + + // If there is a macro, process it. + if (MI) // Mark it used. + markMacroAsUsed(MI); + + if (Callbacks) { + if (isIfndef) + Callbacks->Ifndef(DirectiveTok.getLocation(), MacroNameTok, MD); + else + Callbacks->Ifdef(DirectiveTok.getLocation(), MacroNameTok, MD); + } + + bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + getSourceManager().isInMainFile(DirectiveTok.getLocation()); + + // Should we include the stuff contained by this directive? + if (PPOpts->SingleFileParseMode && !MI) { + // In 'single-file-parse mode' undefined identifiers trigger parsing of all + // the directive blocks. + CurPPLexer->pushConditionalLevel(DirectiveTok.getLocation(), + /*wasskip*/false, /*foundnonskip*/false, + /*foundelse*/false); + } else if (!MI == isIfndef || RetainExcludedCB) { + // Yes, remember that we are inside a conditional, then lex the next token. + CurPPLexer->pushConditionalLevel(DirectiveTok.getLocation(), + /*wasskip*/false, /*foundnonskip*/true, + /*foundelse*/false); + } else { + // No, skip the contents of this block. + SkipExcludedConditionalBlock(HashToken.getLocation(), + DirectiveTok.getLocation(), + /*Foundnonskip*/ false, + /*FoundElse*/ false); + } +} + +/// HandleIfDirective - Implements the \#if directive. +/// +void Preprocessor::HandleIfDirective(Token &IfToken, + const Token &HashToken, + bool ReadAnyTokensBeforeDirective) { + ++NumIf; + + // Parse and evaluate the conditional expression. + IdentifierInfo *IfNDefMacro = nullptr; + const DirectiveEvalResult DER = EvaluateDirectiveExpression(IfNDefMacro); + const bool ConditionalTrue = DER.Conditional; + + // If this condition is equivalent to #ifndef X, and if this is the first + // directive seen, handle it for the multiple-include optimization. + if (CurPPLexer->getConditionalStackDepth() == 0) { + if (!ReadAnyTokensBeforeDirective && IfNDefMacro && ConditionalTrue) + // FIXME: Pass in the location of the macro name, not the 'if' token. + CurPPLexer->MIOpt.EnterTopLevelIfndef(IfNDefMacro, IfToken.getLocation()); + else + CurPPLexer->MIOpt.EnterTopLevelConditional(); + } + + if (Callbacks) + Callbacks->If( + IfToken.getLocation(), DER.ExprRange, + (ConditionalTrue ? PPCallbacks::CVK_True : PPCallbacks::CVK_False)); + + bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + getSourceManager().isInMainFile(IfToken.getLocation()); + + // Should we include the stuff contained by this directive? + if (PPOpts->SingleFileParseMode && DER.IncludedUndefinedIds) { + // In 'single-file-parse mode' undefined identifiers trigger parsing of all + // the directive blocks. + CurPPLexer->pushConditionalLevel(IfToken.getLocation(), /*wasskip*/false, + /*foundnonskip*/false, /*foundelse*/false); + } else if (ConditionalTrue || RetainExcludedCB) { + // Yes, remember that we are inside a conditional, then lex the next token. + CurPPLexer->pushConditionalLevel(IfToken.getLocation(), /*wasskip*/false, + /*foundnonskip*/true, /*foundelse*/false); + } else { + // No, skip the contents of this block. + SkipExcludedConditionalBlock(HashToken.getLocation(), IfToken.getLocation(), + /*Foundnonskip*/ false, + /*FoundElse*/ false); + } +} + +/// HandleEndifDirective - Implements the \#endif directive. +/// +void Preprocessor::HandleEndifDirective(Token &EndifToken) { + ++NumEndif; + + // Check that this is the whole directive. + CheckEndOfDirective("endif"); + + PPConditionalInfo CondInfo; + if (CurPPLexer->popConditionalLevel(CondInfo)) { + // No conditionals on the stack: this is an #endif without an #if. + Diag(EndifToken, diag::err_pp_endif_without_if); + return; + } + + // If this the end of a top-level #endif, inform MIOpt. + if (CurPPLexer->getConditionalStackDepth() == 0) + CurPPLexer->MIOpt.ExitTopLevelConditional(); + + assert(!CondInfo.WasSkipping && !CurPPLexer->LexingRawMode && + "This code should only be reachable in the non-skipping case!"); + + if (Callbacks) + Callbacks->Endif(EndifToken.getLocation(), CondInfo.IfLoc); +} + +/// HandleElseDirective - Implements the \#else directive. +/// +void Preprocessor::HandleElseDirective(Token &Result, const Token &HashToken) { + ++NumElse; + + // #else directive in a non-skipping conditional... start skipping. + CheckEndOfDirective("else"); + + PPConditionalInfo CI; + if (CurPPLexer->popConditionalLevel(CI)) { + Diag(Result, diag::pp_err_else_without_if); + return; + } + + // If this is a top-level #else, inform the MIOpt. + if (CurPPLexer->getConditionalStackDepth() == 0) + CurPPLexer->MIOpt.EnterTopLevelConditional(); + + // If this is a #else with a #else before it, report the error. + if (CI.FoundElse) Diag(Result, diag::pp_err_else_after_else); + + if (Callbacks) + Callbacks->Else(Result.getLocation(), CI.IfLoc); + + bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + getSourceManager().isInMainFile(Result.getLocation()); + + if ((PPOpts->SingleFileParseMode && !CI.FoundNonSkip) || RetainExcludedCB) { + // In 'single-file-parse mode' undefined identifiers trigger parsing of all + // the directive blocks. + CurPPLexer->pushConditionalLevel(CI.IfLoc, /*wasskip*/false, + /*foundnonskip*/false, /*foundelse*/true); + return; + } + + // Finally, skip the rest of the contents of this block. + SkipExcludedConditionalBlock(HashToken.getLocation(), CI.IfLoc, + /*Foundnonskip*/ true, + /*FoundElse*/ true, Result.getLocation()); +} + +/// HandleElifDirective - Implements the \#elif directive. +/// +void Preprocessor::HandleElifDirective(Token &ElifToken, + const Token &HashToken) { + ++NumElse; + + // #elif directive in a non-skipping conditional... start skipping. + // We don't care what the condition is, because we will always skip it (since + // the block immediately before it was included). + SourceRange ConditionRange = DiscardUntilEndOfDirective(); + + PPConditionalInfo CI; + if (CurPPLexer->popConditionalLevel(CI)) { + Diag(ElifToken, diag::pp_err_elif_without_if); + return; + } + + // If this is a top-level #elif, inform the MIOpt. + if (CurPPLexer->getConditionalStackDepth() == 0) + CurPPLexer->MIOpt.EnterTopLevelConditional(); + + // If this is a #elif with a #else before it, report the error. + if (CI.FoundElse) Diag(ElifToken, diag::pp_err_elif_after_else); + + if (Callbacks) + Callbacks->Elif(ElifToken.getLocation(), ConditionRange, + PPCallbacks::CVK_NotEvaluated, CI.IfLoc); + + bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + getSourceManager().isInMainFile(ElifToken.getLocation()); + + if ((PPOpts->SingleFileParseMode && !CI.FoundNonSkip) || RetainExcludedCB) { + // In 'single-file-parse mode' undefined identifiers trigger parsing of all + // the directive blocks. + CurPPLexer->pushConditionalLevel(ElifToken.getLocation(), /*wasskip*/false, + /*foundnonskip*/false, /*foundelse*/false); + return; + } + + // Finally, skip the rest of the contents of this block. + SkipExcludedConditionalBlock( + HashToken.getLocation(), CI.IfLoc, /*Foundnonskip*/ true, + /*FoundElse*/ CI.FoundElse, ElifToken.getLocation()); +} diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp new file mode 100644 index 000000000000..e5ec2b99f507 --- /dev/null +++ b/clang/lib/Lex/PPExpressions.cpp @@ -0,0 +1,899 @@ +//===--- PPExpressions.cpp - Preprocessor Expression Evaluation -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Preprocessor::EvaluateDirectiveExpression method, +// which parses and evaluates integer constant expressions for #if directives. +// +//===----------------------------------------------------------------------===// +// +// FIXME: implement testing for #assert's. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/CodeCompletionHandler.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/SaveAndRestore.h" +#include <cassert> + +using namespace clang; + +namespace { + +/// PPValue - Represents the value of a subexpression of a preprocessor +/// conditional and the source range covered by it. +class PPValue { + SourceRange Range; + IdentifierInfo *II; + +public: + llvm::APSInt Val; + + // Default ctor - Construct an 'invalid' PPValue. + PPValue(unsigned BitWidth) : Val(BitWidth) {} + + // If this value was produced by directly evaluating an identifier, produce + // that identifier. + IdentifierInfo *getIdentifier() const { return II; } + void setIdentifier(IdentifierInfo *II) { this->II = II; } + + unsigned getBitWidth() const { return Val.getBitWidth(); } + bool isUnsigned() const { return Val.isUnsigned(); } + + SourceRange getRange() const { return Range; } + + void setRange(SourceLocation L) { Range.setBegin(L); Range.setEnd(L); } + void setRange(SourceLocation B, SourceLocation E) { + Range.setBegin(B); Range.setEnd(E); + } + void setBegin(SourceLocation L) { Range.setBegin(L); } + void setEnd(SourceLocation L) { Range.setEnd(L); } +}; + +} // end anonymous namespace + +static bool EvaluateDirectiveSubExpr(PPValue &LHS, unsigned MinPrec, + Token &PeekTok, bool ValueLive, + bool &IncludedUndefinedIds, + Preprocessor &PP); + +/// DefinedTracker - This struct is used while parsing expressions to keep track +/// of whether !defined(X) has been seen. +/// +/// With this simple scheme, we handle the basic forms: +/// !defined(X) and !defined X +/// but we also trivially handle (silly) stuff like: +/// !!!defined(X) and +!defined(X) and !+!+!defined(X) and !(defined(X)). +struct DefinedTracker { + /// Each time a Value is evaluated, it returns information about whether the + /// parsed value is of the form defined(X), !defined(X) or is something else. + enum TrackerState { + DefinedMacro, // defined(X) + NotDefinedMacro, // !defined(X) + Unknown // Something else. + } State; + /// TheMacro - When the state is DefinedMacro or NotDefinedMacro, this + /// indicates the macro that was checked. + IdentifierInfo *TheMacro; + bool IncludedUndefinedIds = false; +}; + +/// EvaluateDefined - Process a 'defined(sym)' expression. +static bool EvaluateDefined(PPValue &Result, Token &PeekTok, DefinedTracker &DT, + bool ValueLive, Preprocessor &PP) { + SourceLocation beginLoc(PeekTok.getLocation()); + Result.setBegin(beginLoc); + + // Get the next token, don't expand it. + PP.LexUnexpandedNonComment(PeekTok); + + // Two options, it can either be a pp-identifier or a (. + SourceLocation LParenLoc; + if (PeekTok.is(tok::l_paren)) { + // Found a paren, remember we saw it and skip it. + LParenLoc = PeekTok.getLocation(); + PP.LexUnexpandedNonComment(PeekTok); + } + + if (PeekTok.is(tok::code_completion)) { + if (PP.getCodeCompletionHandler()) + PP.getCodeCompletionHandler()->CodeCompleteMacroName(false); + PP.setCodeCompletionReached(); + PP.LexUnexpandedNonComment(PeekTok); + } + + // If we don't have a pp-identifier now, this is an error. + if (PP.CheckMacroName(PeekTok, MU_Other)) + return true; + + // Otherwise, we got an identifier, is it defined to something? + IdentifierInfo *II = PeekTok.getIdentifierInfo(); + MacroDefinition Macro = PP.getMacroDefinition(II); + Result.Val = !!Macro; + Result.Val.setIsUnsigned(false); // Result is signed intmax_t. + DT.IncludedUndefinedIds = !Macro; + + // If there is a macro, mark it used. + if (Result.Val != 0 && ValueLive) + PP.markMacroAsUsed(Macro.getMacroInfo()); + + // Save macro token for callback. + Token macroToken(PeekTok); + + // If we are in parens, ensure we have a trailing ). + if (LParenLoc.isValid()) { + // Consume identifier. + Result.setEnd(PeekTok.getLocation()); + PP.LexUnexpandedNonComment(PeekTok); + + if (PeekTok.isNot(tok::r_paren)) { + PP.Diag(PeekTok.getLocation(), diag::err_pp_expected_after) + << "'defined'" << tok::r_paren; + PP.Diag(LParenLoc, diag::note_matching) << tok::l_paren; + return true; + } + // Consume the ). + PP.LexNonComment(PeekTok); + Result.setEnd(PeekTok.getLocation()); + } else { + // Consume identifier. + Result.setEnd(PeekTok.getLocation()); + PP.LexNonComment(PeekTok); + } + + // [cpp.cond]p4: + // Prior to evaluation, macro invocations in the list of preprocessing + // tokens that will become the controlling constant expression are replaced + // (except for those macro names modified by the 'defined' unary operator), + // just as in normal text. If the token 'defined' is generated as a result + // of this replacement process or use of the 'defined' unary operator does + // not match one of the two specified forms prior to macro replacement, the + // behavior is undefined. + // This isn't an idle threat, consider this program: + // #define FOO + // #define BAR defined(FOO) + // #if BAR + // ... + // #else + // ... + // #endif + // clang and gcc will pick the #if branch while Visual Studio will take the + // #else branch. Emit a warning about this undefined behavior. + if (beginLoc.isMacroID()) { + bool IsFunctionTypeMacro = + PP.getSourceManager() + .getSLocEntry(PP.getSourceManager().getFileID(beginLoc)) + .getExpansion() + .isFunctionMacroExpansion(); + // For object-type macros, it's easy to replace + // #define FOO defined(BAR) + // with + // #if defined(BAR) + // #define FOO 1 + // #else + // #define FOO 0 + // #endif + // and doing so makes sense since compilers handle this differently in + // practice (see example further up). But for function-type macros, + // there is no good way to write + // # define FOO(x) (defined(M_ ## x) && M_ ## x) + // in a different way, and compilers seem to agree on how to behave here. + // So warn by default on object-type macros, but only warn in -pedantic + // mode on function-type macros. + if (IsFunctionTypeMacro) + PP.Diag(beginLoc, diag::warn_defined_in_function_type_macro); + else + PP.Diag(beginLoc, diag::warn_defined_in_object_type_macro); + } + + // Invoke the 'defined' callback. + if (PPCallbacks *Callbacks = PP.getPPCallbacks()) { + Callbacks->Defined(macroToken, Macro, + SourceRange(beginLoc, PeekTok.getLocation())); + } + + // Success, remember that we saw defined(X). + DT.State = DefinedTracker::DefinedMacro; + DT.TheMacro = II; + return false; +} + +/// EvaluateValue - Evaluate the token PeekTok (and any others needed) and +/// return the computed value in Result. Return true if there was an error +/// parsing. This function also returns information about the form of the +/// expression in DT. See above for information on what DT means. +/// +/// If ValueLive is false, then this value is being evaluated in a context where +/// the result is not used. As such, avoid diagnostics that relate to +/// evaluation. +static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, + bool ValueLive, Preprocessor &PP) { + DT.State = DefinedTracker::Unknown; + + Result.setIdentifier(nullptr); + + if (PeekTok.is(tok::code_completion)) { + if (PP.getCodeCompletionHandler()) + PP.getCodeCompletionHandler()->CodeCompletePreprocessorExpression(); + PP.setCodeCompletionReached(); + PP.LexNonComment(PeekTok); + } + + switch (PeekTok.getKind()) { + default: + // If this token's spelling is a pp-identifier, check to see if it is + // 'defined' or if it is a macro. Note that we check here because many + // keywords are pp-identifiers, so we can't check the kind. + if (IdentifierInfo *II = PeekTok.getIdentifierInfo()) { + // Handle "defined X" and "defined(X)". + if (II->isStr("defined")) + return EvaluateDefined(Result, PeekTok, DT, ValueLive, PP); + + if (!II->isCPlusPlusOperatorKeyword()) { + // If this identifier isn't 'defined' or one of the special + // preprocessor keywords and it wasn't macro expanded, it turns + // into a simple 0 + if (ValueLive) + PP.Diag(PeekTok, diag::warn_pp_undef_identifier) << II; + Result.Val = 0; + Result.Val.setIsUnsigned(false); // "0" is signed intmax_t 0. + Result.setIdentifier(II); + Result.setRange(PeekTok.getLocation()); + DT.IncludedUndefinedIds = true; + PP.LexNonComment(PeekTok); + return false; + } + } + PP.Diag(PeekTok, diag::err_pp_expr_bad_token_start_expr); + return true; + case tok::eod: + case tok::r_paren: + // If there is no expression, report and exit. + PP.Diag(PeekTok, diag::err_pp_expected_value_in_expr); + return true; + case tok::numeric_constant: { + SmallString<64> IntegerBuffer; + bool NumberInvalid = false; + StringRef Spelling = PP.getSpelling(PeekTok, IntegerBuffer, + &NumberInvalid); + if (NumberInvalid) + return true; // a diagnostic was already reported + + NumericLiteralParser Literal(Spelling, PeekTok.getLocation(), PP); + if (Literal.hadError) + return true; // a diagnostic was already reported. + + if (Literal.isFloatingLiteral() || Literal.isImaginary) { + PP.Diag(PeekTok, diag::err_pp_illegal_floating_literal); + return true; + } + assert(Literal.isIntegerLiteral() && "Unknown ppnumber"); + + // Complain about, and drop, any ud-suffix. + if (Literal.hasUDSuffix()) + PP.Diag(PeekTok, diag::err_pp_invalid_udl) << /*integer*/1; + + // 'long long' is a C99 or C++11 feature. + if (!PP.getLangOpts().C99 && Literal.isLongLong) { + if (PP.getLangOpts().CPlusPlus) + PP.Diag(PeekTok, + PP.getLangOpts().CPlusPlus11 ? + diag::warn_cxx98_compat_longlong : diag::ext_cxx11_longlong); + else + PP.Diag(PeekTok, diag::ext_c99_longlong); + } + + // Parse the integer literal into Result. + if (Literal.GetIntegerValue(Result.Val)) { + // Overflow parsing integer literal. + if (ValueLive) + PP.Diag(PeekTok, diag::err_integer_literal_too_large) + << /* Unsigned */ 1; + Result.Val.setIsUnsigned(true); + } else { + // Set the signedness of the result to match whether there was a U suffix + // or not. + Result.Val.setIsUnsigned(Literal.isUnsigned); + + // Detect overflow based on whether the value is signed. If signed + // and if the value is too large, emit a warning "integer constant is so + // large that it is unsigned" e.g. on 12345678901234567890 where intmax_t + // is 64-bits. + if (!Literal.isUnsigned && Result.Val.isNegative()) { + // Octal, hexadecimal, and binary literals are implicitly unsigned if + // the value does not fit into a signed integer type. + if (ValueLive && Literal.getRadix() == 10) + PP.Diag(PeekTok, diag::ext_integer_literal_too_large_for_signed); + Result.Val.setIsUnsigned(true); + } + } + + // Consume the token. + Result.setRange(PeekTok.getLocation()); + PP.LexNonComment(PeekTok); + return false; + } + case tok::char_constant: // 'x' + case tok::wide_char_constant: // L'x' + case tok::utf8_char_constant: // u8'x' + case tok::utf16_char_constant: // u'x' + case tok::utf32_char_constant: { // U'x' + // Complain about, and drop, any ud-suffix. + if (PeekTok.hasUDSuffix()) + PP.Diag(PeekTok, diag::err_pp_invalid_udl) << /*character*/0; + + SmallString<32> CharBuffer; + bool CharInvalid = false; + StringRef ThisTok = PP.getSpelling(PeekTok, CharBuffer, &CharInvalid); + if (CharInvalid) + return true; + + CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(), + PeekTok.getLocation(), PP, PeekTok.getKind()); + if (Literal.hadError()) + return true; // A diagnostic was already emitted. + + // Character literals are always int or wchar_t, expand to intmax_t. + const TargetInfo &TI = PP.getTargetInfo(); + unsigned NumBits; + if (Literal.isMultiChar()) + NumBits = TI.getIntWidth(); + else if (Literal.isWide()) + NumBits = TI.getWCharWidth(); + else if (Literal.isUTF16()) + NumBits = TI.getChar16Width(); + else if (Literal.isUTF32()) + NumBits = TI.getChar32Width(); + else // char or char8_t + NumBits = TI.getCharWidth(); + + // Set the width. + llvm::APSInt Val(NumBits); + // Set the value. + Val = Literal.getValue(); + // Set the signedness. UTF-16 and UTF-32 are always unsigned + if (Literal.isWide()) + Val.setIsUnsigned(!TargetInfo::isTypeSigned(TI.getWCharType())); + else if (!Literal.isUTF16() && !Literal.isUTF32()) + Val.setIsUnsigned(!PP.getLangOpts().CharIsSigned); + + if (Result.Val.getBitWidth() > Val.getBitWidth()) { + Result.Val = Val.extend(Result.Val.getBitWidth()); + } else { + assert(Result.Val.getBitWidth() == Val.getBitWidth() && + "intmax_t smaller than char/wchar_t?"); + Result.Val = Val; + } + + // Consume the token. + Result.setRange(PeekTok.getLocation()); + PP.LexNonComment(PeekTok); + return false; + } + case tok::l_paren: { + SourceLocation Start = PeekTok.getLocation(); + PP.LexNonComment(PeekTok); // Eat the (. + // Parse the value and if there are any binary operators involved, parse + // them. + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + + // If this is a silly value like (X), which doesn't need parens, check for + // !(defined X). + if (PeekTok.is(tok::r_paren)) { + // Just use DT unmodified as our result. + } else { + // Otherwise, we have something like (x+y), and we consumed '(x'. + if (EvaluateDirectiveSubExpr(Result, 1, PeekTok, ValueLive, + DT.IncludedUndefinedIds, PP)) + return true; + + if (PeekTok.isNot(tok::r_paren)) { + PP.Diag(PeekTok.getLocation(), diag::err_pp_expected_rparen) + << Result.getRange(); + PP.Diag(Start, diag::note_matching) << tok::l_paren; + return true; + } + DT.State = DefinedTracker::Unknown; + } + Result.setRange(Start, PeekTok.getLocation()); + Result.setIdentifier(nullptr); + PP.LexNonComment(PeekTok); // Eat the ). + return false; + } + case tok::plus: { + SourceLocation Start = PeekTok.getLocation(); + // Unary plus doesn't modify the value. + PP.LexNonComment(PeekTok); + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + Result.setBegin(Start); + Result.setIdentifier(nullptr); + return false; + } + case tok::minus: { + SourceLocation Loc = PeekTok.getLocation(); + PP.LexNonComment(PeekTok); + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + Result.setBegin(Loc); + Result.setIdentifier(nullptr); + + // C99 6.5.3.3p3: The sign of the result matches the sign of the operand. + Result.Val = -Result.Val; + + // -MININT is the only thing that overflows. Unsigned never overflows. + bool Overflow = !Result.isUnsigned() && Result.Val.isMinSignedValue(); + + // If this operator is live and overflowed, report the issue. + if (Overflow && ValueLive) + PP.Diag(Loc, diag::warn_pp_expr_overflow) << Result.getRange(); + + DT.State = DefinedTracker::Unknown; + return false; + } + + case tok::tilde: { + SourceLocation Start = PeekTok.getLocation(); + PP.LexNonComment(PeekTok); + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + Result.setBegin(Start); + Result.setIdentifier(nullptr); + + // C99 6.5.3.3p4: The sign of the result matches the sign of the operand. + Result.Val = ~Result.Val; + DT.State = DefinedTracker::Unknown; + return false; + } + + case tok::exclaim: { + SourceLocation Start = PeekTok.getLocation(); + PP.LexNonComment(PeekTok); + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + Result.setBegin(Start); + Result.Val = !Result.Val; + // C99 6.5.3.3p5: The sign of the result is 'int', aka it is signed. + Result.Val.setIsUnsigned(false); + Result.setIdentifier(nullptr); + + if (DT.State == DefinedTracker::DefinedMacro) + DT.State = DefinedTracker::NotDefinedMacro; + else if (DT.State == DefinedTracker::NotDefinedMacro) + DT.State = DefinedTracker::DefinedMacro; + return false; + } + case tok::kw_true: + case tok::kw_false: + Result.Val = PeekTok.getKind() == tok::kw_true; + Result.Val.setIsUnsigned(false); // "0" is signed intmax_t 0. + Result.setIdentifier(PeekTok.getIdentifierInfo()); + Result.setRange(PeekTok.getLocation()); + PP.LexNonComment(PeekTok); + return false; + + // FIXME: Handle #assert + } +} + +/// getPrecedence - Return the precedence of the specified binary operator +/// token. This returns: +/// ~0 - Invalid token. +/// 14 -> 3 - various operators. +/// 0 - 'eod' or ')' +static unsigned getPrecedence(tok::TokenKind Kind) { + switch (Kind) { + default: return ~0U; + case tok::percent: + case tok::slash: + case tok::star: return 14; + case tok::plus: + case tok::minus: return 13; + case tok::lessless: + case tok::greatergreater: return 12; + case tok::lessequal: + case tok::less: + case tok::greaterequal: + case tok::greater: return 11; + case tok::exclaimequal: + case tok::equalequal: return 10; + case tok::amp: return 9; + case tok::caret: return 8; + case tok::pipe: return 7; + case tok::ampamp: return 6; + case tok::pipepipe: return 5; + case tok::question: return 4; + case tok::comma: return 3; + case tok::colon: return 2; + case tok::r_paren: return 0;// Lowest priority, end of expr. + case tok::eod: return 0;// Lowest priority, end of directive. + } +} + +static void diagnoseUnexpectedOperator(Preprocessor &PP, PPValue &LHS, + Token &Tok) { + if (Tok.is(tok::l_paren) && LHS.getIdentifier()) + PP.Diag(LHS.getRange().getBegin(), diag::err_pp_expr_bad_token_lparen) + << LHS.getIdentifier(); + else + PP.Diag(Tok.getLocation(), diag::err_pp_expr_bad_token_binop) + << LHS.getRange(); +} + +/// EvaluateDirectiveSubExpr - Evaluate the subexpression whose first token is +/// PeekTok, and whose precedence is PeekPrec. This returns the result in LHS. +/// +/// If ValueLive is false, then this value is being evaluated in a context where +/// the result is not used. As such, avoid diagnostics that relate to +/// evaluation, such as division by zero warnings. +static bool EvaluateDirectiveSubExpr(PPValue &LHS, unsigned MinPrec, + Token &PeekTok, bool ValueLive, + bool &IncludedUndefinedIds, + Preprocessor &PP) { + unsigned PeekPrec = getPrecedence(PeekTok.getKind()); + // If this token isn't valid, report the error. + if (PeekPrec == ~0U) { + diagnoseUnexpectedOperator(PP, LHS, PeekTok); + return true; + } + + while (true) { + // If this token has a lower precedence than we are allowed to parse, return + // it so that higher levels of the recursion can parse it. + if (PeekPrec < MinPrec) + return false; + + tok::TokenKind Operator = PeekTok.getKind(); + + // If this is a short-circuiting operator, see if the RHS of the operator is + // dead. Note that this cannot just clobber ValueLive. Consider + // "0 && 1 ? 4 : 1 / 0", which is parsed as "(0 && 1) ? 4 : (1 / 0)". In + // this example, the RHS of the && being dead does not make the rest of the + // expr dead. + bool RHSIsLive; + if (Operator == tok::ampamp && LHS.Val == 0) + RHSIsLive = false; // RHS of "0 && x" is dead. + else if (Operator == tok::pipepipe && LHS.Val != 0) + RHSIsLive = false; // RHS of "1 || x" is dead. + else if (Operator == tok::question && LHS.Val == 0) + RHSIsLive = false; // RHS (x) of "0 ? x : y" is dead. + else + RHSIsLive = ValueLive; + + // Consume the operator, remembering the operator's location for reporting. + SourceLocation OpLoc = PeekTok.getLocation(); + PP.LexNonComment(PeekTok); + + PPValue RHS(LHS.getBitWidth()); + // Parse the RHS of the operator. + DefinedTracker DT; + if (EvaluateValue(RHS, PeekTok, DT, RHSIsLive, PP)) return true; + IncludedUndefinedIds = DT.IncludedUndefinedIds; + + // Remember the precedence of this operator and get the precedence of the + // operator immediately to the right of the RHS. + unsigned ThisPrec = PeekPrec; + PeekPrec = getPrecedence(PeekTok.getKind()); + + // If this token isn't valid, report the error. + if (PeekPrec == ~0U) { + diagnoseUnexpectedOperator(PP, RHS, PeekTok); + return true; + } + + // Decide whether to include the next binop in this subexpression. For + // example, when parsing x+y*z and looking at '*', we want to recursively + // handle y*z as a single subexpression. We do this because the precedence + // of * is higher than that of +. The only strange case we have to handle + // here is for the ?: operator, where the precedence is actually lower than + // the LHS of the '?'. The grammar rule is: + // + // conditional-expression ::= + // logical-OR-expression ? expression : conditional-expression + // where 'expression' is actually comma-expression. + unsigned RHSPrec; + if (Operator == tok::question) + // The RHS of "?" should be maximally consumed as an expression. + RHSPrec = getPrecedence(tok::comma); + else // All others should munch while higher precedence. + RHSPrec = ThisPrec+1; + + if (PeekPrec >= RHSPrec) { + if (EvaluateDirectiveSubExpr(RHS, RHSPrec, PeekTok, RHSIsLive, + IncludedUndefinedIds, PP)) + return true; + PeekPrec = getPrecedence(PeekTok.getKind()); + } + assert(PeekPrec <= ThisPrec && "Recursion didn't work!"); + + // Usual arithmetic conversions (C99 6.3.1.8p1): result is unsigned if + // either operand is unsigned. + llvm::APSInt Res(LHS.getBitWidth()); + switch (Operator) { + case tok::question: // No UAC for x and y in "x ? y : z". + case tok::lessless: // Shift amount doesn't UAC with shift value. + case tok::greatergreater: // Shift amount doesn't UAC with shift value. + case tok::comma: // Comma operands are not subject to UACs. + case tok::pipepipe: // Logical || does not do UACs. + case tok::ampamp: // Logical && does not do UACs. + break; // No UAC + default: + Res.setIsUnsigned(LHS.isUnsigned()|RHS.isUnsigned()); + // If this just promoted something from signed to unsigned, and if the + // value was negative, warn about it. + if (ValueLive && Res.isUnsigned()) { + if (!LHS.isUnsigned() && LHS.Val.isNegative()) + PP.Diag(OpLoc, diag::warn_pp_convert_to_positive) << 0 + << LHS.Val.toString(10, true) + " to " + + LHS.Val.toString(10, false) + << LHS.getRange() << RHS.getRange(); + if (!RHS.isUnsigned() && RHS.Val.isNegative()) + PP.Diag(OpLoc, diag::warn_pp_convert_to_positive) << 1 + << RHS.Val.toString(10, true) + " to " + + RHS.Val.toString(10, false) + << LHS.getRange() << RHS.getRange(); + } + LHS.Val.setIsUnsigned(Res.isUnsigned()); + RHS.Val.setIsUnsigned(Res.isUnsigned()); + } + + bool Overflow = false; + switch (Operator) { + default: llvm_unreachable("Unknown operator token!"); + case tok::percent: + if (RHS.Val != 0) + Res = LHS.Val % RHS.Val; + else if (ValueLive) { + PP.Diag(OpLoc, diag::err_pp_remainder_by_zero) + << LHS.getRange() << RHS.getRange(); + return true; + } + break; + case tok::slash: + if (RHS.Val != 0) { + if (LHS.Val.isSigned()) + Res = llvm::APSInt(LHS.Val.sdiv_ov(RHS.Val, Overflow), false); + else + Res = LHS.Val / RHS.Val; + } else if (ValueLive) { + PP.Diag(OpLoc, diag::err_pp_division_by_zero) + << LHS.getRange() << RHS.getRange(); + return true; + } + break; + + case tok::star: + if (Res.isSigned()) + Res = llvm::APSInt(LHS.Val.smul_ov(RHS.Val, Overflow), false); + else + Res = LHS.Val * RHS.Val; + break; + case tok::lessless: { + // Determine whether overflow is about to happen. + if (LHS.isUnsigned()) + Res = LHS.Val.ushl_ov(RHS.Val, Overflow); + else + Res = llvm::APSInt(LHS.Val.sshl_ov(RHS.Val, Overflow), false); + break; + } + case tok::greatergreater: { + // Determine whether overflow is about to happen. + unsigned ShAmt = static_cast<unsigned>(RHS.Val.getLimitedValue()); + if (ShAmt >= LHS.getBitWidth()) { + Overflow = true; + ShAmt = LHS.getBitWidth()-1; + } + Res = LHS.Val >> ShAmt; + break; + } + case tok::plus: + if (LHS.isUnsigned()) + Res = LHS.Val + RHS.Val; + else + Res = llvm::APSInt(LHS.Val.sadd_ov(RHS.Val, Overflow), false); + break; + case tok::minus: + if (LHS.isUnsigned()) + Res = LHS.Val - RHS.Val; + else + Res = llvm::APSInt(LHS.Val.ssub_ov(RHS.Val, Overflow), false); + break; + case tok::lessequal: + Res = LHS.Val <= RHS.Val; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::less: + Res = LHS.Val < RHS.Val; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::greaterequal: + Res = LHS.Val >= RHS.Val; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::greater: + Res = LHS.Val > RHS.Val; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::exclaimequal: + Res = LHS.Val != RHS.Val; + Res.setIsUnsigned(false); // C99 6.5.9p3, result is always int (signed) + break; + case tok::equalequal: + Res = LHS.Val == RHS.Val; + Res.setIsUnsigned(false); // C99 6.5.9p3, result is always int (signed) + break; + case tok::amp: + Res = LHS.Val & RHS.Val; + break; + case tok::caret: + Res = LHS.Val ^ RHS.Val; + break; + case tok::pipe: + Res = LHS.Val | RHS.Val; + break; + case tok::ampamp: + Res = (LHS.Val != 0 && RHS.Val != 0); + Res.setIsUnsigned(false); // C99 6.5.13p3, result is always int (signed) + break; + case tok::pipepipe: + Res = (LHS.Val != 0 || RHS.Val != 0); + Res.setIsUnsigned(false); // C99 6.5.14p3, result is always int (signed) + break; + case tok::comma: + // Comma is invalid in pp expressions in c89/c++ mode, but is valid in C99 + // if not being evaluated. + if (!PP.getLangOpts().C99 || ValueLive) + PP.Diag(OpLoc, diag::ext_pp_comma_expr) + << LHS.getRange() << RHS.getRange(); + Res = RHS.Val; // LHS = LHS,RHS -> RHS. + break; + case tok::question: { + // Parse the : part of the expression. + if (PeekTok.isNot(tok::colon)) { + PP.Diag(PeekTok.getLocation(), diag::err_expected) + << tok::colon << LHS.getRange() << RHS.getRange(); + PP.Diag(OpLoc, diag::note_matching) << tok::question; + return true; + } + // Consume the :. + PP.LexNonComment(PeekTok); + + // Evaluate the value after the :. + bool AfterColonLive = ValueLive && LHS.Val == 0; + PPValue AfterColonVal(LHS.getBitWidth()); + DefinedTracker DT; + if (EvaluateValue(AfterColonVal, PeekTok, DT, AfterColonLive, PP)) + return true; + + // Parse anything after the : with the same precedence as ?. We allow + // things of equal precedence because ?: is right associative. + if (EvaluateDirectiveSubExpr(AfterColonVal, ThisPrec, + PeekTok, AfterColonLive, + IncludedUndefinedIds, PP)) + return true; + + // Now that we have the condition, the LHS and the RHS of the :, evaluate. + Res = LHS.Val != 0 ? RHS.Val : AfterColonVal.Val; + RHS.setEnd(AfterColonVal.getRange().getEnd()); + + // Usual arithmetic conversions (C99 6.3.1.8p1): result is unsigned if + // either operand is unsigned. + Res.setIsUnsigned(RHS.isUnsigned() | AfterColonVal.isUnsigned()); + + // Figure out the precedence of the token after the : part. + PeekPrec = getPrecedence(PeekTok.getKind()); + break; + } + case tok::colon: + // Don't allow :'s to float around without being part of ?: exprs. + PP.Diag(OpLoc, diag::err_pp_colon_without_question) + << LHS.getRange() << RHS.getRange(); + return true; + } + + // If this operator is live and overflowed, report the issue. + if (Overflow && ValueLive) + PP.Diag(OpLoc, diag::warn_pp_expr_overflow) + << LHS.getRange() << RHS.getRange(); + + // Put the result back into 'LHS' for our next iteration. + LHS.Val = Res; + LHS.setEnd(RHS.getRange().getEnd()); + RHS.setIdentifier(nullptr); + } +} + +/// EvaluateDirectiveExpression - Evaluate an integer constant expression that +/// may occur after a #if or #elif directive. If the expression is equivalent +/// to "!defined(X)" return X in IfNDefMacro. +Preprocessor::DirectiveEvalResult +Preprocessor::EvaluateDirectiveExpression(IdentifierInfo *&IfNDefMacro) { + SaveAndRestore<bool> PPDir(ParsingIfOrElifDirective, true); + // Save the current state of 'DisableMacroExpansion' and reset it to false. If + // 'DisableMacroExpansion' is true, then we must be in a macro argument list + // in which case a directive is undefined behavior. We want macros to be able + // to recursively expand in order to get more gcc-list behavior, so we force + // DisableMacroExpansion to false and restore it when we're done parsing the + // expression. + bool DisableMacroExpansionAtStartOfDirective = DisableMacroExpansion; + DisableMacroExpansion = false; + + // Peek ahead one token. + Token Tok; + LexNonComment(Tok); + + // C99 6.10.1p3 - All expressions are evaluated as intmax_t or uintmax_t. + unsigned BitWidth = getTargetInfo().getIntMaxTWidth(); + + PPValue ResVal(BitWidth); + DefinedTracker DT; + SourceLocation ExprStartLoc = SourceMgr.getExpansionLoc(Tok.getLocation()); + if (EvaluateValue(ResVal, Tok, DT, true, *this)) { + // Parse error, skip the rest of the macro line. + SourceRange ConditionRange = ExprStartLoc; + if (Tok.isNot(tok::eod)) + ConditionRange = DiscardUntilEndOfDirective(); + + // Restore 'DisableMacroExpansion'. + DisableMacroExpansion = DisableMacroExpansionAtStartOfDirective; + + // We cannot trust the source range from the value because there was a + // parse error. Track the range manually -- the end of the directive is the + // end of the condition range. + return {false, + DT.IncludedUndefinedIds, + {ExprStartLoc, ConditionRange.getEnd()}}; + } + + // If we are at the end of the expression after just parsing a value, there + // must be no (unparenthesized) binary operators involved, so we can exit + // directly. + if (Tok.is(tok::eod)) { + // If the expression we parsed was of the form !defined(macro), return the + // macro in IfNDefMacro. + if (DT.State == DefinedTracker::NotDefinedMacro) + IfNDefMacro = DT.TheMacro; + + // Restore 'DisableMacroExpansion'. + DisableMacroExpansion = DisableMacroExpansionAtStartOfDirective; + return {ResVal.Val != 0, DT.IncludedUndefinedIds, ResVal.getRange()}; + } + + // Otherwise, we must have a binary operator (e.g. "#if 1 < 2"), so parse the + // operator and the stuff after it. + if (EvaluateDirectiveSubExpr(ResVal, getPrecedence(tok::question), + Tok, true, DT.IncludedUndefinedIds, *this)) { + // Parse error, skip the rest of the macro line. + if (Tok.isNot(tok::eod)) + DiscardUntilEndOfDirective(); + + // Restore 'DisableMacroExpansion'. + DisableMacroExpansion = DisableMacroExpansionAtStartOfDirective; + return {false, DT.IncludedUndefinedIds, ResVal.getRange()}; + } + + // If we aren't at the tok::eod token, something bad happened, like an extra + // ')' token. + if (Tok.isNot(tok::eod)) { + Diag(Tok, diag::err_pp_expected_eol); + DiscardUntilEndOfDirective(); + } + + // Restore 'DisableMacroExpansion'. + DisableMacroExpansion = DisableMacroExpansionAtStartOfDirective; + return {ResVal.Val != 0, DT.IncludedUndefinedIds, ResVal.getRange()}; +} diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp new file mode 100644 index 000000000000..802172693960 --- /dev/null +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -0,0 +1,834 @@ +//===--- PPLexerChange.cpp - Handle changing lexers in the preprocessor ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements pieces of the Preprocessor interface that manage the +// current lexer stack. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/MacroInfo.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +using namespace clang; + +PPCallbacks::~PPCallbacks() {} + +//===----------------------------------------------------------------------===// +// Miscellaneous Methods. +//===----------------------------------------------------------------------===// + +/// isInPrimaryFile - Return true if we're in the top-level file, not in a +/// \#include. This looks through macro expansions and active _Pragma lexers. +bool Preprocessor::isInPrimaryFile() const { + if (IsFileLexer()) + return IncludeMacroStack.empty(); + + // If there are any stacked lexers, we're in a #include. + assert(IsFileLexer(IncludeMacroStack[0]) && + "Top level include stack isn't our primary lexer?"); + return std::none_of( + IncludeMacroStack.begin() + 1, IncludeMacroStack.end(), + [&](const IncludeStackInfo &ISI) -> bool { return IsFileLexer(ISI); }); +} + +/// getCurrentLexer - Return the current file lexer being lexed from. Note +/// that this ignores any potentially active macro expansions and _Pragma +/// expansions going on at the time. +PreprocessorLexer *Preprocessor::getCurrentFileLexer() const { + if (IsFileLexer()) + return CurPPLexer; + + // Look for a stacked lexer. + for (const IncludeStackInfo &ISI : llvm::reverse(IncludeMacroStack)) { + if (IsFileLexer(ISI)) + return ISI.ThePPLexer; + } + return nullptr; +} + + +//===----------------------------------------------------------------------===// +// Methods for Entering and Callbacks for leaving various contexts +//===----------------------------------------------------------------------===// + +/// EnterSourceFile - Add a source file to the top of the include stack and +/// start lexing tokens from it instead of the current buffer. +bool Preprocessor::EnterSourceFile(FileID FID, const DirectoryLookup *CurDir, + SourceLocation Loc) { + assert(!CurTokenLexer && "Cannot #include a file inside a macro!"); + ++NumEnteredSourceFiles; + + if (MaxIncludeStackDepth < IncludeMacroStack.size()) + MaxIncludeStackDepth = IncludeMacroStack.size(); + + // Get the MemoryBuffer for this FID, if it fails, we fail. + bool Invalid = false; + const llvm::MemoryBuffer *InputFile = + getSourceManager().getBuffer(FID, Loc, &Invalid); + if (Invalid) { + SourceLocation FileStart = SourceMgr.getLocForStartOfFile(FID); + Diag(Loc, diag::err_pp_error_opening_file) + << std::string(SourceMgr.getBufferName(FileStart)) << ""; + return true; + } + + if (isCodeCompletionEnabled() && + SourceMgr.getFileEntryForID(FID) == CodeCompletionFile) { + CodeCompletionFileLoc = SourceMgr.getLocForStartOfFile(FID); + CodeCompletionLoc = + CodeCompletionFileLoc.getLocWithOffset(CodeCompletionOffset); + } + + EnterSourceFileWithLexer(new Lexer(FID, InputFile, *this), CurDir); + return false; +} + +/// EnterSourceFileWithLexer - Add a source file to the top of the include stack +/// and start lexing tokens from it instead of the current buffer. +void Preprocessor::EnterSourceFileWithLexer(Lexer *TheLexer, + const DirectoryLookup *CurDir) { + + // Add the current lexer to the include stack. + if (CurPPLexer || CurTokenLexer) + PushIncludeMacroStack(); + + CurLexer.reset(TheLexer); + CurPPLexer = TheLexer; + CurDirLookup = CurDir; + CurLexerSubmodule = nullptr; + if (CurLexerKind != CLK_LexAfterModuleImport) + CurLexerKind = CLK_Lexer; + + // Notify the client, if desired, that we are in a new source file. + if (Callbacks && !CurLexer->Is_PragmaLexer) { + SrcMgr::CharacteristicKind FileType = + SourceMgr.getFileCharacteristic(CurLexer->getFileLoc()); + + Callbacks->FileChanged(CurLexer->getFileLoc(), + PPCallbacks::EnterFile, FileType); + } +} + +/// EnterMacro - Add a Macro to the top of the include stack and start lexing +/// tokens from it instead of the current buffer. +void Preprocessor::EnterMacro(Token &Tok, SourceLocation ILEnd, + MacroInfo *Macro, MacroArgs *Args) { + std::unique_ptr<TokenLexer> TokLexer; + if (NumCachedTokenLexers == 0) { + TokLexer = std::make_unique<TokenLexer>(Tok, ILEnd, Macro, Args, *this); + } else { + TokLexer = std::move(TokenLexerCache[--NumCachedTokenLexers]); + TokLexer->Init(Tok, ILEnd, Macro, Args); + } + + PushIncludeMacroStack(); + CurDirLookup = nullptr; + CurTokenLexer = std::move(TokLexer); + if (CurLexerKind != CLK_LexAfterModuleImport) + CurLexerKind = CLK_TokenLexer; +} + +/// EnterTokenStream - Add a "macro" context to the top of the include stack, +/// which will cause the lexer to start returning the specified tokens. +/// +/// If DisableMacroExpansion is true, tokens lexed from the token stream will +/// not be subject to further macro expansion. Otherwise, these tokens will +/// be re-macro-expanded when/if expansion is enabled. +/// +/// If OwnsTokens is false, this method assumes that the specified stream of +/// tokens has a permanent owner somewhere, so they do not need to be copied. +/// If it is true, it assumes the array of tokens is allocated with new[] and +/// must be freed. +/// +void Preprocessor::EnterTokenStream(const Token *Toks, unsigned NumToks, + bool DisableMacroExpansion, bool OwnsTokens, + bool IsReinject) { + if (CurLexerKind == CLK_CachingLexer) { + if (CachedLexPos < CachedTokens.size()) { + assert(IsReinject && "new tokens in the middle of cached stream"); + // We're entering tokens into the middle of our cached token stream. We + // can't represent that, so just insert the tokens into the buffer. + CachedTokens.insert(CachedTokens.begin() + CachedLexPos, + Toks, Toks + NumToks); + if (OwnsTokens) + delete [] Toks; + return; + } + + // New tokens are at the end of the cached token sequnece; insert the + // token stream underneath the caching lexer. + ExitCachingLexMode(); + EnterTokenStream(Toks, NumToks, DisableMacroExpansion, OwnsTokens, + IsReinject); + EnterCachingLexMode(); + return; + } + + // Create a macro expander to expand from the specified token stream. + std::unique_ptr<TokenLexer> TokLexer; + if (NumCachedTokenLexers == 0) { + TokLexer = std::make_unique<TokenLexer>( + Toks, NumToks, DisableMacroExpansion, OwnsTokens, IsReinject, *this); + } else { + TokLexer = std::move(TokenLexerCache[--NumCachedTokenLexers]); + TokLexer->Init(Toks, NumToks, DisableMacroExpansion, OwnsTokens, + IsReinject); + } + + // Save our current state. + PushIncludeMacroStack(); + CurDirLookup = nullptr; + CurTokenLexer = std::move(TokLexer); + if (CurLexerKind != CLK_LexAfterModuleImport) + CurLexerKind = CLK_TokenLexer; +} + +/// Compute the relative path that names the given file relative to +/// the given directory. +static void computeRelativePath(FileManager &FM, const DirectoryEntry *Dir, + const FileEntry *File, + SmallString<128> &Result) { + Result.clear(); + + StringRef FilePath = File->getDir()->getName(); + StringRef Path = FilePath; + while (!Path.empty()) { + if (auto CurDir = FM.getDirectory(Path)) { + if (*CurDir == Dir) { + Result = FilePath.substr(Path.size()); + llvm::sys::path::append(Result, + llvm::sys::path::filename(File->getName())); + return; + } + } + + Path = llvm::sys::path::parent_path(Path); + } + + Result = File->getName(); +} + +void Preprocessor::PropagateLineStartLeadingSpaceInfo(Token &Result) { + if (CurTokenLexer) { + CurTokenLexer->PropagateLineStartLeadingSpaceInfo(Result); + return; + } + if (CurLexer) { + CurLexer->PropagateLineStartLeadingSpaceInfo(Result); + return; + } + // FIXME: Handle other kinds of lexers? It generally shouldn't matter, + // but it might if they're empty? +} + +/// Determine the location to use as the end of the buffer for a lexer. +/// +/// If the file ends with a newline, form the EOF token on the newline itself, +/// rather than "on the line following it", which doesn't exist. This makes +/// diagnostics relating to the end of file include the last file that the user +/// actually typed, which is goodness. +const char *Preprocessor::getCurLexerEndPos() { + const char *EndPos = CurLexer->BufferEnd; + if (EndPos != CurLexer->BufferStart && + (EndPos[-1] == '\n' || EndPos[-1] == '\r')) { + --EndPos; + + // Handle \n\r and \r\n: + if (EndPos != CurLexer->BufferStart && + (EndPos[-1] == '\n' || EndPos[-1] == '\r') && + EndPos[-1] != EndPos[0]) + --EndPos; + } + + return EndPos; +} + +static void collectAllSubModulesWithUmbrellaHeader( + const Module &Mod, SmallVectorImpl<const Module *> &SubMods) { + if (Mod.getUmbrellaHeader()) + SubMods.push_back(&Mod); + for (auto *M : Mod.submodules()) + collectAllSubModulesWithUmbrellaHeader(*M, SubMods); +} + +void Preprocessor::diagnoseMissingHeaderInUmbrellaDir(const Module &Mod) { + assert(Mod.getUmbrellaHeader() && "Module must use umbrella header"); + SourceLocation StartLoc = + SourceMgr.getLocForStartOfFile(SourceMgr.getMainFileID()); + if (getDiagnostics().isIgnored(diag::warn_uncovered_module_header, StartLoc)) + return; + + ModuleMap &ModMap = getHeaderSearchInfo().getModuleMap(); + const DirectoryEntry *Dir = Mod.getUmbrellaDir().Entry; + llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); + std::error_code EC; + for (llvm::vfs::recursive_directory_iterator Entry(FS, Dir->getName(), EC), + End; + Entry != End && !EC; Entry.increment(EC)) { + using llvm::StringSwitch; + + // Check whether this entry has an extension typically associated with + // headers. + if (!StringSwitch<bool>(llvm::sys::path::extension(Entry->path())) + .Cases(".h", ".H", ".hh", ".hpp", true) + .Default(false)) + continue; + + if (auto Header = getFileManager().getFile(Entry->path())) + if (!getSourceManager().hasFileInfo(*Header)) { + if (!ModMap.isHeaderInUnavailableModule(*Header)) { + // Find the relative path that would access this header. + SmallString<128> RelativePath; + computeRelativePath(FileMgr, Dir, *Header, RelativePath); + Diag(StartLoc, diag::warn_uncovered_module_header) + << Mod.getFullModuleName() << RelativePath; + } + } + } +} + +/// HandleEndOfFile - This callback is invoked when the lexer hits the end of +/// the current file. This either returns the EOF token or pops a level off +/// the include stack and keeps going. +bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) { + assert(!CurTokenLexer && + "Ending a file when currently in a macro!"); + + // If we have an unclosed module region from a pragma at the end of a + // module, complain and close it now. + const bool LeavingSubmodule = CurLexer && CurLexerSubmodule; + if ((LeavingSubmodule || IncludeMacroStack.empty()) && + !BuildingSubmoduleStack.empty() && + BuildingSubmoduleStack.back().IsPragma) { + Diag(BuildingSubmoduleStack.back().ImportLoc, + diag::err_pp_module_begin_without_module_end); + Module *M = LeaveSubmodule(/*ForPragma*/true); + + Result.startToken(); + const char *EndPos = getCurLexerEndPos(); + CurLexer->BufferPtr = EndPos; + CurLexer->FormTokenWithChars(Result, EndPos, tok::annot_module_end); + Result.setAnnotationEndLoc(Result.getLocation()); + Result.setAnnotationValue(M); + return true; + } + + // See if this file had a controlling macro. + if (CurPPLexer) { // Not ending a macro, ignore it. + if (const IdentifierInfo *ControllingMacro = + CurPPLexer->MIOpt.GetControllingMacroAtEndOfFile()) { + // Okay, this has a controlling macro, remember in HeaderFileInfo. + if (const FileEntry *FE = CurPPLexer->getFileEntry()) { + HeaderInfo.SetFileControllingMacro(FE, ControllingMacro); + if (MacroInfo *MI = + getMacroInfo(const_cast<IdentifierInfo*>(ControllingMacro))) + MI->setUsedForHeaderGuard(true); + if (const IdentifierInfo *DefinedMacro = + CurPPLexer->MIOpt.GetDefinedMacro()) { + if (!isMacroDefined(ControllingMacro) && + DefinedMacro != ControllingMacro && + HeaderInfo.FirstTimeLexingFile(FE)) { + + // If the edit distance between the two macros is more than 50%, + // DefinedMacro may not be header guard, or can be header guard of + // another header file. Therefore, it maybe defining something + // completely different. This can be observed in the wild when + // handling feature macros or header guards in different files. + + const StringRef ControllingMacroName = ControllingMacro->getName(); + const StringRef DefinedMacroName = DefinedMacro->getName(); + const size_t MaxHalfLength = std::max(ControllingMacroName.size(), + DefinedMacroName.size()) / 2; + const unsigned ED = ControllingMacroName.edit_distance( + DefinedMacroName, true, MaxHalfLength); + if (ED <= MaxHalfLength) { + // Emit a warning for a bad header guard. + Diag(CurPPLexer->MIOpt.GetMacroLocation(), + diag::warn_header_guard) + << CurPPLexer->MIOpt.GetMacroLocation() << ControllingMacro; + Diag(CurPPLexer->MIOpt.GetDefinedLocation(), + diag::note_header_guard) + << CurPPLexer->MIOpt.GetDefinedLocation() << DefinedMacro + << ControllingMacro + << FixItHint::CreateReplacement( + CurPPLexer->MIOpt.GetDefinedLocation(), + ControllingMacro->getName()); + } + } + } + } + } + } + + // Complain about reaching a true EOF within arc_cf_code_audited. + // We don't want to complain about reaching the end of a macro + // instantiation or a _Pragma. + if (PragmaARCCFCodeAuditedInfo.second.isValid() && !isEndOfMacro && + !(CurLexer && CurLexer->Is_PragmaLexer)) { + Diag(PragmaARCCFCodeAuditedInfo.second, + diag::err_pp_eof_in_arc_cf_code_audited); + + // Recover by leaving immediately. + PragmaARCCFCodeAuditedInfo = {nullptr, SourceLocation()}; + } + + // Complain about reaching a true EOF within assume_nonnull. + // We don't want to complain about reaching the end of a macro + // instantiation or a _Pragma. + if (PragmaAssumeNonNullLoc.isValid() && + !isEndOfMacro && !(CurLexer && CurLexer->Is_PragmaLexer)) { + Diag(PragmaAssumeNonNullLoc, diag::err_pp_eof_in_assume_nonnull); + + // Recover by leaving immediately. + PragmaAssumeNonNullLoc = SourceLocation(); + } + + bool LeavingPCHThroughHeader = false; + + // If this is a #include'd file, pop it off the include stack and continue + // lexing the #includer file. + if (!IncludeMacroStack.empty()) { + + // If we lexed the code-completion file, act as if we reached EOF. + if (isCodeCompletionEnabled() && CurPPLexer && + SourceMgr.getLocForStartOfFile(CurPPLexer->getFileID()) == + CodeCompletionFileLoc) { + assert(CurLexer && "Got EOF but no current lexer set!"); + Result.startToken(); + CurLexer->FormTokenWithChars(Result, CurLexer->BufferEnd, tok::eof); + CurLexer.reset(); + + CurPPLexer = nullptr; + recomputeCurLexerKind(); + return true; + } + + if (!isEndOfMacro && CurPPLexer && + SourceMgr.getIncludeLoc(CurPPLexer->getFileID()).isValid()) { + // Notify SourceManager to record the number of FileIDs that were created + // during lexing of the #include'd file. + unsigned NumFIDs = + SourceMgr.local_sloc_entry_size() - + CurPPLexer->getInitialNumSLocEntries() + 1/*#include'd file*/; + SourceMgr.setNumCreatedFIDsForFileID(CurPPLexer->getFileID(), NumFIDs); + } + + bool ExitedFromPredefinesFile = false; + FileID ExitedFID; + if (!isEndOfMacro && CurPPLexer) { + ExitedFID = CurPPLexer->getFileID(); + + assert(PredefinesFileID.isValid() && + "HandleEndOfFile is called before PredefinesFileId is set"); + ExitedFromPredefinesFile = (PredefinesFileID == ExitedFID); + } + + if (LeavingSubmodule) { + // We're done with this submodule. + Module *M = LeaveSubmodule(/*ForPragma*/false); + + // Notify the parser that we've left the module. + const char *EndPos = getCurLexerEndPos(); + Result.startToken(); + CurLexer->BufferPtr = EndPos; + CurLexer->FormTokenWithChars(Result, EndPos, tok::annot_module_end); + Result.setAnnotationEndLoc(Result.getLocation()); + Result.setAnnotationValue(M); + } + + bool FoundPCHThroughHeader = false; + if (CurPPLexer && creatingPCHWithThroughHeader() && + isPCHThroughHeader( + SourceMgr.getFileEntryForID(CurPPLexer->getFileID()))) + FoundPCHThroughHeader = true; + + // We're done with the #included file. + RemoveTopOfLexerStack(); + + // Propagate info about start-of-line/leading white-space/etc. + PropagateLineStartLeadingSpaceInfo(Result); + + // Notify the client, if desired, that we are in a new source file. + if (Callbacks && !isEndOfMacro && CurPPLexer) { + SrcMgr::CharacteristicKind FileType = + SourceMgr.getFileCharacteristic(CurPPLexer->getSourceLocation()); + Callbacks->FileChanged(CurPPLexer->getSourceLocation(), + PPCallbacks::ExitFile, FileType, ExitedFID); + } + + // Restore conditional stack from the preamble right after exiting from the + // predefines file. + if (ExitedFromPredefinesFile) + replayPreambleConditionalStack(); + + if (!isEndOfMacro && CurPPLexer && FoundPCHThroughHeader && + (isInPrimaryFile() || + CurPPLexer->getFileID() == getPredefinesFileID())) { + // Leaving the through header. Continue directly to end of main file + // processing. + LeavingPCHThroughHeader = true; + } else { + // Client should lex another token unless we generated an EOM. + return LeavingSubmodule; + } + } + + // If this is the end of the main file, form an EOF token. + assert(CurLexer && "Got EOF but no current lexer set!"); + const char *EndPos = getCurLexerEndPos(); + Result.startToken(); + CurLexer->BufferPtr = EndPos; + CurLexer->FormTokenWithChars(Result, EndPos, tok::eof); + + if (isCodeCompletionEnabled()) { + // Inserting the code-completion point increases the source buffer by 1, + // but the main FileID was created before inserting the point. + // Compensate by reducing the EOF location by 1, otherwise the location + // will point to the next FileID. + // FIXME: This is hacky, the code-completion point should probably be + // inserted before the main FileID is created. + if (CurLexer->getFileLoc() == CodeCompletionFileLoc) + Result.setLocation(Result.getLocation().getLocWithOffset(-1)); + } + + if (creatingPCHWithThroughHeader() && !LeavingPCHThroughHeader) { + // Reached the end of the compilation without finding the through header. + Diag(CurLexer->getFileLoc(), diag::err_pp_through_header_not_seen) + << PPOpts->PCHThroughHeader << 0; + } + + if (!isIncrementalProcessingEnabled()) + // We're done with lexing. + CurLexer.reset(); + + if (!isIncrementalProcessingEnabled()) + CurPPLexer = nullptr; + + if (TUKind == TU_Complete) { + // This is the end of the top-level file. 'WarnUnusedMacroLocs' has + // collected all macro locations that we need to warn because they are not + // used. + for (WarnUnusedMacroLocsTy::iterator + I=WarnUnusedMacroLocs.begin(), E=WarnUnusedMacroLocs.end(); + I!=E; ++I) + Diag(*I, diag::pp_macro_not_used); + } + + // If we are building a module that has an umbrella header, make sure that + // each of the headers within the directory, including all submodules, is + // covered by the umbrella header was actually included by the umbrella + // header. + if (Module *Mod = getCurrentModule()) { + llvm::SmallVector<const Module *, 4> AllMods; + collectAllSubModulesWithUmbrellaHeader(*Mod, AllMods); + for (auto *M : AllMods) + diagnoseMissingHeaderInUmbrellaDir(*M); + } + + return true; +} + +/// HandleEndOfTokenLexer - This callback is invoked when the current TokenLexer +/// hits the end of its token stream. +bool Preprocessor::HandleEndOfTokenLexer(Token &Result) { + assert(CurTokenLexer && !CurPPLexer && + "Ending a macro when currently in a #include file!"); + + if (!MacroExpandingLexersStack.empty() && + MacroExpandingLexersStack.back().first == CurTokenLexer.get()) + removeCachedMacroExpandedTokensOfLastLexer(); + + // Delete or cache the now-dead macro expander. + if (NumCachedTokenLexers == TokenLexerCacheSize) + CurTokenLexer.reset(); + else + TokenLexerCache[NumCachedTokenLexers++] = std::move(CurTokenLexer); + + // Handle this like a #include file being popped off the stack. + return HandleEndOfFile(Result, true); +} + +/// RemoveTopOfLexerStack - Pop the current lexer/macro exp off the top of the +/// lexer stack. This should only be used in situations where the current +/// state of the top-of-stack lexer is unknown. +void Preprocessor::RemoveTopOfLexerStack() { + assert(!IncludeMacroStack.empty() && "Ran out of stack entries to load"); + + if (CurTokenLexer) { + // Delete or cache the now-dead macro expander. + if (NumCachedTokenLexers == TokenLexerCacheSize) + CurTokenLexer.reset(); + else + TokenLexerCache[NumCachedTokenLexers++] = std::move(CurTokenLexer); + } + + PopIncludeMacroStack(); +} + +/// HandleMicrosoftCommentPaste - When the macro expander pastes together a +/// comment (/##/) in microsoft mode, this method handles updating the current +/// state, returning the token on the next source line. +void Preprocessor::HandleMicrosoftCommentPaste(Token &Tok) { + assert(CurTokenLexer && !CurPPLexer && + "Pasted comment can only be formed from macro"); + // We handle this by scanning for the closest real lexer, switching it to + // raw mode and preprocessor mode. This will cause it to return \n as an + // explicit EOD token. + PreprocessorLexer *FoundLexer = nullptr; + bool LexerWasInPPMode = false; + for (const IncludeStackInfo &ISI : llvm::reverse(IncludeMacroStack)) { + if (ISI.ThePPLexer == nullptr) continue; // Scan for a real lexer. + + // Once we find a real lexer, mark it as raw mode (disabling macro + // expansions) and preprocessor mode (return EOD). We know that the lexer + // was *not* in raw mode before, because the macro that the comment came + // from was expanded. However, it could have already been in preprocessor + // mode (#if COMMENT) in which case we have to return it to that mode and + // return EOD. + FoundLexer = ISI.ThePPLexer; + FoundLexer->LexingRawMode = true; + LexerWasInPPMode = FoundLexer->ParsingPreprocessorDirective; + FoundLexer->ParsingPreprocessorDirective = true; + break; + } + + // Okay, we either found and switched over the lexer, or we didn't find a + // lexer. In either case, finish off the macro the comment came from, getting + // the next token. + if (!HandleEndOfTokenLexer(Tok)) Lex(Tok); + + // Discarding comments as long as we don't have EOF or EOD. This 'comments + // out' the rest of the line, including any tokens that came from other macros + // that were active, as in: + // #define submacro a COMMENT b + // submacro c + // which should lex to 'a' only: 'b' and 'c' should be removed. + while (Tok.isNot(tok::eod) && Tok.isNot(tok::eof)) + Lex(Tok); + + // If we got an eod token, then we successfully found the end of the line. + if (Tok.is(tok::eod)) { + assert(FoundLexer && "Can't get end of line without an active lexer"); + // Restore the lexer back to normal mode instead of raw mode. + FoundLexer->LexingRawMode = false; + + // If the lexer was already in preprocessor mode, just return the EOD token + // to finish the preprocessor line. + if (LexerWasInPPMode) return; + + // Otherwise, switch out of PP mode and return the next lexed token. + FoundLexer->ParsingPreprocessorDirective = false; + return Lex(Tok); + } + + // If we got an EOF token, then we reached the end of the token stream but + // didn't find an explicit \n. This can only happen if there was no lexer + // active (an active lexer would return EOD at EOF if there was no \n in + // preprocessor directive mode), so just return EOF as our token. + assert(!FoundLexer && "Lexer should return EOD before EOF in PP mode"); +} + +void Preprocessor::EnterSubmodule(Module *M, SourceLocation ImportLoc, + bool ForPragma) { + if (!getLangOpts().ModulesLocalVisibility) { + // Just track that we entered this submodule. + BuildingSubmoduleStack.push_back( + BuildingSubmoduleInfo(M, ImportLoc, ForPragma, CurSubmoduleState, + PendingModuleMacroNames.size())); + if (Callbacks) + Callbacks->EnteredSubmodule(M, ImportLoc, ForPragma); + return; + } + + // Resolve as much of the module definition as we can now, before we enter + // one of its headers. + // FIXME: Can we enable Complain here? + // FIXME: Can we do this when local visibility is disabled? + ModuleMap &ModMap = getHeaderSearchInfo().getModuleMap(); + ModMap.resolveExports(M, /*Complain=*/false); + ModMap.resolveUses(M, /*Complain=*/false); + ModMap.resolveConflicts(M, /*Complain=*/false); + + // If this is the first time we've entered this module, set up its state. + auto R = Submodules.insert(std::make_pair(M, SubmoduleState())); + auto &State = R.first->second; + bool FirstTime = R.second; + if (FirstTime) { + // Determine the set of starting macros for this submodule; take these + // from the "null" module (the predefines buffer). + // + // FIXME: If we have local visibility but not modules enabled, the + // NullSubmoduleState is polluted by #defines in the top-level source + // file. + auto &StartingMacros = NullSubmoduleState.Macros; + + // Restore to the starting state. + // FIXME: Do this lazily, when each macro name is first referenced. + for (auto &Macro : StartingMacros) { + // Skip uninteresting macros. + if (!Macro.second.getLatest() && + Macro.second.getOverriddenMacros().empty()) + continue; + + MacroState MS(Macro.second.getLatest()); + MS.setOverriddenMacros(*this, Macro.second.getOverriddenMacros()); + State.Macros.insert(std::make_pair(Macro.first, std::move(MS))); + } + } + + // Track that we entered this module. + BuildingSubmoduleStack.push_back( + BuildingSubmoduleInfo(M, ImportLoc, ForPragma, CurSubmoduleState, + PendingModuleMacroNames.size())); + + if (Callbacks) + Callbacks->EnteredSubmodule(M, ImportLoc, ForPragma); + + // Switch to this submodule as the current submodule. + CurSubmoduleState = &State; + + // This module is visible to itself. + if (FirstTime) + makeModuleVisible(M, ImportLoc); +} + +bool Preprocessor::needModuleMacros() const { + // If we're not within a submodule, we never need to create ModuleMacros. + if (BuildingSubmoduleStack.empty()) + return false; + // If we are tracking module macro visibility even for textually-included + // headers, we need ModuleMacros. + if (getLangOpts().ModulesLocalVisibility) + return true; + // Otherwise, we only need module macros if we're actually compiling a module + // interface. + return getLangOpts().isCompilingModule(); +} + +Module *Preprocessor::LeaveSubmodule(bool ForPragma) { + if (BuildingSubmoduleStack.empty() || + BuildingSubmoduleStack.back().IsPragma != ForPragma) { + assert(ForPragma && "non-pragma module enter/leave mismatch"); + return nullptr; + } + + auto &Info = BuildingSubmoduleStack.back(); + + Module *LeavingMod = Info.M; + SourceLocation ImportLoc = Info.ImportLoc; + + if (!needModuleMacros() || + (!getLangOpts().ModulesLocalVisibility && + LeavingMod->getTopLevelModuleName() != getLangOpts().CurrentModule)) { + // If we don't need module macros, or this is not a module for which we + // are tracking macro visibility, don't build any, and preserve the list + // of pending names for the surrounding submodule. + BuildingSubmoduleStack.pop_back(); + + if (Callbacks) + Callbacks->LeftSubmodule(LeavingMod, ImportLoc, ForPragma); + + makeModuleVisible(LeavingMod, ImportLoc); + return LeavingMod; + } + + // Create ModuleMacros for any macros defined in this submodule. + llvm::SmallPtrSet<const IdentifierInfo*, 8> VisitedMacros; + for (unsigned I = Info.OuterPendingModuleMacroNames; + I != PendingModuleMacroNames.size(); ++I) { + auto *II = const_cast<IdentifierInfo*>(PendingModuleMacroNames[I]); + if (!VisitedMacros.insert(II).second) + continue; + + auto MacroIt = CurSubmoduleState->Macros.find(II); + if (MacroIt == CurSubmoduleState->Macros.end()) + continue; + auto &Macro = MacroIt->second; + + // Find the starting point for the MacroDirective chain in this submodule. + MacroDirective *OldMD = nullptr; + auto *OldState = Info.OuterSubmoduleState; + if (getLangOpts().ModulesLocalVisibility) + OldState = &NullSubmoduleState; + if (OldState && OldState != CurSubmoduleState) { + // FIXME: It'd be better to start at the state from when we most recently + // entered this submodule, but it doesn't really matter. + auto &OldMacros = OldState->Macros; + auto OldMacroIt = OldMacros.find(II); + if (OldMacroIt == OldMacros.end()) + OldMD = nullptr; + else + OldMD = OldMacroIt->second.getLatest(); + } + + // This module may have exported a new macro. If so, create a ModuleMacro + // representing that fact. + bool ExplicitlyPublic = false; + for (auto *MD = Macro.getLatest(); MD != OldMD; MD = MD->getPrevious()) { + assert(MD && "broken macro directive chain"); + + if (auto *VisMD = dyn_cast<VisibilityMacroDirective>(MD)) { + // The latest visibility directive for a name in a submodule affects + // all the directives that come before it. + if (VisMD->isPublic()) + ExplicitlyPublic = true; + else if (!ExplicitlyPublic) + // Private with no following public directive: not exported. + break; + } else { + MacroInfo *Def = nullptr; + if (DefMacroDirective *DefMD = dyn_cast<DefMacroDirective>(MD)) + Def = DefMD->getInfo(); + + // FIXME: Issue a warning if multiple headers for the same submodule + // define a macro, rather than silently ignoring all but the first. + bool IsNew; + // Don't bother creating a module macro if it would represent a #undef + // that doesn't override anything. + if (Def || !Macro.getOverriddenMacros().empty()) + addModuleMacro(LeavingMod, II, Def, + Macro.getOverriddenMacros(), IsNew); + + if (!getLangOpts().ModulesLocalVisibility) { + // This macro is exposed to the rest of this compilation as a + // ModuleMacro; we don't need to track its MacroDirective any more. + Macro.setLatest(nullptr); + Macro.setOverriddenMacros(*this, {}); + } + break; + } + } + } + PendingModuleMacroNames.resize(Info.OuterPendingModuleMacroNames); + + // FIXME: Before we leave this submodule, we should parse all the other + // headers within it. Otherwise, we're left with an inconsistent state + // where we've made the module visible but don't yet have its complete + // contents. + + // Put back the outer module's state, if we're tracking it. + if (getLangOpts().ModulesLocalVisibility) + CurSubmoduleState = Info.OuterSubmoduleState; + + BuildingSubmoduleStack.pop_back(); + + if (Callbacks) + Callbacks->LeftSubmodule(LeavingMod, ImportLoc, ForPragma); + + // A nested #include makes the included submodule visible. + makeModuleVisible(LeavingMod, ImportLoc); + return LeavingMod; +} diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp new file mode 100644 index 000000000000..dfbcaedcacff --- /dev/null +++ b/clang/lib/Lex/PPMacroExpansion.cpp @@ -0,0 +1,1845 @@ +//===--- PPMacroExpansion.cpp - Top level Macro Expansion -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the top level handling of macro expansion for the +// preprocessor. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/Attributes.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/ObjCRuntime.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Lex/CodeCompletionHandler.h" +#include "clang/Lex/DirectoryLookup.h" +#include "clang/Lex/ExternalPreprocessorSource.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/MacroArgs.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorLexer.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstring> +#include <ctime> +#include <string> +#include <tuple> +#include <utility> + +using namespace clang; + +MacroDirective * +Preprocessor::getLocalMacroDirectiveHistory(const IdentifierInfo *II) const { + if (!II->hadMacroDefinition()) + return nullptr; + auto Pos = CurSubmoduleState->Macros.find(II); + return Pos == CurSubmoduleState->Macros.end() ? nullptr + : Pos->second.getLatest(); +} + +void Preprocessor::appendMacroDirective(IdentifierInfo *II, MacroDirective *MD){ + assert(MD && "MacroDirective should be non-zero!"); + assert(!MD->getPrevious() && "Already attached to a MacroDirective history."); + + MacroState &StoredMD = CurSubmoduleState->Macros[II]; + auto *OldMD = StoredMD.getLatest(); + MD->setPrevious(OldMD); + StoredMD.setLatest(MD); + StoredMD.overrideActiveModuleMacros(*this, II); + + if (needModuleMacros()) { + // Track that we created a new macro directive, so we know we should + // consider building a ModuleMacro for it when we get to the end of + // the module. + PendingModuleMacroNames.push_back(II); + } + + // Set up the identifier as having associated macro history. + II->setHasMacroDefinition(true); + if (!MD->isDefined() && LeafModuleMacros.find(II) == LeafModuleMacros.end()) + II->setHasMacroDefinition(false); + if (II->isFromAST()) + II->setChangedSinceDeserialization(); +} + +void Preprocessor::setLoadedMacroDirective(IdentifierInfo *II, + MacroDirective *ED, + MacroDirective *MD) { + // Normally, when a macro is defined, it goes through appendMacroDirective() + // above, which chains a macro to previous defines, undefs, etc. + // However, in a pch, the whole macro history up to the end of the pch is + // stored, so ASTReader goes through this function instead. + // However, built-in macros are already registered in the Preprocessor + // ctor, and ASTWriter stops writing the macro chain at built-in macros, + // so in that case the chain from the pch needs to be spliced to the existing + // built-in. + + assert(II && MD); + MacroState &StoredMD = CurSubmoduleState->Macros[II]; + + if (auto *OldMD = StoredMD.getLatest()) { + // shouldIgnoreMacro() in ASTWriter also stops at macros from the + // predefines buffer in module builds. However, in module builds, modules + // are loaded completely before predefines are processed, so StoredMD + // will be nullptr for them when they're loaded. StoredMD should only be + // non-nullptr for builtins read from a pch file. + assert(OldMD->getMacroInfo()->isBuiltinMacro() && + "only built-ins should have an entry here"); + assert(!OldMD->getPrevious() && "builtin should only have a single entry"); + ED->setPrevious(OldMD); + StoredMD.setLatest(MD); + } else { + StoredMD = MD; + } + + // Setup the identifier as having associated macro history. + II->setHasMacroDefinition(true); + if (!MD->isDefined() && LeafModuleMacros.find(II) == LeafModuleMacros.end()) + II->setHasMacroDefinition(false); +} + +ModuleMacro *Preprocessor::addModuleMacro(Module *Mod, IdentifierInfo *II, + MacroInfo *Macro, + ArrayRef<ModuleMacro *> Overrides, + bool &New) { + llvm::FoldingSetNodeID ID; + ModuleMacro::Profile(ID, Mod, II); + + void *InsertPos; + if (auto *MM = ModuleMacros.FindNodeOrInsertPos(ID, InsertPos)) { + New = false; + return MM; + } + + auto *MM = ModuleMacro::create(*this, Mod, II, Macro, Overrides); + ModuleMacros.InsertNode(MM, InsertPos); + + // Each overridden macro is now overridden by one more macro. + bool HidAny = false; + for (auto *O : Overrides) { + HidAny |= (O->NumOverriddenBy == 0); + ++O->NumOverriddenBy; + } + + // If we were the first overrider for any macro, it's no longer a leaf. + auto &LeafMacros = LeafModuleMacros[II]; + if (HidAny) { + LeafMacros.erase(std::remove_if(LeafMacros.begin(), LeafMacros.end(), + [](ModuleMacro *MM) { + return MM->NumOverriddenBy != 0; + }), + LeafMacros.end()); + } + + // The new macro is always a leaf macro. + LeafMacros.push_back(MM); + // The identifier now has defined macros (that may or may not be visible). + II->setHasMacroDefinition(true); + + New = true; + return MM; +} + +ModuleMacro *Preprocessor::getModuleMacro(Module *Mod, IdentifierInfo *II) { + llvm::FoldingSetNodeID ID; + ModuleMacro::Profile(ID, Mod, II); + + void *InsertPos; + return ModuleMacros.FindNodeOrInsertPos(ID, InsertPos); +} + +void Preprocessor::updateModuleMacroInfo(const IdentifierInfo *II, + ModuleMacroInfo &Info) { + assert(Info.ActiveModuleMacrosGeneration != + CurSubmoduleState->VisibleModules.getGeneration() && + "don't need to update this macro name info"); + Info.ActiveModuleMacrosGeneration = + CurSubmoduleState->VisibleModules.getGeneration(); + + auto Leaf = LeafModuleMacros.find(II); + if (Leaf == LeafModuleMacros.end()) { + // No imported macros at all: nothing to do. + return; + } + + Info.ActiveModuleMacros.clear(); + + // Every macro that's locally overridden is overridden by a visible macro. + llvm::DenseMap<ModuleMacro *, int> NumHiddenOverrides; + for (auto *O : Info.OverriddenMacros) + NumHiddenOverrides[O] = -1; + + // Collect all macros that are not overridden by a visible macro. + llvm::SmallVector<ModuleMacro *, 16> Worklist; + for (auto *LeafMM : Leaf->second) { + assert(LeafMM->getNumOverridingMacros() == 0 && "leaf macro overridden"); + if (NumHiddenOverrides.lookup(LeafMM) == 0) + Worklist.push_back(LeafMM); + } + while (!Worklist.empty()) { + auto *MM = Worklist.pop_back_val(); + if (CurSubmoduleState->VisibleModules.isVisible(MM->getOwningModule())) { + // We only care about collecting definitions; undefinitions only act + // to override other definitions. + if (MM->getMacroInfo()) + Info.ActiveModuleMacros.push_back(MM); + } else { + for (auto *O : MM->overrides()) + if ((unsigned)++NumHiddenOverrides[O] == O->getNumOverridingMacros()) + Worklist.push_back(O); + } + } + // Our reverse postorder walk found the macros in reverse order. + std::reverse(Info.ActiveModuleMacros.begin(), Info.ActiveModuleMacros.end()); + + // Determine whether the macro name is ambiguous. + MacroInfo *MI = nullptr; + bool IsSystemMacro = true; + bool IsAmbiguous = false; + if (auto *MD = Info.MD) { + while (MD && isa<VisibilityMacroDirective>(MD)) + MD = MD->getPrevious(); + if (auto *DMD = dyn_cast_or_null<DefMacroDirective>(MD)) { + MI = DMD->getInfo(); + IsSystemMacro &= SourceMgr.isInSystemHeader(DMD->getLocation()); + } + } + for (auto *Active : Info.ActiveModuleMacros) { + auto *NewMI = Active->getMacroInfo(); + + // Before marking the macro as ambiguous, check if this is a case where + // both macros are in system headers. If so, we trust that the system + // did not get it wrong. This also handles cases where Clang's own + // headers have a different spelling of certain system macros: + // #define LONG_MAX __LONG_MAX__ (clang's limits.h) + // #define LONG_MAX 0x7fffffffffffffffL (system's limits.h) + // + // FIXME: Remove the defined-in-system-headers check. clang's limits.h + // overrides the system limits.h's macros, so there's no conflict here. + if (MI && NewMI != MI && + !MI->isIdenticalTo(*NewMI, *this, /*Syntactically=*/true)) + IsAmbiguous = true; + IsSystemMacro &= Active->getOwningModule()->IsSystem || + SourceMgr.isInSystemHeader(NewMI->getDefinitionLoc()); + MI = NewMI; + } + Info.IsAmbiguous = IsAmbiguous && !IsSystemMacro; +} + +void Preprocessor::dumpMacroInfo(const IdentifierInfo *II) { + ArrayRef<ModuleMacro*> Leaf; + auto LeafIt = LeafModuleMacros.find(II); + if (LeafIt != LeafModuleMacros.end()) + Leaf = LeafIt->second; + const MacroState *State = nullptr; + auto Pos = CurSubmoduleState->Macros.find(II); + if (Pos != CurSubmoduleState->Macros.end()) + State = &Pos->second; + + llvm::errs() << "MacroState " << State << " " << II->getNameStart(); + if (State && State->isAmbiguous(*this, II)) + llvm::errs() << " ambiguous"; + if (State && !State->getOverriddenMacros().empty()) { + llvm::errs() << " overrides"; + for (auto *O : State->getOverriddenMacros()) + llvm::errs() << " " << O->getOwningModule()->getFullModuleName(); + } + llvm::errs() << "\n"; + + // Dump local macro directives. + for (auto *MD = State ? State->getLatest() : nullptr; MD; + MD = MD->getPrevious()) { + llvm::errs() << " "; + MD->dump(); + } + + // Dump module macros. + llvm::DenseSet<ModuleMacro*> Active; + for (auto *MM : State ? State->getActiveModuleMacros(*this, II) : None) + Active.insert(MM); + llvm::DenseSet<ModuleMacro*> Visited; + llvm::SmallVector<ModuleMacro *, 16> Worklist(Leaf.begin(), Leaf.end()); + while (!Worklist.empty()) { + auto *MM = Worklist.pop_back_val(); + llvm::errs() << " ModuleMacro " << MM << " " + << MM->getOwningModule()->getFullModuleName(); + if (!MM->getMacroInfo()) + llvm::errs() << " undef"; + + if (Active.count(MM)) + llvm::errs() << " active"; + else if (!CurSubmoduleState->VisibleModules.isVisible( + MM->getOwningModule())) + llvm::errs() << " hidden"; + else if (MM->getMacroInfo()) + llvm::errs() << " overridden"; + + if (!MM->overrides().empty()) { + llvm::errs() << " overrides"; + for (auto *O : MM->overrides()) { + llvm::errs() << " " << O->getOwningModule()->getFullModuleName(); + if (Visited.insert(O).second) + Worklist.push_back(O); + } + } + llvm::errs() << "\n"; + if (auto *MI = MM->getMacroInfo()) { + llvm::errs() << " "; + MI->dump(); + llvm::errs() << "\n"; + } + } +} + +/// RegisterBuiltinMacro - Register the specified identifier in the identifier +/// table and mark it as a builtin macro to be expanded. +static IdentifierInfo *RegisterBuiltinMacro(Preprocessor &PP, const char *Name){ + // Get the identifier. + IdentifierInfo *Id = PP.getIdentifierInfo(Name); + + // Mark it as being a macro that is builtin. + MacroInfo *MI = PP.AllocateMacroInfo(SourceLocation()); + MI->setIsBuiltinMacro(); + PP.appendDefMacroDirective(Id, MI); + return Id; +} + +/// RegisterBuiltinMacros - Register builtin macros, such as __LINE__ with the +/// identifier table. +void Preprocessor::RegisterBuiltinMacros() { + Ident__LINE__ = RegisterBuiltinMacro(*this, "__LINE__"); + Ident__FILE__ = RegisterBuiltinMacro(*this, "__FILE__"); + Ident__DATE__ = RegisterBuiltinMacro(*this, "__DATE__"); + Ident__TIME__ = RegisterBuiltinMacro(*this, "__TIME__"); + Ident__COUNTER__ = RegisterBuiltinMacro(*this, "__COUNTER__"); + Ident_Pragma = RegisterBuiltinMacro(*this, "_Pragma"); + + // C++ Standing Document Extensions. + if (LangOpts.CPlusPlus) + Ident__has_cpp_attribute = + RegisterBuiltinMacro(*this, "__has_cpp_attribute"); + else + Ident__has_cpp_attribute = nullptr; + + // GCC Extensions. + Ident__BASE_FILE__ = RegisterBuiltinMacro(*this, "__BASE_FILE__"); + Ident__INCLUDE_LEVEL__ = RegisterBuiltinMacro(*this, "__INCLUDE_LEVEL__"); + Ident__TIMESTAMP__ = RegisterBuiltinMacro(*this, "__TIMESTAMP__"); + + // Microsoft Extensions. + if (LangOpts.MicrosoftExt) { + Ident__identifier = RegisterBuiltinMacro(*this, "__identifier"); + Ident__pragma = RegisterBuiltinMacro(*this, "__pragma"); + } else { + Ident__identifier = nullptr; + Ident__pragma = nullptr; + } + + // Clang Extensions. + Ident__FILE_NAME__ = RegisterBuiltinMacro(*this, "__FILE_NAME__"); + Ident__has_feature = RegisterBuiltinMacro(*this, "__has_feature"); + Ident__has_extension = RegisterBuiltinMacro(*this, "__has_extension"); + Ident__has_builtin = RegisterBuiltinMacro(*this, "__has_builtin"); + Ident__has_attribute = RegisterBuiltinMacro(*this, "__has_attribute"); + Ident__has_c_attribute = RegisterBuiltinMacro(*this, "__has_c_attribute"); + Ident__has_declspec = RegisterBuiltinMacro(*this, "__has_declspec_attribute"); + Ident__has_include = RegisterBuiltinMacro(*this, "__has_include"); + Ident__has_include_next = RegisterBuiltinMacro(*this, "__has_include_next"); + Ident__has_warning = RegisterBuiltinMacro(*this, "__has_warning"); + Ident__is_identifier = RegisterBuiltinMacro(*this, "__is_identifier"); + Ident__is_target_arch = RegisterBuiltinMacro(*this, "__is_target_arch"); + Ident__is_target_vendor = RegisterBuiltinMacro(*this, "__is_target_vendor"); + Ident__is_target_os = RegisterBuiltinMacro(*this, "__is_target_os"); + Ident__is_target_environment = + RegisterBuiltinMacro(*this, "__is_target_environment"); + + // Modules. + Ident__building_module = RegisterBuiltinMacro(*this, "__building_module"); + if (!LangOpts.CurrentModule.empty()) + Ident__MODULE__ = RegisterBuiltinMacro(*this, "__MODULE__"); + else + Ident__MODULE__ = nullptr; +} + +/// isTrivialSingleTokenExpansion - Return true if MI, which has a single token +/// in its expansion, currently expands to that token literally. +static bool isTrivialSingleTokenExpansion(const MacroInfo *MI, + const IdentifierInfo *MacroIdent, + Preprocessor &PP) { + IdentifierInfo *II = MI->getReplacementToken(0).getIdentifierInfo(); + + // If the token isn't an identifier, it's always literally expanded. + if (!II) return true; + + // If the information about this identifier is out of date, update it from + // the external source. + if (II->isOutOfDate()) + PP.getExternalSource()->updateOutOfDateIdentifier(*II); + + // If the identifier is a macro, and if that macro is enabled, it may be + // expanded so it's not a trivial expansion. + if (auto *ExpansionMI = PP.getMacroInfo(II)) + if (ExpansionMI->isEnabled() && + // Fast expanding "#define X X" is ok, because X would be disabled. + II != MacroIdent) + return false; + + // If this is an object-like macro invocation, it is safe to trivially expand + // it. + if (MI->isObjectLike()) return true; + + // If this is a function-like macro invocation, it's safe to trivially expand + // as long as the identifier is not a macro argument. + return std::find(MI->param_begin(), MI->param_end(), II) == MI->param_end(); +} + +/// isNextPPTokenLParen - Determine whether the next preprocessor token to be +/// lexed is a '('. If so, consume the token and return true, if not, this +/// method should have no observable side-effect on the lexed tokens. +bool Preprocessor::isNextPPTokenLParen() { + // Do some quick tests for rejection cases. + unsigned Val; + if (CurLexer) + Val = CurLexer->isNextPPTokenLParen(); + else + Val = CurTokenLexer->isNextTokenLParen(); + + if (Val == 2) { + // We have run off the end. If it's a source file we don't + // examine enclosing ones (C99 5.1.1.2p4). Otherwise walk up the + // macro stack. + if (CurPPLexer) + return false; + for (const IncludeStackInfo &Entry : llvm::reverse(IncludeMacroStack)) { + if (Entry.TheLexer) + Val = Entry.TheLexer->isNextPPTokenLParen(); + else + Val = Entry.TheTokenLexer->isNextTokenLParen(); + + if (Val != 2) + break; + + // Ran off the end of a source file? + if (Entry.ThePPLexer) + return false; + } + } + + // Okay, if we know that the token is a '(', lex it and return. Otherwise we + // have found something that isn't a '(' or we found the end of the + // translation unit. In either case, return false. + return Val == 1; +} + +/// HandleMacroExpandedIdentifier - If an identifier token is read that is to be +/// expanded as a macro, handle it and return the next token as 'Identifier'. +bool Preprocessor::HandleMacroExpandedIdentifier(Token &Identifier, + const MacroDefinition &M) { + MacroInfo *MI = M.getMacroInfo(); + + // If this is a macro expansion in the "#if !defined(x)" line for the file, + // then the macro could expand to different things in other contexts, we need + // to disable the optimization in this case. + if (CurPPLexer) CurPPLexer->MIOpt.ExpandedMacro(); + + // If this is a builtin macro, like __LINE__ or _Pragma, handle it specially. + if (MI->isBuiltinMacro()) { + if (Callbacks) + Callbacks->MacroExpands(Identifier, M, Identifier.getLocation(), + /*Args=*/nullptr); + ExpandBuiltinMacro(Identifier); + return true; + } + + /// Args - If this is a function-like macro expansion, this contains, + /// for each macro argument, the list of tokens that were provided to the + /// invocation. + MacroArgs *Args = nullptr; + + // Remember where the end of the expansion occurred. For an object-like + // macro, this is the identifier. For a function-like macro, this is the ')'. + SourceLocation ExpansionEnd = Identifier.getLocation(); + + // If this is a function-like macro, read the arguments. + if (MI->isFunctionLike()) { + // Remember that we are now parsing the arguments to a macro invocation. + // Preprocessor directives used inside macro arguments are not portable, and + // this enables the warning. + InMacroArgs = true; + ArgMacro = &Identifier; + + Args = ReadMacroCallArgumentList(Identifier, MI, ExpansionEnd); + + // Finished parsing args. + InMacroArgs = false; + ArgMacro = nullptr; + + // If there was an error parsing the arguments, bail out. + if (!Args) return true; + + ++NumFnMacroExpanded; + } else { + ++NumMacroExpanded; + } + + // Notice that this macro has been used. + markMacroAsUsed(MI); + + // Remember where the token is expanded. + SourceLocation ExpandLoc = Identifier.getLocation(); + SourceRange ExpansionRange(ExpandLoc, ExpansionEnd); + + if (Callbacks) { + if (InMacroArgs) { + // We can have macro expansion inside a conditional directive while + // reading the function macro arguments. To ensure, in that case, that + // MacroExpands callbacks still happen in source order, queue this + // callback to have it happen after the function macro callback. + DelayedMacroExpandsCallbacks.push_back( + MacroExpandsInfo(Identifier, M, ExpansionRange)); + } else { + Callbacks->MacroExpands(Identifier, M, ExpansionRange, Args); + if (!DelayedMacroExpandsCallbacks.empty()) { + for (const MacroExpandsInfo &Info : DelayedMacroExpandsCallbacks) { + // FIXME: We lose macro args info with delayed callback. + Callbacks->MacroExpands(Info.Tok, Info.MD, Info.Range, + /*Args=*/nullptr); + } + DelayedMacroExpandsCallbacks.clear(); + } + } + } + + // If the macro definition is ambiguous, complain. + if (M.isAmbiguous()) { + Diag(Identifier, diag::warn_pp_ambiguous_macro) + << Identifier.getIdentifierInfo(); + Diag(MI->getDefinitionLoc(), diag::note_pp_ambiguous_macro_chosen) + << Identifier.getIdentifierInfo(); + M.forAllDefinitions([&](const MacroInfo *OtherMI) { + if (OtherMI != MI) + Diag(OtherMI->getDefinitionLoc(), diag::note_pp_ambiguous_macro_other) + << Identifier.getIdentifierInfo(); + }); + } + + // If we started lexing a macro, enter the macro expansion body. + + // If this macro expands to no tokens, don't bother to push it onto the + // expansion stack, only to take it right back off. + if (MI->getNumTokens() == 0) { + // No need for arg info. + if (Args) Args->destroy(*this); + + // Propagate whitespace info as if we had pushed, then popped, + // a macro context. + Identifier.setFlag(Token::LeadingEmptyMacro); + PropagateLineStartLeadingSpaceInfo(Identifier); + ++NumFastMacroExpanded; + return false; + } else if (MI->getNumTokens() == 1 && + isTrivialSingleTokenExpansion(MI, Identifier.getIdentifierInfo(), + *this)) { + // Otherwise, if this macro expands into a single trivially-expanded + // token: expand it now. This handles common cases like + // "#define VAL 42". + + // No need for arg info. + if (Args) Args->destroy(*this); + + // Propagate the isAtStartOfLine/hasLeadingSpace markers of the macro + // identifier to the expanded token. + bool isAtStartOfLine = Identifier.isAtStartOfLine(); + bool hasLeadingSpace = Identifier.hasLeadingSpace(); + + // Replace the result token. + Identifier = MI->getReplacementToken(0); + + // Restore the StartOfLine/LeadingSpace markers. + Identifier.setFlagValue(Token::StartOfLine , isAtStartOfLine); + Identifier.setFlagValue(Token::LeadingSpace, hasLeadingSpace); + + // Update the tokens location to include both its expansion and physical + // locations. + SourceLocation Loc = + SourceMgr.createExpansionLoc(Identifier.getLocation(), ExpandLoc, + ExpansionEnd,Identifier.getLength()); + Identifier.setLocation(Loc); + + // If this is a disabled macro or #define X X, we must mark the result as + // unexpandable. + if (IdentifierInfo *NewII = Identifier.getIdentifierInfo()) { + if (MacroInfo *NewMI = getMacroInfo(NewII)) + if (!NewMI->isEnabled() || NewMI == MI) { + Identifier.setFlag(Token::DisableExpand); + // Don't warn for "#define X X" like "#define bool bool" from + // stdbool.h. + if (NewMI != MI || MI->isFunctionLike()) + Diag(Identifier, diag::pp_disabled_macro_expansion); + } + } + + // Since this is not an identifier token, it can't be macro expanded, so + // we're done. + ++NumFastMacroExpanded; + return true; + } + + // Start expanding the macro. + EnterMacro(Identifier, ExpansionEnd, MI, Args); + return false; +} + +enum Bracket { + Brace, + Paren +}; + +/// CheckMatchedBrackets - Returns true if the braces and parentheses in the +/// token vector are properly nested. +static bool CheckMatchedBrackets(const SmallVectorImpl<Token> &Tokens) { + SmallVector<Bracket, 8> Brackets; + for (SmallVectorImpl<Token>::const_iterator I = Tokens.begin(), + E = Tokens.end(); + I != E; ++I) { + if (I->is(tok::l_paren)) { + Brackets.push_back(Paren); + } else if (I->is(tok::r_paren)) { + if (Brackets.empty() || Brackets.back() == Brace) + return false; + Brackets.pop_back(); + } else if (I->is(tok::l_brace)) { + Brackets.push_back(Brace); + } else if (I->is(tok::r_brace)) { + if (Brackets.empty() || Brackets.back() == Paren) + return false; + Brackets.pop_back(); + } + } + return Brackets.empty(); +} + +/// GenerateNewArgTokens - Returns true if OldTokens can be converted to a new +/// vector of tokens in NewTokens. The new number of arguments will be placed +/// in NumArgs and the ranges which need to surrounded in parentheses will be +/// in ParenHints. +/// Returns false if the token stream cannot be changed. If this is because +/// of an initializer list starting a macro argument, the range of those +/// initializer lists will be place in InitLists. +static bool GenerateNewArgTokens(Preprocessor &PP, + SmallVectorImpl<Token> &OldTokens, + SmallVectorImpl<Token> &NewTokens, + unsigned &NumArgs, + SmallVectorImpl<SourceRange> &ParenHints, + SmallVectorImpl<SourceRange> &InitLists) { + if (!CheckMatchedBrackets(OldTokens)) + return false; + + // Once it is known that the brackets are matched, only a simple count of the + // braces is needed. + unsigned Braces = 0; + + // First token of a new macro argument. + SmallVectorImpl<Token>::iterator ArgStartIterator = OldTokens.begin(); + + // First closing brace in a new macro argument. Used to generate + // SourceRanges for InitLists. + SmallVectorImpl<Token>::iterator ClosingBrace = OldTokens.end(); + NumArgs = 0; + Token TempToken; + // Set to true when a macro separator token is found inside a braced list. + // If true, the fixed argument spans multiple old arguments and ParenHints + // will be updated. + bool FoundSeparatorToken = false; + for (SmallVectorImpl<Token>::iterator I = OldTokens.begin(), + E = OldTokens.end(); + I != E; ++I) { + if (I->is(tok::l_brace)) { + ++Braces; + } else if (I->is(tok::r_brace)) { + --Braces; + if (Braces == 0 && ClosingBrace == E && FoundSeparatorToken) + ClosingBrace = I; + } else if (I->is(tok::eof)) { + // EOF token is used to separate macro arguments + if (Braces != 0) { + // Assume comma separator is actually braced list separator and change + // it back to a comma. + FoundSeparatorToken = true; + I->setKind(tok::comma); + I->setLength(1); + } else { // Braces == 0 + // Separator token still separates arguments. + ++NumArgs; + + // If the argument starts with a brace, it can't be fixed with + // parentheses. A different diagnostic will be given. + if (FoundSeparatorToken && ArgStartIterator->is(tok::l_brace)) { + InitLists.push_back( + SourceRange(ArgStartIterator->getLocation(), + PP.getLocForEndOfToken(ClosingBrace->getLocation()))); + ClosingBrace = E; + } + + // Add left paren + if (FoundSeparatorToken) { + TempToken.startToken(); + TempToken.setKind(tok::l_paren); + TempToken.setLocation(ArgStartIterator->getLocation()); + TempToken.setLength(0); + NewTokens.push_back(TempToken); + } + + // Copy over argument tokens + NewTokens.insert(NewTokens.end(), ArgStartIterator, I); + + // Add right paren and store the paren locations in ParenHints + if (FoundSeparatorToken) { + SourceLocation Loc = PP.getLocForEndOfToken((I - 1)->getLocation()); + TempToken.startToken(); + TempToken.setKind(tok::r_paren); + TempToken.setLocation(Loc); + TempToken.setLength(0); + NewTokens.push_back(TempToken); + ParenHints.push_back(SourceRange(ArgStartIterator->getLocation(), + Loc)); + } + + // Copy separator token + NewTokens.push_back(*I); + + // Reset values + ArgStartIterator = I + 1; + FoundSeparatorToken = false; + } + } + } + + return !ParenHints.empty() && InitLists.empty(); +} + +/// ReadFunctionLikeMacroArgs - After reading "MACRO" and knowing that the next +/// token is the '(' of the macro, this method is invoked to read all of the +/// actual arguments specified for the macro invocation. This returns null on +/// error. +MacroArgs *Preprocessor::ReadMacroCallArgumentList(Token &MacroName, + MacroInfo *MI, + SourceLocation &MacroEnd) { + // The number of fixed arguments to parse. + unsigned NumFixedArgsLeft = MI->getNumParams(); + bool isVariadic = MI->isVariadic(); + + // Outer loop, while there are more arguments, keep reading them. + Token Tok; + + // Read arguments as unexpanded tokens. This avoids issues, e.g., where + // an argument value in a macro could expand to ',' or '(' or ')'. + LexUnexpandedToken(Tok); + assert(Tok.is(tok::l_paren) && "Error computing l-paren-ness?"); + + // ArgTokens - Build up a list of tokens that make up each argument. Each + // argument is separated by an EOF token. Use a SmallVector so we can avoid + // heap allocations in the common case. + SmallVector<Token, 64> ArgTokens; + bool ContainsCodeCompletionTok = false; + bool FoundElidedComma = false; + + SourceLocation TooManyArgsLoc; + + unsigned NumActuals = 0; + while (Tok.isNot(tok::r_paren)) { + if (ContainsCodeCompletionTok && Tok.isOneOf(tok::eof, tok::eod)) + break; + + assert(Tok.isOneOf(tok::l_paren, tok::comma) && + "only expect argument separators here"); + + size_t ArgTokenStart = ArgTokens.size(); + SourceLocation ArgStartLoc = Tok.getLocation(); + + // C99 6.10.3p11: Keep track of the number of l_parens we have seen. Note + // that we already consumed the first one. + unsigned NumParens = 0; + + while (true) { + // Read arguments as unexpanded tokens. This avoids issues, e.g., where + // an argument value in a macro could expand to ',' or '(' or ')'. + LexUnexpandedToken(Tok); + + if (Tok.isOneOf(tok::eof, tok::eod)) { // "#if f(<eof>" & "#if f(\n" + if (!ContainsCodeCompletionTok) { + Diag(MacroName, diag::err_unterm_macro_invoc); + Diag(MI->getDefinitionLoc(), diag::note_macro_here) + << MacroName.getIdentifierInfo(); + // Do not lose the EOF/EOD. Return it to the client. + MacroName = Tok; + return nullptr; + } + // Do not lose the EOF/EOD. + auto Toks = std::make_unique<Token[]>(1); + Toks[0] = Tok; + EnterTokenStream(std::move(Toks), 1, true, /*IsReinject*/ false); + break; + } else if (Tok.is(tok::r_paren)) { + // If we found the ) token, the macro arg list is done. + if (NumParens-- == 0) { + MacroEnd = Tok.getLocation(); + if (!ArgTokens.empty() && + ArgTokens.back().commaAfterElided()) { + FoundElidedComma = true; + } + break; + } + } else if (Tok.is(tok::l_paren)) { + ++NumParens; + } else if (Tok.is(tok::comma) && NumParens == 0 && + !(Tok.getFlags() & Token::IgnoredComma)) { + // In Microsoft-compatibility mode, single commas from nested macro + // expansions should not be considered as argument separators. We test + // for this with the IgnoredComma token flag above. + + // Comma ends this argument if there are more fixed arguments expected. + // However, if this is a variadic macro, and this is part of the + // variadic part, then the comma is just an argument token. + if (!isVariadic) break; + if (NumFixedArgsLeft > 1) + break; + } else if (Tok.is(tok::comment) && !KeepMacroComments) { + // If this is a comment token in the argument list and we're just in + // -C mode (not -CC mode), discard the comment. + continue; + } else if (!Tok.isAnnotation() && Tok.getIdentifierInfo() != nullptr) { + // Reading macro arguments can cause macros that we are currently + // expanding from to be popped off the expansion stack. Doing so causes + // them to be reenabled for expansion. Here we record whether any + // identifiers we lex as macro arguments correspond to disabled macros. + // If so, we mark the token as noexpand. This is a subtle aspect of + // C99 6.10.3.4p2. + if (MacroInfo *MI = getMacroInfo(Tok.getIdentifierInfo())) + if (!MI->isEnabled()) + Tok.setFlag(Token::DisableExpand); + } else if (Tok.is(tok::code_completion)) { + ContainsCodeCompletionTok = true; + if (CodeComplete) + CodeComplete->CodeCompleteMacroArgument(MacroName.getIdentifierInfo(), + MI, NumActuals); + // Don't mark that we reached the code-completion point because the + // parser is going to handle the token and there will be another + // code-completion callback. + } + + ArgTokens.push_back(Tok); + } + + // If this was an empty argument list foo(), don't add this as an empty + // argument. + if (ArgTokens.empty() && Tok.getKind() == tok::r_paren) + break; + + // If this is not a variadic macro, and too many args were specified, emit + // an error. + if (!isVariadic && NumFixedArgsLeft == 0 && TooManyArgsLoc.isInvalid()) { + if (ArgTokens.size() != ArgTokenStart) + TooManyArgsLoc = ArgTokens[ArgTokenStart].getLocation(); + else + TooManyArgsLoc = ArgStartLoc; + } + + // Empty arguments are standard in C99 and C++0x, and are supported as an + // extension in other modes. + if (ArgTokens.size() == ArgTokenStart && !LangOpts.C99) + Diag(Tok, LangOpts.CPlusPlus11 ? + diag::warn_cxx98_compat_empty_fnmacro_arg : + diag::ext_empty_fnmacro_arg); + + // Add a marker EOF token to the end of the token list for this argument. + Token EOFTok; + EOFTok.startToken(); + EOFTok.setKind(tok::eof); + EOFTok.setLocation(Tok.getLocation()); + EOFTok.setLength(0); + ArgTokens.push_back(EOFTok); + ++NumActuals; + if (!ContainsCodeCompletionTok && NumFixedArgsLeft != 0) + --NumFixedArgsLeft; + } + + // Okay, we either found the r_paren. Check to see if we parsed too few + // arguments. + unsigned MinArgsExpected = MI->getNumParams(); + + // If this is not a variadic macro, and too many args were specified, emit + // an error. + if (!isVariadic && NumActuals > MinArgsExpected && + !ContainsCodeCompletionTok) { + // Emit the diagnostic at the macro name in case there is a missing ). + // Emitting it at the , could be far away from the macro name. + Diag(TooManyArgsLoc, diag::err_too_many_args_in_macro_invoc); + Diag(MI->getDefinitionLoc(), diag::note_macro_here) + << MacroName.getIdentifierInfo(); + + // Commas from braced initializer lists will be treated as argument + // separators inside macros. Attempt to correct for this with parentheses. + // TODO: See if this can be generalized to angle brackets for templates + // inside macro arguments. + + SmallVector<Token, 4> FixedArgTokens; + unsigned FixedNumArgs = 0; + SmallVector<SourceRange, 4> ParenHints, InitLists; + if (!GenerateNewArgTokens(*this, ArgTokens, FixedArgTokens, FixedNumArgs, + ParenHints, InitLists)) { + if (!InitLists.empty()) { + DiagnosticBuilder DB = + Diag(MacroName, + diag::note_init_list_at_beginning_of_macro_argument); + for (SourceRange Range : InitLists) + DB << Range; + } + return nullptr; + } + if (FixedNumArgs != MinArgsExpected) + return nullptr; + + DiagnosticBuilder DB = Diag(MacroName, diag::note_suggest_parens_for_macro); + for (SourceRange ParenLocation : ParenHints) { + DB << FixItHint::CreateInsertion(ParenLocation.getBegin(), "("); + DB << FixItHint::CreateInsertion(ParenLocation.getEnd(), ")"); + } + ArgTokens.swap(FixedArgTokens); + NumActuals = FixedNumArgs; + } + + // See MacroArgs instance var for description of this. + bool isVarargsElided = false; + + if (ContainsCodeCompletionTok) { + // Recover from not-fully-formed macro invocation during code-completion. + Token EOFTok; + EOFTok.startToken(); + EOFTok.setKind(tok::eof); + EOFTok.setLocation(Tok.getLocation()); + EOFTok.setLength(0); + for (; NumActuals < MinArgsExpected; ++NumActuals) + ArgTokens.push_back(EOFTok); + } + + if (NumActuals < MinArgsExpected) { + // There are several cases where too few arguments is ok, handle them now. + if (NumActuals == 0 && MinArgsExpected == 1) { + // #define A(X) or #define A(...) ---> A() + + // If there is exactly one argument, and that argument is missing, + // then we have an empty "()" argument empty list. This is fine, even if + // the macro expects one argument (the argument is just empty). + isVarargsElided = MI->isVariadic(); + } else if ((FoundElidedComma || MI->isVariadic()) && + (NumActuals+1 == MinArgsExpected || // A(x, ...) -> A(X) + (NumActuals == 0 && MinArgsExpected == 2))) {// A(x,...) -> A() + // Varargs where the named vararg parameter is missing: OK as extension. + // #define A(x, ...) + // A("blah") + // + // If the macro contains the comma pasting extension, the diagnostic + // is suppressed; we know we'll get another diagnostic later. + if (!MI->hasCommaPasting()) { + Diag(Tok, diag::ext_missing_varargs_arg); + Diag(MI->getDefinitionLoc(), diag::note_macro_here) + << MacroName.getIdentifierInfo(); + } + + // Remember this occurred, allowing us to elide the comma when used for + // cases like: + // #define A(x, foo...) blah(a, ## foo) + // #define B(x, ...) blah(a, ## __VA_ARGS__) + // #define C(...) blah(a, ## __VA_ARGS__) + // A(x) B(x) C() + isVarargsElided = true; + } else if (!ContainsCodeCompletionTok) { + // Otherwise, emit the error. + Diag(Tok, diag::err_too_few_args_in_macro_invoc); + Diag(MI->getDefinitionLoc(), diag::note_macro_here) + << MacroName.getIdentifierInfo(); + return nullptr; + } + + // Add a marker EOF token to the end of the token list for this argument. + SourceLocation EndLoc = Tok.getLocation(); + Tok.startToken(); + Tok.setKind(tok::eof); + Tok.setLocation(EndLoc); + Tok.setLength(0); + ArgTokens.push_back(Tok); + + // If we expect two arguments, add both as empty. + if (NumActuals == 0 && MinArgsExpected == 2) + ArgTokens.push_back(Tok); + + } else if (NumActuals > MinArgsExpected && !MI->isVariadic() && + !ContainsCodeCompletionTok) { + // Emit the diagnostic at the macro name in case there is a missing ). + // Emitting it at the , could be far away from the macro name. + Diag(MacroName, diag::err_too_many_args_in_macro_invoc); + Diag(MI->getDefinitionLoc(), diag::note_macro_here) + << MacroName.getIdentifierInfo(); + return nullptr; + } + + return MacroArgs::create(MI, ArgTokens, isVarargsElided, *this); +} + +/// Keeps macro expanded tokens for TokenLexers. +// +/// Works like a stack; a TokenLexer adds the macro expanded tokens that is +/// going to lex in the cache and when it finishes the tokens are removed +/// from the end of the cache. +Token *Preprocessor::cacheMacroExpandedTokens(TokenLexer *tokLexer, + ArrayRef<Token> tokens) { + assert(tokLexer); + if (tokens.empty()) + return nullptr; + + size_t newIndex = MacroExpandedTokens.size(); + bool cacheNeedsToGrow = tokens.size() > + MacroExpandedTokens.capacity()-MacroExpandedTokens.size(); + MacroExpandedTokens.append(tokens.begin(), tokens.end()); + + if (cacheNeedsToGrow) { + // Go through all the TokenLexers whose 'Tokens' pointer points in the + // buffer and update the pointers to the (potential) new buffer array. + for (const auto &Lexer : MacroExpandingLexersStack) { + TokenLexer *prevLexer; + size_t tokIndex; + std::tie(prevLexer, tokIndex) = Lexer; + prevLexer->Tokens = MacroExpandedTokens.data() + tokIndex; + } + } + + MacroExpandingLexersStack.push_back(std::make_pair(tokLexer, newIndex)); + return MacroExpandedTokens.data() + newIndex; +} + +void Preprocessor::removeCachedMacroExpandedTokensOfLastLexer() { + assert(!MacroExpandingLexersStack.empty()); + size_t tokIndex = MacroExpandingLexersStack.back().second; + assert(tokIndex < MacroExpandedTokens.size()); + // Pop the cached macro expanded tokens from the end. + MacroExpandedTokens.resize(tokIndex); + MacroExpandingLexersStack.pop_back(); +} + +/// ComputeDATE_TIME - Compute the current time, enter it into the specified +/// scratch buffer, then return DATELoc/TIMELoc locations with the position of +/// the identifier tokens inserted. +static void ComputeDATE_TIME(SourceLocation &DATELoc, SourceLocation &TIMELoc, + Preprocessor &PP) { + time_t TT = time(nullptr); + struct tm *TM = localtime(&TT); + + static const char * const Months[] = { + "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" + }; + + { + SmallString<32> TmpBuffer; + llvm::raw_svector_ostream TmpStream(TmpBuffer); + TmpStream << llvm::format("\"%s %2d %4d\"", Months[TM->tm_mon], + TM->tm_mday, TM->tm_year + 1900); + Token TmpTok; + TmpTok.startToken(); + PP.CreateString(TmpStream.str(), TmpTok); + DATELoc = TmpTok.getLocation(); + } + + { + SmallString<32> TmpBuffer; + llvm::raw_svector_ostream TmpStream(TmpBuffer); + TmpStream << llvm::format("\"%02d:%02d:%02d\"", + TM->tm_hour, TM->tm_min, TM->tm_sec); + Token TmpTok; + TmpTok.startToken(); + PP.CreateString(TmpStream.str(), TmpTok); + TIMELoc = TmpTok.getLocation(); + } +} + +/// HasFeature - Return true if we recognize and implement the feature +/// specified by the identifier as a standard language feature. +static bool HasFeature(const Preprocessor &PP, StringRef Feature) { + const LangOptions &LangOpts = PP.getLangOpts(); + + // Normalize the feature name, __foo__ becomes foo. + if (Feature.startswith("__") && Feature.endswith("__") && Feature.size() >= 4) + Feature = Feature.substr(2, Feature.size() - 4); + +#define FEATURE(Name, Predicate) .Case(#Name, Predicate) + return llvm::StringSwitch<bool>(Feature) +#include "clang/Basic/Features.def" + .Default(false); +#undef FEATURE +} + +/// HasExtension - Return true if we recognize and implement the feature +/// specified by the identifier, either as an extension or a standard language +/// feature. +static bool HasExtension(const Preprocessor &PP, StringRef Extension) { + if (HasFeature(PP, Extension)) + return true; + + // If the use of an extension results in an error diagnostic, extensions are + // effectively unavailable, so just return false here. + if (PP.getDiagnostics().getExtensionHandlingBehavior() >= + diag::Severity::Error) + return false; + + const LangOptions &LangOpts = PP.getLangOpts(); + + // Normalize the extension name, __foo__ becomes foo. + if (Extension.startswith("__") && Extension.endswith("__") && + Extension.size() >= 4) + Extension = Extension.substr(2, Extension.size() - 4); + + // Because we inherit the feature list from HasFeature, this string switch + // must be less restrictive than HasFeature's. +#define EXTENSION(Name, Predicate) .Case(#Name, Predicate) + return llvm::StringSwitch<bool>(Extension) +#include "clang/Basic/Features.def" + .Default(false); +#undef EXTENSION +} + +/// EvaluateHasIncludeCommon - Process a '__has_include("path")' +/// or '__has_include_next("path")' expression. +/// Returns true if successful. +static bool EvaluateHasIncludeCommon(Token &Tok, + IdentifierInfo *II, Preprocessor &PP, + const DirectoryLookup *LookupFrom, + const FileEntry *LookupFromFile) { + // Save the location of the current token. If a '(' is later found, use + // that location. If not, use the end of this location instead. + SourceLocation LParenLoc = Tok.getLocation(); + + // These expressions are only allowed within a preprocessor directive. + if (!PP.isParsingIfOrElifDirective()) { + PP.Diag(LParenLoc, diag::err_pp_directive_required) << II; + // Return a valid identifier token. + assert(Tok.is(tok::identifier)); + Tok.setIdentifierInfo(II); + return false; + } + + // Get '('. If we don't have a '(', try to form a header-name token. + do { + if (PP.LexHeaderName(Tok)) + return false; + } while (Tok.getKind() == tok::comment); + + // Ensure we have a '('. + if (Tok.isNot(tok::l_paren)) { + // No '(', use end of last token. + LParenLoc = PP.getLocForEndOfToken(LParenLoc); + PP.Diag(LParenLoc, diag::err_pp_expected_after) << II << tok::l_paren; + // If the next token looks like a filename or the start of one, + // assume it is and process it as such. + if (Tok.isNot(tok::header_name)) + return false; + } else { + // Save '(' location for possible missing ')' message. + LParenLoc = Tok.getLocation(); + if (PP.LexHeaderName(Tok)) + return false; + } + + if (Tok.isNot(tok::header_name)) { + PP.Diag(Tok.getLocation(), diag::err_pp_expects_filename); + return false; + } + + // Reserve a buffer to get the spelling. + SmallString<128> FilenameBuffer; + bool Invalid = false; + StringRef Filename = PP.getSpelling(Tok, FilenameBuffer, &Invalid); + if (Invalid) + return false; + + SourceLocation FilenameLoc = Tok.getLocation(); + + // Get ')'. + PP.LexNonComment(Tok); + + // Ensure we have a trailing ). + if (Tok.isNot(tok::r_paren)) { + PP.Diag(PP.getLocForEndOfToken(FilenameLoc), diag::err_pp_expected_after) + << II << tok::r_paren; + PP.Diag(LParenLoc, diag::note_matching) << tok::l_paren; + return false; + } + + bool isAngled = PP.GetIncludeFilenameSpelling(Tok.getLocation(), Filename); + // If GetIncludeFilenameSpelling set the start ptr to null, there was an + // error. + if (Filename.empty()) + return false; + + // Search include directories. + const DirectoryLookup *CurDir; + Optional<FileEntryRef> File = + PP.LookupFile(FilenameLoc, Filename, isAngled, LookupFrom, LookupFromFile, + CurDir, nullptr, nullptr, nullptr, nullptr, nullptr); + + if (PPCallbacks *Callbacks = PP.getPPCallbacks()) { + SrcMgr::CharacteristicKind FileType = SrcMgr::C_User; + if (File) + FileType = + PP.getHeaderSearchInfo().getFileDirFlavor(&File->getFileEntry()); + Callbacks->HasInclude(FilenameLoc, Filename, isAngled, File, FileType); + } + + // Get the result value. A result of true means the file exists. + return File.hasValue(); +} + +/// EvaluateHasInclude - Process a '__has_include("path")' expression. +/// Returns true if successful. +static bool EvaluateHasInclude(Token &Tok, IdentifierInfo *II, + Preprocessor &PP) { + return EvaluateHasIncludeCommon(Tok, II, PP, nullptr, nullptr); +} + +/// EvaluateHasIncludeNext - Process '__has_include_next("path")' expression. +/// Returns true if successful. +static bool EvaluateHasIncludeNext(Token &Tok, + IdentifierInfo *II, Preprocessor &PP) { + // __has_include_next is like __has_include, except that we start + // searching after the current found directory. If we can't do this, + // issue a diagnostic. + // FIXME: Factor out duplication with + // Preprocessor::HandleIncludeNextDirective. + const DirectoryLookup *Lookup = PP.GetCurDirLookup(); + const FileEntry *LookupFromFile = nullptr; + if (PP.isInPrimaryFile() && PP.getLangOpts().IsHeaderFile) { + // If the main file is a header, then it's either for PCH/AST generation, + // or libclang opened it. Either way, handle it as a normal include below + // and do not complain about __has_include_next. + } else if (PP.isInPrimaryFile()) { + Lookup = nullptr; + PP.Diag(Tok, diag::pp_include_next_in_primary); + } else if (PP.getCurrentLexerSubmodule()) { + // Start looking up in the directory *after* the one in which the current + // file would be found, if any. + assert(PP.getCurrentLexer() && "#include_next directive in macro?"); + LookupFromFile = PP.getCurrentLexer()->getFileEntry(); + Lookup = nullptr; + } else if (!Lookup) { + PP.Diag(Tok, diag::pp_include_next_absolute_path); + } else { + // Start looking up in the next directory. + ++Lookup; + } + + return EvaluateHasIncludeCommon(Tok, II, PP, Lookup, LookupFromFile); +} + +/// Process single-argument builtin feature-like macros that return +/// integer values. +static void EvaluateFeatureLikeBuiltinMacro(llvm::raw_svector_ostream& OS, + Token &Tok, IdentifierInfo *II, + Preprocessor &PP, + llvm::function_ref< + int(Token &Tok, + bool &HasLexedNextTok)> Op) { + // Parse the initial '('. + PP.LexUnexpandedToken(Tok); + if (Tok.isNot(tok::l_paren)) { + PP.Diag(Tok.getLocation(), diag::err_pp_expected_after) << II + << tok::l_paren; + + // Provide a dummy '0' value on output stream to elide further errors. + if (!Tok.isOneOf(tok::eof, tok::eod)) { + OS << 0; + Tok.setKind(tok::numeric_constant); + } + return; + } + + unsigned ParenDepth = 1; + SourceLocation LParenLoc = Tok.getLocation(); + llvm::Optional<int> Result; + + Token ResultTok; + bool SuppressDiagnostic = false; + while (true) { + // Parse next token. + PP.LexUnexpandedToken(Tok); + +already_lexed: + switch (Tok.getKind()) { + case tok::eof: + case tok::eod: + // Don't provide even a dummy value if the eod or eof marker is + // reached. Simply provide a diagnostic. + PP.Diag(Tok.getLocation(), diag::err_unterm_macro_invoc); + return; + + case tok::comma: + if (!SuppressDiagnostic) { + PP.Diag(Tok.getLocation(), diag::err_too_many_args_in_macro_invoc); + SuppressDiagnostic = true; + } + continue; + + case tok::l_paren: + ++ParenDepth; + if (Result.hasValue()) + break; + if (!SuppressDiagnostic) { + PP.Diag(Tok.getLocation(), diag::err_pp_nested_paren) << II; + SuppressDiagnostic = true; + } + continue; + + case tok::r_paren: + if (--ParenDepth > 0) + continue; + + // The last ')' has been reached; return the value if one found or + // a diagnostic and a dummy value. + if (Result.hasValue()) { + OS << Result.getValue(); + // For strict conformance to __has_cpp_attribute rules, use 'L' + // suffix for dated literals. + if (Result.getValue() > 1) + OS << 'L'; + } else { + OS << 0; + if (!SuppressDiagnostic) + PP.Diag(Tok.getLocation(), diag::err_too_few_args_in_macro_invoc); + } + Tok.setKind(tok::numeric_constant); + return; + + default: { + // Parse the macro argument, if one not found so far. + if (Result.hasValue()) + break; + + bool HasLexedNextToken = false; + Result = Op(Tok, HasLexedNextToken); + ResultTok = Tok; + if (HasLexedNextToken) + goto already_lexed; + continue; + } + } + + // Diagnose missing ')'. + if (!SuppressDiagnostic) { + if (auto Diag = PP.Diag(Tok.getLocation(), diag::err_pp_expected_after)) { + if (IdentifierInfo *LastII = ResultTok.getIdentifierInfo()) + Diag << LastII; + else + Diag << ResultTok.getKind(); + Diag << tok::r_paren << ResultTok.getLocation(); + } + PP.Diag(LParenLoc, diag::note_matching) << tok::l_paren; + SuppressDiagnostic = true; + } + } +} + +/// Helper function to return the IdentifierInfo structure of a Token +/// or generate a diagnostic if none available. +static IdentifierInfo *ExpectFeatureIdentifierInfo(Token &Tok, + Preprocessor &PP, + signed DiagID) { + IdentifierInfo *II; + if (!Tok.isAnnotation() && (II = Tok.getIdentifierInfo())) + return II; + + PP.Diag(Tok.getLocation(), DiagID); + return nullptr; +} + +/// Implements the __is_target_arch builtin macro. +static bool isTargetArch(const TargetInfo &TI, const IdentifierInfo *II) { + std::string ArchName = II->getName().lower() + "--"; + llvm::Triple Arch(ArchName); + const llvm::Triple &TT = TI.getTriple(); + if (TT.isThumb()) { + // arm matches thumb or thumbv7. armv7 matches thumbv7. + if ((Arch.getSubArch() == llvm::Triple::NoSubArch || + Arch.getSubArch() == TT.getSubArch()) && + ((TT.getArch() == llvm::Triple::thumb && + Arch.getArch() == llvm::Triple::arm) || + (TT.getArch() == llvm::Triple::thumbeb && + Arch.getArch() == llvm::Triple::armeb))) + return true; + } + // Check the parsed arch when it has no sub arch to allow Clang to + // match thumb to thumbv7 but to prohibit matching thumbv6 to thumbv7. + return (Arch.getSubArch() == llvm::Triple::NoSubArch || + Arch.getSubArch() == TT.getSubArch()) && + Arch.getArch() == TT.getArch(); +} + +/// Implements the __is_target_vendor builtin macro. +static bool isTargetVendor(const TargetInfo &TI, const IdentifierInfo *II) { + StringRef VendorName = TI.getTriple().getVendorName(); + if (VendorName.empty()) + VendorName = "unknown"; + return VendorName.equals_lower(II->getName()); +} + +/// Implements the __is_target_os builtin macro. +static bool isTargetOS(const TargetInfo &TI, const IdentifierInfo *II) { + std::string OSName = + (llvm::Twine("unknown-unknown-") + II->getName().lower()).str(); + llvm::Triple OS(OSName); + if (OS.getOS() == llvm::Triple::Darwin) { + // Darwin matches macos, ios, etc. + return TI.getTriple().isOSDarwin(); + } + return TI.getTriple().getOS() == OS.getOS(); +} + +/// Implements the __is_target_environment builtin macro. +static bool isTargetEnvironment(const TargetInfo &TI, + const IdentifierInfo *II) { + std::string EnvName = (llvm::Twine("---") + II->getName().lower()).str(); + llvm::Triple Env(EnvName); + return TI.getTriple().getEnvironment() == Env.getEnvironment(); +} + +/// ExpandBuiltinMacro - If an identifier token is read that is to be expanded +/// as a builtin macro, handle it and return the next token as 'Tok'. +void Preprocessor::ExpandBuiltinMacro(Token &Tok) { + // Figure out which token this is. + IdentifierInfo *II = Tok.getIdentifierInfo(); + assert(II && "Can't be a macro without id info!"); + + // If this is an _Pragma or Microsoft __pragma directive, expand it, + // invoke the pragma handler, then lex the token after it. + if (II == Ident_Pragma) + return Handle_Pragma(Tok); + else if (II == Ident__pragma) // in non-MS mode this is null + return HandleMicrosoft__pragma(Tok); + + ++NumBuiltinMacroExpanded; + + SmallString<128> TmpBuffer; + llvm::raw_svector_ostream OS(TmpBuffer); + + // Set up the return result. + Tok.setIdentifierInfo(nullptr); + Tok.clearFlag(Token::NeedsCleaning); + bool IsAtStartOfLine = Tok.isAtStartOfLine(); + bool HasLeadingSpace = Tok.hasLeadingSpace(); + + if (II == Ident__LINE__) { + // C99 6.10.8: "__LINE__: The presumed line number (within the current + // source file) of the current source line (an integer constant)". This can + // be affected by #line. + SourceLocation Loc = Tok.getLocation(); + + // Advance to the location of the first _, this might not be the first byte + // of the token if it starts with an escaped newline. + Loc = AdvanceToTokenCharacter(Loc, 0); + + // One wrinkle here is that GCC expands __LINE__ to location of the *end* of + // a macro expansion. This doesn't matter for object-like macros, but + // can matter for a function-like macro that expands to contain __LINE__. + // Skip down through expansion points until we find a file loc for the + // end of the expansion history. + Loc = SourceMgr.getExpansionRange(Loc).getEnd(); + PresumedLoc PLoc = SourceMgr.getPresumedLoc(Loc); + + // __LINE__ expands to a simple numeric value. + OS << (PLoc.isValid()? PLoc.getLine() : 1); + Tok.setKind(tok::numeric_constant); + } else if (II == Ident__FILE__ || II == Ident__BASE_FILE__ || + II == Ident__FILE_NAME__) { + // C99 6.10.8: "__FILE__: The presumed name of the current source file (a + // character string literal)". This can be affected by #line. + PresumedLoc PLoc = SourceMgr.getPresumedLoc(Tok.getLocation()); + + // __BASE_FILE__ is a GNU extension that returns the top of the presumed + // #include stack instead of the current file. + if (II == Ident__BASE_FILE__ && PLoc.isValid()) { + SourceLocation NextLoc = PLoc.getIncludeLoc(); + while (NextLoc.isValid()) { + PLoc = SourceMgr.getPresumedLoc(NextLoc); + if (PLoc.isInvalid()) + break; + + NextLoc = PLoc.getIncludeLoc(); + } + } + + // Escape this filename. Turn '\' -> '\\' '"' -> '\"' + SmallString<128> FN; + if (PLoc.isValid()) { + // __FILE_NAME__ is a Clang-specific extension that expands to the + // the last part of __FILE__. + if (II == Ident__FILE_NAME__) { + // Try to get the last path component, failing that return the original + // presumed location. + StringRef PLFileName = llvm::sys::path::filename(PLoc.getFilename()); + if (PLFileName != "") + FN += PLFileName; + else + FN += PLoc.getFilename(); + } else { + FN += PLoc.getFilename(); + } + Lexer::Stringify(FN); + OS << '"' << FN << '"'; + } + Tok.setKind(tok::string_literal); + } else if (II == Ident__DATE__) { + Diag(Tok.getLocation(), diag::warn_pp_date_time); + if (!DATELoc.isValid()) + ComputeDATE_TIME(DATELoc, TIMELoc, *this); + Tok.setKind(tok::string_literal); + Tok.setLength(strlen("\"Mmm dd yyyy\"")); + Tok.setLocation(SourceMgr.createExpansionLoc(DATELoc, Tok.getLocation(), + Tok.getLocation(), + Tok.getLength())); + return; + } else if (II == Ident__TIME__) { + Diag(Tok.getLocation(), diag::warn_pp_date_time); + if (!TIMELoc.isValid()) + ComputeDATE_TIME(DATELoc, TIMELoc, *this); + Tok.setKind(tok::string_literal); + Tok.setLength(strlen("\"hh:mm:ss\"")); + Tok.setLocation(SourceMgr.createExpansionLoc(TIMELoc, Tok.getLocation(), + Tok.getLocation(), + Tok.getLength())); + return; + } else if (II == Ident__INCLUDE_LEVEL__) { + // Compute the presumed include depth of this token. This can be affected + // by GNU line markers. + unsigned Depth = 0; + + PresumedLoc PLoc = SourceMgr.getPresumedLoc(Tok.getLocation()); + if (PLoc.isValid()) { + PLoc = SourceMgr.getPresumedLoc(PLoc.getIncludeLoc()); + for (; PLoc.isValid(); ++Depth) + PLoc = SourceMgr.getPresumedLoc(PLoc.getIncludeLoc()); + } + + // __INCLUDE_LEVEL__ expands to a simple numeric value. + OS << Depth; + Tok.setKind(tok::numeric_constant); + } else if (II == Ident__TIMESTAMP__) { + Diag(Tok.getLocation(), diag::warn_pp_date_time); + // MSVC, ICC, GCC, VisualAge C++ extension. The generated string should be + // of the form "Ddd Mmm dd hh::mm::ss yyyy", which is returned by asctime. + + // Get the file that we are lexing out of. If we're currently lexing from + // a macro, dig into the include stack. + const FileEntry *CurFile = nullptr; + PreprocessorLexer *TheLexer = getCurrentFileLexer(); + + if (TheLexer) + CurFile = SourceMgr.getFileEntryForID(TheLexer->getFileID()); + + const char *Result; + if (CurFile) { + time_t TT = CurFile->getModificationTime(); + struct tm *TM = localtime(&TT); + Result = asctime(TM); + } else { + Result = "??? ??? ?? ??:??:?? ????\n"; + } + // Surround the string with " and strip the trailing newline. + OS << '"' << StringRef(Result).drop_back() << '"'; + Tok.setKind(tok::string_literal); + } else if (II == Ident__COUNTER__) { + // __COUNTER__ expands to a simple numeric value. + OS << CounterValue++; + Tok.setKind(tok::numeric_constant); + } else if (II == Ident__has_feature) { + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this, + diag::err_feature_check_malformed); + return II && HasFeature(*this, II->getName()); + }); + } else if (II == Ident__has_extension) { + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this, + diag::err_feature_check_malformed); + return II && HasExtension(*this, II->getName()); + }); + } else if (II == Ident__has_builtin) { + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this, + diag::err_feature_check_malformed); + const LangOptions &LangOpts = getLangOpts(); + if (!II) + return false; + else if (II->getBuiltinID() != 0) { + switch (II->getBuiltinID()) { + case Builtin::BI__builtin_operator_new: + case Builtin::BI__builtin_operator_delete: + // denotes date of behavior change to support calling arbitrary + // usual allocation and deallocation functions. Required by libc++ + return 201802; + default: + return true; + } + return true; + } else if (II->getTokenID() != tok::identifier || + II->hasRevertedTokenIDToIdentifier()) { + // Treat all keywords that introduce a custom syntax of the form + // + // '__some_keyword' '(' [...] ')' + // + // as being "builtin functions", even if the syntax isn't a valid + // function call (for example, because the builtin takes a type + // argument). + if (II->getName().startswith("__builtin_") || + II->getName().startswith("__is_") || + II->getName().startswith("__has_")) + return true; + return llvm::StringSwitch<bool>(II->getName()) + .Case("__array_rank", true) + .Case("__array_extent", true) + .Case("__reference_binds_to_temporary", true) + .Case("__underlying_type", true) + .Default(false); + } else { + return llvm::StringSwitch<bool>(II->getName()) + // Report builtin templates as being builtins. + .Case("__make_integer_seq", LangOpts.CPlusPlus) + .Case("__type_pack_element", LangOpts.CPlusPlus) + // Likewise for some builtin preprocessor macros. + // FIXME: This is inconsistent; we usually suggest detecting + // builtin macros via #ifdef. Don't add more cases here. + .Case("__is_target_arch", true) + .Case("__is_target_vendor", true) + .Case("__is_target_os", true) + .Case("__is_target_environment", true) + .Default(false); + } + }); + } else if (II == Ident__is_identifier) { + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [](Token &Tok, bool &HasLexedNextToken) -> int { + return Tok.is(tok::identifier); + }); + } else if (II == Ident__has_attribute) { + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this, + diag::err_feature_check_malformed); + return II ? hasAttribute(AttrSyntax::GNU, nullptr, II, + getTargetInfo(), getLangOpts()) : 0; + }); + } else if (II == Ident__has_declspec) { + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this, + diag::err_feature_check_malformed); + return II ? hasAttribute(AttrSyntax::Declspec, nullptr, II, + getTargetInfo(), getLangOpts()) : 0; + }); + } else if (II == Ident__has_cpp_attribute || + II == Ident__has_c_attribute) { + bool IsCXX = II == Ident__has_cpp_attribute; + EvaluateFeatureLikeBuiltinMacro( + OS, Tok, II, *this, [&](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *ScopeII = nullptr; + IdentifierInfo *II = ExpectFeatureIdentifierInfo( + Tok, *this, diag::err_feature_check_malformed); + if (!II) + return false; + + // It is possible to receive a scope token. Read the "::", if it is + // available, and the subsequent identifier. + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::coloncolon)) + HasLexedNextToken = true; + else { + ScopeII = II; + LexUnexpandedToken(Tok); + II = ExpectFeatureIdentifierInfo(Tok, *this, + diag::err_feature_check_malformed); + } + + AttrSyntax Syntax = IsCXX ? AttrSyntax::CXX : AttrSyntax::C; + return II ? hasAttribute(Syntax, ScopeII, II, getTargetInfo(), + getLangOpts()) + : 0; + }); + } else if (II == Ident__has_include || + II == Ident__has_include_next) { + // The argument to these two builtins should be a parenthesized + // file name string literal using angle brackets (<>) or + // double-quotes (""). + bool Value; + if (II == Ident__has_include) + Value = EvaluateHasInclude(Tok, II, *this); + else + Value = EvaluateHasIncludeNext(Tok, II, *this); + + if (Tok.isNot(tok::r_paren)) + return; + OS << (int)Value; + Tok.setKind(tok::numeric_constant); + } else if (II == Ident__has_warning) { + // The argument should be a parenthesized string literal. + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [this](Token &Tok, bool &HasLexedNextToken) -> int { + std::string WarningName; + SourceLocation StrStartLoc = Tok.getLocation(); + + HasLexedNextToken = Tok.is(tok::string_literal); + if (!FinishLexStringLiteral(Tok, WarningName, "'__has_warning'", + /*AllowMacroExpansion=*/false)) + return false; + + // FIXME: Should we accept "-R..." flags here, or should that be + // handled by a separate __has_remark? + if (WarningName.size() < 3 || WarningName[0] != '-' || + WarningName[1] != 'W') { + Diag(StrStartLoc, diag::warn_has_warning_invalid_option); + return false; + } + + // Finally, check if the warning flags maps to a diagnostic group. + // We construct a SmallVector here to talk to getDiagnosticIDs(). + // Although we don't use the result, this isn't a hot path, and not + // worth special casing. + SmallVector<diag::kind, 10> Diags; + return !getDiagnostics().getDiagnosticIDs()-> + getDiagnosticsInGroup(diag::Flavor::WarningOrError, + WarningName.substr(2), Diags); + }); + } else if (II == Ident__building_module) { + // The argument to this builtin should be an identifier. The + // builtin evaluates to 1 when that identifier names the module we are + // currently building. + EvaluateFeatureLikeBuiltinMacro(OS, Tok, II, *this, + [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo(Tok, *this, + diag::err_expected_id_building_module); + return getLangOpts().isCompilingModule() && II && + (II->getName() == getLangOpts().CurrentModule); + }); + } else if (II == Ident__MODULE__) { + // The current module as an identifier. + OS << getLangOpts().CurrentModule; + IdentifierInfo *ModuleII = getIdentifierInfo(getLangOpts().CurrentModule); + Tok.setIdentifierInfo(ModuleII); + Tok.setKind(ModuleII->getTokenID()); + } else if (II == Ident__identifier) { + SourceLocation Loc = Tok.getLocation(); + + // We're expecting '__identifier' '(' identifier ')'. Try to recover + // if the parens are missing. + LexNonComment(Tok); + if (Tok.isNot(tok::l_paren)) { + // No '(', use end of last token. + Diag(getLocForEndOfToken(Loc), diag::err_pp_expected_after) + << II << tok::l_paren; + // If the next token isn't valid as our argument, we can't recover. + if (!Tok.isAnnotation() && Tok.getIdentifierInfo()) + Tok.setKind(tok::identifier); + return; + } + + SourceLocation LParenLoc = Tok.getLocation(); + LexNonComment(Tok); + + if (!Tok.isAnnotation() && Tok.getIdentifierInfo()) + Tok.setKind(tok::identifier); + else { + Diag(Tok.getLocation(), diag::err_pp_identifier_arg_not_identifier) + << Tok.getKind(); + // Don't walk past anything that's not a real token. + if (Tok.isOneOf(tok::eof, tok::eod) || Tok.isAnnotation()) + return; + } + + // Discard the ')', preserving 'Tok' as our result. + Token RParen; + LexNonComment(RParen); + if (RParen.isNot(tok::r_paren)) { + Diag(getLocForEndOfToken(Tok.getLocation()), diag::err_pp_expected_after) + << Tok.getKind() << tok::r_paren; + Diag(LParenLoc, diag::note_matching) << tok::l_paren; + } + return; + } else if (II == Ident__is_target_arch) { + EvaluateFeatureLikeBuiltinMacro( + OS, Tok, II, *this, [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo( + Tok, *this, diag::err_feature_check_malformed); + return II && isTargetArch(getTargetInfo(), II); + }); + } else if (II == Ident__is_target_vendor) { + EvaluateFeatureLikeBuiltinMacro( + OS, Tok, II, *this, [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo( + Tok, *this, diag::err_feature_check_malformed); + return II && isTargetVendor(getTargetInfo(), II); + }); + } else if (II == Ident__is_target_os) { + EvaluateFeatureLikeBuiltinMacro( + OS, Tok, II, *this, [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo( + Tok, *this, diag::err_feature_check_malformed); + return II && isTargetOS(getTargetInfo(), II); + }); + } else if (II == Ident__is_target_environment) { + EvaluateFeatureLikeBuiltinMacro( + OS, Tok, II, *this, [this](Token &Tok, bool &HasLexedNextToken) -> int { + IdentifierInfo *II = ExpectFeatureIdentifierInfo( + Tok, *this, diag::err_feature_check_malformed); + return II && isTargetEnvironment(getTargetInfo(), II); + }); + } else { + llvm_unreachable("Unknown identifier!"); + } + CreateString(OS.str(), Tok, Tok.getLocation(), Tok.getLocation()); + Tok.setFlagValue(Token::StartOfLine, IsAtStartOfLine); + Tok.setFlagValue(Token::LeadingSpace, HasLeadingSpace); +} + +void Preprocessor::markMacroAsUsed(MacroInfo *MI) { + // If the 'used' status changed, and the macro requires 'unused' warning, + // remove its SourceLocation from the warn-for-unused-macro locations. + if (MI->isWarnIfUnused() && !MI->isUsed()) + WarnUnusedMacroLocs.erase(MI->getDefinitionLoc()); + MI->setIsUsed(true); +} diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp new file mode 100644 index 000000000000..79953804b5d3 --- /dev/null +++ b/clang/lib/Lex/Pragma.cpp @@ -0,0 +1,1914 @@ +//===- Pragma.cpp - Pragma registration and handling ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the PragmaHandler/PragmaTable interfaces and implements +// pragma related methods of the Preprocessor class. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Pragma.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/Module.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/ModuleLoader.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorLexer.h" +#include "clang/Lex/Token.h" +#include "clang/Lex/TokenLexer.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CrashRecoveryContext.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <limits> +#include <string> +#include <utility> +#include <vector> + +using namespace clang; + +// Out-of-line destructor to provide a home for the class. +PragmaHandler::~PragmaHandler() = default; + +//===----------------------------------------------------------------------===// +// EmptyPragmaHandler Implementation. +//===----------------------------------------------------------------------===// + +EmptyPragmaHandler::EmptyPragmaHandler(StringRef Name) : PragmaHandler(Name) {} + +void EmptyPragmaHandler::HandlePragma(Preprocessor &PP, + PragmaIntroducer Introducer, + Token &FirstToken) {} + +//===----------------------------------------------------------------------===// +// PragmaNamespace Implementation. +//===----------------------------------------------------------------------===// + +PragmaNamespace::~PragmaNamespace() { + llvm::DeleteContainerSeconds(Handlers); +} + +/// FindHandler - Check to see if there is already a handler for the +/// specified name. If not, return the handler for the null identifier if it +/// exists, otherwise return null. If IgnoreNull is true (the default) then +/// the null handler isn't returned on failure to match. +PragmaHandler *PragmaNamespace::FindHandler(StringRef Name, + bool IgnoreNull) const { + if (PragmaHandler *Handler = Handlers.lookup(Name)) + return Handler; + return IgnoreNull ? nullptr : Handlers.lookup(StringRef()); +} + +void PragmaNamespace::AddPragma(PragmaHandler *Handler) { + assert(!Handlers.lookup(Handler->getName()) && + "A handler with this name is already registered in this namespace"); + Handlers[Handler->getName()] = Handler; +} + +void PragmaNamespace::RemovePragmaHandler(PragmaHandler *Handler) { + assert(Handlers.lookup(Handler->getName()) && + "Handler not registered in this namespace"); + Handlers.erase(Handler->getName()); +} + +void PragmaNamespace::HandlePragma(Preprocessor &PP, + PragmaIntroducer Introducer, Token &Tok) { + // Read the 'namespace' that the directive is in, e.g. STDC. Do not macro + // expand it, the user can have a STDC #define, that should not affect this. + PP.LexUnexpandedToken(Tok); + + // Get the handler for this token. If there is no handler, ignore the pragma. + PragmaHandler *Handler + = FindHandler(Tok.getIdentifierInfo() ? Tok.getIdentifierInfo()->getName() + : StringRef(), + /*IgnoreNull=*/false); + if (!Handler) { + PP.Diag(Tok, diag::warn_pragma_ignored); + return; + } + + // Otherwise, pass it down. + Handler->HandlePragma(PP, Introducer, Tok); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Pragma Directive Handling. +//===----------------------------------------------------------------------===// + +namespace { +// TokenCollector provides the option to collect tokens that were "read" +// and return them to the stream to be read later. +// Currently used when reading _Pragma/__pragma directives. +struct TokenCollector { + Preprocessor &Self; + bool Collect; + SmallVector<Token, 3> Tokens; + Token &Tok; + + void lex() { + if (Collect) + Tokens.push_back(Tok); + Self.Lex(Tok); + } + + void revert() { + assert(Collect && "did not collect tokens"); + assert(!Tokens.empty() && "collected unexpected number of tokens"); + + // Push the ( "string" ) tokens into the token stream. + auto Toks = std::make_unique<Token[]>(Tokens.size()); + std::copy(Tokens.begin() + 1, Tokens.end(), Toks.get()); + Toks[Tokens.size() - 1] = Tok; + Self.EnterTokenStream(std::move(Toks), Tokens.size(), + /*DisableMacroExpansion*/ true, + /*IsReinject*/ true); + + // ... and return the pragma token unchanged. + Tok = *Tokens.begin(); + } +}; +} // namespace + +/// HandlePragmaDirective - The "\#pragma" directive has been parsed. Lex the +/// rest of the pragma, passing it to the registered pragma handlers. +void Preprocessor::HandlePragmaDirective(PragmaIntroducer Introducer) { + if (Callbacks) + Callbacks->PragmaDirective(Introducer.Loc, Introducer.Kind); + + if (!PragmasEnabled) + return; + + ++NumPragma; + + // Invoke the first level of pragma handlers which reads the namespace id. + Token Tok; + PragmaHandlers->HandlePragma(*this, Introducer, Tok); + + // If the pragma handler didn't read the rest of the line, consume it now. + if ((CurTokenLexer && CurTokenLexer->isParsingPreprocessorDirective()) + || (CurPPLexer && CurPPLexer->ParsingPreprocessorDirective)) + DiscardUntilEndOfDirective(); +} + +/// Handle_Pragma - Read a _Pragma directive, slice it up, process it, then +/// return the first token after the directive. The _Pragma token has just +/// been read into 'Tok'. +void Preprocessor::Handle_Pragma(Token &Tok) { + // C11 6.10.3.4/3: + // all pragma unary operator expressions within [a completely + // macro-replaced preprocessing token sequence] are [...] processed [after + // rescanning is complete] + // + // This means that we execute _Pragma operators in two cases: + // + // 1) on token sequences that would otherwise be produced as the output of + // phase 4 of preprocessing, and + // 2) on token sequences formed as the macro-replaced token sequence of a + // macro argument + // + // Case #2 appears to be a wording bug: only _Pragmas that would survive to + // the end of phase 4 should actually be executed. Discussion on the WG14 + // mailing list suggests that a _Pragma operator is notionally checked early, + // but only pragmas that survive to the end of phase 4 should be executed. + // + // In Case #2, we check the syntax now, but then put the tokens back into the + // token stream for later consumption. + + TokenCollector Toks = {*this, InMacroArgPreExpansion, {}, Tok}; + + // Remember the pragma token location. + SourceLocation PragmaLoc = Tok.getLocation(); + + // Read the '('. + Toks.lex(); + if (Tok.isNot(tok::l_paren)) { + Diag(PragmaLoc, diag::err__Pragma_malformed); + return; + } + + // Read the '"..."'. + Toks.lex(); + if (!tok::isStringLiteral(Tok.getKind())) { + Diag(PragmaLoc, diag::err__Pragma_malformed); + // Skip bad tokens, and the ')', if present. + if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::eof)) + Lex(Tok); + while (Tok.isNot(tok::r_paren) && + !Tok.isAtStartOfLine() && + Tok.isNot(tok::eof)) + Lex(Tok); + if (Tok.is(tok::r_paren)) + Lex(Tok); + return; + } + + if (Tok.hasUDSuffix()) { + Diag(Tok, diag::err_invalid_string_udl); + // Skip this token, and the ')', if present. + Lex(Tok); + if (Tok.is(tok::r_paren)) + Lex(Tok); + return; + } + + // Remember the string. + Token StrTok = Tok; + + // Read the ')'. + Toks.lex(); + if (Tok.isNot(tok::r_paren)) { + Diag(PragmaLoc, diag::err__Pragma_malformed); + return; + } + + // If we're expanding a macro argument, put the tokens back. + if (InMacroArgPreExpansion) { + Toks.revert(); + return; + } + + SourceLocation RParenLoc = Tok.getLocation(); + std::string StrVal = getSpelling(StrTok); + + // The _Pragma is lexically sound. Destringize according to C11 6.10.9.1: + // "The string literal is destringized by deleting any encoding prefix, + // deleting the leading and trailing double-quotes, replacing each escape + // sequence \" by a double-quote, and replacing each escape sequence \\ by a + // single backslash." + if (StrVal[0] == 'L' || StrVal[0] == 'U' || + (StrVal[0] == 'u' && StrVal[1] != '8')) + StrVal.erase(StrVal.begin()); + else if (StrVal[0] == 'u') + StrVal.erase(StrVal.begin(), StrVal.begin() + 2); + + if (StrVal[0] == 'R') { + // FIXME: C++11 does not specify how to handle raw-string-literals here. + // We strip off the 'R', the quotes, the d-char-sequences, and the parens. + assert(StrVal[1] == '"' && StrVal[StrVal.size() - 1] == '"' && + "Invalid raw string token!"); + + // Measure the length of the d-char-sequence. + unsigned NumDChars = 0; + while (StrVal[2 + NumDChars] != '(') { + assert(NumDChars < (StrVal.size() - 5) / 2 && + "Invalid raw string token!"); + ++NumDChars; + } + assert(StrVal[StrVal.size() - 2 - NumDChars] == ')'); + + // Remove 'R " d-char-sequence' and 'd-char-sequence "'. We'll replace the + // parens below. + StrVal.erase(0, 2 + NumDChars); + StrVal.erase(StrVal.size() - 1 - NumDChars); + } else { + assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' && + "Invalid string token!"); + + // Remove escaped quotes and escapes. + unsigned ResultPos = 1; + for (size_t i = 1, e = StrVal.size() - 1; i != e; ++i) { + // Skip escapes. \\ -> '\' and \" -> '"'. + if (StrVal[i] == '\\' && i + 1 < e && + (StrVal[i + 1] == '\\' || StrVal[i + 1] == '"')) + ++i; + StrVal[ResultPos++] = StrVal[i]; + } + StrVal.erase(StrVal.begin() + ResultPos, StrVal.end() - 1); + } + + // Remove the front quote, replacing it with a space, so that the pragma + // contents appear to have a space before them. + StrVal[0] = ' '; + + // Replace the terminating quote with a \n. + StrVal[StrVal.size()-1] = '\n'; + + // Plop the string (including the newline and trailing null) into a buffer + // where we can lex it. + Token TmpTok; + TmpTok.startToken(); + CreateString(StrVal, TmpTok); + SourceLocation TokLoc = TmpTok.getLocation(); + + // Make and enter a lexer object so that we lex and expand the tokens just + // like any others. + Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc, + StrVal.size(), *this); + + EnterSourceFileWithLexer(TL, nullptr); + + // With everything set up, lex this as a #pragma directive. + HandlePragmaDirective({PIK__Pragma, PragmaLoc}); + + // Finally, return whatever came after the pragma directive. + return Lex(Tok); +} + +/// HandleMicrosoft__pragma - Like Handle_Pragma except the pragma text +/// is not enclosed within a string literal. +void Preprocessor::HandleMicrosoft__pragma(Token &Tok) { + // During macro pre-expansion, check the syntax now but put the tokens back + // into the token stream for later consumption. Same as Handle_Pragma. + TokenCollector Toks = {*this, InMacroArgPreExpansion, {}, Tok}; + + // Remember the pragma token location. + SourceLocation PragmaLoc = Tok.getLocation(); + + // Read the '('. + Toks.lex(); + if (Tok.isNot(tok::l_paren)) { + Diag(PragmaLoc, diag::err__Pragma_malformed); + return; + } + + // Get the tokens enclosed within the __pragma(), as well as the final ')'. + SmallVector<Token, 32> PragmaToks; + int NumParens = 0; + Toks.lex(); + while (Tok.isNot(tok::eof)) { + PragmaToks.push_back(Tok); + if (Tok.is(tok::l_paren)) + NumParens++; + else if (Tok.is(tok::r_paren) && NumParens-- == 0) + break; + Toks.lex(); + } + + if (Tok.is(tok::eof)) { + Diag(PragmaLoc, diag::err_unterminated___pragma); + return; + } + + // If we're expanding a macro argument, put the tokens back. + if (InMacroArgPreExpansion) { + Toks.revert(); + return; + } + + PragmaToks.front().setFlag(Token::LeadingSpace); + + // Replace the ')' with an EOD to mark the end of the pragma. + PragmaToks.back().setKind(tok::eod); + + Token *TokArray = new Token[PragmaToks.size()]; + std::copy(PragmaToks.begin(), PragmaToks.end(), TokArray); + + // Push the tokens onto the stack. + EnterTokenStream(TokArray, PragmaToks.size(), true, true, + /*IsReinject*/ false); + + // With everything set up, lex this as a #pragma directive. + HandlePragmaDirective({PIK___pragma, PragmaLoc}); + + // Finally, return whatever came after the pragma directive. + return Lex(Tok); +} + +/// HandlePragmaOnce - Handle \#pragma once. OnceTok is the 'once'. +void Preprocessor::HandlePragmaOnce(Token &OnceTok) { + // Don't honor the 'once' when handling the primary source file, unless + // this is a prefix to a TU, which indicates we're generating a PCH file, or + // when the main file is a header (e.g. when -xc-header is provided on the + // commandline). + if (isInPrimaryFile() && TUKind != TU_Prefix && !getLangOpts().IsHeaderFile) { + Diag(OnceTok, diag::pp_pragma_once_in_main_file); + return; + } + + // Get the current file lexer we're looking at. Ignore _Pragma 'files' etc. + // Mark the file as a once-only file now. + HeaderInfo.MarkFileIncludeOnce(getCurrentFileLexer()->getFileEntry()); +} + +void Preprocessor::HandlePragmaMark() { + assert(CurPPLexer && "No current lexer?"); + CurLexer->ReadToEndOfLine(); +} + +/// HandlePragmaPoison - Handle \#pragma GCC poison. PoisonTok is the 'poison'. +void Preprocessor::HandlePragmaPoison() { + Token Tok; + + while (true) { + // Read the next token to poison. While doing this, pretend that we are + // skipping while reading the identifier to poison. + // This avoids errors on code like: + // #pragma GCC poison X + // #pragma GCC poison X + if (CurPPLexer) CurPPLexer->LexingRawMode = true; + LexUnexpandedToken(Tok); + if (CurPPLexer) CurPPLexer->LexingRawMode = false; + + // If we reached the end of line, we're done. + if (Tok.is(tok::eod)) return; + + // Can only poison identifiers. + if (Tok.isNot(tok::raw_identifier)) { + Diag(Tok, diag::err_pp_invalid_poison); + return; + } + + // Look up the identifier info for the token. We disabled identifier lookup + // by saying we're skipping contents, so we need to do this manually. + IdentifierInfo *II = LookUpIdentifierInfo(Tok); + + // Already poisoned. + if (II->isPoisoned()) continue; + + // If this is a macro identifier, emit a warning. + if (isMacroDefined(II)) + Diag(Tok, diag::pp_poisoning_existing_macro); + + // Finally, poison it! + II->setIsPoisoned(); + if (II->isFromAST()) + II->setChangedSinceDeserialization(); + } +} + +/// HandlePragmaSystemHeader - Implement \#pragma GCC system_header. We know +/// that the whole directive has been parsed. +void Preprocessor::HandlePragmaSystemHeader(Token &SysHeaderTok) { + if (isInPrimaryFile()) { + Diag(SysHeaderTok, diag::pp_pragma_sysheader_in_main_file); + return; + } + + // Get the current file lexer we're looking at. Ignore _Pragma 'files' etc. + PreprocessorLexer *TheLexer = getCurrentFileLexer(); + + // Mark the file as a system header. + HeaderInfo.MarkFileSystemHeader(TheLexer->getFileEntry()); + + PresumedLoc PLoc = SourceMgr.getPresumedLoc(SysHeaderTok.getLocation()); + if (PLoc.isInvalid()) + return; + + unsigned FilenameID = SourceMgr.getLineTableFilenameID(PLoc.getFilename()); + + // Notify the client, if desired, that we are in a new source file. + if (Callbacks) + Callbacks->FileChanged(SysHeaderTok.getLocation(), + PPCallbacks::SystemHeaderPragma, SrcMgr::C_System); + + // Emit a line marker. This will change any source locations from this point + // forward to realize they are in a system header. + // Create a line note with this information. + SourceMgr.AddLineNote(SysHeaderTok.getLocation(), PLoc.getLine() + 1, + FilenameID, /*IsEntry=*/false, /*IsExit=*/false, + SrcMgr::C_System); +} + +/// HandlePragmaDependency - Handle \#pragma GCC dependency "foo" blah. +void Preprocessor::HandlePragmaDependency(Token &DependencyTok) { + Token FilenameTok; + if (LexHeaderName(FilenameTok, /*AllowConcatenation*/false)) + return; + + // If the next token wasn't a header-name, diagnose the error. + if (FilenameTok.isNot(tok::header_name)) { + Diag(FilenameTok.getLocation(), diag::err_pp_expects_filename); + return; + } + + // Reserve a buffer to get the spelling. + SmallString<128> FilenameBuffer; + bool Invalid = false; + StringRef Filename = getSpelling(FilenameTok, FilenameBuffer, &Invalid); + if (Invalid) + return; + + bool isAngled = + GetIncludeFilenameSpelling(FilenameTok.getLocation(), Filename); + // If GetIncludeFilenameSpelling set the start ptr to null, there was an + // error. + if (Filename.empty()) + return; + + // Search include directories for this file. + const DirectoryLookup *CurDir; + Optional<FileEntryRef> File = + LookupFile(FilenameTok.getLocation(), Filename, isAngled, nullptr, + nullptr, CurDir, nullptr, nullptr, nullptr, nullptr, nullptr); + if (!File) { + if (!SuppressIncludeNotFoundError) + Diag(FilenameTok, diag::err_pp_file_not_found) << Filename; + return; + } + + const FileEntry *CurFile = getCurrentFileLexer()->getFileEntry(); + + // If this file is older than the file it depends on, emit a diagnostic. + if (CurFile && CurFile->getModificationTime() < File->getModificationTime()) { + // Lex tokens at the end of the message and include them in the message. + std::string Message; + Lex(DependencyTok); + while (DependencyTok.isNot(tok::eod)) { + Message += getSpelling(DependencyTok) + " "; + Lex(DependencyTok); + } + + // Remove the trailing ' ' if present. + if (!Message.empty()) + Message.erase(Message.end()-1); + Diag(FilenameTok, diag::pp_out_of_date_dependency) << Message; + } +} + +/// ParsePragmaPushOrPopMacro - Handle parsing of pragma push_macro/pop_macro. +/// Return the IdentifierInfo* associated with the macro to push or pop. +IdentifierInfo *Preprocessor::ParsePragmaPushOrPopMacro(Token &Tok) { + // Remember the pragma token location. + Token PragmaTok = Tok; + + // Read the '('. + Lex(Tok); + if (Tok.isNot(tok::l_paren)) { + Diag(PragmaTok.getLocation(), diag::err_pragma_push_pop_macro_malformed) + << getSpelling(PragmaTok); + return nullptr; + } + + // Read the macro name string. + Lex(Tok); + if (Tok.isNot(tok::string_literal)) { + Diag(PragmaTok.getLocation(), diag::err_pragma_push_pop_macro_malformed) + << getSpelling(PragmaTok); + return nullptr; + } + + if (Tok.hasUDSuffix()) { + Diag(Tok, diag::err_invalid_string_udl); + return nullptr; + } + + // Remember the macro string. + std::string StrVal = getSpelling(Tok); + + // Read the ')'. + Lex(Tok); + if (Tok.isNot(tok::r_paren)) { + Diag(PragmaTok.getLocation(), diag::err_pragma_push_pop_macro_malformed) + << getSpelling(PragmaTok); + return nullptr; + } + + assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' && + "Invalid string token!"); + + // Create a Token from the string. + Token MacroTok; + MacroTok.startToken(); + MacroTok.setKind(tok::raw_identifier); + CreateString(StringRef(&StrVal[1], StrVal.size() - 2), MacroTok); + + // Get the IdentifierInfo of MacroToPushTok. + return LookUpIdentifierInfo(MacroTok); +} + +/// Handle \#pragma push_macro. +/// +/// The syntax is: +/// \code +/// #pragma push_macro("macro") +/// \endcode +void Preprocessor::HandlePragmaPushMacro(Token &PushMacroTok) { + // Parse the pragma directive and get the macro IdentifierInfo*. + IdentifierInfo *IdentInfo = ParsePragmaPushOrPopMacro(PushMacroTok); + if (!IdentInfo) return; + + // Get the MacroInfo associated with IdentInfo. + MacroInfo *MI = getMacroInfo(IdentInfo); + + if (MI) { + // Allow the original MacroInfo to be redefined later. + MI->setIsAllowRedefinitionsWithoutWarning(true); + } + + // Push the cloned MacroInfo so we can retrieve it later. + PragmaPushMacroInfo[IdentInfo].push_back(MI); +} + +/// Handle \#pragma pop_macro. +/// +/// The syntax is: +/// \code +/// #pragma pop_macro("macro") +/// \endcode +void Preprocessor::HandlePragmaPopMacro(Token &PopMacroTok) { + SourceLocation MessageLoc = PopMacroTok.getLocation(); + + // Parse the pragma directive and get the macro IdentifierInfo*. + IdentifierInfo *IdentInfo = ParsePragmaPushOrPopMacro(PopMacroTok); + if (!IdentInfo) return; + + // Find the vector<MacroInfo*> associated with the macro. + llvm::DenseMap<IdentifierInfo *, std::vector<MacroInfo *>>::iterator iter = + PragmaPushMacroInfo.find(IdentInfo); + if (iter != PragmaPushMacroInfo.end()) { + // Forget the MacroInfo currently associated with IdentInfo. + if (MacroInfo *MI = getMacroInfo(IdentInfo)) { + if (MI->isWarnIfUnused()) + WarnUnusedMacroLocs.erase(MI->getDefinitionLoc()); + appendMacroDirective(IdentInfo, AllocateUndefMacroDirective(MessageLoc)); + } + + // Get the MacroInfo we want to reinstall. + MacroInfo *MacroToReInstall = iter->second.back(); + + if (MacroToReInstall) + // Reinstall the previously pushed macro. + appendDefMacroDirective(IdentInfo, MacroToReInstall, MessageLoc); + + // Pop PragmaPushMacroInfo stack. + iter->second.pop_back(); + if (iter->second.empty()) + PragmaPushMacroInfo.erase(iter); + } else { + Diag(MessageLoc, diag::warn_pragma_pop_macro_no_push) + << IdentInfo->getName(); + } +} + +void Preprocessor::HandlePragmaIncludeAlias(Token &Tok) { + // We will either get a quoted filename or a bracketed filename, and we + // have to track which we got. The first filename is the source name, + // and the second name is the mapped filename. If the first is quoted, + // the second must be as well (cannot mix and match quotes and brackets). + + // Get the open paren + Lex(Tok); + if (Tok.isNot(tok::l_paren)) { + Diag(Tok, diag::warn_pragma_include_alias_expected) << "("; + return; + } + + // We expect either a quoted string literal, or a bracketed name + Token SourceFilenameTok; + if (LexHeaderName(SourceFilenameTok)) + return; + + StringRef SourceFileName; + SmallString<128> FileNameBuffer; + if (SourceFilenameTok.is(tok::header_name)) { + SourceFileName = getSpelling(SourceFilenameTok, FileNameBuffer); + } else { + Diag(Tok, diag::warn_pragma_include_alias_expected_filename); + return; + } + FileNameBuffer.clear(); + + // Now we expect a comma, followed by another include name + Lex(Tok); + if (Tok.isNot(tok::comma)) { + Diag(Tok, diag::warn_pragma_include_alias_expected) << ","; + return; + } + + Token ReplaceFilenameTok; + if (LexHeaderName(ReplaceFilenameTok)) + return; + + StringRef ReplaceFileName; + if (ReplaceFilenameTok.is(tok::header_name)) { + ReplaceFileName = getSpelling(ReplaceFilenameTok, FileNameBuffer); + } else { + Diag(Tok, diag::warn_pragma_include_alias_expected_filename); + return; + } + + // Finally, we expect the closing paren + Lex(Tok); + if (Tok.isNot(tok::r_paren)) { + Diag(Tok, diag::warn_pragma_include_alias_expected) << ")"; + return; + } + + // Now that we have the source and target filenames, we need to make sure + // they're both of the same type (angled vs non-angled) + StringRef OriginalSource = SourceFileName; + + bool SourceIsAngled = + GetIncludeFilenameSpelling(SourceFilenameTok.getLocation(), + SourceFileName); + bool ReplaceIsAngled = + GetIncludeFilenameSpelling(ReplaceFilenameTok.getLocation(), + ReplaceFileName); + if (!SourceFileName.empty() && !ReplaceFileName.empty() && + (SourceIsAngled != ReplaceIsAngled)) { + unsigned int DiagID; + if (SourceIsAngled) + DiagID = diag::warn_pragma_include_alias_mismatch_angle; + else + DiagID = diag::warn_pragma_include_alias_mismatch_quote; + + Diag(SourceFilenameTok.getLocation(), DiagID) + << SourceFileName + << ReplaceFileName; + + return; + } + + // Now we can let the include handler know about this mapping + getHeaderSearchInfo().AddIncludeAlias(OriginalSource, ReplaceFileName); +} + +// Lex a component of a module name: either an identifier or a string literal; +// for components that can be expressed both ways, the two forms are equivalent. +static bool LexModuleNameComponent( + Preprocessor &PP, Token &Tok, + std::pair<IdentifierInfo *, SourceLocation> &ModuleNameComponent, + bool First) { + PP.LexUnexpandedToken(Tok); + if (Tok.is(tok::string_literal) && !Tok.hasUDSuffix()) { + StringLiteralParser Literal(Tok, PP); + if (Literal.hadError) + return true; + ModuleNameComponent = std::make_pair( + PP.getIdentifierInfo(Literal.GetString()), Tok.getLocation()); + } else if (!Tok.isAnnotation() && Tok.getIdentifierInfo()) { + ModuleNameComponent = + std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation()); + } else { + PP.Diag(Tok.getLocation(), diag::err_pp_expected_module_name) << First; + return true; + } + return false; +} + +static bool LexModuleName( + Preprocessor &PP, Token &Tok, + llvm::SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> + &ModuleName) { + while (true) { + std::pair<IdentifierInfo*, SourceLocation> NameComponent; + if (LexModuleNameComponent(PP, Tok, NameComponent, ModuleName.empty())) + return true; + ModuleName.push_back(NameComponent); + + PP.LexUnexpandedToken(Tok); + if (Tok.isNot(tok::period)) + return false; + } +} + +void Preprocessor::HandlePragmaModuleBuild(Token &Tok) { + SourceLocation Loc = Tok.getLocation(); + + std::pair<IdentifierInfo *, SourceLocation> ModuleNameLoc; + if (LexModuleNameComponent(*this, Tok, ModuleNameLoc, true)) + return; + IdentifierInfo *ModuleName = ModuleNameLoc.first; + + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::eod)) { + Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; + DiscardUntilEndOfDirective(); + } + + CurLexer->LexingRawMode = true; + + auto TryConsumeIdentifier = [&](StringRef Ident) -> bool { + if (Tok.getKind() != tok::raw_identifier || + Tok.getRawIdentifier() != Ident) + return false; + CurLexer->Lex(Tok); + return true; + }; + + // Scan forward looking for the end of the module. + const char *Start = CurLexer->getBufferLocation(); + const char *End = nullptr; + unsigned NestingLevel = 1; + while (true) { + End = CurLexer->getBufferLocation(); + CurLexer->Lex(Tok); + + if (Tok.is(tok::eof)) { + Diag(Loc, diag::err_pp_module_build_missing_end); + break; + } + + if (Tok.isNot(tok::hash) || !Tok.isAtStartOfLine()) { + // Token was part of module; keep going. + continue; + } + + // We hit something directive-shaped; check to see if this is the end + // of the module build. + CurLexer->ParsingPreprocessorDirective = true; + CurLexer->Lex(Tok); + if (TryConsumeIdentifier("pragma") && TryConsumeIdentifier("clang") && + TryConsumeIdentifier("module")) { + if (TryConsumeIdentifier("build")) + // #pragma clang module build -> entering a nested module build. + ++NestingLevel; + else if (TryConsumeIdentifier("endbuild")) { + // #pragma clang module endbuild -> leaving a module build. + if (--NestingLevel == 0) + break; + } + // We should either be looking at the EOD or more of the current directive + // preceding the EOD. Either way we can ignore this token and keep going. + assert(Tok.getKind() != tok::eof && "missing EOD before EOF"); + } + } + + CurLexer->LexingRawMode = false; + + // Load the extracted text as a preprocessed module. + assert(CurLexer->getBuffer().begin() <= Start && + Start <= CurLexer->getBuffer().end() && + CurLexer->getBuffer().begin() <= End && + End <= CurLexer->getBuffer().end() && + "module source range not contained within same file buffer"); + TheModuleLoader.loadModuleFromSource(Loc, ModuleName->getName(), + StringRef(Start, End - Start)); +} + +void Preprocessor::HandlePragmaHdrstop(Token &Tok) { + Lex(Tok); + if (Tok.is(tok::l_paren)) { + Diag(Tok.getLocation(), diag::warn_pp_hdrstop_filename_ignored); + + std::string FileName; + if (!LexStringLiteral(Tok, FileName, "pragma hdrstop", false)) + return; + + if (Tok.isNot(tok::r_paren)) { + Diag(Tok, diag::err_expected) << tok::r_paren; + return; + } + Lex(Tok); + } + if (Tok.isNot(tok::eod)) + Diag(Tok.getLocation(), diag::ext_pp_extra_tokens_at_eol) + << "pragma hdrstop"; + + if (creatingPCHWithPragmaHdrStop() && + SourceMgr.isInMainFile(Tok.getLocation())) { + assert(CurLexer && "no lexer for #pragma hdrstop processing"); + Token &Result = Tok; + Result.startToken(); + CurLexer->FormTokenWithChars(Result, CurLexer->BufferEnd, tok::eof); + CurLexer->cutOffLexing(); + } + if (usingPCHWithPragmaHdrStop()) + SkippingUntilPragmaHdrStop = false; +} + +/// AddPragmaHandler - Add the specified pragma handler to the preprocessor. +/// If 'Namespace' is non-null, then it is a token required to exist on the +/// pragma line before the pragma string starts, e.g. "STDC" or "GCC". +void Preprocessor::AddPragmaHandler(StringRef Namespace, + PragmaHandler *Handler) { + PragmaNamespace *InsertNS = PragmaHandlers.get(); + + // If this is specified to be in a namespace, step down into it. + if (!Namespace.empty()) { + // If there is already a pragma handler with the name of this namespace, + // we either have an error (directive with the same name as a namespace) or + // we already have the namespace to insert into. + if (PragmaHandler *Existing = PragmaHandlers->FindHandler(Namespace)) { + InsertNS = Existing->getIfNamespace(); + assert(InsertNS != nullptr && "Cannot have a pragma namespace and pragma" + " handler with the same name!"); + } else { + // Otherwise, this namespace doesn't exist yet, create and insert the + // handler for it. + InsertNS = new PragmaNamespace(Namespace); + PragmaHandlers->AddPragma(InsertNS); + } + } + + // Check to make sure we don't already have a pragma for this identifier. + assert(!InsertNS->FindHandler(Handler->getName()) && + "Pragma handler already exists for this identifier!"); + InsertNS->AddPragma(Handler); +} + +/// RemovePragmaHandler - Remove the specific pragma handler from the +/// preprocessor. If \arg Namespace is non-null, then it should be the +/// namespace that \arg Handler was added to. It is an error to remove +/// a handler that has not been registered. +void Preprocessor::RemovePragmaHandler(StringRef Namespace, + PragmaHandler *Handler) { + PragmaNamespace *NS = PragmaHandlers.get(); + + // If this is specified to be in a namespace, step down into it. + if (!Namespace.empty()) { + PragmaHandler *Existing = PragmaHandlers->FindHandler(Namespace); + assert(Existing && "Namespace containing handler does not exist!"); + + NS = Existing->getIfNamespace(); + assert(NS && "Invalid namespace, registered as a regular pragma handler!"); + } + + NS->RemovePragmaHandler(Handler); + + // If this is a non-default namespace and it is now empty, remove it. + if (NS != PragmaHandlers.get() && NS->IsEmpty()) { + PragmaHandlers->RemovePragmaHandler(NS); + delete NS; + } +} + +bool Preprocessor::LexOnOffSwitch(tok::OnOffSwitch &Result) { + Token Tok; + LexUnexpandedToken(Tok); + + if (Tok.isNot(tok::identifier)) { + Diag(Tok, diag::ext_on_off_switch_syntax); + return true; + } + IdentifierInfo *II = Tok.getIdentifierInfo(); + if (II->isStr("ON")) + Result = tok::OOS_ON; + else if (II->isStr("OFF")) + Result = tok::OOS_OFF; + else if (II->isStr("DEFAULT")) + Result = tok::OOS_DEFAULT; + else { + Diag(Tok, diag::ext_on_off_switch_syntax); + return true; + } + + // Verify that this is followed by EOD. + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::eod)) + Diag(Tok, diag::ext_pragma_syntax_eod); + return false; +} + +namespace { + +/// PragmaOnceHandler - "\#pragma once" marks the file as atomically included. +struct PragmaOnceHandler : public PragmaHandler { + PragmaOnceHandler() : PragmaHandler("once") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &OnceTok) override { + PP.CheckEndOfDirective("pragma once"); + PP.HandlePragmaOnce(OnceTok); + } +}; + +/// PragmaMarkHandler - "\#pragma mark ..." is ignored by the compiler, and the +/// rest of the line is not lexed. +struct PragmaMarkHandler : public PragmaHandler { + PragmaMarkHandler() : PragmaHandler("mark") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &MarkTok) override { + PP.HandlePragmaMark(); + } +}; + +/// PragmaPoisonHandler - "\#pragma poison x" marks x as not usable. +struct PragmaPoisonHandler : public PragmaHandler { + PragmaPoisonHandler() : PragmaHandler("poison") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &PoisonTok) override { + PP.HandlePragmaPoison(); + } +}; + +/// PragmaSystemHeaderHandler - "\#pragma system_header" marks the current file +/// as a system header, which silences warnings in it. +struct PragmaSystemHeaderHandler : public PragmaHandler { + PragmaSystemHeaderHandler() : PragmaHandler("system_header") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &SHToken) override { + PP.HandlePragmaSystemHeader(SHToken); + PP.CheckEndOfDirective("pragma"); + } +}; + +struct PragmaDependencyHandler : public PragmaHandler { + PragmaDependencyHandler() : PragmaHandler("dependency") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &DepToken) override { + PP.HandlePragmaDependency(DepToken); + } +}; + +struct PragmaDebugHandler : public PragmaHandler { + PragmaDebugHandler() : PragmaHandler("__debug") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &DebugToken) override { + Token Tok; + PP.LexUnexpandedToken(Tok); + if (Tok.isNot(tok::identifier)) { + PP.Diag(Tok, diag::warn_pragma_diagnostic_invalid); + return; + } + IdentifierInfo *II = Tok.getIdentifierInfo(); + + if (II->isStr("assert")) { + llvm_unreachable("This is an assertion!"); + } else if (II->isStr("crash")) { + LLVM_BUILTIN_TRAP; + } else if (II->isStr("parser_crash")) { + Token Crasher; + Crasher.startToken(); + Crasher.setKind(tok::annot_pragma_parser_crash); + Crasher.setAnnotationRange(SourceRange(Tok.getLocation())); + PP.EnterToken(Crasher, /*IsReinject*/false); + } else if (II->isStr("dump")) { + Token Identifier; + PP.LexUnexpandedToken(Identifier); + if (auto *DumpII = Identifier.getIdentifierInfo()) { + Token DumpAnnot; + DumpAnnot.startToken(); + DumpAnnot.setKind(tok::annot_pragma_dump); + DumpAnnot.setAnnotationRange( + SourceRange(Tok.getLocation(), Identifier.getLocation())); + DumpAnnot.setAnnotationValue(DumpII); + PP.DiscardUntilEndOfDirective(); + PP.EnterToken(DumpAnnot, /*IsReinject*/false); + } else { + PP.Diag(Identifier, diag::warn_pragma_debug_missing_argument) + << II->getName(); + } + } else if (II->isStr("diag_mapping")) { + Token DiagName; + PP.LexUnexpandedToken(DiagName); + if (DiagName.is(tok::eod)) + PP.getDiagnostics().dump(); + else if (DiagName.is(tok::string_literal) && !DiagName.hasUDSuffix()) { + StringLiteralParser Literal(DiagName, PP); + if (Literal.hadError) + return; + PP.getDiagnostics().dump(Literal.GetString()); + } else { + PP.Diag(DiagName, diag::warn_pragma_debug_missing_argument) + << II->getName(); + } + } else if (II->isStr("llvm_fatal_error")) { + llvm::report_fatal_error("#pragma clang __debug llvm_fatal_error"); + } else if (II->isStr("llvm_unreachable")) { + llvm_unreachable("#pragma clang __debug llvm_unreachable"); + } else if (II->isStr("macro")) { + Token MacroName; + PP.LexUnexpandedToken(MacroName); + auto *MacroII = MacroName.getIdentifierInfo(); + if (MacroII) + PP.dumpMacroInfo(MacroII); + else + PP.Diag(MacroName, diag::warn_pragma_debug_missing_argument) + << II->getName(); + } else if (II->isStr("module_map")) { + llvm::SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 8> + ModuleName; + if (LexModuleName(PP, Tok, ModuleName)) + return; + ModuleMap &MM = PP.getHeaderSearchInfo().getModuleMap(); + Module *M = nullptr; + for (auto IIAndLoc : ModuleName) { + M = MM.lookupModuleQualified(IIAndLoc.first->getName(), M); + if (!M) { + PP.Diag(IIAndLoc.second, diag::warn_pragma_debug_unknown_module) + << IIAndLoc.first; + return; + } + } + M->dump(); + } else if (II->isStr("overflow_stack")) { + DebugOverflowStack(); + } else if (II->isStr("handle_crash")) { + llvm::CrashRecoveryContext *CRC =llvm::CrashRecoveryContext::GetCurrent(); + if (CRC) + CRC->HandleCrash(); + } else if (II->isStr("captured")) { + HandleCaptured(PP); + } else { + PP.Diag(Tok, diag::warn_pragma_debug_unexpected_command) + << II->getName(); + } + + PPCallbacks *Callbacks = PP.getPPCallbacks(); + if (Callbacks) + Callbacks->PragmaDebug(Tok.getLocation(), II->getName()); + } + + void HandleCaptured(Preprocessor &PP) { + Token Tok; + PP.LexUnexpandedToken(Tok); + + if (Tok.isNot(tok::eod)) { + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) + << "pragma clang __debug captured"; + return; + } + + SourceLocation NameLoc = Tok.getLocation(); + MutableArrayRef<Token> Toks( + PP.getPreprocessorAllocator().Allocate<Token>(1), 1); + Toks[0].startToken(); + Toks[0].setKind(tok::annot_pragma_captured); + Toks[0].setLocation(NameLoc); + + PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/true, + /*IsReinject=*/false); + } + +// Disable MSVC warning about runtime stack overflow. +#ifdef _MSC_VER + #pragma warning(disable : 4717) +#endif + static void DebugOverflowStack(void (*P)() = nullptr) { + void (*volatile Self)(void(*P)()) = DebugOverflowStack; + Self(reinterpret_cast<void(*)()>(Self)); + } +#ifdef _MSC_VER + #pragma warning(default : 4717) +#endif +}; + +/// PragmaDiagnosticHandler - e.g. '\#pragma GCC diagnostic ignored "-Wformat"' +struct PragmaDiagnosticHandler : public PragmaHandler { +private: + const char *Namespace; + +public: + explicit PragmaDiagnosticHandler(const char *NS) + : PragmaHandler("diagnostic"), Namespace(NS) {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &DiagToken) override { + SourceLocation DiagLoc = DiagToken.getLocation(); + Token Tok; + PP.LexUnexpandedToken(Tok); + if (Tok.isNot(tok::identifier)) { + PP.Diag(Tok, diag::warn_pragma_diagnostic_invalid); + return; + } + IdentifierInfo *II = Tok.getIdentifierInfo(); + PPCallbacks *Callbacks = PP.getPPCallbacks(); + + if (II->isStr("pop")) { + if (!PP.getDiagnostics().popMappings(DiagLoc)) + PP.Diag(Tok, diag::warn_pragma_diagnostic_cannot_pop); + else if (Callbacks) + Callbacks->PragmaDiagnosticPop(DiagLoc, Namespace); + return; + } else if (II->isStr("push")) { + PP.getDiagnostics().pushMappings(DiagLoc); + if (Callbacks) + Callbacks->PragmaDiagnosticPush(DiagLoc, Namespace); + return; + } + + diag::Severity SV = llvm::StringSwitch<diag::Severity>(II->getName()) + .Case("ignored", diag::Severity::Ignored) + .Case("warning", diag::Severity::Warning) + .Case("error", diag::Severity::Error) + .Case("fatal", diag::Severity::Fatal) + .Default(diag::Severity()); + + if (SV == diag::Severity()) { + PP.Diag(Tok, diag::warn_pragma_diagnostic_invalid); + return; + } + + PP.LexUnexpandedToken(Tok); + SourceLocation StringLoc = Tok.getLocation(); + + std::string WarningName; + if (!PP.FinishLexStringLiteral(Tok, WarningName, "pragma diagnostic", + /*AllowMacroExpansion=*/false)) + return; + + if (Tok.isNot(tok::eod)) { + PP.Diag(Tok.getLocation(), diag::warn_pragma_diagnostic_invalid_token); + return; + } + + if (WarningName.size() < 3 || WarningName[0] != '-' || + (WarningName[1] != 'W' && WarningName[1] != 'R')) { + PP.Diag(StringLoc, diag::warn_pragma_diagnostic_invalid_option); + return; + } + + diag::Flavor Flavor = WarningName[1] == 'W' ? diag::Flavor::WarningOrError + : diag::Flavor::Remark; + StringRef Group = StringRef(WarningName).substr(2); + bool unknownDiag = false; + if (Group == "everything") { + // Special handling for pragma clang diagnostic ... "-Weverything". + // There is no formal group named "everything", so there has to be a + // special case for it. + PP.getDiagnostics().setSeverityForAll(Flavor, SV, DiagLoc); + } else + unknownDiag = PP.getDiagnostics().setSeverityForGroup(Flavor, Group, SV, + DiagLoc); + if (unknownDiag) + PP.Diag(StringLoc, diag::warn_pragma_diagnostic_unknown_warning) + << WarningName; + else if (Callbacks) + Callbacks->PragmaDiagnostic(DiagLoc, Namespace, SV, WarningName); + } +}; + +/// "\#pragma hdrstop [<header-name-string>]" +struct PragmaHdrstopHandler : public PragmaHandler { + PragmaHdrstopHandler() : PragmaHandler("hdrstop") {} + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &DepToken) override { + PP.HandlePragmaHdrstop(DepToken); + } +}; + +/// "\#pragma warning(...)". MSVC's diagnostics do not map cleanly to clang's +/// diagnostics, so we don't really implement this pragma. We parse it and +/// ignore it to avoid -Wunknown-pragma warnings. +struct PragmaWarningHandler : public PragmaHandler { + PragmaWarningHandler() : PragmaHandler("warning") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + // Parse things like: + // warning(push, 1) + // warning(pop) + // warning(disable : 1 2 3 ; error : 4 5 6 ; suppress : 7 8 9) + SourceLocation DiagLoc = Tok.getLocation(); + PPCallbacks *Callbacks = PP.getPPCallbacks(); + + PP.Lex(Tok); + if (Tok.isNot(tok::l_paren)) { + PP.Diag(Tok, diag::warn_pragma_warning_expected) << "("; + return; + } + + PP.Lex(Tok); + IdentifierInfo *II = Tok.getIdentifierInfo(); + + if (II && II->isStr("push")) { + // #pragma warning( push[ ,n ] ) + int Level = -1; + PP.Lex(Tok); + if (Tok.is(tok::comma)) { + PP.Lex(Tok); + uint64_t Value; + if (Tok.is(tok::numeric_constant) && + PP.parseSimpleIntegerLiteral(Tok, Value)) + Level = int(Value); + if (Level < 0 || Level > 4) { + PP.Diag(Tok, diag::warn_pragma_warning_push_level); + return; + } + } + if (Callbacks) + Callbacks->PragmaWarningPush(DiagLoc, Level); + } else if (II && II->isStr("pop")) { + // #pragma warning( pop ) + PP.Lex(Tok); + if (Callbacks) + Callbacks->PragmaWarningPop(DiagLoc); + } else { + // #pragma warning( warning-specifier : warning-number-list + // [; warning-specifier : warning-number-list...] ) + while (true) { + II = Tok.getIdentifierInfo(); + if (!II && !Tok.is(tok::numeric_constant)) { + PP.Diag(Tok, diag::warn_pragma_warning_spec_invalid); + return; + } + + // Figure out which warning specifier this is. + bool SpecifierValid; + StringRef Specifier; + llvm::SmallString<1> SpecifierBuf; + if (II) { + Specifier = II->getName(); + SpecifierValid = llvm::StringSwitch<bool>(Specifier) + .Cases("default", "disable", "error", "once", + "suppress", true) + .Default(false); + // If we read a correct specifier, snatch next token (that should be + // ":", checked later). + if (SpecifierValid) + PP.Lex(Tok); + } else { + // Token is a numeric constant. It should be either 1, 2, 3 or 4. + uint64_t Value; + Specifier = PP.getSpelling(Tok, SpecifierBuf); + if (PP.parseSimpleIntegerLiteral(Tok, Value)) { + SpecifierValid = (Value >= 1) && (Value <= 4); + } else + SpecifierValid = false; + // Next token already snatched by parseSimpleIntegerLiteral. + } + + if (!SpecifierValid) { + PP.Diag(Tok, diag::warn_pragma_warning_spec_invalid); + return; + } + if (Tok.isNot(tok::colon)) { + PP.Diag(Tok, diag::warn_pragma_warning_expected) << ":"; + return; + } + + // Collect the warning ids. + SmallVector<int, 4> Ids; + PP.Lex(Tok); + while (Tok.is(tok::numeric_constant)) { + uint64_t Value; + if (!PP.parseSimpleIntegerLiteral(Tok, Value) || Value == 0 || + Value > std::numeric_limits<int>::max()) { + PP.Diag(Tok, diag::warn_pragma_warning_expected_number); + return; + } + Ids.push_back(int(Value)); + } + if (Callbacks) + Callbacks->PragmaWarning(DiagLoc, Specifier, Ids); + + // Parse the next specifier if there is a semicolon. + if (Tok.isNot(tok::semi)) + break; + PP.Lex(Tok); + } + } + + if (Tok.isNot(tok::r_paren)) { + PP.Diag(Tok, diag::warn_pragma_warning_expected) << ")"; + return; + } + + PP.Lex(Tok); + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma warning"; + } +}; + +/// "\#pragma execution_character_set(...)". MSVC supports this pragma only +/// for "UTF-8". We parse it and ignore it if UTF-8 is provided and warn +/// otherwise to avoid -Wunknown-pragma warnings. +struct PragmaExecCharsetHandler : public PragmaHandler { + PragmaExecCharsetHandler() : PragmaHandler("execution_character_set") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + // Parse things like: + // execution_character_set(push, "UTF-8") + // execution_character_set(pop) + SourceLocation DiagLoc = Tok.getLocation(); + PPCallbacks *Callbacks = PP.getPPCallbacks(); + + PP.Lex(Tok); + if (Tok.isNot(tok::l_paren)) { + PP.Diag(Tok, diag::warn_pragma_exec_charset_expected) << "("; + return; + } + + PP.Lex(Tok); + IdentifierInfo *II = Tok.getIdentifierInfo(); + + if (II && II->isStr("push")) { + // #pragma execution_character_set( push[ , string ] ) + PP.Lex(Tok); + if (Tok.is(tok::comma)) { + PP.Lex(Tok); + + std::string ExecCharset; + if (!PP.FinishLexStringLiteral(Tok, ExecCharset, + "pragma execution_character_set", + /*AllowMacroExpansion=*/false)) + return; + + // MSVC supports either of these, but nothing else. + if (ExecCharset != "UTF-8" && ExecCharset != "utf-8") { + PP.Diag(Tok, diag::warn_pragma_exec_charset_push_invalid) << ExecCharset; + return; + } + } + if (Callbacks) + Callbacks->PragmaExecCharsetPush(DiagLoc, "UTF-8"); + } else if (II && II->isStr("pop")) { + // #pragma execution_character_set( pop ) + PP.Lex(Tok); + if (Callbacks) + Callbacks->PragmaExecCharsetPop(DiagLoc); + } else { + PP.Diag(Tok, diag::warn_pragma_exec_charset_spec_invalid); + return; + } + + if (Tok.isNot(tok::r_paren)) { + PP.Diag(Tok, diag::warn_pragma_exec_charset_expected) << ")"; + return; + } + + PP.Lex(Tok); + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma execution_character_set"; + } +}; + +/// PragmaIncludeAliasHandler - "\#pragma include_alias("...")". +struct PragmaIncludeAliasHandler : public PragmaHandler { + PragmaIncludeAliasHandler() : PragmaHandler("include_alias") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &IncludeAliasTok) override { + PP.HandlePragmaIncludeAlias(IncludeAliasTok); + } +}; + +/// PragmaMessageHandler - Handle the microsoft and gcc \#pragma message +/// extension. The syntax is: +/// \code +/// #pragma message(string) +/// \endcode +/// OR, in GCC mode: +/// \code +/// #pragma message string +/// \endcode +/// string is a string, which is fully macro expanded, and permits string +/// concatenation, embedded escape characters, etc... See MSDN for more details. +/// Also handles \#pragma GCC warning and \#pragma GCC error which take the same +/// form as \#pragma message. +struct PragmaMessageHandler : public PragmaHandler { +private: + const PPCallbacks::PragmaMessageKind Kind; + const StringRef Namespace; + + static const char* PragmaKind(PPCallbacks::PragmaMessageKind Kind, + bool PragmaNameOnly = false) { + switch (Kind) { + case PPCallbacks::PMK_Message: + return PragmaNameOnly ? "message" : "pragma message"; + case PPCallbacks::PMK_Warning: + return PragmaNameOnly ? "warning" : "pragma warning"; + case PPCallbacks::PMK_Error: + return PragmaNameOnly ? "error" : "pragma error"; + } + llvm_unreachable("Unknown PragmaMessageKind!"); + } + +public: + PragmaMessageHandler(PPCallbacks::PragmaMessageKind Kind, + StringRef Namespace = StringRef()) + : PragmaHandler(PragmaKind(Kind, true)), Kind(Kind), + Namespace(Namespace) {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + SourceLocation MessageLoc = Tok.getLocation(); + PP.Lex(Tok); + bool ExpectClosingParen = false; + switch (Tok.getKind()) { + case tok::l_paren: + // We have a MSVC style pragma message. + ExpectClosingParen = true; + // Read the string. + PP.Lex(Tok); + break; + case tok::string_literal: + // We have a GCC style pragma message, and we just read the string. + break; + default: + PP.Diag(MessageLoc, diag::err_pragma_message_malformed) << Kind; + return; + } + + std::string MessageString; + if (!PP.FinishLexStringLiteral(Tok, MessageString, PragmaKind(Kind), + /*AllowMacroExpansion=*/true)) + return; + + if (ExpectClosingParen) { + if (Tok.isNot(tok::r_paren)) { + PP.Diag(Tok.getLocation(), diag::err_pragma_message_malformed) << Kind; + return; + } + PP.Lex(Tok); // eat the r_paren. + } + + if (Tok.isNot(tok::eod)) { + PP.Diag(Tok.getLocation(), diag::err_pragma_message_malformed) << Kind; + return; + } + + // Output the message. + PP.Diag(MessageLoc, (Kind == PPCallbacks::PMK_Error) + ? diag::err_pragma_message + : diag::warn_pragma_message) << MessageString; + + // If the pragma is lexically sound, notify any interested PPCallbacks. + if (PPCallbacks *Callbacks = PP.getPPCallbacks()) + Callbacks->PragmaMessage(MessageLoc, Namespace, Kind, MessageString); + } +}; + +/// Handle the clang \#pragma module import extension. The syntax is: +/// \code +/// #pragma clang module import some.module.name +/// \endcode +struct PragmaModuleImportHandler : public PragmaHandler { + PragmaModuleImportHandler() : PragmaHandler("import") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + SourceLocation ImportLoc = Tok.getLocation(); + + // Read the module name. + llvm::SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 8> + ModuleName; + if (LexModuleName(PP, Tok, ModuleName)) + return; + + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; + + // If we have a non-empty module path, load the named module. + Module *Imported = + PP.getModuleLoader().loadModule(ImportLoc, ModuleName, Module::Hidden, + /*IsInclusionDirective=*/false); + if (!Imported) + return; + + PP.makeModuleVisible(Imported, ImportLoc); + PP.EnterAnnotationToken(SourceRange(ImportLoc, ModuleName.back().second), + tok::annot_module_include, Imported); + if (auto *CB = PP.getPPCallbacks()) + CB->moduleImport(ImportLoc, ModuleName, Imported); + } +}; + +/// Handle the clang \#pragma module begin extension. The syntax is: +/// \code +/// #pragma clang module begin some.module.name +/// ... +/// #pragma clang module end +/// \endcode +struct PragmaModuleBeginHandler : public PragmaHandler { + PragmaModuleBeginHandler() : PragmaHandler("begin") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + SourceLocation BeginLoc = Tok.getLocation(); + + // Read the module name. + llvm::SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 8> + ModuleName; + if (LexModuleName(PP, Tok, ModuleName)) + return; + + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; + + // We can only enter submodules of the current module. + StringRef Current = PP.getLangOpts().CurrentModule; + if (ModuleName.front().first->getName() != Current) { + PP.Diag(ModuleName.front().second, diag::err_pp_module_begin_wrong_module) + << ModuleName.front().first << (ModuleName.size() > 1) + << Current.empty() << Current; + return; + } + + // Find the module we're entering. We require that a module map for it + // be loaded or implicitly loadable. + auto &HSI = PP.getHeaderSearchInfo(); + Module *M = HSI.lookupModule(Current); + if (!M) { + PP.Diag(ModuleName.front().second, + diag::err_pp_module_begin_no_module_map) << Current; + return; + } + for (unsigned I = 1; I != ModuleName.size(); ++I) { + auto *NewM = M->findOrInferSubmodule(ModuleName[I].first->getName()); + if (!NewM) { + PP.Diag(ModuleName[I].second, diag::err_pp_module_begin_no_submodule) + << M->getFullModuleName() << ModuleName[I].first; + return; + } + M = NewM; + } + + // If the module isn't available, it doesn't make sense to enter it. + if (Preprocessor::checkModuleIsAvailable( + PP.getLangOpts(), PP.getTargetInfo(), PP.getDiagnostics(), M)) { + PP.Diag(BeginLoc, diag::note_pp_module_begin_here) + << M->getTopLevelModuleName(); + return; + } + + // Enter the scope of the submodule. + PP.EnterSubmodule(M, BeginLoc, /*ForPragma*/true); + PP.EnterAnnotationToken(SourceRange(BeginLoc, ModuleName.back().second), + tok::annot_module_begin, M); + } +}; + +/// Handle the clang \#pragma module end extension. +struct PragmaModuleEndHandler : public PragmaHandler { + PragmaModuleEndHandler() : PragmaHandler("end") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + SourceLocation Loc = Tok.getLocation(); + + PP.LexUnexpandedToken(Tok); + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; + + Module *M = PP.LeaveSubmodule(/*ForPragma*/true); + if (M) + PP.EnterAnnotationToken(SourceRange(Loc), tok::annot_module_end, M); + else + PP.Diag(Loc, diag::err_pp_module_end_without_module_begin); + } +}; + +/// Handle the clang \#pragma module build extension. +struct PragmaModuleBuildHandler : public PragmaHandler { + PragmaModuleBuildHandler() : PragmaHandler("build") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + PP.HandlePragmaModuleBuild(Tok); + } +}; + +/// Handle the clang \#pragma module load extension. +struct PragmaModuleLoadHandler : public PragmaHandler { + PragmaModuleLoadHandler() : PragmaHandler("load") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &Tok) override { + SourceLocation Loc = Tok.getLocation(); + + // Read the module name. + llvm::SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 8> + ModuleName; + if (LexModuleName(PP, Tok, ModuleName)) + return; + + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; + + // Load the module, don't make it visible. + PP.getModuleLoader().loadModule(Loc, ModuleName, Module::Hidden, + /*IsInclusionDirective=*/false); + } +}; + +/// PragmaPushMacroHandler - "\#pragma push_macro" saves the value of the +/// macro on the top of the stack. +struct PragmaPushMacroHandler : public PragmaHandler { + PragmaPushMacroHandler() : PragmaHandler("push_macro") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &PushMacroTok) override { + PP.HandlePragmaPushMacro(PushMacroTok); + } +}; + +/// PragmaPopMacroHandler - "\#pragma pop_macro" sets the value of the +/// macro to the value on the top of the stack. +struct PragmaPopMacroHandler : public PragmaHandler { + PragmaPopMacroHandler() : PragmaHandler("pop_macro") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &PopMacroTok) override { + PP.HandlePragmaPopMacro(PopMacroTok); + } +}; + +/// PragmaARCCFCodeAuditedHandler - +/// \#pragma clang arc_cf_code_audited begin/end +struct PragmaARCCFCodeAuditedHandler : public PragmaHandler { + PragmaARCCFCodeAuditedHandler() : PragmaHandler("arc_cf_code_audited") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &NameTok) override { + SourceLocation Loc = NameTok.getLocation(); + bool IsBegin; + + Token Tok; + + // Lex the 'begin' or 'end'. + PP.LexUnexpandedToken(Tok); + const IdentifierInfo *BeginEnd = Tok.getIdentifierInfo(); + if (BeginEnd && BeginEnd->isStr("begin")) { + IsBegin = true; + } else if (BeginEnd && BeginEnd->isStr("end")) { + IsBegin = false; + } else { + PP.Diag(Tok.getLocation(), diag::err_pp_arc_cf_code_audited_syntax); + return; + } + + // Verify that this is followed by EOD. + PP.LexUnexpandedToken(Tok); + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; + + // The start location of the active audit. + SourceLocation BeginLoc = PP.getPragmaARCCFCodeAuditedInfo().second; + + // The start location we want after processing this. + SourceLocation NewLoc; + + if (IsBegin) { + // Complain about attempts to re-enter an audit. + if (BeginLoc.isValid()) { + PP.Diag(Loc, diag::err_pp_double_begin_of_arc_cf_code_audited); + PP.Diag(BeginLoc, diag::note_pragma_entered_here); + } + NewLoc = Loc; + } else { + // Complain about attempts to leave an audit that doesn't exist. + if (!BeginLoc.isValid()) { + PP.Diag(Loc, diag::err_pp_unmatched_end_of_arc_cf_code_audited); + return; + } + NewLoc = SourceLocation(); + } + + PP.setPragmaARCCFCodeAuditedInfo(NameTok.getIdentifierInfo(), NewLoc); + } +}; + +/// PragmaAssumeNonNullHandler - +/// \#pragma clang assume_nonnull begin/end +struct PragmaAssumeNonNullHandler : public PragmaHandler { + PragmaAssumeNonNullHandler() : PragmaHandler("assume_nonnull") {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &NameTok) override { + SourceLocation Loc = NameTok.getLocation(); + bool IsBegin; + + Token Tok; + + // Lex the 'begin' or 'end'. + PP.LexUnexpandedToken(Tok); + const IdentifierInfo *BeginEnd = Tok.getIdentifierInfo(); + if (BeginEnd && BeginEnd->isStr("begin")) { + IsBegin = true; + } else if (BeginEnd && BeginEnd->isStr("end")) { + IsBegin = false; + } else { + PP.Diag(Tok.getLocation(), diag::err_pp_assume_nonnull_syntax); + return; + } + + // Verify that this is followed by EOD. + PP.LexUnexpandedToken(Tok); + if (Tok.isNot(tok::eod)) + PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; + + // The start location of the active audit. + SourceLocation BeginLoc = PP.getPragmaAssumeNonNullLoc(); + + // The start location we want after processing this. + SourceLocation NewLoc; + PPCallbacks *Callbacks = PP.getPPCallbacks(); + + if (IsBegin) { + // Complain about attempts to re-enter an audit. + if (BeginLoc.isValid()) { + PP.Diag(Loc, diag::err_pp_double_begin_of_assume_nonnull); + PP.Diag(BeginLoc, diag::note_pragma_entered_here); + } + NewLoc = Loc; + if (Callbacks) + Callbacks->PragmaAssumeNonNullBegin(NewLoc); + } else { + // Complain about attempts to leave an audit that doesn't exist. + if (!BeginLoc.isValid()) { + PP.Diag(Loc, diag::err_pp_unmatched_end_of_assume_nonnull); + return; + } + NewLoc = SourceLocation(); + if (Callbacks) + Callbacks->PragmaAssumeNonNullEnd(NewLoc); + } + + PP.setPragmaAssumeNonNullLoc(NewLoc); + } +}; + +/// Handle "\#pragma region [...]" +/// +/// The syntax is +/// \code +/// #pragma region [optional name] +/// #pragma endregion [optional comment] +/// \endcode +/// +/// \note This is +/// <a href="http://msdn.microsoft.com/en-us/library/b6xkz944(v=vs.80).aspx">editor-only</a> +/// pragma, just skipped by compiler. +struct PragmaRegionHandler : public PragmaHandler { + PragmaRegionHandler(const char *pragma) : PragmaHandler(pragma) {} + + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &NameTok) override { + // #pragma region: endregion matches can be verified + // __pragma(region): no sense, but ignored by msvc + // _Pragma is not valid for MSVC, but there isn't any point + // to handle a _Pragma differently. + } +}; + +} // namespace + +/// RegisterBuiltinPragmas - Install the standard preprocessor pragmas: +/// \#pragma GCC poison/system_header/dependency and \#pragma once. +void Preprocessor::RegisterBuiltinPragmas() { + AddPragmaHandler(new PragmaOnceHandler()); + AddPragmaHandler(new PragmaMarkHandler()); + AddPragmaHandler(new PragmaPushMacroHandler()); + AddPragmaHandler(new PragmaPopMacroHandler()); + AddPragmaHandler(new PragmaMessageHandler(PPCallbacks::PMK_Message)); + + // #pragma GCC ... + AddPragmaHandler("GCC", new PragmaPoisonHandler()); + AddPragmaHandler("GCC", new PragmaSystemHeaderHandler()); + AddPragmaHandler("GCC", new PragmaDependencyHandler()); + AddPragmaHandler("GCC", new PragmaDiagnosticHandler("GCC")); + AddPragmaHandler("GCC", new PragmaMessageHandler(PPCallbacks::PMK_Warning, + "GCC")); + AddPragmaHandler("GCC", new PragmaMessageHandler(PPCallbacks::PMK_Error, + "GCC")); + // #pragma clang ... + AddPragmaHandler("clang", new PragmaPoisonHandler()); + AddPragmaHandler("clang", new PragmaSystemHeaderHandler()); + AddPragmaHandler("clang", new PragmaDebugHandler()); + AddPragmaHandler("clang", new PragmaDependencyHandler()); + AddPragmaHandler("clang", new PragmaDiagnosticHandler("clang")); + AddPragmaHandler("clang", new PragmaARCCFCodeAuditedHandler()); + AddPragmaHandler("clang", new PragmaAssumeNonNullHandler()); + + // #pragma clang module ... + auto *ModuleHandler = new PragmaNamespace("module"); + AddPragmaHandler("clang", ModuleHandler); + ModuleHandler->AddPragma(new PragmaModuleImportHandler()); + ModuleHandler->AddPragma(new PragmaModuleBeginHandler()); + ModuleHandler->AddPragma(new PragmaModuleEndHandler()); + ModuleHandler->AddPragma(new PragmaModuleBuildHandler()); + ModuleHandler->AddPragma(new PragmaModuleLoadHandler()); + + // Add region pragmas. + AddPragmaHandler(new PragmaRegionHandler("region")); + AddPragmaHandler(new PragmaRegionHandler("endregion")); + + // MS extensions. + if (LangOpts.MicrosoftExt) { + AddPragmaHandler(new PragmaWarningHandler()); + AddPragmaHandler(new PragmaExecCharsetHandler()); + AddPragmaHandler(new PragmaIncludeAliasHandler()); + AddPragmaHandler(new PragmaHdrstopHandler()); + } + + // Pragmas added by plugins + for (PragmaHandlerRegistry::iterator it = PragmaHandlerRegistry::begin(), + ie = PragmaHandlerRegistry::end(); + it != ie; ++it) { + AddPragmaHandler(it->instantiate().release()); + } +} + +/// Ignore all pragmas, useful for modes such as -Eonly which would otherwise +/// warn about those pragmas being unknown. +void Preprocessor::IgnorePragmas() { + AddPragmaHandler(new EmptyPragmaHandler()); + // Also ignore all pragmas in all namespaces created + // in Preprocessor::RegisterBuiltinPragmas(). + AddPragmaHandler("GCC", new EmptyPragmaHandler()); + AddPragmaHandler("clang", new EmptyPragmaHandler()); +} diff --git a/clang/lib/Lex/PreprocessingRecord.cpp b/clang/lib/Lex/PreprocessingRecord.cpp new file mode 100644 index 000000000000..115256db4809 --- /dev/null +++ b/clang/lib/Lex/PreprocessingRecord.cpp @@ -0,0 +1,516 @@ +//===- PreprocessingRecord.cpp - Record of Preprocessing ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the PreprocessingRecord class, which maintains a record +// of what occurred during preprocessing, and its helpers. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/PreprocessingRecord.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Token.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Support/Capacity.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstring> +#include <iterator> +#include <utility> +#include <vector> + +using namespace clang; + +ExternalPreprocessingRecordSource::~ExternalPreprocessingRecordSource() = + default; + +InclusionDirective::InclusionDirective(PreprocessingRecord &PPRec, + InclusionKind Kind, StringRef FileName, + bool InQuotes, bool ImportedModule, + const FileEntry *File, SourceRange Range) + : PreprocessingDirective(InclusionDirectiveKind, Range), InQuotes(InQuotes), + Kind(Kind), ImportedModule(ImportedModule), File(File) { + char *Memory = (char *)PPRec.Allocate(FileName.size() + 1, alignof(char)); + memcpy(Memory, FileName.data(), FileName.size()); + Memory[FileName.size()] = 0; + this->FileName = StringRef(Memory, FileName.size()); +} + +PreprocessingRecord::PreprocessingRecord(SourceManager &SM) : SourceMgr(SM) {} + +/// Returns a pair of [Begin, End) iterators of preprocessed entities +/// that source range \p Range encompasses. +llvm::iterator_range<PreprocessingRecord::iterator> +PreprocessingRecord::getPreprocessedEntitiesInRange(SourceRange Range) { + if (Range.isInvalid()) + return llvm::make_range(iterator(), iterator()); + + if (CachedRangeQuery.Range == Range) { + return llvm::make_range(iterator(this, CachedRangeQuery.Result.first), + iterator(this, CachedRangeQuery.Result.second)); + } + + std::pair<int, int> Res = getPreprocessedEntitiesInRangeSlow(Range); + + CachedRangeQuery.Range = Range; + CachedRangeQuery.Result = Res; + + return llvm::make_range(iterator(this, Res.first), + iterator(this, Res.second)); +} + +static bool isPreprocessedEntityIfInFileID(PreprocessedEntity *PPE, FileID FID, + SourceManager &SM) { + assert(FID.isValid()); + if (!PPE) + return false; + + SourceLocation Loc = PPE->getSourceRange().getBegin(); + if (Loc.isInvalid()) + return false; + + return SM.isInFileID(SM.getFileLoc(Loc), FID); +} + +/// Returns true if the preprocessed entity that \arg PPEI iterator +/// points to is coming from the file \arg FID. +/// +/// Can be used to avoid implicit deserializations of preallocated +/// preprocessed entities if we only care about entities of a specific file +/// and not from files \#included in the range given at +/// \see getPreprocessedEntitiesInRange. +bool PreprocessingRecord::isEntityInFileID(iterator PPEI, FileID FID) { + if (FID.isInvalid()) + return false; + + int Pos = std::distance(iterator(this, 0), PPEI); + if (Pos < 0) { + if (unsigned(-Pos-1) >= LoadedPreprocessedEntities.size()) { + assert(0 && "Out-of bounds loaded preprocessed entity"); + return false; + } + assert(ExternalSource && "No external source to load from"); + unsigned LoadedIndex = LoadedPreprocessedEntities.size()+Pos; + if (PreprocessedEntity *PPE = LoadedPreprocessedEntities[LoadedIndex]) + return isPreprocessedEntityIfInFileID(PPE, FID, SourceMgr); + + // See if the external source can see if the entity is in the file without + // deserializing it. + Optional<bool> IsInFile = + ExternalSource->isPreprocessedEntityInFileID(LoadedIndex, FID); + if (IsInFile.hasValue()) + return IsInFile.getValue(); + + // The external source did not provide a definite answer, go and deserialize + // the entity to check it. + return isPreprocessedEntityIfInFileID( + getLoadedPreprocessedEntity(LoadedIndex), + FID, SourceMgr); + } + + if (unsigned(Pos) >= PreprocessedEntities.size()) { + assert(0 && "Out-of bounds local preprocessed entity"); + return false; + } + return isPreprocessedEntityIfInFileID(PreprocessedEntities[Pos], + FID, SourceMgr); +} + +/// Returns a pair of [Begin, End) iterators of preprocessed entities +/// that source range \arg R encompasses. +std::pair<int, int> +PreprocessingRecord::getPreprocessedEntitiesInRangeSlow(SourceRange Range) { + assert(Range.isValid()); + assert(!SourceMgr.isBeforeInTranslationUnit(Range.getEnd(),Range.getBegin())); + + std::pair<unsigned, unsigned> + Local = findLocalPreprocessedEntitiesInRange(Range); + + // Check if range spans local entities. + if (!ExternalSource || SourceMgr.isLocalSourceLocation(Range.getBegin())) + return std::make_pair(Local.first, Local.second); + + std::pair<unsigned, unsigned> + Loaded = ExternalSource->findPreprocessedEntitiesInRange(Range); + + // Check if range spans local entities. + if (Loaded.first == Loaded.second) + return std::make_pair(Local.first, Local.second); + + unsigned TotalLoaded = LoadedPreprocessedEntities.size(); + + // Check if range spans loaded entities. + if (Local.first == Local.second) + return std::make_pair(int(Loaded.first)-TotalLoaded, + int(Loaded.second)-TotalLoaded); + + // Range spands loaded and local entities. + return std::make_pair(int(Loaded.first)-TotalLoaded, Local.second); +} + +std::pair<unsigned, unsigned> +PreprocessingRecord::findLocalPreprocessedEntitiesInRange( + SourceRange Range) const { + if (Range.isInvalid()) + return std::make_pair(0,0); + assert(!SourceMgr.isBeforeInTranslationUnit(Range.getEnd(),Range.getBegin())); + + unsigned Begin = findBeginLocalPreprocessedEntity(Range.getBegin()); + unsigned End = findEndLocalPreprocessedEntity(Range.getEnd()); + return std::make_pair(Begin, End); +} + +namespace { + +template <SourceLocation (SourceRange::*getRangeLoc)() const> +struct PPEntityComp { + const SourceManager &SM; + + explicit PPEntityComp(const SourceManager &SM) : SM(SM) {} + + bool operator()(PreprocessedEntity *L, PreprocessedEntity *R) const { + SourceLocation LHS = getLoc(L); + SourceLocation RHS = getLoc(R); + return SM.isBeforeInTranslationUnit(LHS, RHS); + } + + bool operator()(PreprocessedEntity *L, SourceLocation RHS) const { + SourceLocation LHS = getLoc(L); + return SM.isBeforeInTranslationUnit(LHS, RHS); + } + + bool operator()(SourceLocation LHS, PreprocessedEntity *R) const { + SourceLocation RHS = getLoc(R); + return SM.isBeforeInTranslationUnit(LHS, RHS); + } + + SourceLocation getLoc(PreprocessedEntity *PPE) const { + SourceRange Range = PPE->getSourceRange(); + return (Range.*getRangeLoc)(); + } +}; + +} // namespace + +unsigned PreprocessingRecord::findBeginLocalPreprocessedEntity( + SourceLocation Loc) const { + if (SourceMgr.isLoadedSourceLocation(Loc)) + return 0; + + size_t Count = PreprocessedEntities.size(); + size_t Half; + std::vector<PreprocessedEntity *>::const_iterator + First = PreprocessedEntities.begin(); + std::vector<PreprocessedEntity *>::const_iterator I; + + // Do a binary search manually instead of using std::lower_bound because + // The end locations of entities may be unordered (when a macro expansion + // is inside another macro argument), but for this case it is not important + // whether we get the first macro expansion or its containing macro. + while (Count > 0) { + Half = Count/2; + I = First; + std::advance(I, Half); + if (SourceMgr.isBeforeInTranslationUnit((*I)->getSourceRange().getEnd(), + Loc)){ + First = I; + ++First; + Count = Count - Half - 1; + } else + Count = Half; + } + + return First - PreprocessedEntities.begin(); +} + +unsigned +PreprocessingRecord::findEndLocalPreprocessedEntity(SourceLocation Loc) const { + if (SourceMgr.isLoadedSourceLocation(Loc)) + return 0; + + auto I = llvm::upper_bound(PreprocessedEntities, Loc, + PPEntityComp<&SourceRange::getBegin>(SourceMgr)); + return I - PreprocessedEntities.begin(); +} + +PreprocessingRecord::PPEntityID +PreprocessingRecord::addPreprocessedEntity(PreprocessedEntity *Entity) { + assert(Entity); + SourceLocation BeginLoc = Entity->getSourceRange().getBegin(); + + if (isa<MacroDefinitionRecord>(Entity)) { + assert((PreprocessedEntities.empty() || + !SourceMgr.isBeforeInTranslationUnit( + BeginLoc, + PreprocessedEntities.back()->getSourceRange().getBegin())) && + "a macro definition was encountered out-of-order"); + PreprocessedEntities.push_back(Entity); + return getPPEntityID(PreprocessedEntities.size()-1, /*isLoaded=*/false); + } + + // Check normal case, this entity begin location is after the previous one. + if (PreprocessedEntities.empty() || + !SourceMgr.isBeforeInTranslationUnit(BeginLoc, + PreprocessedEntities.back()->getSourceRange().getBegin())) { + PreprocessedEntities.push_back(Entity); + return getPPEntityID(PreprocessedEntities.size()-1, /*isLoaded=*/false); + } + + // The entity's location is not after the previous one; this can happen with + // include directives that form the filename using macros, e.g: + // "#include MACRO(STUFF)" + // or with macro expansions inside macro arguments where the arguments are + // not expanded in the same order as listed, e.g: + // \code + // #define M1 1 + // #define M2 2 + // #define FM(x,y) y x + // FM(M1, M2) + // \endcode + + using pp_iter = std::vector<PreprocessedEntity *>::iterator; + + // Usually there are few macro expansions when defining the filename, do a + // linear search for a few entities. + unsigned count = 0; + for (pp_iter RI = PreprocessedEntities.end(), + Begin = PreprocessedEntities.begin(); + RI != Begin && count < 4; --RI, ++count) { + pp_iter I = RI; + --I; + if (!SourceMgr.isBeforeInTranslationUnit(BeginLoc, + (*I)->getSourceRange().getBegin())) { + pp_iter insertI = PreprocessedEntities.insert(RI, Entity); + return getPPEntityID(insertI - PreprocessedEntities.begin(), + /*isLoaded=*/false); + } + } + + // Linear search unsuccessful. Do a binary search. + pp_iter I = + llvm::upper_bound(PreprocessedEntities, BeginLoc, + PPEntityComp<&SourceRange::getBegin>(SourceMgr)); + pp_iter insertI = PreprocessedEntities.insert(I, Entity); + return getPPEntityID(insertI - PreprocessedEntities.begin(), + /*isLoaded=*/false); +} + +void PreprocessingRecord::SetExternalSource( + ExternalPreprocessingRecordSource &Source) { + assert(!ExternalSource && + "Preprocessing record already has an external source"); + ExternalSource = &Source; +} + +unsigned PreprocessingRecord::allocateLoadedEntities(unsigned NumEntities) { + unsigned Result = LoadedPreprocessedEntities.size(); + LoadedPreprocessedEntities.resize(LoadedPreprocessedEntities.size() + + NumEntities); + return Result; +} + +unsigned PreprocessingRecord::allocateSkippedRanges(unsigned NumRanges) { + unsigned Result = SkippedRanges.size(); + SkippedRanges.resize(SkippedRanges.size() + NumRanges); + SkippedRangesAllLoaded = false; + return Result; +} + +void PreprocessingRecord::ensureSkippedRangesLoaded() { + if (SkippedRangesAllLoaded || !ExternalSource) + return; + for (unsigned Index = 0; Index != SkippedRanges.size(); ++Index) { + if (SkippedRanges[Index].isInvalid()) + SkippedRanges[Index] = ExternalSource->ReadSkippedRange(Index); + } + SkippedRangesAllLoaded = true; +} + +void PreprocessingRecord::RegisterMacroDefinition(MacroInfo *Macro, + MacroDefinitionRecord *Def) { + MacroDefinitions[Macro] = Def; +} + +/// Retrieve the preprocessed entity at the given ID. +PreprocessedEntity *PreprocessingRecord::getPreprocessedEntity(PPEntityID PPID){ + if (PPID.ID < 0) { + unsigned Index = -PPID.ID - 1; + assert(Index < LoadedPreprocessedEntities.size() && + "Out-of bounds loaded preprocessed entity"); + return getLoadedPreprocessedEntity(Index); + } + + if (PPID.ID == 0) + return nullptr; + unsigned Index = PPID.ID - 1; + assert(Index < PreprocessedEntities.size() && + "Out-of bounds local preprocessed entity"); + return PreprocessedEntities[Index]; +} + +/// Retrieve the loaded preprocessed entity at the given index. +PreprocessedEntity * +PreprocessingRecord::getLoadedPreprocessedEntity(unsigned Index) { + assert(Index < LoadedPreprocessedEntities.size() && + "Out-of bounds loaded preprocessed entity"); + assert(ExternalSource && "No external source to load from"); + PreprocessedEntity *&Entity = LoadedPreprocessedEntities[Index]; + if (!Entity) { + Entity = ExternalSource->ReadPreprocessedEntity(Index); + if (!Entity) // Failed to load. + Entity = new (*this) + PreprocessedEntity(PreprocessedEntity::InvalidKind, SourceRange()); + } + return Entity; +} + +MacroDefinitionRecord * +PreprocessingRecord::findMacroDefinition(const MacroInfo *MI) { + llvm::DenseMap<const MacroInfo *, MacroDefinitionRecord *>::iterator Pos = + MacroDefinitions.find(MI); + if (Pos == MacroDefinitions.end()) + return nullptr; + + return Pos->second; +} + +void PreprocessingRecord::addMacroExpansion(const Token &Id, + const MacroInfo *MI, + SourceRange Range) { + // We don't record nested macro expansions. + if (Id.getLocation().isMacroID()) + return; + + if (MI->isBuiltinMacro()) + addPreprocessedEntity(new (*this) + MacroExpansion(Id.getIdentifierInfo(), Range)); + else if (MacroDefinitionRecord *Def = findMacroDefinition(MI)) + addPreprocessedEntity(new (*this) MacroExpansion(Def, Range)); +} + +void PreprocessingRecord::Ifdef(SourceLocation Loc, const Token &MacroNameTok, + const MacroDefinition &MD) { + // This is not actually a macro expansion but record it as a macro reference. + if (MD) + addMacroExpansion(MacroNameTok, MD.getMacroInfo(), + MacroNameTok.getLocation()); +} + +void PreprocessingRecord::Ifndef(SourceLocation Loc, const Token &MacroNameTok, + const MacroDefinition &MD) { + // This is not actually a macro expansion but record it as a macro reference. + if (MD) + addMacroExpansion(MacroNameTok, MD.getMacroInfo(), + MacroNameTok.getLocation()); +} + +void PreprocessingRecord::Defined(const Token &MacroNameTok, + const MacroDefinition &MD, + SourceRange Range) { + // This is not actually a macro expansion but record it as a macro reference. + if (MD) + addMacroExpansion(MacroNameTok, MD.getMacroInfo(), + MacroNameTok.getLocation()); +} + +void PreprocessingRecord::SourceRangeSkipped(SourceRange Range, + SourceLocation EndifLoc) { + assert(Range.isValid()); + SkippedRanges.emplace_back(Range.getBegin(), EndifLoc); +} + +void PreprocessingRecord::MacroExpands(const Token &Id, + const MacroDefinition &MD, + SourceRange Range, + const MacroArgs *Args) { + addMacroExpansion(Id, MD.getMacroInfo(), Range); +} + +void PreprocessingRecord::MacroDefined(const Token &Id, + const MacroDirective *MD) { + const MacroInfo *MI = MD->getMacroInfo(); + SourceRange R(MI->getDefinitionLoc(), MI->getDefinitionEndLoc()); + MacroDefinitionRecord *Def = + new (*this) MacroDefinitionRecord(Id.getIdentifierInfo(), R); + addPreprocessedEntity(Def); + MacroDefinitions[MI] = Def; +} + +void PreprocessingRecord::MacroUndefined(const Token &Id, + const MacroDefinition &MD, + const MacroDirective *Undef) { + MD.forAllDefinitions([&](MacroInfo *MI) { MacroDefinitions.erase(MI); }); +} + +void PreprocessingRecord::InclusionDirective( + SourceLocation HashLoc, + const Token &IncludeTok, + StringRef FileName, + bool IsAngled, + CharSourceRange FilenameRange, + const FileEntry *File, + StringRef SearchPath, + StringRef RelativePath, + const Module *Imported, + SrcMgr::CharacteristicKind FileType) { + InclusionDirective::InclusionKind Kind = InclusionDirective::Include; + + switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) { + case tok::pp_include: + Kind = InclusionDirective::Include; + break; + + case tok::pp_import: + Kind = InclusionDirective::Import; + break; + + case tok::pp_include_next: + Kind = InclusionDirective::IncludeNext; + break; + + case tok::pp___include_macros: + Kind = InclusionDirective::IncludeMacros; + break; + + default: + llvm_unreachable("Unknown include directive kind"); + } + + SourceLocation EndLoc; + if (!IsAngled) { + EndLoc = FilenameRange.getBegin(); + } else { + EndLoc = FilenameRange.getEnd(); + if (FilenameRange.isCharRange()) + EndLoc = EndLoc.getLocWithOffset(-1); // the InclusionDirective expects + // a token range. + } + clang::InclusionDirective *ID = + new (*this) clang::InclusionDirective(*this, Kind, FileName, !IsAngled, + (bool)Imported, File, + SourceRange(HashLoc, EndLoc)); + addPreprocessedEntity(ID); +} + +size_t PreprocessingRecord::getTotalMemory() const { + return BumpAlloc.getTotalMemory() + + llvm::capacity_in_bytes(MacroDefinitions) + + llvm::capacity_in_bytes(PreprocessedEntities) + + llvm::capacity_in_bytes(LoadedPreprocessedEntities) + + llvm::capacity_in_bytes(SkippedRanges); +} diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp new file mode 100644 index 000000000000..82007732a9b1 --- /dev/null +++ b/clang/lib/Lex/Preprocessor.cpp @@ -0,0 +1,1401 @@ +//===- Preprocessor.cpp - C Language Family Preprocessor Implementation ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Preprocessor interface. +// +//===----------------------------------------------------------------------===// +// +// Options to support: +// -H - Print the name of each header file used. +// -d[DNI] - Dump various things. +// -fworking-directory - #line's with preprocessor's working dir. +// -fpreprocessed +// -dependency-file,-M,-MM,-MF,-MG,-MP,-MT,-MQ,-MD,-MMD +// -W* +// -w +// +// Messages to emit: +// "Multiple include guards may be useful for:\n" +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/FileSystemStatCache.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/Module.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Lex/CodeCompletionHandler.h" +#include "clang/Lex/ExternalPreprocessorSource.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/MacroArgs.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/ModuleLoader.h" +#include "clang/Lex/Pragma.h" +#include "clang/Lex/PreprocessingRecord.h" +#include "clang/Lex/PreprocessorLexer.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/ScratchBuffer.h" +#include "clang/Lex/Token.h" +#include "clang/Lex/TokenLexer.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/Capacity.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +using namespace clang; + +LLVM_INSTANTIATE_REGISTRY(PragmaHandlerRegistry) + +ExternalPreprocessorSource::~ExternalPreprocessorSource() = default; + +Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts, + DiagnosticsEngine &diags, LangOptions &opts, + SourceManager &SM, HeaderSearch &Headers, + ModuleLoader &TheModuleLoader, + IdentifierInfoLookup *IILookup, bool OwnsHeaders, + TranslationUnitKind TUKind) + : PPOpts(std::move(PPOpts)), Diags(&diags), LangOpts(opts), + FileMgr(Headers.getFileMgr()), SourceMgr(SM), + ScratchBuf(new ScratchBuffer(SourceMgr)), HeaderInfo(Headers), + TheModuleLoader(TheModuleLoader), ExternalSource(nullptr), + // As the language options may have not been loaded yet (when + // deserializing an ASTUnit), adding keywords to the identifier table is + // deferred to Preprocessor::Initialize(). + Identifiers(IILookup), PragmaHandlers(new PragmaNamespace(StringRef())), + TUKind(TUKind), SkipMainFilePreamble(0, true), + CurSubmoduleState(&NullSubmoduleState) { + OwnsHeaderSearch = OwnsHeaders; + + // Default to discarding comments. + KeepComments = false; + KeepMacroComments = false; + SuppressIncludeNotFoundError = false; + + // Macro expansion is enabled. + DisableMacroExpansion = false; + MacroExpansionInDirectivesOverride = false; + InMacroArgs = false; + ArgMacro = nullptr; + InMacroArgPreExpansion = false; + NumCachedTokenLexers = 0; + PragmasEnabled = true; + ParsingIfOrElifDirective = false; + PreprocessedOutput = false; + + // We haven't read anything from the external source. + ReadMacrosFromExternalSource = false; + + // "Poison" __VA_ARGS__, __VA_OPT__ which can only appear in the expansion of + // a macro. They get unpoisoned where it is allowed. + (Ident__VA_ARGS__ = getIdentifierInfo("__VA_ARGS__"))->setIsPoisoned(); + SetPoisonReason(Ident__VA_ARGS__,diag::ext_pp_bad_vaargs_use); + if (getLangOpts().CPlusPlus2a) { + (Ident__VA_OPT__ = getIdentifierInfo("__VA_OPT__"))->setIsPoisoned(); + SetPoisonReason(Ident__VA_OPT__,diag::ext_pp_bad_vaopt_use); + } else { + Ident__VA_OPT__ = nullptr; + } + + // Initialize the pragma handlers. + RegisterBuiltinPragmas(); + + // Initialize builtin macros like __LINE__ and friends. + RegisterBuiltinMacros(); + + if(LangOpts.Borland) { + Ident__exception_info = getIdentifierInfo("_exception_info"); + Ident___exception_info = getIdentifierInfo("__exception_info"); + Ident_GetExceptionInfo = getIdentifierInfo("GetExceptionInformation"); + Ident__exception_code = getIdentifierInfo("_exception_code"); + Ident___exception_code = getIdentifierInfo("__exception_code"); + Ident_GetExceptionCode = getIdentifierInfo("GetExceptionCode"); + Ident__abnormal_termination = getIdentifierInfo("_abnormal_termination"); + Ident___abnormal_termination = getIdentifierInfo("__abnormal_termination"); + Ident_AbnormalTermination = getIdentifierInfo("AbnormalTermination"); + } else { + Ident__exception_info = Ident__exception_code = nullptr; + Ident__abnormal_termination = Ident___exception_info = nullptr; + Ident___exception_code = Ident___abnormal_termination = nullptr; + Ident_GetExceptionInfo = Ident_GetExceptionCode = nullptr; + Ident_AbnormalTermination = nullptr; + } + + // If using a PCH where a #pragma hdrstop is expected, start skipping tokens. + if (usingPCHWithPragmaHdrStop()) + SkippingUntilPragmaHdrStop = true; + + // If using a PCH with a through header, start skipping tokens. + if (!this->PPOpts->PCHThroughHeader.empty() && + !this->PPOpts->ImplicitPCHInclude.empty()) + SkippingUntilPCHThroughHeader = true; + + if (this->PPOpts->GeneratePreamble) + PreambleConditionalStack.startRecording(); + + ExcludedConditionalDirectiveSkipMappings = + this->PPOpts->ExcludedConditionalDirectiveSkipMappings; + if (ExcludedConditionalDirectiveSkipMappings) + ExcludedConditionalDirectiveSkipMappings->clear(); +} + +Preprocessor::~Preprocessor() { + assert(BacktrackPositions.empty() && "EnableBacktrack/Backtrack imbalance!"); + + IncludeMacroStack.clear(); + + // Destroy any macro definitions. + while (MacroInfoChain *I = MIChainHead) { + MIChainHead = I->Next; + I->~MacroInfoChain(); + } + + // Free any cached macro expanders. + // This populates MacroArgCache, so all TokenLexers need to be destroyed + // before the code below that frees up the MacroArgCache list. + std::fill(TokenLexerCache, TokenLexerCache + NumCachedTokenLexers, nullptr); + CurTokenLexer.reset(); + + // Free any cached MacroArgs. + for (MacroArgs *ArgList = MacroArgCache; ArgList;) + ArgList = ArgList->deallocate(); + + // Delete the header search info, if we own it. + if (OwnsHeaderSearch) + delete &HeaderInfo; +} + +void Preprocessor::Initialize(const TargetInfo &Target, + const TargetInfo *AuxTarget) { + assert((!this->Target || this->Target == &Target) && + "Invalid override of target information"); + this->Target = &Target; + + assert((!this->AuxTarget || this->AuxTarget == AuxTarget) && + "Invalid override of aux target information."); + this->AuxTarget = AuxTarget; + + // Initialize information about built-ins. + BuiltinInfo.InitializeTarget(Target, AuxTarget); + HeaderInfo.setTarget(Target); + + // Populate the identifier table with info about keywords for the current language. + Identifiers.AddKeywords(LangOpts); +} + +void Preprocessor::InitializeForModelFile() { + NumEnteredSourceFiles = 0; + + // Reset pragmas + PragmaHandlersBackup = std::move(PragmaHandlers); + PragmaHandlers = std::make_unique<PragmaNamespace>(StringRef()); + RegisterBuiltinPragmas(); + + // Reset PredefinesFileID + PredefinesFileID = FileID(); +} + +void Preprocessor::FinalizeForModelFile() { + NumEnteredSourceFiles = 1; + + PragmaHandlers = std::move(PragmaHandlersBackup); +} + +void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const { + llvm::errs() << tok::getTokenName(Tok.getKind()) << " '" + << getSpelling(Tok) << "'"; + + if (!DumpFlags) return; + + llvm::errs() << "\t"; + if (Tok.isAtStartOfLine()) + llvm::errs() << " [StartOfLine]"; + if (Tok.hasLeadingSpace()) + llvm::errs() << " [LeadingSpace]"; + if (Tok.isExpandDisabled()) + llvm::errs() << " [ExpandDisabled]"; + if (Tok.needsCleaning()) { + const char *Start = SourceMgr.getCharacterData(Tok.getLocation()); + llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength()) + << "']"; + } + + llvm::errs() << "\tLoc=<"; + DumpLocation(Tok.getLocation()); + llvm::errs() << ">"; +} + +void Preprocessor::DumpLocation(SourceLocation Loc) const { + Loc.print(llvm::errs(), SourceMgr); +} + +void Preprocessor::DumpMacro(const MacroInfo &MI) const { + llvm::errs() << "MACRO: "; + for (unsigned i = 0, e = MI.getNumTokens(); i != e; ++i) { + DumpToken(MI.getReplacementToken(i)); + llvm::errs() << " "; + } + llvm::errs() << "\n"; +} + +void Preprocessor::PrintStats() { + llvm::errs() << "\n*** Preprocessor Stats:\n"; + llvm::errs() << NumDirectives << " directives found:\n"; + llvm::errs() << " " << NumDefined << " #define.\n"; + llvm::errs() << " " << NumUndefined << " #undef.\n"; + llvm::errs() << " #include/#include_next/#import:\n"; + llvm::errs() << " " << NumEnteredSourceFiles << " source files entered.\n"; + llvm::errs() << " " << MaxIncludeStackDepth << " max include stack depth\n"; + llvm::errs() << " " << NumIf << " #if/#ifndef/#ifdef.\n"; + llvm::errs() << " " << NumElse << " #else/#elif.\n"; + llvm::errs() << " " << NumEndif << " #endif.\n"; + llvm::errs() << " " << NumPragma << " #pragma.\n"; + llvm::errs() << NumSkipped << " #if/#ifndef#ifdef regions skipped\n"; + + llvm::errs() << NumMacroExpanded << "/" << NumFnMacroExpanded << "/" + << NumBuiltinMacroExpanded << " obj/fn/builtin macros expanded, " + << NumFastMacroExpanded << " on the fast path.\n"; + llvm::errs() << (NumFastTokenPaste+NumTokenPaste) + << " token paste (##) operations performed, " + << NumFastTokenPaste << " on the fast path.\n"; + + llvm::errs() << "\nPreprocessor Memory: " << getTotalMemory() << "B total"; + + llvm::errs() << "\n BumpPtr: " << BP.getTotalMemory(); + llvm::errs() << "\n Macro Expanded Tokens: " + << llvm::capacity_in_bytes(MacroExpandedTokens); + llvm::errs() << "\n Predefines Buffer: " << Predefines.capacity(); + // FIXME: List information for all submodules. + llvm::errs() << "\n Macros: " + << llvm::capacity_in_bytes(CurSubmoduleState->Macros); + llvm::errs() << "\n #pragma push_macro Info: " + << llvm::capacity_in_bytes(PragmaPushMacroInfo); + llvm::errs() << "\n Poison Reasons: " + << llvm::capacity_in_bytes(PoisonReasons); + llvm::errs() << "\n Comment Handlers: " + << llvm::capacity_in_bytes(CommentHandlers) << "\n"; +} + +Preprocessor::macro_iterator +Preprocessor::macro_begin(bool IncludeExternalMacros) const { + if (IncludeExternalMacros && ExternalSource && + !ReadMacrosFromExternalSource) { + ReadMacrosFromExternalSource = true; + ExternalSource->ReadDefinedMacros(); + } + + // Make sure we cover all macros in visible modules. + for (const ModuleMacro &Macro : ModuleMacros) + CurSubmoduleState->Macros.insert(std::make_pair(Macro.II, MacroState())); + + return CurSubmoduleState->Macros.begin(); +} + +size_t Preprocessor::getTotalMemory() const { + return BP.getTotalMemory() + + llvm::capacity_in_bytes(MacroExpandedTokens) + + Predefines.capacity() /* Predefines buffer. */ + // FIXME: Include sizes from all submodules, and include MacroInfo sizes, + // and ModuleMacros. + + llvm::capacity_in_bytes(CurSubmoduleState->Macros) + + llvm::capacity_in_bytes(PragmaPushMacroInfo) + + llvm::capacity_in_bytes(PoisonReasons) + + llvm::capacity_in_bytes(CommentHandlers); +} + +Preprocessor::macro_iterator +Preprocessor::macro_end(bool IncludeExternalMacros) const { + if (IncludeExternalMacros && ExternalSource && + !ReadMacrosFromExternalSource) { + ReadMacrosFromExternalSource = true; + ExternalSource->ReadDefinedMacros(); + } + + return CurSubmoduleState->Macros.end(); +} + +/// Compares macro tokens with a specified token value sequence. +static bool MacroDefinitionEquals(const MacroInfo *MI, + ArrayRef<TokenValue> Tokens) { + return Tokens.size() == MI->getNumTokens() && + std::equal(Tokens.begin(), Tokens.end(), MI->tokens_begin()); +} + +StringRef Preprocessor::getLastMacroWithSpelling( + SourceLocation Loc, + ArrayRef<TokenValue> Tokens) const { + SourceLocation BestLocation; + StringRef BestSpelling; + for (Preprocessor::macro_iterator I = macro_begin(), E = macro_end(); + I != E; ++I) { + const MacroDirective::DefInfo + Def = I->second.findDirectiveAtLoc(Loc, SourceMgr); + if (!Def || !Def.getMacroInfo()) + continue; + if (!Def.getMacroInfo()->isObjectLike()) + continue; + if (!MacroDefinitionEquals(Def.getMacroInfo(), Tokens)) + continue; + SourceLocation Location = Def.getLocation(); + // Choose the macro defined latest. + if (BestLocation.isInvalid() || + (Location.isValid() && + SourceMgr.isBeforeInTranslationUnit(BestLocation, Location))) { + BestLocation = Location; + BestSpelling = I->first->getName(); + } + } + return BestSpelling; +} + +void Preprocessor::recomputeCurLexerKind() { + if (CurLexer) + CurLexerKind = CLK_Lexer; + else if (CurTokenLexer) + CurLexerKind = CLK_TokenLexer; + else + CurLexerKind = CLK_CachingLexer; +} + +bool Preprocessor::SetCodeCompletionPoint(const FileEntry *File, + unsigned CompleteLine, + unsigned CompleteColumn) { + assert(File); + assert(CompleteLine && CompleteColumn && "Starts from 1:1"); + assert(!CodeCompletionFile && "Already set"); + + using llvm::MemoryBuffer; + + // Load the actual file's contents. + bool Invalid = false; + const MemoryBuffer *Buffer = SourceMgr.getMemoryBufferForFile(File, &Invalid); + if (Invalid) + return true; + + // Find the byte position of the truncation point. + const char *Position = Buffer->getBufferStart(); + for (unsigned Line = 1; Line < CompleteLine; ++Line) { + for (; *Position; ++Position) { + if (*Position != '\r' && *Position != '\n') + continue; + + // Eat \r\n or \n\r as a single line. + if ((Position[1] == '\r' || Position[1] == '\n') && + Position[0] != Position[1]) + ++Position; + ++Position; + break; + } + } + + Position += CompleteColumn - 1; + + // If pointing inside the preamble, adjust the position at the beginning of + // the file after the preamble. + if (SkipMainFilePreamble.first && + SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()) == File) { + if (Position - Buffer->getBufferStart() < SkipMainFilePreamble.first) + Position = Buffer->getBufferStart() + SkipMainFilePreamble.first; + } + + if (Position > Buffer->getBufferEnd()) + Position = Buffer->getBufferEnd(); + + CodeCompletionFile = File; + CodeCompletionOffset = Position - Buffer->getBufferStart(); + + auto NewBuffer = llvm::WritableMemoryBuffer::getNewUninitMemBuffer( + Buffer->getBufferSize() + 1, Buffer->getBufferIdentifier()); + char *NewBuf = NewBuffer->getBufferStart(); + char *NewPos = std::copy(Buffer->getBufferStart(), Position, NewBuf); + *NewPos = '\0'; + std::copy(Position, Buffer->getBufferEnd(), NewPos+1); + SourceMgr.overrideFileContents(File, std::move(NewBuffer)); + + return false; +} + +void Preprocessor::CodeCompleteIncludedFile(llvm::StringRef Dir, + bool IsAngled) { + if (CodeComplete) + CodeComplete->CodeCompleteIncludedFile(Dir, IsAngled); + setCodeCompletionReached(); +} + +void Preprocessor::CodeCompleteNaturalLanguage() { + if (CodeComplete) + CodeComplete->CodeCompleteNaturalLanguage(); + setCodeCompletionReached(); +} + +/// getSpelling - This method is used to get the spelling of a token into a +/// SmallVector. Note that the returned StringRef may not point to the +/// supplied buffer if a copy can be avoided. +StringRef Preprocessor::getSpelling(const Token &Tok, + SmallVectorImpl<char> &Buffer, + bool *Invalid) const { + // NOTE: this has to be checked *before* testing for an IdentifierInfo. + if (Tok.isNot(tok::raw_identifier) && !Tok.hasUCN()) { + // Try the fast path. + if (const IdentifierInfo *II = Tok.getIdentifierInfo()) + return II->getName(); + } + + // Resize the buffer if we need to copy into it. + if (Tok.needsCleaning()) + Buffer.resize(Tok.getLength()); + + const char *Ptr = Buffer.data(); + unsigned Len = getSpelling(Tok, Ptr, Invalid); + return StringRef(Ptr, Len); +} + +/// CreateString - Plop the specified string into a scratch buffer and return a +/// location for it. If specified, the source location provides a source +/// location for the token. +void Preprocessor::CreateString(StringRef Str, Token &Tok, + SourceLocation ExpansionLocStart, + SourceLocation ExpansionLocEnd) { + Tok.setLength(Str.size()); + + const char *DestPtr; + SourceLocation Loc = ScratchBuf->getToken(Str.data(), Str.size(), DestPtr); + + if (ExpansionLocStart.isValid()) + Loc = SourceMgr.createExpansionLoc(Loc, ExpansionLocStart, + ExpansionLocEnd, Str.size()); + Tok.setLocation(Loc); + + // If this is a raw identifier or a literal token, set the pointer data. + if (Tok.is(tok::raw_identifier)) + Tok.setRawIdentifierData(DestPtr); + else if (Tok.isLiteral()) + Tok.setLiteralData(DestPtr); +} + +SourceLocation Preprocessor::SplitToken(SourceLocation Loc, unsigned Length) { + auto &SM = getSourceManager(); + SourceLocation SpellingLoc = SM.getSpellingLoc(Loc); + std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(SpellingLoc); + bool Invalid = false; + StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); + if (Invalid) + return SourceLocation(); + + // FIXME: We could consider re-using spelling for tokens we see repeatedly. + const char *DestPtr; + SourceLocation Spelling = + ScratchBuf->getToken(Buffer.data() + LocInfo.second, Length, DestPtr); + return SM.createTokenSplitLoc(Spelling, Loc, Loc.getLocWithOffset(Length)); +} + +Module *Preprocessor::getCurrentModule() { + if (!getLangOpts().isCompilingModule()) + return nullptr; + + return getHeaderSearchInfo().lookupModule(getLangOpts().CurrentModule); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Initialization Methods +//===----------------------------------------------------------------------===// + +/// EnterMainSourceFile - Enter the specified FileID as the main source file, +/// which implicitly adds the builtin defines etc. +void Preprocessor::EnterMainSourceFile() { + // We do not allow the preprocessor to reenter the main file. Doing so will + // cause FileID's to accumulate information from both runs (e.g. #line + // information) and predefined macros aren't guaranteed to be set properly. + assert(NumEnteredSourceFiles == 0 && "Cannot reenter the main file!"); + FileID MainFileID = SourceMgr.getMainFileID(); + + // If MainFileID is loaded it means we loaded an AST file, no need to enter + // a main file. + if (!SourceMgr.isLoadedFileID(MainFileID)) { + // Enter the main file source buffer. + EnterSourceFile(MainFileID, nullptr, SourceLocation()); + + // If we've been asked to skip bytes in the main file (e.g., as part of a + // precompiled preamble), do so now. + if (SkipMainFilePreamble.first > 0) + CurLexer->SetByteOffset(SkipMainFilePreamble.first, + SkipMainFilePreamble.second); + + // Tell the header info that the main file was entered. If the file is later + // #imported, it won't be re-entered. + if (const FileEntry *FE = SourceMgr.getFileEntryForID(MainFileID)) + HeaderInfo.IncrementIncludeCount(FE); + } + + // Preprocess Predefines to populate the initial preprocessor state. + std::unique_ptr<llvm::MemoryBuffer> SB = + llvm::MemoryBuffer::getMemBufferCopy(Predefines, "<built-in>"); + assert(SB && "Cannot create predefined source buffer"); + FileID FID = SourceMgr.createFileID(std::move(SB)); + assert(FID.isValid() && "Could not create FileID for predefines?"); + setPredefinesFileID(FID); + + // Start parsing the predefines. + EnterSourceFile(FID, nullptr, SourceLocation()); + + if (!PPOpts->PCHThroughHeader.empty()) { + // Lookup and save the FileID for the through header. If it isn't found + // in the search path, it's a fatal error. + const DirectoryLookup *CurDir; + Optional<FileEntryRef> File = LookupFile( + SourceLocation(), PPOpts->PCHThroughHeader, + /*isAngled=*/false, /*FromDir=*/nullptr, /*FromFile=*/nullptr, CurDir, + /*SearchPath=*/nullptr, /*RelativePath=*/nullptr, + /*SuggestedModule=*/nullptr, /*IsMapped=*/nullptr, + /*IsFrameworkFound=*/nullptr); + if (!File) { + Diag(SourceLocation(), diag::err_pp_through_header_not_found) + << PPOpts->PCHThroughHeader; + return; + } + setPCHThroughHeaderFileID( + SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User)); + } + + // Skip tokens from the Predefines and if needed the main file. + if ((usingPCHWithThroughHeader() && SkippingUntilPCHThroughHeader) || + (usingPCHWithPragmaHdrStop() && SkippingUntilPragmaHdrStop)) + SkipTokensWhileUsingPCH(); +} + +void Preprocessor::setPCHThroughHeaderFileID(FileID FID) { + assert(PCHThroughHeaderFileID.isInvalid() && + "PCHThroughHeaderFileID already set!"); + PCHThroughHeaderFileID = FID; +} + +bool Preprocessor::isPCHThroughHeader(const FileEntry *FE) { + assert(PCHThroughHeaderFileID.isValid() && + "Invalid PCH through header FileID"); + return FE == SourceMgr.getFileEntryForID(PCHThroughHeaderFileID); +} + +bool Preprocessor::creatingPCHWithThroughHeader() { + return TUKind == TU_Prefix && !PPOpts->PCHThroughHeader.empty() && + PCHThroughHeaderFileID.isValid(); +} + +bool Preprocessor::usingPCHWithThroughHeader() { + return TUKind != TU_Prefix && !PPOpts->PCHThroughHeader.empty() && + PCHThroughHeaderFileID.isValid(); +} + +bool Preprocessor::creatingPCHWithPragmaHdrStop() { + return TUKind == TU_Prefix && PPOpts->PCHWithHdrStop; +} + +bool Preprocessor::usingPCHWithPragmaHdrStop() { + return TUKind != TU_Prefix && PPOpts->PCHWithHdrStop; +} + +/// Skip tokens until after the #include of the through header or +/// until after a #pragma hdrstop is seen. Tokens in the predefines file +/// and the main file may be skipped. If the end of the predefines file +/// is reached, skipping continues into the main file. If the end of the +/// main file is reached, it's a fatal error. +void Preprocessor::SkipTokensWhileUsingPCH() { + bool ReachedMainFileEOF = false; + bool UsingPCHThroughHeader = SkippingUntilPCHThroughHeader; + bool UsingPragmaHdrStop = SkippingUntilPragmaHdrStop; + Token Tok; + while (true) { + bool InPredefines = + (CurLexer && CurLexer->getFileID() == getPredefinesFileID()); + switch (CurLexerKind) { + case CLK_Lexer: + CurLexer->Lex(Tok); + break; + case CLK_TokenLexer: + CurTokenLexer->Lex(Tok); + break; + case CLK_CachingLexer: + CachingLex(Tok); + break; + case CLK_LexAfterModuleImport: + LexAfterModuleImport(Tok); + break; + } + if (Tok.is(tok::eof) && !InPredefines) { + ReachedMainFileEOF = true; + break; + } + if (UsingPCHThroughHeader && !SkippingUntilPCHThroughHeader) + break; + if (UsingPragmaHdrStop && !SkippingUntilPragmaHdrStop) + break; + } + if (ReachedMainFileEOF) { + if (UsingPCHThroughHeader) + Diag(SourceLocation(), diag::err_pp_through_header_not_seen) + << PPOpts->PCHThroughHeader << 1; + else if (!PPOpts->PCHWithHdrStopCreate) + Diag(SourceLocation(), diag::err_pp_pragma_hdrstop_not_seen); + } +} + +void Preprocessor::replayPreambleConditionalStack() { + // Restore the conditional stack from the preamble, if there is one. + if (PreambleConditionalStack.isReplaying()) { + assert(CurPPLexer && + "CurPPLexer is null when calling replayPreambleConditionalStack."); + CurPPLexer->setConditionalLevels(PreambleConditionalStack.getStack()); + PreambleConditionalStack.doneReplaying(); + if (PreambleConditionalStack.reachedEOFWhileSkipping()) + SkipExcludedConditionalBlock( + PreambleConditionalStack.SkipInfo->HashTokenLoc, + PreambleConditionalStack.SkipInfo->IfTokenLoc, + PreambleConditionalStack.SkipInfo->FoundNonSkipPortion, + PreambleConditionalStack.SkipInfo->FoundElse, + PreambleConditionalStack.SkipInfo->ElseLoc); + } +} + +void Preprocessor::EndSourceFile() { + // Notify the client that we reached the end of the source file. + if (Callbacks) + Callbacks->EndOfMainFile(); +} + +//===----------------------------------------------------------------------===// +// Lexer Event Handling. +//===----------------------------------------------------------------------===// + +/// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the +/// identifier information for the token and install it into the token, +/// updating the token kind accordingly. +IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier) const { + assert(!Identifier.getRawIdentifier().empty() && "No raw identifier data!"); + + // Look up this token, see if it is a macro, or if it is a language keyword. + IdentifierInfo *II; + if (!Identifier.needsCleaning() && !Identifier.hasUCN()) { + // No cleaning needed, just use the characters from the lexed buffer. + II = getIdentifierInfo(Identifier.getRawIdentifier()); + } else { + // Cleaning needed, alloca a buffer, clean into it, then use the buffer. + SmallString<64> IdentifierBuffer; + StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer); + + if (Identifier.hasUCN()) { + SmallString<64> UCNIdentifierBuffer; + expandUCNs(UCNIdentifierBuffer, CleanedStr); + II = getIdentifierInfo(UCNIdentifierBuffer); + } else { + II = getIdentifierInfo(CleanedStr); + } + } + + // Update the token info (identifier info and appropriate token kind). + Identifier.setIdentifierInfo(II); + if (getLangOpts().MSVCCompat && II->isCPlusPlusOperatorKeyword() && + getSourceManager().isInSystemHeader(Identifier.getLocation())) + Identifier.setKind(tok::identifier); + else + Identifier.setKind(II->getTokenID()); + + return II; +} + +void Preprocessor::SetPoisonReason(IdentifierInfo *II, unsigned DiagID) { + PoisonReasons[II] = DiagID; +} + +void Preprocessor::PoisonSEHIdentifiers(bool Poison) { + assert(Ident__exception_code && Ident__exception_info); + assert(Ident___exception_code && Ident___exception_info); + Ident__exception_code->setIsPoisoned(Poison); + Ident___exception_code->setIsPoisoned(Poison); + Ident_GetExceptionCode->setIsPoisoned(Poison); + Ident__exception_info->setIsPoisoned(Poison); + Ident___exception_info->setIsPoisoned(Poison); + Ident_GetExceptionInfo->setIsPoisoned(Poison); + Ident__abnormal_termination->setIsPoisoned(Poison); + Ident___abnormal_termination->setIsPoisoned(Poison); + Ident_AbnormalTermination->setIsPoisoned(Poison); +} + +void Preprocessor::HandlePoisonedIdentifier(Token & Identifier) { + assert(Identifier.getIdentifierInfo() && + "Can't handle identifiers without identifier info!"); + llvm::DenseMap<IdentifierInfo*,unsigned>::const_iterator it = + PoisonReasons.find(Identifier.getIdentifierInfo()); + if(it == PoisonReasons.end()) + Diag(Identifier, diag::err_pp_used_poisoned_id); + else + Diag(Identifier,it->second) << Identifier.getIdentifierInfo(); +} + +/// Returns a diagnostic message kind for reporting a future keyword as +/// appropriate for the identifier and specified language. +static diag::kind getFutureCompatDiagKind(const IdentifierInfo &II, + const LangOptions &LangOpts) { + assert(II.isFutureCompatKeyword() && "diagnostic should not be needed"); + + if (LangOpts.CPlusPlus) + return llvm::StringSwitch<diag::kind>(II.getName()) +#define CXX11_KEYWORD(NAME, FLAGS) \ + .Case(#NAME, diag::warn_cxx11_keyword) +#define CXX2A_KEYWORD(NAME, FLAGS) \ + .Case(#NAME, diag::warn_cxx2a_keyword) +#include "clang/Basic/TokenKinds.def" + ; + + llvm_unreachable( + "Keyword not known to come from a newer Standard or proposed Standard"); +} + +void Preprocessor::updateOutOfDateIdentifier(IdentifierInfo &II) const { + assert(II.isOutOfDate() && "not out of date"); + getExternalSource()->updateOutOfDateIdentifier(II); +} + +/// HandleIdentifier - This callback is invoked when the lexer reads an +/// identifier. This callback looks up the identifier in the map and/or +/// potentially macro expands it or turns it into a named token (like 'for'). +/// +/// Note that callers of this method are guarded by checking the +/// IdentifierInfo's 'isHandleIdentifierCase' bit. If this method changes, the +/// IdentifierInfo methods that compute these properties will need to change to +/// match. +bool Preprocessor::HandleIdentifier(Token &Identifier) { + assert(Identifier.getIdentifierInfo() && + "Can't handle identifiers without identifier info!"); + + IdentifierInfo &II = *Identifier.getIdentifierInfo(); + + // If the information about this identifier is out of date, update it from + // the external source. + // We have to treat __VA_ARGS__ in a special way, since it gets + // serialized with isPoisoned = true, but our preprocessor may have + // unpoisoned it if we're defining a C99 macro. + if (II.isOutOfDate()) { + bool CurrentIsPoisoned = false; + const bool IsSpecialVariadicMacro = + &II == Ident__VA_ARGS__ || &II == Ident__VA_OPT__; + if (IsSpecialVariadicMacro) + CurrentIsPoisoned = II.isPoisoned(); + + updateOutOfDateIdentifier(II); + Identifier.setKind(II.getTokenID()); + + if (IsSpecialVariadicMacro) + II.setIsPoisoned(CurrentIsPoisoned); + } + + // If this identifier was poisoned, and if it was not produced from a macro + // expansion, emit an error. + if (II.isPoisoned() && CurPPLexer) { + HandlePoisonedIdentifier(Identifier); + } + + // If this is a macro to be expanded, do it. + if (MacroDefinition MD = getMacroDefinition(&II)) { + auto *MI = MD.getMacroInfo(); + assert(MI && "macro definition with no macro info?"); + if (!DisableMacroExpansion) { + if (!Identifier.isExpandDisabled() && MI->isEnabled()) { + // C99 6.10.3p10: If the preprocessing token immediately after the + // macro name isn't a '(', this macro should not be expanded. + if (!MI->isFunctionLike() || isNextPPTokenLParen()) + return HandleMacroExpandedIdentifier(Identifier, MD); + } else { + // C99 6.10.3.4p2 says that a disabled macro may never again be + // expanded, even if it's in a context where it could be expanded in the + // future. + Identifier.setFlag(Token::DisableExpand); + if (MI->isObjectLike() || isNextPPTokenLParen()) + Diag(Identifier, diag::pp_disabled_macro_expansion); + } + } + } + + // If this identifier is a keyword in a newer Standard or proposed Standard, + // produce a warning. Don't warn if we're not considering macro expansion, + // since this identifier might be the name of a macro. + // FIXME: This warning is disabled in cases where it shouldn't be, like + // "#define constexpr constexpr", "int constexpr;" + if (II.isFutureCompatKeyword() && !DisableMacroExpansion) { + Diag(Identifier, getFutureCompatDiagKind(II, getLangOpts())) + << II.getName(); + // Don't diagnose this keyword again in this translation unit. + II.setIsFutureCompatKeyword(false); + } + + // If this is an extension token, diagnose its use. + // We avoid diagnosing tokens that originate from macro definitions. + // FIXME: This warning is disabled in cases where it shouldn't be, + // like "#define TY typeof", "TY(1) x". + if (II.isExtensionToken() && !DisableMacroExpansion) + Diag(Identifier, diag::ext_token_used); + + // If this is the 'import' contextual keyword following an '@', note + // that the next token indicates a module name. + // + // Note that we do not treat 'import' as a contextual + // keyword when we're in a caching lexer, because caching lexers only get + // used in contexts where import declarations are disallowed. + // + // Likewise if this is the C++ Modules TS import keyword. + if (((LastTokenWasAt && II.isModulesImport()) || + Identifier.is(tok::kw_import)) && + !InMacroArgs && !DisableMacroExpansion && + (getLangOpts().Modules || getLangOpts().DebuggerSupport) && + CurLexerKind != CLK_CachingLexer) { + ModuleImportLoc = Identifier.getLocation(); + ModuleImportPath.clear(); + ModuleImportExpectsIdentifier = true; + CurLexerKind = CLK_LexAfterModuleImport; + } + return true; +} + +void Preprocessor::Lex(Token &Result) { + ++LexLevel; + + // We loop here until a lex function returns a token; this avoids recursion. + bool ReturnedToken; + do { + switch (CurLexerKind) { + case CLK_Lexer: + ReturnedToken = CurLexer->Lex(Result); + break; + case CLK_TokenLexer: + ReturnedToken = CurTokenLexer->Lex(Result); + break; + case CLK_CachingLexer: + CachingLex(Result); + ReturnedToken = true; + break; + case CLK_LexAfterModuleImport: + ReturnedToken = LexAfterModuleImport(Result); + break; + } + } while (!ReturnedToken); + + if (Result.is(tok::code_completion) && Result.getIdentifierInfo()) { + // Remember the identifier before code completion token. + setCodeCompletionIdentifierInfo(Result.getIdentifierInfo()); + setCodeCompletionTokenRange(Result.getLocation(), Result.getEndLoc()); + // Set IdenfitierInfo to null to avoid confusing code that handles both + // identifiers and completion tokens. + Result.setIdentifierInfo(nullptr); + } + + // Update ImportSeqState to track our position within a C++20 import-seq + // if this token is being produced as a result of phase 4 of translation. + if (getLangOpts().CPlusPlusModules && LexLevel == 1 && + !Result.getFlag(Token::IsReinjected)) { + switch (Result.getKind()) { + case tok::l_paren: case tok::l_square: case tok::l_brace: + ImportSeqState.handleOpenBracket(); + break; + case tok::r_paren: case tok::r_square: + ImportSeqState.handleCloseBracket(); + break; + case tok::r_brace: + ImportSeqState.handleCloseBrace(); + break; + case tok::semi: + ImportSeqState.handleSemi(); + break; + case tok::header_name: + case tok::annot_header_unit: + ImportSeqState.handleHeaderName(); + break; + case tok::kw_export: + ImportSeqState.handleExport(); + break; + case tok::identifier: + if (Result.getIdentifierInfo()->isModulesImport()) { + ImportSeqState.handleImport(); + if (ImportSeqState.afterImportSeq()) { + ModuleImportLoc = Result.getLocation(); + ModuleImportPath.clear(); + ModuleImportExpectsIdentifier = true; + CurLexerKind = CLK_LexAfterModuleImport; + } + break; + } + LLVM_FALLTHROUGH; + default: + ImportSeqState.handleMisc(); + break; + } + } + + LastTokenWasAt = Result.is(tok::at); + --LexLevel; + if (OnToken && LexLevel == 0 && !Result.getFlag(Token::IsReinjected)) + OnToken(Result); +} + +/// Lex a header-name token (including one formed from header-name-tokens if +/// \p AllowConcatenation is \c true). +/// +/// \param FilenameTok Filled in with the next token. On success, this will +/// be either a header_name token. On failure, it will be whatever other +/// token was found instead. +/// \param AllowMacroExpansion If \c true, allow the header name to be formed +/// by macro expansion (concatenating tokens as necessary if the first +/// token is a '<'). +/// \return \c true if we reached EOD or EOF while looking for a > token in +/// a concatenated header name and diagnosed it. \c false otherwise. +bool Preprocessor::LexHeaderName(Token &FilenameTok, bool AllowMacroExpansion) { + // Lex using header-name tokenization rules if tokens are being lexed from + // a file. Just grab a token normally if we're in a macro expansion. + if (CurPPLexer) + CurPPLexer->LexIncludeFilename(FilenameTok); + else + Lex(FilenameTok); + + // This could be a <foo/bar.h> file coming from a macro expansion. In this + // case, glue the tokens together into an angle_string_literal token. + SmallString<128> FilenameBuffer; + if (FilenameTok.is(tok::less) && AllowMacroExpansion) { + bool StartOfLine = FilenameTok.isAtStartOfLine(); + bool LeadingSpace = FilenameTok.hasLeadingSpace(); + bool LeadingEmptyMacro = FilenameTok.hasLeadingEmptyMacro(); + + SourceLocation Start = FilenameTok.getLocation(); + SourceLocation End; + FilenameBuffer.push_back('<'); + + // Consume tokens until we find a '>'. + // FIXME: A header-name could be formed starting or ending with an + // alternative token. It's not clear whether that's ill-formed in all + // cases. + while (FilenameTok.isNot(tok::greater)) { + Lex(FilenameTok); + if (FilenameTok.isOneOf(tok::eod, tok::eof)) { + Diag(FilenameTok.getLocation(), diag::err_expected) << tok::greater; + Diag(Start, diag::note_matching) << tok::less; + return true; + } + + End = FilenameTok.getLocation(); + + // FIXME: Provide code completion for #includes. + if (FilenameTok.is(tok::code_completion)) { + setCodeCompletionReached(); + Lex(FilenameTok); + continue; + } + + // Append the spelling of this token to the buffer. If there was a space + // before it, add it now. + if (FilenameTok.hasLeadingSpace()) + FilenameBuffer.push_back(' '); + + // Get the spelling of the token, directly into FilenameBuffer if + // possible. + size_t PreAppendSize = FilenameBuffer.size(); + FilenameBuffer.resize(PreAppendSize + FilenameTok.getLength()); + + const char *BufPtr = &FilenameBuffer[PreAppendSize]; + unsigned ActualLen = getSpelling(FilenameTok, BufPtr); + + // If the token was spelled somewhere else, copy it into FilenameBuffer. + if (BufPtr != &FilenameBuffer[PreAppendSize]) + memcpy(&FilenameBuffer[PreAppendSize], BufPtr, ActualLen); + + // Resize FilenameBuffer to the correct size. + if (FilenameTok.getLength() != ActualLen) + FilenameBuffer.resize(PreAppendSize + ActualLen); + } + + FilenameTok.startToken(); + FilenameTok.setKind(tok::header_name); + FilenameTok.setFlagValue(Token::StartOfLine, StartOfLine); + FilenameTok.setFlagValue(Token::LeadingSpace, LeadingSpace); + FilenameTok.setFlagValue(Token::LeadingEmptyMacro, LeadingEmptyMacro); + CreateString(FilenameBuffer, FilenameTok, Start, End); + } else if (FilenameTok.is(tok::string_literal) && AllowMacroExpansion) { + // Convert a string-literal token of the form " h-char-sequence " + // (produced by macro expansion) into a header-name token. + // + // The rules for header-names don't quite match the rules for + // string-literals, but all the places where they differ result in + // undefined behavior, so we can and do treat them the same. + // + // A string-literal with a prefix or suffix is not translated into a + // header-name. This could theoretically be observable via the C++20 + // context-sensitive header-name formation rules. + StringRef Str = getSpelling(FilenameTok, FilenameBuffer); + if (Str.size() >= 2 && Str.front() == '"' && Str.back() == '"') + FilenameTok.setKind(tok::header_name); + } + + return false; +} + +/// Collect the tokens of a C++20 pp-import-suffix. +void Preprocessor::CollectPpImportSuffix(SmallVectorImpl<Token> &Toks) { + // FIXME: For error recovery, consider recognizing attribute syntax here + // and terminating / diagnosing a missing semicolon if we find anything + // else? (Can we leave that to the parser?) + unsigned BracketDepth = 0; + while (true) { + Toks.emplace_back(); + Lex(Toks.back()); + + switch (Toks.back().getKind()) { + case tok::l_paren: case tok::l_square: case tok::l_brace: + ++BracketDepth; + break; + + case tok::r_paren: case tok::r_square: case tok::r_brace: + if (BracketDepth == 0) + return; + --BracketDepth; + break; + + case tok::semi: + if (BracketDepth == 0) + return; + break; + + case tok::eof: + return; + + default: + break; + } + } +} + + +/// Lex a token following the 'import' contextual keyword. +/// +/// pp-import: [C++20] +/// import header-name pp-import-suffix[opt] ; +/// import header-name-tokens pp-import-suffix[opt] ; +/// [ObjC] @ import module-name ; +/// [Clang] import module-name ; +/// +/// header-name-tokens: +/// string-literal +/// < [any sequence of preprocessing-tokens other than >] > +/// +/// module-name: +/// module-name-qualifier[opt] identifier +/// +/// module-name-qualifier +/// module-name-qualifier[opt] identifier . +/// +/// We respond to a pp-import by importing macros from the named module. +bool Preprocessor::LexAfterModuleImport(Token &Result) { + // Figure out what kind of lexer we actually have. + recomputeCurLexerKind(); + + // Lex the next token. The header-name lexing rules are used at the start of + // a pp-import. + // + // For now, we only support header-name imports in C++20 mode. + // FIXME: Should we allow this in all language modes that support an import + // declaration as an extension? + if (ModuleImportPath.empty() && getLangOpts().CPlusPlusModules) { + if (LexHeaderName(Result)) + return true; + } else { + Lex(Result); + } + + // Allocate a holding buffer for a sequence of tokens and introduce it into + // the token stream. + auto EnterTokens = [this](ArrayRef<Token> Toks) { + auto ToksCopy = std::make_unique<Token[]>(Toks.size()); + std::copy(Toks.begin(), Toks.end(), ToksCopy.get()); + EnterTokenStream(std::move(ToksCopy), Toks.size(), + /*DisableMacroExpansion*/ true, /*IsReinject*/ false); + }; + + // Check for a header-name. + SmallVector<Token, 32> Suffix; + if (Result.is(tok::header_name)) { + // Enter the header-name token into the token stream; a Lex action cannot + // both return a token and cache tokens (doing so would corrupt the token + // cache if the call to Lex comes from CachingLex / PeekAhead). + Suffix.push_back(Result); + + // Consume the pp-import-suffix and expand any macros in it now. We'll add + // it back into the token stream later. + CollectPpImportSuffix(Suffix); + if (Suffix.back().isNot(tok::semi)) { + // This is not a pp-import after all. + EnterTokens(Suffix); + return false; + } + + // C++2a [cpp.module]p1: + // The ';' preprocessing-token terminating a pp-import shall not have + // been produced by macro replacement. + SourceLocation SemiLoc = Suffix.back().getLocation(); + if (SemiLoc.isMacroID()) + Diag(SemiLoc, diag::err_header_import_semi_in_macro); + + // Reconstitute the import token. + Token ImportTok; + ImportTok.startToken(); + ImportTok.setKind(tok::kw_import); + ImportTok.setLocation(ModuleImportLoc); + ImportTok.setIdentifierInfo(getIdentifierInfo("import")); + ImportTok.setLength(6); + + auto Action = HandleHeaderIncludeOrImport( + /*HashLoc*/ SourceLocation(), ImportTok, Suffix.front(), SemiLoc); + switch (Action.Kind) { + case ImportAction::None: + break; + + case ImportAction::ModuleBegin: + // Let the parser know we're textually entering the module. + Suffix.emplace_back(); + Suffix.back().startToken(); + Suffix.back().setKind(tok::annot_module_begin); + Suffix.back().setLocation(SemiLoc); + Suffix.back().setAnnotationEndLoc(SemiLoc); + Suffix.back().setAnnotationValue(Action.ModuleForHeader); + LLVM_FALLTHROUGH; + + case ImportAction::ModuleImport: + case ImportAction::SkippedModuleImport: + // We chose to import (or textually enter) the file. Convert the + // header-name token into a header unit annotation token. + Suffix[0].setKind(tok::annot_header_unit); + Suffix[0].setAnnotationEndLoc(Suffix[0].getLocation()); + Suffix[0].setAnnotationValue(Action.ModuleForHeader); + // FIXME: Call the moduleImport callback? + break; + } + + EnterTokens(Suffix); + return false; + } + + // The token sequence + // + // import identifier (. identifier)* + // + // indicates a module import directive. We already saw the 'import' + // contextual keyword, so now we're looking for the identifiers. + if (ModuleImportExpectsIdentifier && Result.getKind() == tok::identifier) { + // We expected to see an identifier here, and we did; continue handling + // identifiers. + ModuleImportPath.push_back(std::make_pair(Result.getIdentifierInfo(), + Result.getLocation())); + ModuleImportExpectsIdentifier = false; + CurLexerKind = CLK_LexAfterModuleImport; + return true; + } + + // If we're expecting a '.' or a ';', and we got a '.', then wait until we + // see the next identifier. (We can also see a '[[' that begins an + // attribute-specifier-seq here under the C++ Modules TS.) + if (!ModuleImportExpectsIdentifier && Result.getKind() == tok::period) { + ModuleImportExpectsIdentifier = true; + CurLexerKind = CLK_LexAfterModuleImport; + return true; + } + + // If we didn't recognize a module name at all, this is not a (valid) import. + if (ModuleImportPath.empty() || Result.is(tok::eof)) + return true; + + // Consume the pp-import-suffix and expand any macros in it now, if we're not + // at the semicolon already. + SourceLocation SemiLoc = Result.getLocation(); + if (Result.isNot(tok::semi)) { + Suffix.push_back(Result); + CollectPpImportSuffix(Suffix); + if (Suffix.back().isNot(tok::semi)) { + // This is not an import after all. + EnterTokens(Suffix); + return false; + } + SemiLoc = Suffix.back().getLocation(); + } + + // Under the Modules TS, the dot is just part of the module name, and not + // a real hierarchy separator. Flatten such module names now. + // + // FIXME: Is this the right level to be performing this transformation? + std::string FlatModuleName; + if (getLangOpts().ModulesTS || getLangOpts().CPlusPlusModules) { + for (auto &Piece : ModuleImportPath) { + if (!FlatModuleName.empty()) + FlatModuleName += "."; + FlatModuleName += Piece.first->getName(); + } + SourceLocation FirstPathLoc = ModuleImportPath[0].second; + ModuleImportPath.clear(); + ModuleImportPath.push_back( + std::make_pair(getIdentifierInfo(FlatModuleName), FirstPathLoc)); + } + + Module *Imported = nullptr; + if (getLangOpts().Modules) { + Imported = TheModuleLoader.loadModule(ModuleImportLoc, + ModuleImportPath, + Module::Hidden, + /*IsInclusionDirective=*/false); + if (Imported) + makeModuleVisible(Imported, SemiLoc); + } + if (Callbacks) + Callbacks->moduleImport(ModuleImportLoc, ModuleImportPath, Imported); + + if (!Suffix.empty()) { + EnterTokens(Suffix); + return false; + } + return true; +} + +void Preprocessor::makeModuleVisible(Module *M, SourceLocation Loc) { + CurSubmoduleState->VisibleModules.setVisible( + M, Loc, [](Module *) {}, + [&](ArrayRef<Module *> Path, Module *Conflict, StringRef Message) { + // FIXME: Include the path in the diagnostic. + // FIXME: Include the import location for the conflicting module. + Diag(ModuleImportLoc, diag::warn_module_conflict) + << Path[0]->getFullModuleName() + << Conflict->getFullModuleName() + << Message; + }); + + // Add this module to the imports list of the currently-built submodule. + if (!BuildingSubmoduleStack.empty() && M != BuildingSubmoduleStack.back().M) + BuildingSubmoduleStack.back().M->Imports.insert(M); +} + +bool Preprocessor::FinishLexStringLiteral(Token &Result, std::string &String, + const char *DiagnosticTag, + bool AllowMacroExpansion) { + // We need at least one string literal. + if (Result.isNot(tok::string_literal)) { + Diag(Result, diag::err_expected_string_literal) + << /*Source='in...'*/0 << DiagnosticTag; + return false; + } + + // Lex string literal tokens, optionally with macro expansion. + SmallVector<Token, 4> StrToks; + do { + StrToks.push_back(Result); + + if (Result.hasUDSuffix()) + Diag(Result, diag::err_invalid_string_udl); + + if (AllowMacroExpansion) + Lex(Result); + else + LexUnexpandedToken(Result); + } while (Result.is(tok::string_literal)); + + // Concatenate and parse the strings. + StringLiteralParser Literal(StrToks, *this); + assert(Literal.isAscii() && "Didn't allow wide strings in"); + + if (Literal.hadError) + return false; + + if (Literal.Pascal) { + Diag(StrToks[0].getLocation(), diag::err_expected_string_literal) + << /*Source='in...'*/0 << DiagnosticTag; + return false; + } + + String = Literal.GetString(); + return true; +} + +bool Preprocessor::parseSimpleIntegerLiteral(Token &Tok, uint64_t &Value) { + assert(Tok.is(tok::numeric_constant)); + SmallString<8> IntegerBuffer; + bool NumberInvalid = false; + StringRef Spelling = getSpelling(Tok, IntegerBuffer, &NumberInvalid); + if (NumberInvalid) + return false; + NumericLiteralParser Literal(Spelling, Tok.getLocation(), *this); + if (Literal.hadError || !Literal.isIntegerLiteral() || Literal.hasUDSuffix()) + return false; + llvm::APInt APVal(64, 0); + if (Literal.GetIntegerValue(APVal)) + return false; + Lex(Tok); + Value = APVal.getLimitedValue(); + return true; +} + +void Preprocessor::addCommentHandler(CommentHandler *Handler) { + assert(Handler && "NULL comment handler"); + assert(llvm::find(CommentHandlers, Handler) == CommentHandlers.end() && + "Comment handler already registered"); + CommentHandlers.push_back(Handler); +} + +void Preprocessor::removeCommentHandler(CommentHandler *Handler) { + std::vector<CommentHandler *>::iterator Pos = + llvm::find(CommentHandlers, Handler); + assert(Pos != CommentHandlers.end() && "Comment handler not registered"); + CommentHandlers.erase(Pos); +} + +bool Preprocessor::HandleComment(Token &result, SourceRange Comment) { + bool AnyPendingTokens = false; + for (std::vector<CommentHandler *>::iterator H = CommentHandlers.begin(), + HEnd = CommentHandlers.end(); + H != HEnd; ++H) { + if ((*H)->HandleComment(*this, Comment)) + AnyPendingTokens = true; + } + if (!AnyPendingTokens || getCommentRetentionState()) + return false; + Lex(result); + return true; +} + +ModuleLoader::~ModuleLoader() = default; + +CommentHandler::~CommentHandler() = default; + +CodeCompletionHandler::~CodeCompletionHandler() = default; + +void Preprocessor::createPreprocessingRecord() { + if (Record) + return; + + Record = new PreprocessingRecord(getSourceManager()); + addPPCallbacks(std::unique_ptr<PPCallbacks>(Record)); +} diff --git a/clang/lib/Lex/PreprocessorLexer.cpp b/clang/lib/Lex/PreprocessorLexer.cpp new file mode 100644 index 000000000000..5f6f4a13419b --- /dev/null +++ b/clang/lib/Lex/PreprocessorLexer.cpp @@ -0,0 +1,52 @@ +//===- PreprocessorLexer.cpp - C Language Family Lexer --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the PreprocessorLexer and Token interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/PreprocessorLexer.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/Token.h" +#include <cassert> + +using namespace clang; + +void PreprocessorLexer::anchor() {} + +PreprocessorLexer::PreprocessorLexer(Preprocessor *pp, FileID fid) + : PP(pp), FID(fid) { + if (pp) + InitialNumSLocEntries = pp->getSourceManager().local_sloc_entry_size(); +} + +/// After the preprocessor has parsed a \#include, lex and +/// (potentially) macro expand the filename. +void PreprocessorLexer::LexIncludeFilename(Token &FilenameTok) { + assert(ParsingFilename == false && "reentered LexIncludeFilename"); + + // We are now parsing a filename! + ParsingFilename = true; + + // Lex the filename. + if (LexingRawMode) + IndirectLex(FilenameTok); + else + PP->Lex(FilenameTok); + + // We should have obtained the filename now. + ParsingFilename = false; +} + +/// getFileEntry - Return the FileEntry corresponding to this FileID. Like +/// getFileID(), this only works for lexers with attached preprocessors. +const FileEntry *PreprocessorLexer::getFileEntry() const { + return PP->getSourceManager().getFileEntryForID(getFileID()); +} diff --git a/clang/lib/Lex/ScratchBuffer.cpp b/clang/lib/Lex/ScratchBuffer.cpp new file mode 100644 index 000000000000..19ab93ec54b4 --- /dev/null +++ b/clang/lib/Lex/ScratchBuffer.cpp @@ -0,0 +1,83 @@ +//===--- ScratchBuffer.cpp - Scratch space for forming tokens -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the ScratchBuffer interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/ScratchBuffer.h" +#include "clang/Basic/SourceManager.h" +#include "llvm/Support/MemoryBuffer.h" +#include <cstring> +using namespace clang; + +// ScratchBufSize - The size of each chunk of scratch memory. Slightly less +//than a page, almost certainly enough for anything. :) +static const unsigned ScratchBufSize = 4060; + +ScratchBuffer::ScratchBuffer(SourceManager &SM) + : SourceMgr(SM), CurBuffer(nullptr) { + // Set BytesUsed so that the first call to getToken will require an alloc. + BytesUsed = ScratchBufSize; +} + +/// getToken - Splat the specified text into a temporary MemoryBuffer and +/// return a SourceLocation that refers to the token. This is just like the +/// method below, but returns a location that indicates the physloc of the +/// token. +SourceLocation ScratchBuffer::getToken(const char *Buf, unsigned Len, + const char *&DestPtr) { + if (BytesUsed+Len+2 > ScratchBufSize) + AllocScratchBuffer(Len+2); + else { + // Clear out the source line cache if it's already been computed. + // FIXME: Allow this to be incrementally extended. + auto *ContentCache = const_cast<SrcMgr::ContentCache *>( + SourceMgr.getSLocEntry(SourceMgr.getFileID(BufferStartLoc)) + .getFile().getContentCache()); + ContentCache->SourceLineCache = nullptr; + } + + // Prefix the token with a \n, so that it looks like it is the first thing on + // its own virtual line in caret diagnostics. + CurBuffer[BytesUsed++] = '\n'; + + // Return a pointer to the character data. + DestPtr = CurBuffer+BytesUsed; + + // Copy the token data into the buffer. + memcpy(CurBuffer+BytesUsed, Buf, Len); + + // Remember that we used these bytes. + BytesUsed += Len+1; + + // Add a NUL terminator to the token. This keeps the tokens separated, in + // case they get relexed, and puts them on their own virtual lines in case a + // diagnostic points to one. + CurBuffer[BytesUsed-1] = '\0'; + + return BufferStartLoc.getLocWithOffset(BytesUsed-Len-1); +} + +void ScratchBuffer::AllocScratchBuffer(unsigned RequestLen) { + // Only pay attention to the requested length if it is larger than our default + // page size. If it is, we allocate an entire chunk for it. This is to + // support gigantic tokens, which almost certainly won't happen. :) + if (RequestLen < ScratchBufSize) + RequestLen = ScratchBufSize; + + // Get scratch buffer. Zero-initialize it so it can be dumped into a PCH file + // deterministically. + std::unique_ptr<llvm::WritableMemoryBuffer> OwnBuf = + llvm::WritableMemoryBuffer::getNewMemBuffer(RequestLen, + "<scratch space>"); + CurBuffer = OwnBuf->getBufferStart(); + FileID FID = SourceMgr.createFileID(std::move(OwnBuf)); + BufferStartLoc = SourceMgr.getLocForStartOfFile(FID); + BytesUsed = 0; +} diff --git a/clang/lib/Lex/TokenConcatenation.cpp b/clang/lib/Lex/TokenConcatenation.cpp new file mode 100644 index 000000000000..e626cfcc927f --- /dev/null +++ b/clang/lib/Lex/TokenConcatenation.cpp @@ -0,0 +1,297 @@ +//===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the TokenConcatenation class. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/TokenConcatenation.h" +#include "clang/Basic/CharInfo.h" +#include "clang/Lex/Preprocessor.h" +#include "llvm/Support/ErrorHandling.h" +using namespace clang; + + +/// IsStringPrefix - Return true if Str is a string prefix. +/// 'L', 'u', 'U', or 'u8'. Including raw versions. +static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) { + + if (Str[0] == 'L' || + (CPlusPlus11 && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) { + + if (Str.size() == 1) + return true; // "L", "u", "U", and "R" + + // Check for raw flavors. Need to make sure the first character wasn't + // already R. Need CPlusPlus11 check for "LR". + if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11) + return true; // "LR", "uR", "UR" + + // Check for "u8" and "u8R" + if (Str[0] == 'u' && Str[1] == '8') { + if (Str.size() == 2) return true; // "u8" + if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R" + } + } + + return false; +} + +/// IsIdentifierStringPrefix - Return true if the spelling of the token +/// is literally 'L', 'u', 'U', or 'u8'. Including raw versions. +bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { + const LangOptions &LangOpts = PP.getLangOpts(); + + if (!Tok.needsCleaning()) { + if (Tok.getLength() < 1 || Tok.getLength() > 3) + return false; + SourceManager &SM = PP.getSourceManager(); + const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); + return IsStringPrefix(StringRef(Ptr, Tok.getLength()), + LangOpts.CPlusPlus11); + } + + if (Tok.getLength() < 256) { + char Buffer[256]; + const char *TokPtr = Buffer; + unsigned length = PP.getSpelling(Tok, TokPtr); + return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus11); + } + + return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus11); +} + +TokenConcatenation::TokenConcatenation(const Preprocessor &pp) : PP(pp) { + memset(TokenInfo, 0, sizeof(TokenInfo)); + + // These tokens have custom code in AvoidConcat. + TokenInfo[tok::identifier ] |= aci_custom; + TokenInfo[tok::numeric_constant] |= aci_custom_firstchar; + TokenInfo[tok::period ] |= aci_custom_firstchar; + TokenInfo[tok::amp ] |= aci_custom_firstchar; + TokenInfo[tok::plus ] |= aci_custom_firstchar; + TokenInfo[tok::minus ] |= aci_custom_firstchar; + TokenInfo[tok::slash ] |= aci_custom_firstchar; + TokenInfo[tok::less ] |= aci_custom_firstchar; + TokenInfo[tok::greater ] |= aci_custom_firstchar; + TokenInfo[tok::pipe ] |= aci_custom_firstchar; + TokenInfo[tok::percent ] |= aci_custom_firstchar; + TokenInfo[tok::colon ] |= aci_custom_firstchar; + TokenInfo[tok::hash ] |= aci_custom_firstchar; + TokenInfo[tok::arrow ] |= aci_custom_firstchar; + + // These tokens have custom code in C++11 mode. + if (PP.getLangOpts().CPlusPlus11) { + TokenInfo[tok::string_literal ] |= aci_custom; + TokenInfo[tok::wide_string_literal ] |= aci_custom; + TokenInfo[tok::utf8_string_literal ] |= aci_custom; + TokenInfo[tok::utf16_string_literal] |= aci_custom; + TokenInfo[tok::utf32_string_literal] |= aci_custom; + TokenInfo[tok::char_constant ] |= aci_custom; + TokenInfo[tok::wide_char_constant ] |= aci_custom; + TokenInfo[tok::utf16_char_constant ] |= aci_custom; + TokenInfo[tok::utf32_char_constant ] |= aci_custom; + } + + // These tokens have custom code in C++17 mode. + if (PP.getLangOpts().CPlusPlus17) + TokenInfo[tok::utf8_char_constant] |= aci_custom; + + // These tokens have custom code in C++2a mode. + if (PP.getLangOpts().CPlusPlus2a) + TokenInfo[tok::lessequal ] |= aci_custom_firstchar; + + // These tokens change behavior if followed by an '='. + TokenInfo[tok::amp ] |= aci_avoid_equal; // &= + TokenInfo[tok::plus ] |= aci_avoid_equal; // += + TokenInfo[tok::minus ] |= aci_avoid_equal; // -= + TokenInfo[tok::slash ] |= aci_avoid_equal; // /= + TokenInfo[tok::less ] |= aci_avoid_equal; // <= + TokenInfo[tok::greater ] |= aci_avoid_equal; // >= + TokenInfo[tok::pipe ] |= aci_avoid_equal; // |= + TokenInfo[tok::percent ] |= aci_avoid_equal; // %= + TokenInfo[tok::star ] |= aci_avoid_equal; // *= + TokenInfo[tok::exclaim ] |= aci_avoid_equal; // != + TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<= + TokenInfo[tok::greatergreater] |= aci_avoid_equal; // >>= + TokenInfo[tok::caret ] |= aci_avoid_equal; // ^= + TokenInfo[tok::equal ] |= aci_avoid_equal; // == +} + +/// GetFirstChar - Get the first character of the token \arg Tok, +/// avoiding calls to getSpelling where possible. +static char GetFirstChar(const Preprocessor &PP, const Token &Tok) { + if (IdentifierInfo *II = Tok.getIdentifierInfo()) { + // Avoid spelling identifiers, the most common form of token. + return II->getNameStart()[0]; + } else if (!Tok.needsCleaning()) { + if (Tok.isLiteral() && Tok.getLiteralData()) { + return *Tok.getLiteralData(); + } else { + SourceManager &SM = PP.getSourceManager(); + return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); + } + } else if (Tok.getLength() < 256) { + char Buffer[256]; + const char *TokPtr = Buffer; + PP.getSpelling(Tok, TokPtr); + return TokPtr[0]; + } else { + return PP.getSpelling(Tok)[0]; + } +} + +/// AvoidConcat - If printing PrevTok immediately followed by Tok would cause +/// the two individual tokens to be lexed as a single token, return true +/// (which causes a space to be printed between them). This allows the output +/// of -E mode to be lexed to the same token stream as lexing the input +/// directly would. +/// +/// This code must conservatively return true if it doesn't want to be 100% +/// accurate. This will cause the output to include extra space characters, +/// but the resulting output won't have incorrect concatenations going on. +/// Examples include "..", which we print with a space between, because we +/// don't want to track enough to tell "x.." from "...". +bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, + const Token &PrevTok, + const Token &Tok) const { + // Conservatively assume that every annotation token that has a printable + // form requires whitespace. + if (PrevTok.isAnnotation()) + return true; + + // First, check to see if the tokens were directly adjacent in the original + // source. If they were, it must be okay to stick them together: if there + // were an issue, the tokens would have been lexed differently. + SourceManager &SM = PP.getSourceManager(); + SourceLocation PrevSpellLoc = SM.getSpellingLoc(PrevTok.getLocation()); + SourceLocation SpellLoc = SM.getSpellingLoc(Tok.getLocation()); + if (PrevSpellLoc.getLocWithOffset(PrevTok.getLength()) == SpellLoc) + return false; + + tok::TokenKind PrevKind = PrevTok.getKind(); + if (!PrevTok.isAnnotation() && PrevTok.getIdentifierInfo()) + PrevKind = tok::identifier; // Language keyword or named operator. + + // Look up information on when we should avoid concatenation with prevtok. + unsigned ConcatInfo = TokenInfo[PrevKind]; + + // If prevtok never causes a problem for anything after it, return quickly. + if (ConcatInfo == 0) return false; + + if (ConcatInfo & aci_avoid_equal) { + // If the next token is '=' or '==', avoid concatenation. + if (Tok.isOneOf(tok::equal, tok::equalequal)) + return true; + ConcatInfo &= ~aci_avoid_equal; + } + if (Tok.isAnnotation()) { + // Modules annotation can show up when generated automatically for includes. + assert(Tok.isOneOf(tok::annot_module_include, tok::annot_module_begin, + tok::annot_module_end) && + "unexpected annotation in AvoidConcat"); + ConcatInfo = 0; + } + + if (ConcatInfo == 0) + return false; + + // Basic algorithm: we look at the first character of the second token, and + // determine whether it, if appended to the first token, would form (or + // would contribute) to a larger token if concatenated. + char FirstChar = 0; + if (ConcatInfo & aci_custom) { + // If the token does not need to know the first character, don't get it. + } else { + FirstChar = GetFirstChar(PP, Tok); + } + + switch (PrevKind) { + default: + llvm_unreachable("InitAvoidConcatTokenInfo built wrong"); + + case tok::raw_identifier: + llvm_unreachable("tok::raw_identifier in non-raw lexing mode!"); + + case tok::string_literal: + case tok::wide_string_literal: + case tok::utf8_string_literal: + case tok::utf16_string_literal: + case tok::utf32_string_literal: + case tok::char_constant: + case tok::wide_char_constant: + case tok::utf8_char_constant: + case tok::utf16_char_constant: + case tok::utf32_char_constant: + if (!PP.getLangOpts().CPlusPlus11) + return false; + + // In C++11, a string or character literal followed by an identifier is a + // single token. + if (Tok.getIdentifierInfo()) + return true; + + // A ud-suffix is an identifier. If the previous token ends with one, treat + // it as an identifier. + if (!PrevTok.hasUDSuffix()) + return false; + LLVM_FALLTHROUGH; + case tok::identifier: // id+id or id+number or id+L"foo". + // id+'.'... will not append. + if (Tok.is(tok::numeric_constant)) + return GetFirstChar(PP, Tok) != '.'; + + if (Tok.getIdentifierInfo() || + Tok.isOneOf(tok::wide_string_literal, tok::utf8_string_literal, + tok::utf16_string_literal, tok::utf32_string_literal, + tok::wide_char_constant, tok::utf8_char_constant, + tok::utf16_char_constant, tok::utf32_char_constant)) + return true; + + // If this isn't identifier + string, we're done. + if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) + return false; + + // Otherwise, this is a narrow character or string. If the *identifier* + // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". + return IsIdentifierStringPrefix(PrevTok); + + case tok::numeric_constant: + return isPreprocessingNumberBody(FirstChar) || + FirstChar == '+' || FirstChar == '-'; + case tok::period: // ..., .*, .1234 + return (FirstChar == '.' && PrevPrevTok.is(tok::period)) || + isDigit(FirstChar) || + (PP.getLangOpts().CPlusPlus && FirstChar == '*'); + case tok::amp: // && + return FirstChar == '&'; + case tok::plus: // ++ + return FirstChar == '+'; + case tok::minus: // --, ->, ->* + return FirstChar == '-' || FirstChar == '>'; + case tok::slash: //, /*, // + return FirstChar == '*' || FirstChar == '/'; + case tok::less: // <<, <<=, <:, <% + return FirstChar == '<' || FirstChar == ':' || FirstChar == '%'; + case tok::greater: // >>, >>= + return FirstChar == '>'; + case tok::pipe: // || + return FirstChar == '|'; + case tok::percent: // %>, %: + return FirstChar == '>' || FirstChar == ':'; + case tok::colon: // ::, :> + return FirstChar == '>' || + (PP.getLangOpts().CPlusPlus && FirstChar == ':'); + case tok::hash: // ##, #@, %:%: + return FirstChar == '#' || FirstChar == '@' || FirstChar == '%'; + case tok::arrow: // ->* + return PP.getLangOpts().CPlusPlus && FirstChar == '*'; + case tok::lessequal: // <=> (C++2a) + return PP.getLangOpts().CPlusPlus2a && FirstChar == '>'; + } +} diff --git a/clang/lib/Lex/TokenLexer.cpp b/clang/lib/Lex/TokenLexer.cpp new file mode 100644 index 000000000000..da5681aaf478 --- /dev/null +++ b/clang/lib/Lex/TokenLexer.cpp @@ -0,0 +1,1079 @@ +//===- TokenLexer.cpp - Lex from a token stream ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the TokenLexer interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/TokenLexer.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/MacroArgs.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/Token.h" +#include "clang/Lex/VariadicMacroSupport.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" +#include <cassert> +#include <cstring> + +using namespace clang; + +/// Create a TokenLexer for the specified macro with the specified actual +/// arguments. Note that this ctor takes ownership of the ActualArgs pointer. +void TokenLexer::Init(Token &Tok, SourceLocation ELEnd, MacroInfo *MI, + MacroArgs *Actuals) { + // If the client is reusing a TokenLexer, make sure to free any memory + // associated with it. + destroy(); + + Macro = MI; + ActualArgs = Actuals; + CurTokenIdx = 0; + + ExpandLocStart = Tok.getLocation(); + ExpandLocEnd = ELEnd; + AtStartOfLine = Tok.isAtStartOfLine(); + HasLeadingSpace = Tok.hasLeadingSpace(); + NextTokGetsSpace = false; + Tokens = &*Macro->tokens_begin(); + OwnsTokens = false; + DisableMacroExpansion = false; + IsReinject = false; + NumTokens = Macro->tokens_end()-Macro->tokens_begin(); + MacroExpansionStart = SourceLocation(); + + SourceManager &SM = PP.getSourceManager(); + MacroStartSLocOffset = SM.getNextLocalOffset(); + + if (NumTokens > 0) { + assert(Tokens[0].getLocation().isValid()); + assert((Tokens[0].getLocation().isFileID() || Tokens[0].is(tok::comment)) && + "Macro defined in macro?"); + assert(ExpandLocStart.isValid()); + + // Reserve a source location entry chunk for the length of the macro + // definition. Tokens that get lexed directly from the definition will + // have their locations pointing inside this chunk. This is to avoid + // creating separate source location entries for each token. + MacroDefStart = SM.getExpansionLoc(Tokens[0].getLocation()); + MacroDefLength = Macro->getDefinitionLength(SM); + MacroExpansionStart = SM.createExpansionLoc(MacroDefStart, + ExpandLocStart, + ExpandLocEnd, + MacroDefLength); + } + + // If this is a function-like macro, expand the arguments and change + // Tokens to point to the expanded tokens. + if (Macro->isFunctionLike() && Macro->getNumParams()) + ExpandFunctionArguments(); + + // Mark the macro as currently disabled, so that it is not recursively + // expanded. The macro must be disabled only after argument pre-expansion of + // function-like macro arguments occurs. + Macro->DisableMacro(); +} + +/// Create a TokenLexer for the specified token stream. This does not +/// take ownership of the specified token vector. +void TokenLexer::Init(const Token *TokArray, unsigned NumToks, + bool disableMacroExpansion, bool ownsTokens, + bool isReinject) { + assert(!isReinject || disableMacroExpansion); + // If the client is reusing a TokenLexer, make sure to free any memory + // associated with it. + destroy(); + + Macro = nullptr; + ActualArgs = nullptr; + Tokens = TokArray; + OwnsTokens = ownsTokens; + DisableMacroExpansion = disableMacroExpansion; + IsReinject = isReinject; + NumTokens = NumToks; + CurTokenIdx = 0; + ExpandLocStart = ExpandLocEnd = SourceLocation(); + AtStartOfLine = false; + HasLeadingSpace = false; + NextTokGetsSpace = false; + MacroExpansionStart = SourceLocation(); + + // Set HasLeadingSpace/AtStartOfLine so that the first token will be + // returned unmodified. + if (NumToks != 0) { + AtStartOfLine = TokArray[0].isAtStartOfLine(); + HasLeadingSpace = TokArray[0].hasLeadingSpace(); + } +} + +void TokenLexer::destroy() { + // If this was a function-like macro that actually uses its arguments, delete + // the expanded tokens. + if (OwnsTokens) { + delete [] Tokens; + Tokens = nullptr; + OwnsTokens = false; + } + + // TokenLexer owns its formal arguments. + if (ActualArgs) ActualArgs->destroy(PP); +} + +bool TokenLexer::MaybeRemoveCommaBeforeVaArgs( + SmallVectorImpl<Token> &ResultToks, bool HasPasteOperator, MacroInfo *Macro, + unsigned MacroArgNo, Preprocessor &PP) { + // Is the macro argument __VA_ARGS__? + if (!Macro->isVariadic() || MacroArgNo != Macro->getNumParams()-1) + return false; + + // In Microsoft-compatibility mode, a comma is removed in the expansion + // of " ... , __VA_ARGS__ " if __VA_ARGS__ is empty. This extension is + // not supported by gcc. + if (!HasPasteOperator && !PP.getLangOpts().MSVCCompat) + return false; + + // GCC removes the comma in the expansion of " ... , ## __VA_ARGS__ " if + // __VA_ARGS__ is empty, but not in strict C99 mode where there are no + // named arguments, where it remains. In all other modes, including C99 + // with GNU extensions, it is removed regardless of named arguments. + // Microsoft also appears to support this extension, unofficially. + if (PP.getLangOpts().C99 && !PP.getLangOpts().GNUMode + && Macro->getNumParams() < 2) + return false; + + // Is a comma available to be removed? + if (ResultToks.empty() || !ResultToks.back().is(tok::comma)) + return false; + + // Issue an extension diagnostic for the paste operator. + if (HasPasteOperator) + PP.Diag(ResultToks.back().getLocation(), diag::ext_paste_comma); + + // Remove the comma. + ResultToks.pop_back(); + + if (!ResultToks.empty()) { + // If the comma was right after another paste (e.g. "X##,##__VA_ARGS__"), + // then removal of the comma should produce a placemarker token (in C99 + // terms) which we model by popping off the previous ##, giving us a plain + // "X" when __VA_ARGS__ is empty. + if (ResultToks.back().is(tok::hashhash)) + ResultToks.pop_back(); + + // Remember that this comma was elided. + ResultToks.back().setFlag(Token::CommaAfterElided); + } + + // Never add a space, even if the comma, ##, or arg had a space. + NextTokGetsSpace = false; + return true; +} + +void TokenLexer::stringifyVAOPTContents( + SmallVectorImpl<Token> &ResultToks, const VAOptExpansionContext &VCtx, + const SourceLocation VAOPTClosingParenLoc) { + const int NumToksPriorToVAOpt = VCtx.getNumberOfTokensPriorToVAOpt(); + const unsigned int NumVAOptTokens = ResultToks.size() - NumToksPriorToVAOpt; + Token *const VAOPTTokens = + NumVAOptTokens ? &ResultToks[NumToksPriorToVAOpt] : nullptr; + + SmallVector<Token, 64> ConcatenatedVAOPTResultToks; + // FIXME: Should we keep track within VCtx that we did or didnot + // encounter pasting - and only then perform this loop. + + // Perform token pasting (concatenation) prior to stringization. + for (unsigned int CurTokenIdx = 0; CurTokenIdx != NumVAOptTokens; + ++CurTokenIdx) { + if (VAOPTTokens[CurTokenIdx].is(tok::hashhash)) { + assert(CurTokenIdx != 0 && + "Can not have __VAOPT__ contents begin with a ##"); + Token &LHS = VAOPTTokens[CurTokenIdx - 1]; + pasteTokens(LHS, llvm::makeArrayRef(VAOPTTokens, NumVAOptTokens), + CurTokenIdx); + // Replace the token prior to the first ## in this iteration. + ConcatenatedVAOPTResultToks.back() = LHS; + if (CurTokenIdx == NumVAOptTokens) + break; + } + ConcatenatedVAOPTResultToks.push_back(VAOPTTokens[CurTokenIdx]); + } + + ConcatenatedVAOPTResultToks.push_back(VCtx.getEOFTok()); + // Get the SourceLocation that represents the start location within + // the macro definition that marks where this string is substituted + // into: i.e. the __VA_OPT__ and the ')' within the spelling of the + // macro definition, and use it to indicate that the stringified token + // was generated from that location. + const SourceLocation ExpansionLocStartWithinMacro = + getExpansionLocForMacroDefLoc(VCtx.getVAOptLoc()); + const SourceLocation ExpansionLocEndWithinMacro = + getExpansionLocForMacroDefLoc(VAOPTClosingParenLoc); + + Token StringifiedVAOPT = MacroArgs::StringifyArgument( + &ConcatenatedVAOPTResultToks[0], PP, VCtx.hasCharifyBefore() /*Charify*/, + ExpansionLocStartWithinMacro, ExpansionLocEndWithinMacro); + + if (VCtx.getLeadingSpaceForStringifiedToken()) + StringifiedVAOPT.setFlag(Token::LeadingSpace); + + StringifiedVAOPT.setFlag(Token::StringifiedInMacro); + // Resize (shrink) the token stream to just capture this stringified token. + ResultToks.resize(NumToksPriorToVAOpt + 1); + ResultToks.back() = StringifiedVAOPT; +} + +/// Expand the arguments of a function-like macro so that we can quickly +/// return preexpanded tokens from Tokens. +void TokenLexer::ExpandFunctionArguments() { + SmallVector<Token, 128> ResultToks; + + // Loop through 'Tokens', expanding them into ResultToks. Keep + // track of whether we change anything. If not, no need to keep them. If so, + // we install the newly expanded sequence as the new 'Tokens' list. + bool MadeChange = false; + + Optional<bool> CalledWithVariadicArguments; + + VAOptExpansionContext VCtx(PP); + + for (unsigned I = 0, E = NumTokens; I != E; ++I) { + const Token &CurTok = Tokens[I]; + // We don't want a space for the next token after a paste + // operator. In valid code, the token will get smooshed onto the + // preceding one anyway. In assembler-with-cpp mode, invalid + // pastes are allowed through: in this case, we do not want the + // extra whitespace to be added. For example, we want ". ## foo" + // -> ".foo" not ". foo". + if (I != 0 && !Tokens[I-1].is(tok::hashhash) && CurTok.hasLeadingSpace()) + NextTokGetsSpace = true; + + if (VCtx.isVAOptToken(CurTok)) { + MadeChange = true; + assert(Tokens[I + 1].is(tok::l_paren) && + "__VA_OPT__ must be followed by '('"); + + ++I; // Skip the l_paren + VCtx.sawVAOptFollowedByOpeningParens(CurTok.getLocation(), + ResultToks.size()); + + continue; + } + + // We have entered into the __VA_OPT__ context, so handle tokens + // appropriately. + if (VCtx.isInVAOpt()) { + // If we are about to process a token that is either an argument to + // __VA_OPT__ or its closing rparen, then: + // 1) If the token is the closing rparen that exits us out of __VA_OPT__, + // perform any necessary stringification or placemarker processing, + // and/or skip to the next token. + // 2) else if macro was invoked without variadic arguments skip this + // token. + // 3) else (macro was invoked with variadic arguments) process the token + // normally. + + if (Tokens[I].is(tok::l_paren)) + VCtx.sawOpeningParen(Tokens[I].getLocation()); + // Continue skipping tokens within __VA_OPT__ if the macro was not + // called with variadic arguments, else let the rest of the loop handle + // this token. Note sawClosingParen() returns true only if the r_paren matches + // the closing r_paren of the __VA_OPT__. + if (!Tokens[I].is(tok::r_paren) || !VCtx.sawClosingParen()) { + // Lazily expand __VA_ARGS__ when we see the first __VA_OPT__. + if (!CalledWithVariadicArguments.hasValue()) { + CalledWithVariadicArguments = + ActualArgs->invokedWithVariadicArgument(Macro, PP); + } + if (!*CalledWithVariadicArguments) { + // Skip this token. + continue; + } + // ... else the macro was called with variadic arguments, and we do not + // have a closing rparen - so process this token normally. + } else { + // Current token is the closing r_paren which marks the end of the + // __VA_OPT__ invocation, so handle any place-marker pasting (if + // empty) by removing hashhash either before (if exists) or after. And + // also stringify the entire contents if VAOPT was preceded by a hash, + // but do so only after any token concatenation that needs to occur + // within the contents of VAOPT. + + if (VCtx.hasStringifyOrCharifyBefore()) { + // Replace all the tokens just added from within VAOPT into a single + // stringified token. This requires token-pasting to eagerly occur + // within these tokens. If either the contents of VAOPT were empty + // or the macro wasn't called with any variadic arguments, the result + // is a token that represents an empty string. + stringifyVAOPTContents(ResultToks, VCtx, + /*ClosingParenLoc*/ Tokens[I].getLocation()); + + } else if (/*No tokens within VAOPT*/ + ResultToks.size() == VCtx.getNumberOfTokensPriorToVAOpt()) { + // Treat VAOPT as a placemarker token. Eat either the '##' before the + // RHS/VAOPT (if one exists, suggesting that the LHS (if any) to that + // hashhash was not a placemarker) or the '##' + // after VAOPT, but not both. + + if (ResultToks.size() && ResultToks.back().is(tok::hashhash)) { + ResultToks.pop_back(); + } else if ((I + 1 != E) && Tokens[I + 1].is(tok::hashhash)) { + ++I; // Skip the following hashhash. + } + } else { + // If there's a ## before the __VA_OPT__, we might have discovered + // that the __VA_OPT__ begins with a placeholder. We delay action on + // that to now to avoid messing up our stashed count of tokens before + // __VA_OPT__. + if (VCtx.beginsWithPlaceholder()) { + assert(VCtx.getNumberOfTokensPriorToVAOpt() > 0 && + ResultToks.size() >= VCtx.getNumberOfTokensPriorToVAOpt() && + ResultToks[VCtx.getNumberOfTokensPriorToVAOpt() - 1].is( + tok::hashhash) && + "no token paste before __VA_OPT__"); + ResultToks.erase(ResultToks.begin() + + VCtx.getNumberOfTokensPriorToVAOpt() - 1); + } + // If the expansion of __VA_OPT__ ends with a placeholder, eat any + // following '##' token. + if (VCtx.endsWithPlaceholder() && I + 1 != E && + Tokens[I + 1].is(tok::hashhash)) { + ++I; + } + } + VCtx.reset(); + // We processed __VA_OPT__'s closing paren (and the exit out of + // __VA_OPT__), so skip to the next token. + continue; + } + } + + // If we found the stringify operator, get the argument stringified. The + // preprocessor already verified that the following token is a macro + // parameter or __VA_OPT__ when the #define was lexed. + + if (CurTok.isOneOf(tok::hash, tok::hashat)) { + int ArgNo = Macro->getParameterNum(Tokens[I+1].getIdentifierInfo()); + assert((ArgNo != -1 || VCtx.isVAOptToken(Tokens[I + 1])) && + "Token following # is not an argument or __VA_OPT__!"); + + if (ArgNo == -1) { + // Handle the __VA_OPT__ case. + VCtx.sawHashOrHashAtBefore(NextTokGetsSpace, + CurTok.is(tok::hashat)); + continue; + } + // Else handle the simple argument case. + SourceLocation ExpansionLocStart = + getExpansionLocForMacroDefLoc(CurTok.getLocation()); + SourceLocation ExpansionLocEnd = + getExpansionLocForMacroDefLoc(Tokens[I+1].getLocation()); + + bool Charify = CurTok.is(tok::hashat); + const Token *UnexpArg = ActualArgs->getUnexpArgument(ArgNo); + Token Res = MacroArgs::StringifyArgument( + UnexpArg, PP, Charify, ExpansionLocStart, ExpansionLocEnd); + Res.setFlag(Token::StringifiedInMacro); + + // The stringified/charified string leading space flag gets set to match + // the #/#@ operator. + if (NextTokGetsSpace) + Res.setFlag(Token::LeadingSpace); + + ResultToks.push_back(Res); + MadeChange = true; + ++I; // Skip arg name. + NextTokGetsSpace = false; + continue; + } + + // Find out if there is a paste (##) operator before or after the token. + bool NonEmptyPasteBefore = + !ResultToks.empty() && ResultToks.back().is(tok::hashhash); + bool PasteBefore = I != 0 && Tokens[I-1].is(tok::hashhash); + bool PasteAfter = I+1 != E && Tokens[I+1].is(tok::hashhash); + bool RParenAfter = I+1 != E && Tokens[I+1].is(tok::r_paren); + + assert((!NonEmptyPasteBefore || PasteBefore || VCtx.isInVAOpt()) && + "unexpected ## in ResultToks"); + + // Otherwise, if this is not an argument token, just add the token to the + // output buffer. + IdentifierInfo *II = CurTok.getIdentifierInfo(); + int ArgNo = II ? Macro->getParameterNum(II) : -1; + if (ArgNo == -1) { + // This isn't an argument, just add it. + ResultToks.push_back(CurTok); + + if (NextTokGetsSpace) { + ResultToks.back().setFlag(Token::LeadingSpace); + NextTokGetsSpace = false; + } else if (PasteBefore && !NonEmptyPasteBefore) + ResultToks.back().clearFlag(Token::LeadingSpace); + + continue; + } + + // An argument is expanded somehow, the result is different than the + // input. + MadeChange = true; + + // Otherwise, this is a use of the argument. + + // In Microsoft mode, remove the comma before __VA_ARGS__ to ensure there + // are no trailing commas if __VA_ARGS__ is empty. + if (!PasteBefore && ActualArgs->isVarargsElidedUse() && + MaybeRemoveCommaBeforeVaArgs(ResultToks, + /*HasPasteOperator=*/false, + Macro, ArgNo, PP)) + continue; + + // If it is not the LHS/RHS of a ## operator, we must pre-expand the + // argument and substitute the expanded tokens into the result. This is + // C99 6.10.3.1p1. + if (!PasteBefore && !PasteAfter) { + const Token *ResultArgToks; + + // Only preexpand the argument if it could possibly need it. This + // avoids some work in common cases. + const Token *ArgTok = ActualArgs->getUnexpArgument(ArgNo); + if (ActualArgs->ArgNeedsPreexpansion(ArgTok, PP)) + ResultArgToks = &ActualArgs->getPreExpArgument(ArgNo, PP)[0]; + else + ResultArgToks = ArgTok; // Use non-preexpanded tokens. + + // If the arg token expanded into anything, append it. + if (ResultArgToks->isNot(tok::eof)) { + size_t FirstResult = ResultToks.size(); + unsigned NumToks = MacroArgs::getArgLength(ResultArgToks); + ResultToks.append(ResultArgToks, ResultArgToks+NumToks); + + // In Microsoft-compatibility mode, we follow MSVC's preprocessing + // behavior by not considering single commas from nested macro + // expansions as argument separators. Set a flag on the token so we can + // test for this later when the macro expansion is processed. + if (PP.getLangOpts().MSVCCompat && NumToks == 1 && + ResultToks.back().is(tok::comma)) + ResultToks.back().setFlag(Token::IgnoredComma); + + // If the '##' came from expanding an argument, turn it into 'unknown' + // to avoid pasting. + for (Token &Tok : llvm::make_range(ResultToks.begin() + FirstResult, + ResultToks.end())) { + if (Tok.is(tok::hashhash)) + Tok.setKind(tok::unknown); + } + + if(ExpandLocStart.isValid()) { + updateLocForMacroArgTokens(CurTok.getLocation(), + ResultToks.begin()+FirstResult, + ResultToks.end()); + } + + // If any tokens were substituted from the argument, the whitespace + // before the first token should match the whitespace of the arg + // identifier. + ResultToks[FirstResult].setFlagValue(Token::LeadingSpace, + NextTokGetsSpace); + ResultToks[FirstResult].setFlagValue(Token::StartOfLine, false); + NextTokGetsSpace = false; + } else { + // We're creating a placeholder token. Usually this doesn't matter, + // but it can affect paste behavior when at the start or end of a + // __VA_OPT__. + if (NonEmptyPasteBefore) { + // We're imagining a placeholder token is inserted here. If this is + // the first token in a __VA_OPT__ after a ##, delete the ##. + assert(VCtx.isInVAOpt() && "should only happen inside a __VA_OPT__"); + VCtx.hasPlaceholderAfterHashhashAtStart(); + } + if (RParenAfter) + VCtx.hasPlaceholderBeforeRParen(); + } + continue; + } + + // Okay, we have a token that is either the LHS or RHS of a paste (##) + // argument. It gets substituted as its non-pre-expanded tokens. + const Token *ArgToks = ActualArgs->getUnexpArgument(ArgNo); + unsigned NumToks = MacroArgs::getArgLength(ArgToks); + if (NumToks) { // Not an empty argument? + bool VaArgsPseudoPaste = false; + // If this is the GNU ", ## __VA_ARGS__" extension, and we just learned + // that __VA_ARGS__ expands to multiple tokens, avoid a pasting error when + // the expander tries to paste ',' with the first token of the __VA_ARGS__ + // expansion. + if (NonEmptyPasteBefore && ResultToks.size() >= 2 && + ResultToks[ResultToks.size()-2].is(tok::comma) && + (unsigned)ArgNo == Macro->getNumParams()-1 && + Macro->isVariadic()) { + VaArgsPseudoPaste = true; + // Remove the paste operator, report use of the extension. + PP.Diag(ResultToks.pop_back_val().getLocation(), diag::ext_paste_comma); + } + + ResultToks.append(ArgToks, ArgToks+NumToks); + + // If the '##' came from expanding an argument, turn it into 'unknown' + // to avoid pasting. + for (Token &Tok : llvm::make_range(ResultToks.end() - NumToks, + ResultToks.end())) { + if (Tok.is(tok::hashhash)) + Tok.setKind(tok::unknown); + } + + if (ExpandLocStart.isValid()) { + updateLocForMacroArgTokens(CurTok.getLocation(), + ResultToks.end()-NumToks, ResultToks.end()); + } + + // Transfer the leading whitespace information from the token + // (the macro argument) onto the first token of the + // expansion. Note that we don't do this for the GNU + // pseudo-paste extension ", ## __VA_ARGS__". + if (!VaArgsPseudoPaste) { + ResultToks[ResultToks.size() - NumToks].setFlagValue(Token::StartOfLine, + false); + ResultToks[ResultToks.size() - NumToks].setFlagValue( + Token::LeadingSpace, NextTokGetsSpace); + } + + NextTokGetsSpace = false; + continue; + } + + // If an empty argument is on the LHS or RHS of a paste, the standard (C99 + // 6.10.3.3p2,3) calls for a bunch of placemarker stuff to occur. We + // implement this by eating ## operators when a LHS or RHS expands to + // empty. + if (PasteAfter) { + // Discard the argument token and skip (don't copy to the expansion + // buffer) the paste operator after it. + ++I; + continue; + } + + if (RParenAfter) + VCtx.hasPlaceholderBeforeRParen(); + + // If this is on the RHS of a paste operator, we've already copied the + // paste operator to the ResultToks list, unless the LHS was empty too. + // Remove it. + assert(PasteBefore); + if (NonEmptyPasteBefore) { + assert(ResultToks.back().is(tok::hashhash)); + // Do not remove the paste operator if it is the one before __VA_OPT__ + // (and we are still processing tokens within VA_OPT). We handle the case + // of removing the paste operator if __VA_OPT__ reduces to the notional + // placemarker above when we encounter the closing paren of VA_OPT. + if (!VCtx.isInVAOpt() || + ResultToks.size() > VCtx.getNumberOfTokensPriorToVAOpt()) + ResultToks.pop_back(); + else + VCtx.hasPlaceholderAfterHashhashAtStart(); + } + + // If this is the __VA_ARGS__ token, and if the argument wasn't provided, + // and if the macro had at least one real argument, and if the token before + // the ## was a comma, remove the comma. This is a GCC extension which is + // disabled when using -std=c99. + if (ActualArgs->isVarargsElidedUse()) + MaybeRemoveCommaBeforeVaArgs(ResultToks, + /*HasPasteOperator=*/true, + Macro, ArgNo, PP); + } + + // If anything changed, install this as the new Tokens list. + if (MadeChange) { + assert(!OwnsTokens && "This would leak if we already own the token list"); + // This is deleted in the dtor. + NumTokens = ResultToks.size(); + // The tokens will be added to Preprocessor's cache and will be removed + // when this TokenLexer finishes lexing them. + Tokens = PP.cacheMacroExpandedTokens(this, ResultToks); + + // The preprocessor cache of macro expanded tokens owns these tokens,not us. + OwnsTokens = false; + } +} + +/// Checks if two tokens form wide string literal. +static bool isWideStringLiteralFromMacro(const Token &FirstTok, + const Token &SecondTok) { + return FirstTok.is(tok::identifier) && + FirstTok.getIdentifierInfo()->isStr("L") && SecondTok.isLiteral() && + SecondTok.stringifiedInMacro(); +} + +/// Lex - Lex and return a token from this macro stream. +bool TokenLexer::Lex(Token &Tok) { + // Lexing off the end of the macro, pop this macro off the expansion stack. + if (isAtEnd()) { + // If this is a macro (not a token stream), mark the macro enabled now + // that it is no longer being expanded. + if (Macro) Macro->EnableMacro(); + + Tok.startToken(); + Tok.setFlagValue(Token::StartOfLine , AtStartOfLine); + Tok.setFlagValue(Token::LeadingSpace, HasLeadingSpace || NextTokGetsSpace); + if (CurTokenIdx == 0) + Tok.setFlag(Token::LeadingEmptyMacro); + return PP.HandleEndOfTokenLexer(Tok); + } + + SourceManager &SM = PP.getSourceManager(); + + // If this is the first token of the expanded result, we inherit spacing + // properties later. + bool isFirstToken = CurTokenIdx == 0; + + // Get the next token to return. + Tok = Tokens[CurTokenIdx++]; + if (IsReinject) + Tok.setFlag(Token::IsReinjected); + + bool TokenIsFromPaste = false; + + // If this token is followed by a token paste (##) operator, paste the tokens! + // Note that ## is a normal token when not expanding a macro. + if (!isAtEnd() && Macro && + (Tokens[CurTokenIdx].is(tok::hashhash) || + // Special processing of L#x macros in -fms-compatibility mode. + // Microsoft compiler is able to form a wide string literal from + // 'L#macro_arg' construct in a function-like macro. + (PP.getLangOpts().MSVCCompat && + isWideStringLiteralFromMacro(Tok, Tokens[CurTokenIdx])))) { + // When handling the microsoft /##/ extension, the final token is + // returned by pasteTokens, not the pasted token. + if (pasteTokens(Tok)) + return true; + + TokenIsFromPaste = true; + } + + // The token's current location indicate where the token was lexed from. We + // need this information to compute the spelling of the token, but any + // diagnostics for the expanded token should appear as if they came from + // ExpansionLoc. Pull this information together into a new SourceLocation + // that captures all of this. + if (ExpandLocStart.isValid() && // Don't do this for token streams. + // Check that the token's location was not already set properly. + SM.isBeforeInSLocAddrSpace(Tok.getLocation(), MacroStartSLocOffset)) { + SourceLocation instLoc; + if (Tok.is(tok::comment)) { + instLoc = SM.createExpansionLoc(Tok.getLocation(), + ExpandLocStart, + ExpandLocEnd, + Tok.getLength()); + } else { + instLoc = getExpansionLocForMacroDefLoc(Tok.getLocation()); + } + + Tok.setLocation(instLoc); + } + + // If this is the first token, set the lexical properties of the token to + // match the lexical properties of the macro identifier. + if (isFirstToken) { + Tok.setFlagValue(Token::StartOfLine , AtStartOfLine); + Tok.setFlagValue(Token::LeadingSpace, HasLeadingSpace); + } else { + // If this is not the first token, we may still need to pass through + // leading whitespace if we've expanded a macro. + if (AtStartOfLine) Tok.setFlag(Token::StartOfLine); + if (HasLeadingSpace) Tok.setFlag(Token::LeadingSpace); + } + AtStartOfLine = false; + HasLeadingSpace = false; + + // Handle recursive expansion! + if (!Tok.isAnnotation() && Tok.getIdentifierInfo() != nullptr) { + // Change the kind of this identifier to the appropriate token kind, e.g. + // turning "for" into a keyword. + IdentifierInfo *II = Tok.getIdentifierInfo(); + Tok.setKind(II->getTokenID()); + + // If this identifier was poisoned and from a paste, emit an error. This + // won't be handled by Preprocessor::HandleIdentifier because this is coming + // from a macro expansion. + if (II->isPoisoned() && TokenIsFromPaste) { + PP.HandlePoisonedIdentifier(Tok); + } + + if (!DisableMacroExpansion && II->isHandleIdentifierCase()) + return PP.HandleIdentifier(Tok); + } + + // Otherwise, return a normal token. + return true; +} + +bool TokenLexer::pasteTokens(Token &Tok) { + return pasteTokens(Tok, llvm::makeArrayRef(Tokens, NumTokens), CurTokenIdx); +} + +/// LHSTok is the LHS of a ## operator, and CurTokenIdx is the ## +/// operator. Read the ## and RHS, and paste the LHS/RHS together. If there +/// are more ## after it, chomp them iteratively. Return the result as LHSTok. +/// If this returns true, the caller should immediately return the token. +bool TokenLexer::pasteTokens(Token &LHSTok, ArrayRef<Token> TokenStream, + unsigned int &CurIdx) { + assert(CurIdx > 0 && "## can not be the first token within tokens"); + assert((TokenStream[CurIdx].is(tok::hashhash) || + (PP.getLangOpts().MSVCCompat && + isWideStringLiteralFromMacro(LHSTok, TokenStream[CurIdx]))) && + "Token at this Index must be ## or part of the MSVC 'L " + "#macro-arg' pasting pair"); + + // MSVC: If previous token was pasted, this must be a recovery from an invalid + // paste operation. Ignore spaces before this token to mimic MSVC output. + // Required for generating valid UUID strings in some MS headers. + if (PP.getLangOpts().MicrosoftExt && (CurIdx >= 2) && + TokenStream[CurIdx - 2].is(tok::hashhash)) + LHSTok.clearFlag(Token::LeadingSpace); + + SmallString<128> Buffer; + const char *ResultTokStrPtr = nullptr; + SourceLocation StartLoc = LHSTok.getLocation(); + SourceLocation PasteOpLoc; + + auto IsAtEnd = [&TokenStream, &CurIdx] { + return TokenStream.size() == CurIdx; + }; + + do { + // Consume the ## operator if any. + PasteOpLoc = TokenStream[CurIdx].getLocation(); + if (TokenStream[CurIdx].is(tok::hashhash)) + ++CurIdx; + assert(!IsAtEnd() && "No token on the RHS of a paste operator!"); + + // Get the RHS token. + const Token &RHS = TokenStream[CurIdx]; + + // Allocate space for the result token. This is guaranteed to be enough for + // the two tokens. + Buffer.resize(LHSTok.getLength() + RHS.getLength()); + + // Get the spelling of the LHS token in Buffer. + const char *BufPtr = &Buffer[0]; + bool Invalid = false; + unsigned LHSLen = PP.getSpelling(LHSTok, BufPtr, &Invalid); + if (BufPtr != &Buffer[0]) // Really, we want the chars in Buffer! + memcpy(&Buffer[0], BufPtr, LHSLen); + if (Invalid) + return true; + + BufPtr = Buffer.data() + LHSLen; + unsigned RHSLen = PP.getSpelling(RHS, BufPtr, &Invalid); + if (Invalid) + return true; + if (RHSLen && BufPtr != &Buffer[LHSLen]) + // Really, we want the chars in Buffer! + memcpy(&Buffer[LHSLen], BufPtr, RHSLen); + + // Trim excess space. + Buffer.resize(LHSLen+RHSLen); + + // Plop the pasted result (including the trailing newline and null) into a + // scratch buffer where we can lex it. + Token ResultTokTmp; + ResultTokTmp.startToken(); + + // Claim that the tmp token is a string_literal so that we can get the + // character pointer back from CreateString in getLiteralData(). + ResultTokTmp.setKind(tok::string_literal); + PP.CreateString(Buffer, ResultTokTmp); + SourceLocation ResultTokLoc = ResultTokTmp.getLocation(); + ResultTokStrPtr = ResultTokTmp.getLiteralData(); + + // Lex the resultant pasted token into Result. + Token Result; + + if (LHSTok.isAnyIdentifier() && RHS.isAnyIdentifier()) { + // Common paste case: identifier+identifier = identifier. Avoid creating + // a lexer and other overhead. + PP.IncrementPasteCounter(true); + Result.startToken(); + Result.setKind(tok::raw_identifier); + Result.setRawIdentifierData(ResultTokStrPtr); + Result.setLocation(ResultTokLoc); + Result.setLength(LHSLen+RHSLen); + } else { + PP.IncrementPasteCounter(false); + + assert(ResultTokLoc.isFileID() && + "Should be a raw location into scratch buffer"); + SourceManager &SourceMgr = PP.getSourceManager(); + FileID LocFileID = SourceMgr.getFileID(ResultTokLoc); + + bool Invalid = false; + const char *ScratchBufStart + = SourceMgr.getBufferData(LocFileID, &Invalid).data(); + if (Invalid) + return false; + + // Make a lexer to lex this string from. Lex just this one token. + // Make a lexer object so that we lex and expand the paste result. + Lexer TL(SourceMgr.getLocForStartOfFile(LocFileID), + PP.getLangOpts(), ScratchBufStart, + ResultTokStrPtr, ResultTokStrPtr+LHSLen+RHSLen); + + // Lex a token in raw mode. This way it won't look up identifiers + // automatically, lexing off the end will return an eof token, and + // warnings are disabled. This returns true if the result token is the + // entire buffer. + bool isInvalid = !TL.LexFromRawLexer(Result); + + // If we got an EOF token, we didn't form even ONE token. For example, we + // did "/ ## /" to get "//". + isInvalid |= Result.is(tok::eof); + + // If pasting the two tokens didn't form a full new token, this is an + // error. This occurs with "x ## +" and other stuff. Return with LHSTok + // unmodified and with RHS as the next token to lex. + if (isInvalid) { + // Explicitly convert the token location to have proper expansion + // information so that the user knows where it came from. + SourceManager &SM = PP.getSourceManager(); + SourceLocation Loc = + SM.createExpansionLoc(PasteOpLoc, ExpandLocStart, ExpandLocEnd, 2); + + // Test for the Microsoft extension of /##/ turning into // here on the + // error path. + if (PP.getLangOpts().MicrosoftExt && LHSTok.is(tok::slash) && + RHS.is(tok::slash)) { + HandleMicrosoftCommentPaste(LHSTok, Loc); + return true; + } + + // Do not emit the error when preprocessing assembler code. + if (!PP.getLangOpts().AsmPreprocessor) { + // If we're in microsoft extensions mode, downgrade this from a hard + // error to an extension that defaults to an error. This allows + // disabling it. + PP.Diag(Loc, PP.getLangOpts().MicrosoftExt ? diag::ext_pp_bad_paste_ms + : diag::err_pp_bad_paste) + << Buffer; + } + + // An error has occurred so exit loop. + break; + } + + // Turn ## into 'unknown' to avoid # ## # from looking like a paste + // operator. + if (Result.is(tok::hashhash)) + Result.setKind(tok::unknown); + } + + // Transfer properties of the LHS over the Result. + Result.setFlagValue(Token::StartOfLine , LHSTok.isAtStartOfLine()); + Result.setFlagValue(Token::LeadingSpace, LHSTok.hasLeadingSpace()); + + // Finally, replace LHS with the result, consume the RHS, and iterate. + ++CurIdx; + LHSTok = Result; + } while (!IsAtEnd() && TokenStream[CurIdx].is(tok::hashhash)); + + SourceLocation EndLoc = TokenStream[CurIdx - 1].getLocation(); + + // The token's current location indicate where the token was lexed from. We + // need this information to compute the spelling of the token, but any + // diagnostics for the expanded token should appear as if the token was + // expanded from the full ## expression. Pull this information together into + // a new SourceLocation that captures all of this. + SourceManager &SM = PP.getSourceManager(); + if (StartLoc.isFileID()) + StartLoc = getExpansionLocForMacroDefLoc(StartLoc); + if (EndLoc.isFileID()) + EndLoc = getExpansionLocForMacroDefLoc(EndLoc); + FileID MacroFID = SM.getFileID(MacroExpansionStart); + while (SM.getFileID(StartLoc) != MacroFID) + StartLoc = SM.getImmediateExpansionRange(StartLoc).getBegin(); + while (SM.getFileID(EndLoc) != MacroFID) + EndLoc = SM.getImmediateExpansionRange(EndLoc).getEnd(); + + LHSTok.setLocation(SM.createExpansionLoc(LHSTok.getLocation(), StartLoc, EndLoc, + LHSTok.getLength())); + + // Now that we got the result token, it will be subject to expansion. Since + // token pasting re-lexes the result token in raw mode, identifier information + // isn't looked up. As such, if the result is an identifier, look up id info. + if (LHSTok.is(tok::raw_identifier)) { + // Look up the identifier info for the token. We disabled identifier lookup + // by saying we're skipping contents, so we need to do this manually. + PP.LookUpIdentifierInfo(LHSTok); + } + return false; +} + +/// isNextTokenLParen - If the next token lexed will pop this macro off the +/// expansion stack, return 2. If the next unexpanded token is a '(', return +/// 1, otherwise return 0. +unsigned TokenLexer::isNextTokenLParen() const { + // Out of tokens? + if (isAtEnd()) + return 2; + return Tokens[CurTokenIdx].is(tok::l_paren); +} + +/// isParsingPreprocessorDirective - Return true if we are in the middle of a +/// preprocessor directive. +bool TokenLexer::isParsingPreprocessorDirective() const { + return Tokens[NumTokens-1].is(tok::eod) && !isAtEnd(); +} + +/// HandleMicrosoftCommentPaste - In microsoft compatibility mode, /##/ pastes +/// together to form a comment that comments out everything in the current +/// macro, other active macros, and anything left on the current physical +/// source line of the expanded buffer. Handle this by returning the +/// first token on the next line. +void TokenLexer::HandleMicrosoftCommentPaste(Token &Tok, SourceLocation OpLoc) { + PP.Diag(OpLoc, diag::ext_comment_paste_microsoft); + + // We 'comment out' the rest of this macro by just ignoring the rest of the + // tokens that have not been lexed yet, if any. + + // Since this must be a macro, mark the macro enabled now that it is no longer + // being expanded. + assert(Macro && "Token streams can't paste comments"); + Macro->EnableMacro(); + + PP.HandleMicrosoftCommentPaste(Tok); +} + +/// If \arg loc is a file ID and points inside the current macro +/// definition, returns the appropriate source location pointing at the +/// macro expansion source location entry, otherwise it returns an invalid +/// SourceLocation. +SourceLocation +TokenLexer::getExpansionLocForMacroDefLoc(SourceLocation loc) const { + assert(ExpandLocStart.isValid() && MacroExpansionStart.isValid() && + "Not appropriate for token streams"); + assert(loc.isValid() && loc.isFileID()); + + SourceManager &SM = PP.getSourceManager(); + assert(SM.isInSLocAddrSpace(loc, MacroDefStart, MacroDefLength) && + "Expected loc to come from the macro definition"); + + unsigned relativeOffset = 0; + SM.isInSLocAddrSpace(loc, MacroDefStart, MacroDefLength, &relativeOffset); + return MacroExpansionStart.getLocWithOffset(relativeOffset); +} + +/// Finds the tokens that are consecutive (from the same FileID) +/// creates a single SLocEntry, and assigns SourceLocations to each token that +/// point to that SLocEntry. e.g for +/// assert(foo == bar); +/// There will be a single SLocEntry for the "foo == bar" chunk and locations +/// for the 'foo', '==', 'bar' tokens will point inside that chunk. +/// +/// \arg begin_tokens will be updated to a position past all the found +/// consecutive tokens. +static void updateConsecutiveMacroArgTokens(SourceManager &SM, + SourceLocation InstLoc, + Token *&begin_tokens, + Token * end_tokens) { + assert(begin_tokens < end_tokens); + + SourceLocation FirstLoc = begin_tokens->getLocation(); + SourceLocation CurLoc = FirstLoc; + + // Compare the source location offset of tokens and group together tokens that + // are close, even if their locations point to different FileIDs. e.g. + // + // |bar | foo | cake | (3 tokens from 3 consecutive FileIDs) + // ^ ^ + // |bar foo cake| (one SLocEntry chunk for all tokens) + // + // we can perform this "merge" since the token's spelling location depends + // on the relative offset. + + Token *NextTok = begin_tokens + 1; + for (; NextTok < end_tokens; ++NextTok) { + SourceLocation NextLoc = NextTok->getLocation(); + if (CurLoc.isFileID() != NextLoc.isFileID()) + break; // Token from different kind of FileID. + + int RelOffs; + if (!SM.isInSameSLocAddrSpace(CurLoc, NextLoc, &RelOffs)) + break; // Token from different local/loaded location. + // Check that token is not before the previous token or more than 50 + // "characters" away. + if (RelOffs < 0 || RelOffs > 50) + break; + + if (CurLoc.isMacroID() && !SM.isWrittenInSameFile(CurLoc, NextLoc)) + break; // Token from a different macro. + + CurLoc = NextLoc; + } + + // For the consecutive tokens, find the length of the SLocEntry to contain + // all of them. + Token &LastConsecutiveTok = *(NextTok-1); + int LastRelOffs = 0; + SM.isInSameSLocAddrSpace(FirstLoc, LastConsecutiveTok.getLocation(), + &LastRelOffs); + unsigned FullLength = LastRelOffs + LastConsecutiveTok.getLength(); + + // Create a macro expansion SLocEntry that will "contain" all of the tokens. + SourceLocation Expansion = + SM.createMacroArgExpansionLoc(FirstLoc, InstLoc,FullLength); + + // Change the location of the tokens from the spelling location to the new + // expanded location. + for (; begin_tokens < NextTok; ++begin_tokens) { + Token &Tok = *begin_tokens; + int RelOffs = 0; + SM.isInSameSLocAddrSpace(FirstLoc, Tok.getLocation(), &RelOffs); + Tok.setLocation(Expansion.getLocWithOffset(RelOffs)); + } +} + +/// Creates SLocEntries and updates the locations of macro argument +/// tokens to their new expanded locations. +/// +/// \param ArgIdSpellLoc the location of the macro argument id inside the macro +/// definition. +void TokenLexer::updateLocForMacroArgTokens(SourceLocation ArgIdSpellLoc, + Token *begin_tokens, + Token *end_tokens) { + SourceManager &SM = PP.getSourceManager(); + + SourceLocation InstLoc = + getExpansionLocForMacroDefLoc(ArgIdSpellLoc); + + while (begin_tokens < end_tokens) { + // If there's only one token just create a SLocEntry for it. + if (end_tokens - begin_tokens == 1) { + Token &Tok = *begin_tokens; + Tok.setLocation(SM.createMacroArgExpansionLoc(Tok.getLocation(), + InstLoc, + Tok.getLength())); + return; + } + + updateConsecutiveMacroArgTokens(SM, InstLoc, begin_tokens, end_tokens); + } +} + +void TokenLexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { + AtStartOfLine = Result.isAtStartOfLine(); + HasLeadingSpace = Result.hasLeadingSpace(); +} diff --git a/clang/lib/Lex/UnicodeCharSets.h b/clang/lib/Lex/UnicodeCharSets.h new file mode 100644 index 000000000000..74dd57fdf118 --- /dev/null +++ b/clang/lib/Lex/UnicodeCharSets.h @@ -0,0 +1,407 @@ +//===--- UnicodeCharSets.h - Contains important sets of characters --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_LIB_LEX_UNICODECHARSETS_H +#define LLVM_CLANG_LIB_LEX_UNICODECHARSETS_H + +#include "llvm/Support/UnicodeCharRanges.h" + +// C11 D.1, C++11 [charname.allowed] +static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = { + // 1 + { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD }, + { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA }, + { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 }, + { 0x00F8, 0x00FF }, + // 2 + { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF }, + // 3 + { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 }, + { 0x2054, 0x2054 }, { 0x2060, 0x206F }, + // 4 + { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 }, + { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF }, + // 5 + { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F }, + // 6 + { 0x3040, 0xD7FF }, + // 7 + { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 }, + { 0xFE47, 0xFFFD }, + // 8 + { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }, + { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD }, + { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD }, + { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD }, + { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD } +}; + +// C++03 [extendid] +// Note that this is not the same as C++98, but we don't distinguish C++98 +// and C++03 in Clang. +static const llvm::sys::UnicodeCharRange CXX03AllowedIDCharRanges[] = { + // Latin + { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 }, { 0x00F8, 0x01F5 }, + { 0x01FA, 0x0217 }, { 0x0250, 0x02A8 }, + + // Greek + { 0x0384, 0x0384 }, { 0x0388, 0x038A }, { 0x038C, 0x038C }, + { 0x038E, 0x03A1 }, { 0x03A3, 0x03CE }, { 0x03D0, 0x03D6 }, + { 0x03DA, 0x03DA }, { 0x03DC, 0x03DC }, { 0x03DE, 0x03DE }, + { 0x03E0, 0x03E0 }, { 0x03E2, 0x03F3 }, + + // Cyrillic + { 0x0401, 0x040D }, { 0x040F, 0x044F }, { 0x0451, 0x045C }, + { 0x045E, 0x0481 }, { 0x0490, 0x04C4 }, { 0x04C7, 0x04C8 }, + { 0x04CB, 0x04CC }, { 0x04D0, 0x04EB }, { 0x04EE, 0x04F5 }, + { 0x04F8, 0x04F9 }, + + // Armenian + { 0x0531, 0x0556 }, { 0x0561, 0x0587 }, + + // Hebrew + { 0x05D0, 0x05EA }, { 0x05F0, 0x05F4 }, + + // Arabic + { 0x0621, 0x063A }, { 0x0640, 0x0652 }, { 0x0670, 0x06B7 }, + { 0x06BA, 0x06BE }, { 0x06C0, 0x06CE }, { 0x06E5, 0x06E7 }, + + // Devanagari + { 0x0905, 0x0939 }, { 0x0958, 0x0962 }, + + // Bengali + { 0x0985, 0x098C }, { 0x098F, 0x0990 }, { 0x0993, 0x09A8 }, + { 0x09AA, 0x09B0 }, { 0x09B2, 0x09B2 }, { 0x09B6, 0x09B9 }, + { 0x09DC, 0x09DD }, { 0x09DF, 0x09E1 }, { 0x09F0, 0x09F1 }, + + // Gurmukhi + { 0x0A05, 0x0A0A }, { 0x0A0F, 0x0A10 }, { 0x0A13, 0x0A28 }, + { 0x0A2A, 0x0A30 }, { 0x0A32, 0x0A33 }, { 0x0A35, 0x0A36 }, + { 0x0A38, 0x0A39 }, { 0x0A59, 0x0A5C }, { 0x0A5E, 0x0A5E }, + + // Gujarti + { 0x0A85, 0x0A8B }, { 0x0A8D, 0x0A8D }, { 0x0A8F, 0x0A91 }, + { 0x0A93, 0x0AA8 }, { 0x0AAA, 0x0AB0 }, { 0x0AB2, 0x0AB3 }, + { 0x0AB5, 0x0AB9 }, { 0x0AE0, 0x0AE0 }, + + // Oriya + { 0x0B05, 0x0B0C }, { 0x0B0F, 0x0B10 }, { 0x0B13, 0x0B28 }, + { 0x0B2A, 0x0B30 }, { 0x0B32, 0x0B33 }, { 0x0B36, 0x0B39 }, + { 0x0B5C, 0x0B5D }, { 0x0B5F, 0x0B61 }, + + // Tamil + { 0x0B85, 0x0B8A }, { 0x0B8E, 0x0B90 }, { 0x0B92, 0x0B95 }, + { 0x0B99, 0x0B9A }, { 0x0B9C, 0x0B9C }, { 0x0B9E, 0x0B9F }, + { 0x0BA3, 0x0BA4 }, { 0x0BA8, 0x0BAA }, { 0x0BAE, 0x0BB5 }, + { 0x0BB7, 0x0BB9 }, + + // Telugu + { 0x0C05, 0x0C0C }, { 0x0C0E, 0x0C10 }, { 0x0C12, 0x0C28 }, + { 0x0C2A, 0x0C33 }, { 0x0C35, 0x0C39 }, { 0x0C60, 0x0C61 }, + + // Kannada + { 0x0C85, 0x0C8C }, { 0x0C8E, 0x0C90 }, { 0x0C92, 0x0CA8 }, + { 0x0CAA, 0x0CB3 }, { 0x0CB5, 0x0CB9 }, { 0x0CE0, 0x0CE1 }, + + // Malayam + { 0x0D05, 0x0D0C }, { 0x0D0E, 0x0D10 }, { 0x0D12, 0x0D28 }, + { 0x0D2A, 0x0D39 }, { 0x0D60, 0x0D61 }, + + // Thai + { 0x0E01, 0x0E30 }, { 0x0E32, 0x0E33 }, { 0x0E40, 0x0E46 }, + { 0x0E4F, 0x0E5B }, + + // Lao + { 0x0E81, 0x0E82 }, { 0x0E84, 0x0E84 }, { 0x0E87, 0x0E87 }, + { 0x0E88, 0x0E88 }, { 0x0E8A, 0x0E8A }, { 0x0E8D, 0x0E8D }, + { 0x0E94, 0x0E97 }, { 0x0E99, 0x0E9F }, { 0x0EA1, 0x0EA3 }, + { 0x0EA5, 0x0EA5 }, { 0x0EA7, 0x0EA7 }, { 0x0EAA, 0x0EAA }, + { 0x0EAB, 0x0EAB }, { 0x0EAD, 0x0EB0 }, { 0x0EB2, 0x0EB2 }, + { 0x0EB3, 0x0EB3 }, { 0x0EBD, 0x0EBD }, { 0x0EC0, 0x0EC4 }, + { 0x0EC6, 0x0EC6 }, + + // Georgian + { 0x10A0, 0x10C5 }, { 0x10D0, 0x10F6 }, + + // Hangul + { 0x1100, 0x1159 }, { 0x1161, 0x11A2 }, { 0x11A8, 0x11F9 }, + + // Latin (2) + { 0x1E00, 0x1E9A }, { 0x1EA0, 0x1EF9 }, + + // Greek (2) + { 0x1F00, 0x1F15 }, { 0x1F18, 0x1F1D }, { 0x1F20, 0x1F45 }, + { 0x1F48, 0x1F4D }, { 0x1F50, 0x1F57 }, { 0x1F59, 0x1F59 }, + { 0x1F5B, 0x1F5B }, { 0x1F5D, 0x1F5D }, { 0x1F5F, 0x1F7D }, + { 0x1F80, 0x1FB4 }, { 0x1FB6, 0x1FBC }, { 0x1FC2, 0x1FC4 }, + { 0x1FC6, 0x1FCC }, { 0x1FD0, 0x1FD3 }, { 0x1FD6, 0x1FDB }, + { 0x1FE0, 0x1FEC }, { 0x1FF2, 0x1FF4 }, { 0x1FF6, 0x1FFC }, + + // Hiragana + { 0x3041, 0x3094 }, { 0x309B, 0x309E }, + + // Katakana + { 0x30A1, 0x30FE }, + + // Bopmofo [sic] + { 0x3105, 0x312C }, + + // CJK Unified Ideographs + { 0x4E00, 0x9FA5 }, { 0xF900, 0xFA2D }, { 0xFB1F, 0xFB36 }, + { 0xFB38, 0xFB3C }, { 0xFB3E, 0xFB3E }, { 0xFB40, 0xFB41 }, + { 0xFB42, 0xFB44 }, { 0xFB46, 0xFBB1 }, { 0xFBD3, 0xFD3F }, + { 0xFD50, 0xFD8F }, { 0xFD92, 0xFDC7 }, { 0xFDF0, 0xFDFB }, + { 0xFE70, 0xFE72 }, { 0xFE74, 0xFE74 }, { 0xFE76, 0xFEFC }, + { 0xFF21, 0xFF3A }, { 0xFF41, 0xFF5A }, { 0xFF66, 0xFFBE }, + { 0xFFC2, 0xFFC7 }, { 0xFFCA, 0xFFCF }, { 0xFFD2, 0xFFD7 }, + { 0xFFDA, 0xFFDC } +}; + +// C99 Annex D +static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[] = { + // Latin (1) + { 0x00AA, 0x00AA }, + + // Special characters (1) + { 0x00B5, 0x00B5 }, { 0x00B7, 0x00B7 }, + + // Latin (2) + { 0x00BA, 0x00BA }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 }, + { 0x00F8, 0x01F5 }, { 0x01FA, 0x0217 }, { 0x0250, 0x02A8 }, + + // Special characters (2) + { 0x02B0, 0x02B8 }, { 0x02BB, 0x02BB }, { 0x02BD, 0x02C1 }, + { 0x02D0, 0x02D1 }, { 0x02E0, 0x02E4 }, { 0x037A, 0x037A }, + + // Greek (1) + { 0x0386, 0x0386 }, { 0x0388, 0x038A }, { 0x038C, 0x038C }, + { 0x038E, 0x03A1 }, { 0x03A3, 0x03CE }, { 0x03D0, 0x03D6 }, + { 0x03DA, 0x03DA }, { 0x03DC, 0x03DC }, { 0x03DE, 0x03DE }, + { 0x03E0, 0x03E0 }, { 0x03E2, 0x03F3 }, + + // Cyrillic + { 0x0401, 0x040C }, { 0x040E, 0x044F }, { 0x0451, 0x045C }, + { 0x045E, 0x0481 }, { 0x0490, 0x04C4 }, { 0x04C7, 0x04C8 }, + { 0x04CB, 0x04CC }, { 0x04D0, 0x04EB }, { 0x04EE, 0x04F5 }, + { 0x04F8, 0x04F9 }, + + // Armenian (1) + { 0x0531, 0x0556 }, + + // Special characters (3) + { 0x0559, 0x0559 }, + + // Armenian (2) + { 0x0561, 0x0587 }, + + // Hebrew + { 0x05B0, 0x05B9 }, { 0x05BB, 0x05BD }, { 0x05BF, 0x05BF }, + { 0x05C1, 0x05C2 }, { 0x05D0, 0x05EA }, { 0x05F0, 0x05F2 }, + + // Arabic (1) + { 0x0621, 0x063A }, { 0x0640, 0x0652 }, + + // Digits (1) + { 0x0660, 0x0669 }, + + // Arabic (2) + { 0x0670, 0x06B7 }, { 0x06BA, 0x06BE }, { 0x06C0, 0x06CE }, + { 0x06D0, 0x06DC }, { 0x06E5, 0x06E8 }, { 0x06EA, 0x06ED }, + + // Digits (2) + { 0x06F0, 0x06F9 }, + + // Devanagari and Special character 0x093D. + { 0x0901, 0x0903 }, { 0x0905, 0x0939 }, { 0x093D, 0x094D }, + { 0x0950, 0x0952 }, { 0x0958, 0x0963 }, + + // Digits (3) + { 0x0966, 0x096F }, + + // Bengali (1) + { 0x0981, 0x0983 }, { 0x0985, 0x098C }, { 0x098F, 0x0990 }, + { 0x0993, 0x09A8 }, { 0x09AA, 0x09B0 }, { 0x09B2, 0x09B2 }, + { 0x09B6, 0x09B9 }, { 0x09BE, 0x09C4 }, { 0x09C7, 0x09C8 }, + { 0x09CB, 0x09CD }, { 0x09DC, 0x09DD }, { 0x09DF, 0x09E3 }, + + // Digits (4) + { 0x09E6, 0x09EF }, + + // Bengali (2) + { 0x09F0, 0x09F1 }, + + // Gurmukhi (1) + { 0x0A02, 0x0A02 }, { 0x0A05, 0x0A0A }, { 0x0A0F, 0x0A10 }, + { 0x0A13, 0x0A28 }, { 0x0A2A, 0x0A30 }, { 0x0A32, 0x0A33 }, + { 0x0A35, 0x0A36 }, { 0x0A38, 0x0A39 }, { 0x0A3E, 0x0A42 }, + { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A59, 0x0A5C }, + { 0x0A5E, 0x0A5E }, + + // Digits (5) + { 0x0A66, 0x0A6F }, + + // Gurmukhi (2) + { 0x0A74, 0x0A74 }, + + // Gujarti + { 0x0A81, 0x0A83 }, { 0x0A85, 0x0A8B }, { 0x0A8D, 0x0A8D }, + { 0x0A8F, 0x0A91 }, { 0x0A93, 0x0AA8 }, { 0x0AAA, 0x0AB0 }, + { 0x0AB2, 0x0AB3 }, { 0x0AB5, 0x0AB9 }, { 0x0ABD, 0x0AC5 }, + { 0x0AC7, 0x0AC9 }, { 0x0ACB, 0x0ACD }, { 0x0AD0, 0x0AD0 }, + { 0x0AE0, 0x0AE0 }, + + // Digits (6) + { 0x0AE6, 0x0AEF }, + + // Oriya and Special character 0x0B3D + { 0x0B01, 0x0B03 }, { 0x0B05, 0x0B0C }, { 0x0B0F, 0x0B10 }, + { 0x0B13, 0x0B28 }, { 0x0B2A, 0x0B30 }, { 0x0B32, 0x0B33 }, + { 0x0B36, 0x0B39 }, { 0x0B3D, 0x0B43 }, { 0x0B47, 0x0B48 }, + { 0x0B4B, 0x0B4D }, { 0x0B5C, 0x0B5D }, { 0x0B5F, 0x0B61 }, + + // Digits (7) + { 0x0B66, 0x0B6F }, + + // Tamil + { 0x0B82, 0x0B83 }, { 0x0B85, 0x0B8A }, { 0x0B8E, 0x0B90 }, + { 0x0B92, 0x0B95 }, { 0x0B99, 0x0B9A }, { 0x0B9C, 0x0B9C }, + { 0x0B9E, 0x0B9F }, { 0x0BA3, 0x0BA4 }, { 0x0BA8, 0x0BAA }, + { 0x0BAE, 0x0BB5 }, { 0x0BB7, 0x0BB9 }, { 0x0BBE, 0x0BC2 }, + { 0x0BC6, 0x0BC8 }, { 0x0BCA, 0x0BCD }, + + // Digits (8) + { 0x0BE7, 0x0BEF }, + + // Telugu + { 0x0C01, 0x0C03 }, { 0x0C05, 0x0C0C }, { 0x0C0E, 0x0C10 }, + { 0x0C12, 0x0C28 }, { 0x0C2A, 0x0C33 }, { 0x0C35, 0x0C39 }, + { 0x0C3E, 0x0C44 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, + { 0x0C60, 0x0C61 }, + + // Digits (9) + { 0x0C66, 0x0C6F }, + + // Kannada + { 0x0C82, 0x0C83 }, { 0x0C85, 0x0C8C }, { 0x0C8E, 0x0C90 }, + { 0x0C92, 0x0CA8 }, { 0x0CAA, 0x0CB3 }, { 0x0CB5, 0x0CB9 }, + { 0x0CBE, 0x0CC4 }, { 0x0CC6, 0x0CC8 }, { 0x0CCA, 0x0CCD }, + { 0x0CDE, 0x0CDE }, { 0x0CE0, 0x0CE1 }, + + // Digits (10) + { 0x0CE6, 0x0CEF }, + + // Malayam + { 0x0D02, 0x0D03 }, { 0x0D05, 0x0D0C }, { 0x0D0E, 0x0D10 }, + { 0x0D12, 0x0D28 }, { 0x0D2A, 0x0D39 }, { 0x0D3E, 0x0D43 }, + { 0x0D46, 0x0D48 }, { 0x0D4A, 0x0D4D }, { 0x0D60, 0x0D61 }, + + // Digits (11) + { 0x0D66, 0x0D6F }, + + // Thai...including Digits { 0x0E50, 0x0E59 } + { 0x0E01, 0x0E3A }, { 0x0E40, 0x0E5B }, + + // Lao (1) + { 0x0E81, 0x0E82 }, { 0x0E84, 0x0E84 }, { 0x0E87, 0x0E88 }, + { 0x0E8A, 0x0E8A }, { 0x0E8D, 0x0E8D }, { 0x0E94, 0x0E97 }, + { 0x0E99, 0x0E9F }, { 0x0EA1, 0x0EA3 }, { 0x0EA5, 0x0EA5 }, + { 0x0EA7, 0x0EA7 }, { 0x0EAA, 0x0EAB }, { 0x0EAD, 0x0EAE }, + { 0x0EB0, 0x0EB9 }, { 0x0EBB, 0x0EBD }, { 0x0EC0, 0x0EC4 }, + { 0x0EC6, 0x0EC6 }, { 0x0EC8, 0x0ECD }, + + // Digits (12) + { 0x0ED0, 0x0ED9 }, + + // Lao (2) + { 0x0EDC, 0x0EDD }, + + // Tibetan (1) + { 0x0F00, 0x0F00 }, { 0x0F18, 0x0F19 }, + + // Digits (13) + { 0x0F20, 0x0F33 }, + + // Tibetan (2) + { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, + { 0x0F3E, 0x0F47 }, { 0x0F49, 0x0F69 }, { 0x0F71, 0x0F84 }, + { 0x0F86, 0x0F8B }, { 0x0F90, 0x0F95 }, { 0x0F97, 0x0F97 }, + { 0x0F99, 0x0FAD }, { 0x0FB1, 0x0FB7 }, { 0x0FB9, 0x0FB9 }, + + // Georgian + { 0x10A0, 0x10C5 }, { 0x10D0, 0x10F6 }, + + // Latin (3) + { 0x1E00, 0x1E9B }, { 0x1EA0, 0x1EF9 }, + + // Greek (2) + { 0x1F00, 0x1F15 }, { 0x1F18, 0x1F1D }, { 0x1F20, 0x1F45 }, + { 0x1F48, 0x1F4D }, { 0x1F50, 0x1F57 }, { 0x1F59, 0x1F59 }, + { 0x1F5B, 0x1F5B }, { 0x1F5D, 0x1F5D }, { 0x1F5F, 0x1F7D }, + { 0x1F80, 0x1FB4 }, { 0x1FB6, 0x1FBC }, + + // Special characters (4) + { 0x1FBE, 0x1FBE }, + + // Greek (3) + { 0x1FC2, 0x1FC4 }, { 0x1FC6, 0x1FCC }, { 0x1FD0, 0x1FD3 }, + { 0x1FD6, 0x1FDB }, { 0x1FE0, 0x1FEC }, { 0x1FF2, 0x1FF4 }, + { 0x1FF6, 0x1FFC }, + + // Special characters (5) + { 0x203F, 0x2040 }, + + // Latin (4) + { 0x207F, 0x207F }, + + // Special characters (6) + { 0x2102, 0x2102 }, { 0x2107, 0x2107 }, { 0x210A, 0x2113 }, + { 0x2115, 0x2115 }, { 0x2118, 0x211D }, { 0x2124, 0x2124 }, + { 0x2126, 0x2126 }, { 0x2128, 0x2128 }, { 0x212A, 0x2131 }, + { 0x2133, 0x2138 }, { 0x2160, 0x2182 }, { 0x3005, 0x3007 }, + { 0x3021, 0x3029 }, + + // Hiragana + { 0x3041, 0x3093 }, { 0x309B, 0x309C }, + + // Katakana + { 0x30A1, 0x30F6 }, { 0x30FB, 0x30FC }, + + // Bopmofo [sic] + { 0x3105, 0x312C }, + + // CJK Unified Ideographs + { 0x4E00, 0x9FA5 }, + + // Hangul, + { 0xAC00, 0xD7A3 } +}; + +// C11 D.2, C++11 [charname.disallowed] +static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[] = { + { 0x0300, 0x036F }, { 0x1DC0, 0x1DFF }, { 0x20D0, 0x20FF }, + { 0xFE20, 0xFE2F } +}; + +// C99 6.4.2.1p3: The initial character [of an identifier] shall not be a +// universal character name designating a digit. +// C99 Annex D defines these characters as "Digits". +static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[] = { + { 0x0660, 0x0669 }, { 0x06F0, 0x06F9 }, { 0x0966, 0x096F }, + { 0x09E6, 0x09EF }, { 0x0A66, 0x0A6F }, { 0x0AE6, 0x0AEF }, + { 0x0B66, 0x0B6F }, { 0x0BE7, 0x0BEF }, { 0x0C66, 0x0C6F }, + { 0x0CE6, 0x0CEF }, { 0x0D66, 0x0D6F }, { 0x0E50, 0x0E59 }, + { 0x0ED0, 0x0ED9 }, { 0x0F20, 0x0F33 } +}; + +// Unicode v6.2, chapter 6.2, table 6-2. +static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = { + { 0x0085, 0x0085 }, { 0x00A0, 0x00A0 }, { 0x1680, 0x1680 }, + { 0x180E, 0x180E }, { 0x2000, 0x200A }, { 0x2028, 0x2029 }, + { 0x202F, 0x202F }, { 0x205F, 0x205F }, { 0x3000, 0x3000 } +}; + +#endif |