aboutsummaryrefslogtreecommitdiff
path: root/lib/Lex/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Lex/Lexer.cpp')
-rw-r--r--lib/Lex/Lexer.cpp650
1 files changed, 499 insertions, 151 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index a28b8f6e7b9f..a98d889dbc98 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -32,7 +32,7 @@
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/MemoryBuffer.h"
-#include <cctype>
+#include <cstring>
using namespace clang;
static void InitCharacterInfo();
@@ -76,7 +76,7 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
// skip the UTF-8 BOM if it's present.
if (BufferStart == BufferPtr) {
// Determine the size of the BOM.
- llvm::StringRef Buf(BufferStart, BufferEnd - BufferStart);
+ StringRef Buf(BufferStart, BufferEnd - BufferStart);
size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
.StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
.Default(0);
@@ -86,7 +86,7 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
}
Is_PragmaLexer = false;
- IsInConflictMarker = false;
+ CurrentConflictMarkerState = CMK_None;
// Start of the file is a start of line.
IsAtStartOfLine = true;
@@ -187,9 +187,9 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
// Set the SourceLocation with the remapping information. This ensures that
// GetMappedTokenLoc will remap the tokens as they are lexed.
- L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
- ExpansionLocStart,
- ExpansionLocEnd, TokLen);
+ L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
+ ExpansionLocStart,
+ ExpansionLocEnd, TokLen);
// Ensure that the lexer thinks it is inside a directive, so that end \n will
// return an EOD token.
@@ -217,7 +217,7 @@ std::string Lexer::Stringify(const std::string &Str, bool Charify) {
/// Stringify - Convert the specified string into a C string by escaping '\'
/// and " characters. This does not add surrounding ""'s to the string.
-void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
+void Lexer::Stringify(SmallVectorImpl<char> &Str) {
for (unsigned i = 0, e = Str.size(); i != e; ++i) {
if (Str[i] == '\\' || Str[i] == '"') {
Str.insert(Str.begin()+i, '\\');
@@ -235,8 +235,8 @@ void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
/// after trigraph expansion and escaped-newline folding. In particular, this
/// wants to get the true, uncanonicalized, spelling of things like digraphs
/// UCNs, etc.
-llvm::StringRef Lexer::getSpelling(SourceLocation loc,
- llvm::SmallVectorImpl<char> &buffer,
+StringRef Lexer::getSpelling(SourceLocation loc,
+ SmallVectorImpl<char> &buffer,
const SourceManager &SM,
const LangOptions &options,
bool *invalid) {
@@ -245,10 +245,10 @@ llvm::StringRef Lexer::getSpelling(SourceLocation loc,
// Try to the load the file buffer.
bool invalidTemp = false;
- llvm::StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
+ StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
if (invalidTemp) {
if (invalid) *invalid = true;
- return llvm::StringRef();
+ return StringRef();
}
const char *tokenBegin = file.data() + locInfo.second;
@@ -263,7 +263,7 @@ llvm::StringRef Lexer::getSpelling(SourceLocation loc,
// Common case: no need for cleaning.
if (!token.needsCleaning())
- return llvm::StringRef(tokenBegin, length);
+ return StringRef(tokenBegin, length);
// Hard case, we need to relex the characters into the string.
buffer.clear();
@@ -275,7 +275,7 @@ llvm::StringRef Lexer::getSpelling(SourceLocation loc,
ti += charSize;
}
- return llvm::StringRef(buffer.data(), buffer.size());
+ return StringRef(buffer.data(), buffer.size());
}
/// getSpelling() - Return the 'spelling' of this token. The spelling of a
@@ -394,10 +394,10 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
// If this comes from a macro expansion, we really do want the macro name, not
// the token this macro expanded to.
- Loc = SM.getInstantiationLoc(Loc);
+ Loc = SM.getExpansionLoc(Loc);
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
bool Invalid = false;
- llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
+ StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
return 0;
@@ -415,15 +415,16 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
return TheTok.getLength();
}
-SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
- const SourceManager &SM,
- const LangOptions &LangOpts) {
+static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
+ const SourceManager &SM,
+ const LangOptions &LangOpts) {
+ assert(Loc.isFileID());
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
if (LocInfo.first.isInvalid())
return Loc;
bool Invalid = false;
- llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
+ StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
return Loc;
@@ -448,7 +449,7 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
}
// Create a lexer starting at the beginning of this token.
- SourceLocation LexerStartLoc = Loc.getFileLocWithOffset(-LocInfo.second);
+ SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
TheLexer.SetCommentRetentionState(true);
@@ -474,6 +475,25 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
return Loc;
}
+SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
+ const SourceManager &SM,
+ const LangOptions &LangOpts) {
+ if (Loc.isFileID())
+ return getBeginningOfFileToken(Loc, SM, LangOpts);
+
+ if (!SM.isMacroArgExpansion(Loc))
+ return Loc;
+
+ SourceLocation FileLoc = SM.getSpellingLoc(Loc);
+ SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
+ std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
+ std::pair<FileID, unsigned> BeginFileLocInfo= SM.getDecomposedLoc(BeginFileLoc);
+ assert(FileLocInfo.first == BeginFileLocInfo.first &&
+ FileLocInfo.second >= BeginFileLocInfo.second);
+ return Loc.getLocWithOffset(SM.getDecomposedLoc(BeginFileLoc).second -
+ SM.getDecomposedLoc(FileLoc).second);
+}
+
namespace {
enum PreambleDirectiveKind {
PDK_Skipped,
@@ -484,21 +504,36 @@ namespace {
}
std::pair<unsigned, bool>
-Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) {
+Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer,
+ const LangOptions &Features, unsigned MaxLines) {
// Create a lexer starting at the beginning of the file. Note that we use a
// "fake" file source location at offset 1 so that the lexer will track our
// position within the file.
const unsigned StartOffset = 1;
SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset);
- LangOptions LangOpts;
- Lexer TheLexer(StartLoc, LangOpts, Buffer->getBufferStart(),
+ Lexer TheLexer(StartLoc, Features, Buffer->getBufferStart(),
Buffer->getBufferStart(), Buffer->getBufferEnd());
bool InPreprocessorDirective = false;
Token TheTok;
Token IfStartTok;
unsigned IfCount = 0;
- unsigned Line = 0;
+
+ unsigned MaxLineOffset = 0;
+ if (MaxLines) {
+ const char *CurPtr = Buffer->getBufferStart();
+ unsigned CurLine = 0;
+ while (CurPtr != Buffer->getBufferEnd()) {
+ char ch = *CurPtr++;
+ if (ch == '\n') {
+ ++CurLine;
+ if (CurLine == MaxLines)
+ break;
+ }
+ }
+ if (CurPtr != Buffer->getBufferEnd())
+ MaxLineOffset = CurPtr - Buffer->getBufferStart();
+ }
do {
TheLexer.LexFromRawLexer(TheTok);
@@ -522,11 +557,11 @@ Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) {
// Keep track of the # of lines in the preamble.
if (TheTok.isAtStartOfLine()) {
- ++Line;
+ unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
// If we were asked to limit the number of lines in the preamble,
// and we're about to exceed that limit, we're done.
- if (MaxLines && Line >= MaxLines)
+ if (MaxLineOffset && TokOffset >= MaxLineOffset)
break;
}
@@ -539,12 +574,12 @@ Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) {
Token HashTok = TheTok;
InPreprocessorDirective = true;
- // Figure out which direective this is. Since we're lexing raw tokens,
+ // Figure out which directive this is. Since we're lexing raw tokens,
// we don't have an identifier table available. Instead, just look at
// the raw identifier to recognize and categorize preprocessor directives.
TheLexer.LexFromRawLexer(TheTok);
if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
- llvm::StringRef Keyword(TheTok.getRawIdentifierData(),
+ StringRef Keyword(TheTok.getRawIdentifierData(),
TheTok.getLength());
PreambleDirectiveKind PDK
= llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
@@ -638,7 +673,7 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
// chars, this method is extremely fast.
while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
if (CharNo == 0)
- return TokStart.getFileLocWithOffset(PhysOffset);
+ return TokStart.getLocWithOffset(PhysOffset);
++TokPtr, --CharNo, ++PhysOffset;
}
@@ -658,7 +693,7 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
- return TokStart.getFileLocWithOffset(PhysOffset);
+ return TokStart.getLocWithOffset(PhysOffset);
}
/// \brief Computes the source location just past the end of the
@@ -687,7 +722,7 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
return SourceLocation(); // Points inside the macro expansion.
// Continue and find the location just after the macro expansion.
- Loc = SM.getInstantiationRange(Loc).second;
+ Loc = SM.getExpansionRange(Loc).second;
}
unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features);
@@ -696,14 +731,14 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
else
return Loc;
- return Loc.getFileLocWithOffset(Len);
+ return Loc.getLocWithOffset(Len);
}
/// \brief Returns true if the given MacroID location points at the first
/// token of the macro expansion.
bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
- const SourceManager &SM,
- const LangOptions &LangOpts) {
+ const SourceManager &SM,
+ const LangOptions &LangOpts) {
assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc);
@@ -713,8 +748,7 @@ bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
return false; // Does not point at the start of token.
SourceLocation expansionLoc =
- SM.getSLocEntry(infoLoc.first)
- .getInstantiation().getInstantiationLocStart();
+ SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart();
if (expansionLoc.isFileID())
return true; // No other macro expansions, this is the first.
@@ -734,17 +768,15 @@ bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
return false;
FileID FID = SM.getFileID(loc);
- SourceLocation afterLoc = loc.getFileLocWithOffset(tokLen+1);
- if (!SM.isBeforeInSourceLocationOffset(afterLoc, SM.getNextOffset()))
- return true; // We got past the last FileID, this points to the last token.
+ SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1);
+ if (SM.isInFileID(afterLoc, FID))
+ return false; // Still in the same FileID, does not point to the last token.
// FIXME: If the token comes from the macro token paste operator ('##')
// or the stringify operator ('#') this function will always return false;
- if (FID == SM.getFileID(afterLoc))
- return false; // Still in the same FileID, does not point to the last token.
-
+
SourceLocation expansionLoc =
- SM.getSLocEntry(FID).getInstantiation().getInstantiationLocEnd();
+ SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd();
if (expansionLoc.isFileID())
return true; // No other macro expansions.
@@ -761,7 +793,8 @@ enum {
CHAR_LETTER = 0x04, // a-z,A-Z
CHAR_NUMBER = 0x08, // 0-9
CHAR_UNDER = 0x10, // _
- CHAR_PERIOD = 0x20 // .
+ CHAR_PERIOD = 0x20, // .
+ CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"'
};
// Statically initialize CharInfo table based on ASCII character set
@@ -786,20 +819,20 @@ static const unsigned char CharInfo[256] =
0 , 0 , 0 , 0 ,
//32 SP 33 ! 34 " 35 #
//36 $ 37 % 38 & 39 '
- CHAR_HORZ_WS, 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
+ CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+ 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
//40 ( 41 ) 42 * 43 +
//44 , 45 - 46 . 47 /
- 0 , 0 , 0 , 0 ,
- 0 , 0 , CHAR_PERIOD , 0 ,
+ 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
//48 0 49 1 50 2 51 3
//52 4 53 5 54 6 55 7
CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
//56 8 57 9 58 : 59 ;
//60 < 61 = 62 > 63 ?
- CHAR_NUMBER , CHAR_NUMBER , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
+ CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
//64 @ 65 A 66 B 67 C
//68 D 69 E 70 F 71 G
0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
@@ -814,8 +847,8 @@ static const unsigned char CharInfo[256] =
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//88 X 89 Y 90 Z 91 [
//92 \ 93 ] 94 ^ 95 _
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 ,
- 0 , 0 , 0 , CHAR_UNDER ,
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
+ 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
//96 ` 97 a 98 b 99 c
//100 d 101 e 102 f 103 g
0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
@@ -829,9 +862,9 @@ static const unsigned char CharInfo[256] =
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
//120 x 121 y 122 z 123 {
-//124 | 125 } 126 ~ 127 DEL
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 ,
- 0 , 0 , 0 , 0
+//124 | 125 } 126 ~ 127 DEL
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
};
static void InitCharacterInfo() {
@@ -869,6 +902,12 @@ static inline bool isHorizontalWhitespace(unsigned char c) {
return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
}
+/// isVerticalWhitespace - Return true if this character is vertical
+/// whitespace: '\n', '\r'. Note that this returns false for '\0'.
+static inline bool isVerticalWhitespace(unsigned char c) {
+ return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
+}
+
/// isWhitespace - Return true if this character is horizontal or vertical
/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false
/// for '\0'.
@@ -883,6 +922,14 @@ static inline bool isNumberBody(unsigned char c) {
true : false;
}
+/// isRawStringDelimBody - Return true if this is the body character of a
+/// raw string delimiter.
+static inline bool isRawStringDelimBody(unsigned char c) {
+ return (CharInfo[c] &
+ (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
+ true : false;
+}
+
//===----------------------------------------------------------------------===//
// Diagnostics forwarding code.
@@ -907,14 +954,14 @@ static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
// Create a new SLoc which is expanded from Expansion(FileLoc) but whose
// characters come from spelling(FileLoc)+Offset.
SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
- SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
+ SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
// Figure out the expansion loc range, which is the range covered by the
// original _Pragma(...) sequence.
std::pair<SourceLocation,SourceLocation> II =
- SM.getImmediateInstantiationRange(FileLoc);
+ SM.getImmediateExpansionRange(FileLoc);
- return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen);
+ return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
}
/// getSourceLocation - Return a source location identifier for the specified
@@ -928,7 +975,7 @@ SourceLocation Lexer::getSourceLocation(const char *Loc,
// the file id from FileLoc with the offset specified.
unsigned CharNo = Loc-BufferStart;
if (FileLoc.isFileID())
- return FileLoc.getFileLocWithOffset(CharNo);
+ return FileLoc.getLocWithOffset(CharNo);
// Otherwise, this is the _Pragma lexer case, which pretends that all of the
// tokens are lexed from where the _Pragma was defined.
@@ -978,7 +1025,7 @@ static char DecodeTrigraphChar(const char *CP, Lexer *L) {
}
if (!L->isLexingRawMode())
- L->Diag(CP-2, diag::trigraph_converted) << llvm::StringRef(&Res, 1);
+ L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
return Res;
}
@@ -1028,6 +1075,59 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
}
}
+/// \brief Checks that the given token is the first token that occurs after the
+/// given location (this excludes comments and whitespace). Returns the location
+/// immediately after the specified token. If the token is not found or the
+/// location is inside a macro, the returned source location will be invalid.
+SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
+ tok::TokenKind TKind,
+ const SourceManager &SM,
+ const LangOptions &LangOpts,
+ bool SkipTrailingWhitespaceAndNewLine) {
+ if (Loc.isMacroID()) {
+ if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts))
+ return SourceLocation();
+ Loc = SM.getExpansionRange(Loc).second;
+ }
+ Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
+
+ // Break down the source location.
+ std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
+
+ // Try to load the file buffer.
+ bool InvalidTemp = false;
+ llvm::StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
+ if (InvalidTemp)
+ return SourceLocation();
+
+ const char *TokenBegin = File.data() + LocInfo.second;
+
+ // Lex from the start of the given location.
+ Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
+ TokenBegin, File.end());
+ // Find the token.
+ Token Tok;
+ lexer.LexFromRawLexer(Tok);
+ if (Tok.isNot(TKind))
+ return SourceLocation();
+ SourceLocation TokenLoc = Tok.getLocation();
+
+ // Calculate how much whitespace needs to be skipped if any.
+ unsigned NumWhitespaceChars = 0;
+ if (SkipTrailingWhitespaceAndNewLine) {
+ const char *TokenEnd = SM.getCharacterData(TokenLoc) +
+ Tok.getLength();
+ unsigned char C = *TokenEnd;
+ while (isHorizontalWhitespace(C)) {
+ C = *(++TokenEnd);
+ NumWhitespaceChars++;
+ }
+ if (isVerticalWhitespace(C))
+ NumWhitespaceChars++;
+ }
+
+ return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
+}
/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
/// get its size, and return it. This is tricky in several cases:
@@ -1191,6 +1291,7 @@ FinishIdentifier:
// preprocessor, which may macro expand it or something.
if (II->isHandleIdentifierCase())
PP->HandleIdentifier(Result);
+
return;
}
@@ -1252,13 +1353,12 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
// If we are in Microsoft mode, don't continue if the constant is hex.
// For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
- if (!Features.Microsoft || !isHexaLiteral(BufferPtr, Features))
+ if (!Features.MicrosoftExt || !isHexaLiteral(BufferPtr, Features))
return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
}
// If we have a hex FP constant, continue.
- if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') &&
- !Features.CPlusPlus0x)
+ if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p'))
return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
// Update the location of token as well as BufferPtr.
@@ -1268,10 +1368,17 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
}
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
-/// either " or L".
-void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
+/// either " or L" or u8" or u" or U".
+void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
+ tok::TokenKind Kind) {
const char *NulCharacter = 0; // Does this string contain the \0 character?
+ if (!isLexingRawMode() &&
+ (Kind == tok::utf8_string_literal ||
+ Kind == tok::utf16_string_literal ||
+ Kind == tok::utf32_string_literal))
+ Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal);
+
char C = getAndAdvanceChar(CurPtr, Result);
while (C != '"') {
// Skip escaped characters. Escaped newlines will already be processed by
@@ -1281,16 +1388,21 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
if (C == '\n' || C == '\r' || // Newline.
(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
- if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
- PP->CodeCompleteNaturalLanguage();
- else if (!isLexingRawMode() && !Features.AsmPreprocessor)
+ if (!isLexingRawMode() && !Features.AsmPreprocessor)
Diag(BufferPtr, diag::warn_unterminated_string);
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
return;
}
- if (C == 0)
+ if (C == 0) {
+ if (isCodeCompletionPoint(CurPtr-1)) {
+ PP->CodeCompleteNaturalLanguage();
+ FormTokenWithChars(Result, CurPtr-1, tok::unknown);
+ return cutOffLexing();
+ }
+
NulCharacter = CurPtr-1;
+ }
C = getAndAdvanceChar(CurPtr, Result);
}
@@ -1300,8 +1412,82 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
// Update the location of the token as well as the BufferPtr instance var.
const char *TokStart = BufferPtr;
- FormTokenWithChars(Result, CurPtr,
- Wide ? tok::wide_string_literal : tok::string_literal);
+ FormTokenWithChars(Result, CurPtr, Kind);
+ Result.setLiteralData(TokStart);
+}
+
+/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
+/// having lexed R", LR", u8R", uR", or UR".
+void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
+ tok::TokenKind Kind) {
+ // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
+ // Between the initial and final double quote characters of the raw string,
+ // any transformations performed in phases 1 and 2 (trigraphs,
+ // universal-character-names, and line splicing) are reverted.
+
+ if (!isLexingRawMode())
+ Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
+
+ unsigned PrefixLen = 0;
+
+ while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
+ ++PrefixLen;
+
+ // If the last character was not a '(', then we didn't lex a valid delimiter.
+ if (CurPtr[PrefixLen] != '(') {
+ if (!isLexingRawMode()) {
+ const char *PrefixEnd = &CurPtr[PrefixLen];
+ if (PrefixLen == 16) {
+ Diag(PrefixEnd, diag::err_raw_delim_too_long);
+ } else {
+ Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
+ << StringRef(PrefixEnd, 1);
+ }
+ }
+
+ // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
+ // it's possible the '"' was intended to be part of the raw string, but
+ // there's not much we can do about that.
+ while (1) {
+ char C = *CurPtr++;
+
+ if (C == '"')
+ break;
+ if (C == 0 && CurPtr-1 == BufferEnd) {
+ --CurPtr;
+ break;
+ }
+ }
+
+ FormTokenWithChars(Result, CurPtr, tok::unknown);
+ return;
+ }
+
+ // Save prefix and move CurPtr past it
+ const char *Prefix = CurPtr;
+ CurPtr += PrefixLen + 1; // skip over prefix and '('
+
+ while (1) {
+ char C = *CurPtr++;
+
+ if (C == ')') {
+ // Check for prefix match and closing quote.
+ if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
+ CurPtr += PrefixLen + 1; // skip over prefix and '"'
+ break;
+ }
+ } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
+ if (!isLexingRawMode())
+ Diag(BufferPtr, diag::err_unterminated_raw_string)
+ << StringRef(Prefix, PrefixLen);
+ FormTokenWithChars(Result, CurPtr-1, tok::unknown);
+ return;
+ }
+ }
+
+ // Update the location of token as well as BufferPtr.
+ const char *TokStart = BufferPtr;
+ FormTokenWithChars(Result, CurPtr, Kind);
Result.setLiteralData(TokStart);
}
@@ -1317,7 +1503,8 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
// Skip the escaped character.
C = getAndAdvanceChar(CurPtr, Result);
} else if (C == '\n' || C == '\r' || // Newline.
- (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
+ (C == 0 && (CurPtr-1 == BufferEnd || // End of file.
+ isCodeCompletionPoint(CurPtr-1)))) {
// If the filename is unterminated, then it must just be a lone <
// character. Return this as such.
FormTokenWithChars(Result, AfterLessPos, tok::less);
@@ -1340,10 +1527,15 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
/// LexCharConstant - Lex the remainder of a character constant, after having
-/// lexed either ' or L'.
-void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
+/// lexed either ' or L' or u' or U'.
+void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
+ tok::TokenKind Kind) {
const char *NulCharacter = 0; // Does this character contain the \0 character?
+ if (!isLexingRawMode() &&
+ (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant))
+ Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal);
+
char C = getAndAdvanceChar(CurPtr, Result);
if (C == '\'') {
if (!isLexingRawMode() && !Features.AsmPreprocessor)
@@ -1360,13 +1552,17 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
C = getAndAdvanceChar(CurPtr, Result);
} else if (C == '\n' || C == '\r' || // Newline.
(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
- if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
- PP->CodeCompleteNaturalLanguage();
- else if (!isLexingRawMode() && !Features.AsmPreprocessor)
+ if (!isLexingRawMode() && !Features.AsmPreprocessor)
Diag(BufferPtr, diag::warn_unterminated_char);
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
return;
} else if (C == 0) {
+ if (isCodeCompletionPoint(CurPtr-1)) {
+ PP->CodeCompleteNaturalLanguage();
+ FormTokenWithChars(Result, CurPtr-1, tok::unknown);
+ return cutOffLexing();
+ }
+
NulCharacter = CurPtr-1;
}
C = getAndAdvanceChar(CurPtr, Result);
@@ -1378,7 +1574,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
- FormTokenWithChars(Result, CurPtr, tok::char_constant);
+ FormTokenWithChars(Result, CurPtr, Kind);
Result.setLiteralData(TokStart);
}
@@ -1451,20 +1647,28 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
char C;
do {
C = *CurPtr;
- // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character.
- // If we find a \n character, scan backwards, checking to see if it's an
- // escaped newline, like we do for block comments.
-
// Skip over characters in the fast loop.
while (C != 0 && // Potentially EOF.
- C != '\\' && // Potentially escaped newline.
- C != '?' && // Potentially trigraph.
C != '\n' && C != '\r') // Newline or DOS-style newline.
C = *++CurPtr;
- // If this is a newline, we're done.
- if (C == '\n' || C == '\r')
- break; // Found the newline? Break out!
+ const char *NextLine = CurPtr;
+ if (C != 0) {
+ // We found a newline, see if it's escaped.
+ const char *EscapePtr = CurPtr-1;
+ while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace.
+ --EscapePtr;
+
+ if (*EscapePtr == '\\') // Escaped newline.
+ CurPtr = EscapePtr;
+ else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
+ EscapePtr[-2] == '?') // Trigraph-escaped newline.
+ CurPtr = EscapePtr-2;
+ else
+ break; // This is a newline, we're done.
+
+ C = *CurPtr;
+ }
// Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
// properly decode the character. Read it in raw mode to avoid emitting
@@ -1476,6 +1680,13 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
C = getAndAdvanceChar(CurPtr, Result);
LexingRawMode = OldRawMode;
+ // If we only read only one character, then no special handling is needed.
+ // We're done and can skip forward to the newline.
+ if (C != 0 && CurPtr == OldPtr+1) {
+ CurPtr = NextLine;
+ break;
+ }
+
// If the char that we finally got was a \n, then we must have had something
// like \<newline><newline>. We don't want to have consumed the second
// newline, we want CurPtr, to end up pointing to it down below.
@@ -1492,9 +1703,9 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
// Okay, we found a // comment that ends in a newline, if the next
// line is also a // comment, but has spaces, don't emit a diagnostic.
- if (isspace(C)) {
+ if (isWhitespace(C)) {
const char *ForwardPtr = CurPtr;
- while (isspace(*ForwardPtr)) // Skip whitespace.
+ while (isWhitespace(*ForwardPtr)) // Skip whitespace.
++ForwardPtr;
if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
break;
@@ -1507,12 +1718,16 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
}
if (CurPtr == BufferEnd+1) {
- if (PP && PP->isCodeCompletionFile(FileLoc))
- PP->CodeCompleteNaturalLanguage();
-
--CurPtr;
break;
}
+
+ if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
+ PP->CodeCompleteNaturalLanguage();
+ cutOffLexing();
+ return false;
+ }
+
} while (C != '\n' && C != '\r');
// Found but did not consume the newline. Notify comment handlers about the
@@ -1573,7 +1788,7 @@ bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
Result.setKind(tok::comment);
PP->CreateString(&Spelling[0], Spelling.size(), Result,
- Result.getLocation());
+ Result.getLocation(), Result.getLocation());
return true;
}
@@ -1667,8 +1882,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
unsigned char C = getCharAndSize(CurPtr, CharSize);
CurPtr += CharSize;
if (C == 0 && CurPtr == BufferEnd+1) {
- if (!isLexingRawMode() &&
- !PP->isCodeCompletionFile(FileLoc))
+ if (!isLexingRawMode())
Diag(BufferPtr, diag::err_unterminated_block_comment);
--CurPtr;
@@ -1691,7 +1905,10 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
while (1) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
- if (CurPtr + 24 < BufferEnd) {
+ if (CurPtr + 24 < BufferEnd &&
+ // If there is a code-completion point avoid the fast scan because it
+ // doesn't check for '\0'.
+ !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
// While not aligned to a 16-byte boundary.
while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
C = *CurPtr++;
@@ -1751,9 +1968,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
Diag(CurPtr-1, diag::warn_nested_block_comment);
}
} else if (C == 0 && CurPtr == BufferEnd+1) {
- if (PP && PP->isCodeCompletionFile(FileLoc))
- PP->CodeCompleteNaturalLanguage();
- else if (!isLexingRawMode())
+ if (!isLexingRawMode())
Diag(BufferPtr, diag::err_unterminated_block_comment);
// Note: the user probably forgot a */. We could continue immediately
// after the /*, but this would involve lexing a lot of what really is the
@@ -1769,7 +1984,12 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
BufferPtr = CurPtr;
return false;
+ } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
+ PP->CodeCompleteNaturalLanguage();
+ cutOffLexing();
+ return false;
}
+
C = *CurPtr++;
}
@@ -1826,6 +2046,12 @@ std::string Lexer::ReadToEndOfLine() {
case 0: // Null.
// Found end of file?
if (CurPtr-1 != BufferEnd) {
+ if (isCodeCompletionPoint(CurPtr-1)) {
+ PP->CodeCompleteNaturalLanguage();
+ cutOffLexing();
+ return Result;
+ }
+
// Nope, normal character, continue.
Result += Char;
break;
@@ -1840,8 +2066,8 @@ std::string Lexer::ReadToEndOfLine() {
// Next, lex the character, which should handle the EOD transition.
Lex(Tmp);
if (Tmp.is(tok::code_completion)) {
- if (PP && PP->getCodeCompletionHandler())
- PP->getCodeCompletionHandler()->CodeCompleteNaturalLanguage();
+ if (PP)
+ PP->CodeCompleteNaturalLanguage();
Lex(Tmp);
}
assert(Tmp.is(tok::eod) && "Unexpected token!");
@@ -1857,22 +2083,6 @@ std::string Lexer::ReadToEndOfLine() {
/// This returns true if Result contains a token, false if PP.Lex should be
/// called again.
bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
- // Check if we are performing code completion.
- if (PP && PP->isCodeCompletionFile(FileLoc)) {
- // We're at the end of the file, but we've been asked to consider the
- // end of the file to be a code-completion token. Return the
- // code-completion token.
- Result.startToken();
- FormTokenWithChars(Result, CurPtr, tok::code_completion);
-
- // Only do the eof -> code_completion translation once.
- PP->SetCodeCompletionPoint(0, 0, 0);
-
- // Silence any diagnostics that occur once we hit the code-completion point.
- PP->getDiagnostics().setSuppressAllDiagnostics(true);
- return true;
- }
-
// If we hit the end of the file while parsing a preprocessor directive,
// end the preprocessor directive first. The next token returned will
// then be the end of file.
@@ -1900,7 +2110,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
// If we are in a #if directive, emit an error.
while (!ConditionalStack.empty()) {
- if (!PP->isCodeCompletionFile(FileLoc))
+ if (PP->getCodeCompletionFileLoc() != FileLoc)
PP->Diag(ConditionalStack.back().IfLoc,
diag::err_pp_unterminated_conditional);
ConditionalStack.pop_back();
@@ -1951,15 +2161,18 @@ unsigned Lexer::isNextPPTokenLParen() {
}
/// FindConflictEnd - Find the end of a version control conflict marker.
-static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) {
- llvm::StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7);
- size_t Pos = RestOfBuffer.find(">>>>>>>");
- while (Pos != llvm::StringRef::npos) {
+static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
+ ConflictMarkerKind CMK) {
+ const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
+ size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
+ StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen);
+ size_t Pos = RestOfBuffer.find(Terminator);
+ while (Pos != StringRef::npos) {
// Must occur at start of line.
if (RestOfBuffer[Pos-1] != '\r' &&
RestOfBuffer[Pos-1] != '\n') {
- RestOfBuffer = RestOfBuffer.substr(Pos+7);
- Pos = RestOfBuffer.find(">>>>>>>");
+ RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
+ Pos = RestOfBuffer.find(Terminator);
continue;
}
return RestOfBuffer.data()+Pos;
@@ -1977,23 +2190,25 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
return false;
- // Check to see if we have <<<<<<<.
- if (BufferEnd-CurPtr < 8 ||
- llvm::StringRef(CurPtr, 7) != "<<<<<<<")
+ // Check to see if we have <<<<<<< or >>>>.
+ if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") &&
+ (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> "))
return false;
// If we have a situation where we don't care about conflict markers, ignore
// it.
- if (IsInConflictMarker || isLexingRawMode())
+ if (CurrentConflictMarkerState || isLexingRawMode())
return false;
- // Check to see if there is a >>>>>>> somewhere in the buffer at the start of
- // a line to terminate this conflict marker.
- if (FindConflictEnd(CurPtr, BufferEnd)) {
+ ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
+
+ // Check to see if there is an ending marker somewhere in the buffer at the
+ // start of a line to terminate this conflict marker.
+ if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
// We found a match. We are really in a conflict marker.
// Diagnose this, and ignore to the end of line.
Diag(CurPtr, diag::err_conflict_marker);
- IsInConflictMarker = true;
+ CurrentConflictMarkerState = Kind;
// Skip ahead to the end of line. We know this exists because the
// end-of-conflict marker starts with \r or \n.
@@ -2010,10 +2225,10 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
}
-/// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>'
-/// marker, then it is the end of a conflict marker. Handle it by ignoring up
-/// until the end of the line. This returns true if it is a conflict marker and
-/// false if not.
+/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
+/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
+/// is the end of a conflict marker. Handle it by ignoring up until the end of
+/// the line. This returns true if it is a conflict marker and false if not.
bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
// Only a conflict marker if it starts at the beginning of a line.
if (CurPtr != BufferStart &&
@@ -2022,18 +2237,19 @@ bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
// If we have a situation where we don't care about conflict markers, ignore
// it.
- if (!IsInConflictMarker || isLexingRawMode())
+ if (!CurrentConflictMarkerState || isLexingRawMode())
return false;
- // Check to see if we have the marker (7 characters in a row).
- for (unsigned i = 1; i != 7; ++i)
+ // Check to see if we have the marker (4 characters in a row).
+ for (unsigned i = 1; i != 4; ++i)
if (CurPtr[i] != CurPtr[0])
return false;
// If we do have it, search for the end of the conflict marker. This could
// fail if it got skipped with a '#if 0' or something. Note that CurPtr might
// be the end of conflict marker.
- if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) {
+ if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
+ CurrentConflictMarkerState)) {
CurPtr = End;
// Skip ahead to the end of line.
@@ -2043,13 +2259,22 @@ bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
BufferPtr = CurPtr;
// No longer in the conflict marker.
- IsInConflictMarker = false;
+ CurrentConflictMarkerState = CMK_None;
return true;
}
return false;
}
+bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
+ if (PP && PP->isCodeCompletionEnabled()) {
+ SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
+ return Loc == PP->getCodeCompletionLoc();
+ }
+
+ return false;
+}
+
/// LexTokenInternal - This implements a simple C family lexer. It is an
/// extremely performance critical piece of code. This assumes that the buffer
@@ -2102,6 +2327,14 @@ LexNextToken:
return PPCache->Lex(Result);
}
+ // Check if we are performing code completion.
+ if (isCodeCompletionPoint(CurPtr-1)) {
+ // Return the code-completion token.
+ Result.startToken();
+ FormTokenWithChars(Result, CurPtr, tok::code_completion);
+ return;
+ }
+
if (!isLexingRawMode())
Diag(CurPtr-1, diag::null_in_file);
Result.setFlag(Token::LeadingSpace);
@@ -2112,7 +2345,7 @@ LexNextToken:
case 26: // DOS & CP/M EOF: "^Z".
// If we're in Microsoft extensions mode, treat this as end of file.
- if (Features.Microsoft) {
+ if (Features.MicrosoftExt) {
// Read the PP instance variable into an automatic variable, because
// LexEndOfFile will often delete 'this'.
Preprocessor *PPCache = PP;
@@ -2186,6 +2419,102 @@ LexNextToken:
MIOpt.ReadToken();
return LexNumericConstant(Result, CurPtr);
+ case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal
+ // Notify MIOpt that we read a non-whitespace/non-comment token.
+ MIOpt.ReadToken();
+
+ if (Features.CPlusPlus0x) {
+ Char = getCharAndSize(CurPtr, SizeTmp);
+
+ // UTF-16 string literal
+ if (Char == '"')
+ return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf16_string_literal);
+
+ // UTF-16 character constant
+ if (Char == '\'')
+ return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf16_char_constant);
+
+ // UTF-16 raw string literal
+ if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+ return LexRawStringLiteral(Result,
+ ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result),
+ tok::utf16_string_literal);
+
+ if (Char == '8') {
+ char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
+
+ // UTF-8 string literal
+ if (Char2 == '"')
+ return LexStringLiteral(Result,
+ ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result),
+ tok::utf8_string_literal);
+
+ if (Char2 == 'R') {
+ unsigned SizeTmp3;
+ char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
+ // UTF-8 raw string literal
+ if (Char3 == '"') {
+ return LexRawStringLiteral(Result,
+ ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result),
+ SizeTmp3, Result),
+ tok::utf8_string_literal);
+ }
+ }
+ }
+ }
+
+ // treat u like the start of an identifier.
+ return LexIdentifier(Result, CurPtr);
+
+ case 'U': // Identifier (Uber) or C++0x UTF-32 string literal
+ // Notify MIOpt that we read a non-whitespace/non-comment token.
+ MIOpt.ReadToken();
+
+ if (Features.CPlusPlus0x) {
+ Char = getCharAndSize(CurPtr, SizeTmp);
+
+ // UTF-32 string literal
+ if (Char == '"')
+ return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf32_string_literal);
+
+ // UTF-32 character constant
+ if (Char == '\'')
+ return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf32_char_constant);
+
+ // UTF-32 raw string literal
+ if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+ return LexRawStringLiteral(Result,
+ ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result),
+ tok::utf32_string_literal);
+ }
+
+ // treat U like the start of an identifier.
+ return LexIdentifier(Result, CurPtr);
+
+ case 'R': // Identifier or C++0x raw string literal
+ // Notify MIOpt that we read a non-whitespace/non-comment token.
+ MIOpt.ReadToken();
+
+ if (Features.CPlusPlus0x) {
+ Char = getCharAndSize(CurPtr, SizeTmp);
+
+ if (Char == '"')
+ return LexRawStringLiteral(Result,
+ ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::string_literal);
+ }
+
+ // treat R like the start of an identifier.
+ return LexIdentifier(Result, CurPtr);
+
case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
@@ -2194,21 +2523,30 @@ LexNextToken:
// Wide string literal.
if (Char == '"')
return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
- true);
+ tok::wide_string_literal);
+
+ // Wide raw string literal.
+ if (Features.CPlusPlus0x && Char == 'R' &&
+ getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+ return LexRawStringLiteral(Result,
+ ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result),
+ tok::wide_string_literal);
// Wide character constant.
if (Char == '\'')
- return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
+ return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::wide_char_constant);
// FALL THROUGH, treating L like the start of an identifier.
// C99 6.4.2: Identifiers.
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
- case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+ case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
- case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+ case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
case 'v': case 'w': case 'x': case 'y': case 'z':
case '_':
// Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -2231,13 +2569,13 @@ LexNextToken:
case '\'':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
- return LexCharConstant(Result, CurPtr);
+ return LexCharConstant(Result, CurPtr, tok::char_constant);
// C99 6.4.5: String Literals.
case '"':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
- return LexStringLiteral(Result, CurPtr, false);
+ return LexStringLiteral(Result, CurPtr, tok::string_literal);
// C99 6.4.6: Punctuators.
case '?':
@@ -2396,7 +2734,7 @@ LexNextToken:
Kind = tok::hashhash; // '%:%:' -> '##'
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
- } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize
+ } else if (Char == '@' && Features.MicrosoftExt) {// %:@ -> #@ -> Charize
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
if (!isLexingRawMode())
Diag(BufferPtr, diag::charize_microsoft_ext);
@@ -2447,6 +2785,10 @@ LexNextToken:
// If this is actually a '<<<<<<<' version control conflict marker,
// recognize it as such and recover nicely.
goto LexNextToken;
+ } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
+ // If this is '<<<<' and we're in a Perforce-style conflict marker,
+ // ignore it.
+ goto LexNextToken;
} else if (Features.CUDA && After == '<') {
Kind = tok::lesslessless;
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
@@ -2470,6 +2812,8 @@ LexNextToken:
char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
if (After != ':' && After != '>') {
Kind = tok::less;
+ if (!isLexingRawMode())
+ Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
break;
}
}
@@ -2494,6 +2838,10 @@ LexNextToken:
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
Kind = tok::greatergreaterequal;
+ } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
+ // If this is actually a '>>>>' conflict marker, recognize it as such
+ // and recover nicely.
+ goto LexNextToken;
} else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
// If this is '>>>>>>>' and we're in a conflict marker, ignore it.
goto LexNextToken;
@@ -2552,7 +2900,7 @@ LexNextToken:
case '=':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '=') {
- // If this is '=======' and we're in a conflict marker, ignore it.
+ // If this is '====' and we're in a conflict marker, ignore it.
if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
goto LexNextToken;
@@ -2570,7 +2918,7 @@ LexNextToken:
if (Char == '#') {
Kind = tok::hashhash;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
- } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize
+ } else if (Char == '@' && Features.MicrosoftExt) { // #@ -> Charize
Kind = tok::hashat;
if (!isLexingRawMode())
Diag(BufferPtr, diag::charize_microsoft_ext);