diff options
Diffstat (limited to 'contrib/llvm/lib/Support/YAMLParser.cpp')
| -rw-r--r-- | contrib/llvm/lib/Support/YAMLParser.cpp | 2442 |
1 files changed, 0 insertions, 2442 deletions
diff --git a/contrib/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm/lib/Support/YAMLParser.cpp deleted file mode 100644 index 9b2fe9c4418a..000000000000 --- a/contrib/llvm/lib/Support/YAMLParser.cpp +++ /dev/null @@ -1,2442 +0,0 @@ -//===- YAMLParser.cpp - Simple YAML parser --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a YAML parser. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/YAMLParser.h" -#include "llvm/ADT/AllocatorList.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/None.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/SMLoc.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/Unicode.h" -#include "llvm/Support/raw_ostream.h" -#include <algorithm> -#include <cassert> -#include <cstddef> -#include <cstdint> -#include <map> -#include <memory> -#include <string> -#include <system_error> -#include <utility> - -using namespace llvm; -using namespace yaml; - -enum UnicodeEncodingForm { - UEF_UTF32_LE, ///< UTF-32 Little Endian - UEF_UTF32_BE, ///< UTF-32 Big Endian - UEF_UTF16_LE, ///< UTF-16 Little Endian - UEF_UTF16_BE, ///< UTF-16 Big Endian - UEF_UTF8, ///< UTF-8 or ascii. - UEF_Unknown ///< Not a valid Unicode encoding. -}; - -/// EncodingInfo - Holds the encoding type and length of the byte order mark if -/// it exists. Length is in {0, 2, 3, 4}. -using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; - -/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode -/// encoding form of \a Input. -/// -/// @param Input A string of length 0 or more. -/// @returns An EncodingInfo indicating the Unicode encoding form of the input -/// and how long the byte order mark is if one exists. -static EncodingInfo getUnicodeEncoding(StringRef Input) { - if (Input.empty()) - return std::make_pair(UEF_Unknown, 0); - - switch (uint8_t(Input[0])) { - case 0x00: - if (Input.size() >= 4) { - if ( Input[1] == 0 - && uint8_t(Input[2]) == 0xFE - && uint8_t(Input[3]) == 0xFF) - return std::make_pair(UEF_UTF32_BE, 4); - if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) - return std::make_pair(UEF_UTF32_BE, 0); - } - - if (Input.size() >= 2 && Input[1] != 0) - return std::make_pair(UEF_UTF16_BE, 0); - return std::make_pair(UEF_Unknown, 0); - case 0xFF: - if ( Input.size() >= 4 - && uint8_t(Input[1]) == 0xFE - && Input[2] == 0 - && Input[3] == 0) - return std::make_pair(UEF_UTF32_LE, 4); - - if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) - return std::make_pair(UEF_UTF16_LE, 2); - return std::make_pair(UEF_Unknown, 0); - case 0xFE: - if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) - return std::make_pair(UEF_UTF16_BE, 2); - return std::make_pair(UEF_Unknown, 0); - case 0xEF: - if ( Input.size() >= 3 - && uint8_t(Input[1]) == 0xBB - && uint8_t(Input[2]) == 0xBF) - return std::make_pair(UEF_UTF8, 3); - return std::make_pair(UEF_Unknown, 0); - } - - // It could still be utf-32 or utf-16. - if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) - return std::make_pair(UEF_UTF32_LE, 0); - - if (Input.size() >= 2 && Input[1] == 0) - return std::make_pair(UEF_UTF16_LE, 0); - - return std::make_pair(UEF_UTF8, 0); -} - -/// Pin the vtables to this file. -void Node::anchor() {} -void NullNode::anchor() {} -void ScalarNode::anchor() {} -void BlockScalarNode::anchor() {} -void KeyValueNode::anchor() {} -void MappingNode::anchor() {} -void SequenceNode::anchor() {} -void AliasNode::anchor() {} - -namespace llvm { -namespace yaml { - -/// Token - A single YAML token. -struct Token { - enum TokenKind { - TK_Error, // Uninitialized token. - TK_StreamStart, - TK_StreamEnd, - TK_VersionDirective, - TK_TagDirective, - TK_DocumentStart, - TK_DocumentEnd, - TK_BlockEntry, - TK_BlockEnd, - TK_BlockSequenceStart, - TK_BlockMappingStart, - TK_FlowEntry, - TK_FlowSequenceStart, - TK_FlowSequenceEnd, - TK_FlowMappingStart, - TK_FlowMappingEnd, - TK_Key, - TK_Value, - TK_Scalar, - TK_BlockScalar, - TK_Alias, - TK_Anchor, - TK_Tag - } Kind = TK_Error; - - /// A string of length 0 or more whose begin() points to the logical location - /// of the token in the input. - StringRef Range; - - /// The value of a block scalar node. - std::string Value; - - Token() = default; -}; - -} // end namespace yaml -} // end namespace llvm - -using TokenQueueT = BumpPtrList<Token>; - -namespace { - -/// This struct is used to track simple keys. -/// -/// Simple keys are handled by creating an entry in SimpleKeys for each Token -/// which could legally be the start of a simple key. When peekNext is called, -/// if the Token To be returned is referenced by a SimpleKey, we continue -/// tokenizing until that potential simple key has either been found to not be -/// a simple key (we moved on to the next line or went further than 1024 chars). -/// Or when we run into a Value, and then insert a Key token (and possibly -/// others) before the SimpleKey's Tok. -struct SimpleKey { - TokenQueueT::iterator Tok; - unsigned Column; - unsigned Line; - unsigned FlowLevel; - bool IsRequired; - - bool operator ==(const SimpleKey &Other) { - return Tok == Other.Tok; - } -}; - -} // end anonymous namespace - -/// The Unicode scalar value of a UTF-8 minimal well-formed code unit -/// subsequence and the subsequence's length in code units (uint8_t). -/// A length of 0 represents an error. -using UTF8Decoded = std::pair<uint32_t, unsigned>; - -static UTF8Decoded decodeUTF8(StringRef Range) { - StringRef::iterator Position= Range.begin(); - StringRef::iterator End = Range.end(); - // 1 byte: [0x00, 0x7f] - // Bit pattern: 0xxxxxxx - if ((*Position & 0x80) == 0) { - return std::make_pair(*Position, 1); - } - // 2 bytes: [0x80, 0x7ff] - // Bit pattern: 110xxxxx 10xxxxxx - if (Position + 1 != End && - ((*Position & 0xE0) == 0xC0) && - ((*(Position + 1) & 0xC0) == 0x80)) { - uint32_t codepoint = ((*Position & 0x1F) << 6) | - (*(Position + 1) & 0x3F); - if (codepoint >= 0x80) - return std::make_pair(codepoint, 2); - } - // 3 bytes: [0x8000, 0xffff] - // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx - if (Position + 2 != End && - ((*Position & 0xF0) == 0xE0) && - ((*(Position + 1) & 0xC0) == 0x80) && - ((*(Position + 2) & 0xC0) == 0x80)) { - uint32_t codepoint = ((*Position & 0x0F) << 12) | - ((*(Position + 1) & 0x3F) << 6) | - (*(Position + 2) & 0x3F); - // Codepoints between 0xD800 and 0xDFFF are invalid, as - // they are high / low surrogate halves used by UTF-16. - if (codepoint >= 0x800 && - (codepoint < 0xD800 || codepoint > 0xDFFF)) - return std::make_pair(codepoint, 3); - } - // 4 bytes: [0x10000, 0x10FFFF] - // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if (Position + 3 != End && - ((*Position & 0xF8) == 0xF0) && - ((*(Position + 1) & 0xC0) == 0x80) && - ((*(Position + 2) & 0xC0) == 0x80) && - ((*(Position + 3) & 0xC0) == 0x80)) { - uint32_t codepoint = ((*Position & 0x07) << 18) | - ((*(Position + 1) & 0x3F) << 12) | - ((*(Position + 2) & 0x3F) << 6) | - (*(Position + 3) & 0x3F); - if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) - return std::make_pair(codepoint, 4); - } - return std::make_pair(0, 0); -} - -namespace llvm { -namespace yaml { - -/// Scans YAML tokens from a MemoryBuffer. -class Scanner { -public: - Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, - std::error_code *EC = nullptr); - Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, - std::error_code *EC = nullptr); - - /// Parse the next token and return it without popping it. - Token &peekNext(); - - /// Parse the next token and pop it from the queue. - Token getNext(); - - void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, - ArrayRef<SMRange> Ranges = None) { - SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); - } - - void setError(const Twine &Message, StringRef::iterator Position) { - if (Current >= End) - Current = End - 1; - - // propagate the error if possible - if (EC) - *EC = make_error_code(std::errc::invalid_argument); - - // Don't print out more errors after the first one we encounter. The rest - // are just the result of the first, and have no meaning. - if (!Failed) - printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); - Failed = true; - } - - void setError(const Twine &Message) { - setError(Message, Current); - } - - /// Returns true if an error occurred while parsing. - bool failed() { - return Failed; - } - -private: - void init(MemoryBufferRef Buffer); - - StringRef currentInput() { - return StringRef(Current, End - Current); - } - - /// Decode a UTF-8 minimal well-formed code unit subsequence starting - /// at \a Position. - /// - /// If the UTF-8 code units starting at Position do not form a well-formed - /// code unit subsequence, then the Unicode scalar value is 0, and the length - /// is 0. - UTF8Decoded decodeUTF8(StringRef::iterator Position) { - return ::decodeUTF8(StringRef(Position, End - Position)); - } - - // The following functions are based on the gramar rules in the YAML spec. The - // style of the function names it meant to closely match how they are written - // in the spec. The number within the [] is the number of the grammar rule in - // the spec. - // - // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. - // - // c- - // A production starting and ending with a special character. - // b- - // A production matching a single line break. - // nb- - // A production starting and ending with a non-break character. - // s- - // A production starting and ending with a white space character. - // ns- - // A production starting and ending with a non-space character. - // l- - // A production matching complete line(s). - - /// Skip a single nb-char[27] starting at Position. - /// - /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] - /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] - /// - /// @returns The code unit after the nb-char, or Position if it's not an - /// nb-char. - StringRef::iterator skip_nb_char(StringRef::iterator Position); - - /// Skip a single b-break[28] starting at Position. - /// - /// A b-break is 0xD 0xA | 0xD | 0xA - /// - /// @returns The code unit after the b-break, or Position if it's not a - /// b-break. - StringRef::iterator skip_b_break(StringRef::iterator Position); - - /// Skip a single s-space[31] starting at Position. - /// - /// An s-space is 0x20 - /// - /// @returns The code unit after the s-space, or Position if it's not a - /// s-space. - StringRef::iterator skip_s_space(StringRef::iterator Position); - - /// Skip a single s-white[33] starting at Position. - /// - /// A s-white is 0x20 | 0x9 - /// - /// @returns The code unit after the s-white, or Position if it's not a - /// s-white. - StringRef::iterator skip_s_white(StringRef::iterator Position); - - /// Skip a single ns-char[34] starting at Position. - /// - /// A ns-char is nb-char - s-white - /// - /// @returns The code unit after the ns-char, or Position if it's not a - /// ns-char. - StringRef::iterator skip_ns_char(StringRef::iterator Position); - - using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); - - /// Skip minimal well-formed code unit subsequences until Func - /// returns its input. - /// - /// @returns The code unit after the last minimal well-formed code unit - /// subsequence that Func accepted. - StringRef::iterator skip_while( SkipWhileFunc Func - , StringRef::iterator Position); - - /// Skip minimal well-formed code unit subsequences until Func returns its - /// input. - void advanceWhile(SkipWhileFunc Func); - - /// Scan ns-uri-char[39]s starting at Cur. - /// - /// This updates Cur and Column while scanning. - void scan_ns_uri_char(); - - /// Consume a minimal well-formed code unit subsequence starting at - /// \a Cur. Return false if it is not the same Unicode scalar value as - /// \a Expected. This updates \a Column. - bool consume(uint32_t Expected); - - /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. - void skip(uint32_t Distance); - - /// Return true if the minimal well-formed code unit subsequence at - /// Pos is whitespace or a new line - bool isBlankOrBreak(StringRef::iterator Position); - - /// Consume a single b-break[28] if it's present at the current position. - /// - /// Return false if the code unit at the current position isn't a line break. - bool consumeLineBreakIfPresent(); - - /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. - void saveSimpleKeyCandidate( TokenQueueT::iterator Tok - , unsigned AtColumn - , bool IsRequired); - - /// Remove simple keys that can no longer be valid simple keys. - /// - /// Invalid simple keys are not on the current line or are further than 1024 - /// columns back. - void removeStaleSimpleKeyCandidates(); - - /// Remove all simple keys on FlowLevel \a Level. - void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); - - /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd - /// tokens if needed. - bool unrollIndent(int ToColumn); - - /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint - /// if needed. - bool rollIndent( int ToColumn - , Token::TokenKind Kind - , TokenQueueT::iterator InsertPoint); - - /// Skip a single-line comment when the comment starts at the current - /// position of the scanner. - void skipComment(); - - /// Skip whitespace and comments until the start of the next token. - void scanToNextToken(); - - /// Must be the first token generated. - bool scanStreamStart(); - - /// Generate tokens needed to close out the stream. - bool scanStreamEnd(); - - /// Scan a %BLAH directive. - bool scanDirective(); - - /// Scan a ... or ---. - bool scanDocumentIndicator(bool IsStart); - - /// Scan a [ or { and generate the proper flow collection start token. - bool scanFlowCollectionStart(bool IsSequence); - - /// Scan a ] or } and generate the proper flow collection end token. - bool scanFlowCollectionEnd(bool IsSequence); - - /// Scan the , that separates entries in a flow collection. - bool scanFlowEntry(); - - /// Scan the - that starts block sequence entries. - bool scanBlockEntry(); - - /// Scan an explicit ? indicating a key. - bool scanKey(); - - /// Scan an explicit : indicating a value. - bool scanValue(); - - /// Scan a quoted scalar. - bool scanFlowScalar(bool IsDoubleQuoted); - - /// Scan an unquoted scalar. - bool scanPlainScalar(); - - /// Scan an Alias or Anchor starting with * or &. - bool scanAliasOrAnchor(bool IsAlias); - - /// Scan a block scalar starting with | or >. - bool scanBlockScalar(bool IsLiteral); - - /// Scan a chomping indicator in a block scalar header. - char scanBlockChompingIndicator(); - - /// Scan an indentation indicator in a block scalar header. - unsigned scanBlockIndentationIndicator(); - - /// Scan a block scalar header. - /// - /// Return false if an error occurred. - bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, - bool &IsDone); - - /// Look for the indentation level of a block scalar. - /// - /// Return false if an error occurred. - bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, - unsigned &LineBreaks, bool &IsDone); - - /// Scan the indentation of a text line in a block scalar. - /// - /// Return false if an error occurred. - bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, - bool &IsDone); - - /// Scan a tag of the form !stuff. - bool scanTag(); - - /// Dispatch to the next scanning function based on \a *Cur. - bool fetchMoreTokens(); - - /// The SourceMgr used for diagnostics and buffer management. - SourceMgr &SM; - - /// The original input. - MemoryBufferRef InputBuffer; - - /// The current position of the scanner. - StringRef::iterator Current; - - /// The end of the input (one past the last character). - StringRef::iterator End; - - /// Current YAML indentation level in spaces. - int Indent; - - /// Current column number in Unicode code points. - unsigned Column; - - /// Current line number. - unsigned Line; - - /// How deep we are in flow style containers. 0 Means at block level. - unsigned FlowLevel; - - /// Are we at the start of the stream? - bool IsStartOfStream; - - /// Can the next token be the start of a simple key? - bool IsSimpleKeyAllowed; - - /// True if an error has occurred. - bool Failed; - - /// Should colors be used when printing out the diagnostic messages? - bool ShowColors; - - /// Queue of tokens. This is required to queue up tokens while looking - /// for the end of a simple key. And for cases where a single character - /// can produce multiple tokens (e.g. BlockEnd). - TokenQueueT TokenQueue; - - /// Indentation levels. - SmallVector<int, 4> Indents; - - /// Potential simple keys. - SmallVector<SimpleKey, 4> SimpleKeys; - - std::error_code *EC; -}; - -} // end namespace yaml -} // end namespace llvm - -/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. -static void encodeUTF8( uint32_t UnicodeScalarValue - , SmallVectorImpl<char> &Result) { - if (UnicodeScalarValue <= 0x7F) { - Result.push_back(UnicodeScalarValue & 0x7F); - } else if (UnicodeScalarValue <= 0x7FF) { - uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); - uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); - Result.push_back(FirstByte); - Result.push_back(SecondByte); - } else if (UnicodeScalarValue <= 0xFFFF) { - uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); - uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); - uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); - Result.push_back(FirstByte); - Result.push_back(SecondByte); - Result.push_back(ThirdByte); - } else if (UnicodeScalarValue <= 0x10FFFF) { - uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); - uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); - uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); - uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); - Result.push_back(FirstByte); - Result.push_back(SecondByte); - Result.push_back(ThirdByte); - Result.push_back(FourthByte); - } -} - -bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { - SourceMgr SM; - Scanner scanner(Input, SM); - while (true) { - Token T = scanner.getNext(); - switch (T.Kind) { - case Token::TK_StreamStart: - OS << "Stream-Start: "; - break; - case Token::TK_StreamEnd: - OS << "Stream-End: "; - break; - case Token::TK_VersionDirective: - OS << "Version-Directive: "; - break; - case Token::TK_TagDirective: - OS << "Tag-Directive: "; - break; - case Token::TK_DocumentStart: - OS << "Document-Start: "; - break; - case Token::TK_DocumentEnd: - OS << "Document-End: "; - break; - case Token::TK_BlockEntry: - OS << "Block-Entry: "; - break; - case Token::TK_BlockEnd: - OS << "Block-End: "; - break; - case Token::TK_BlockSequenceStart: - OS << "Block-Sequence-Start: "; - break; - case Token::TK_BlockMappingStart: - OS << "Block-Mapping-Start: "; - break; - case Token::TK_FlowEntry: - OS << "Flow-Entry: "; - break; - case Token::TK_FlowSequenceStart: - OS << "Flow-Sequence-Start: "; - break; - case Token::TK_FlowSequenceEnd: - OS << "Flow-Sequence-End: "; - break; - case Token::TK_FlowMappingStart: - OS << "Flow-Mapping-Start: "; - break; - case Token::TK_FlowMappingEnd: - OS << "Flow-Mapping-End: "; - break; - case Token::TK_Key: - OS << "Key: "; - break; - case Token::TK_Value: - OS << "Value: "; - break; - case Token::TK_Scalar: - OS << "Scalar: "; - break; - case Token::TK_BlockScalar: - OS << "Block Scalar: "; - break; - case Token::TK_Alias: - OS << "Alias: "; - break; - case Token::TK_Anchor: - OS << "Anchor: "; - break; - case Token::TK_Tag: - OS << "Tag: "; - break; - case Token::TK_Error: - break; - } - OS << T.Range << "\n"; - if (T.Kind == Token::TK_StreamEnd) - break; - else if (T.Kind == Token::TK_Error) - return false; - } - return true; -} - -bool yaml::scanTokens(StringRef Input) { - SourceMgr SM; - Scanner scanner(Input, SM); - while (true) { - Token T = scanner.getNext(); - if (T.Kind == Token::TK_StreamEnd) - break; - else if (T.Kind == Token::TK_Error) - return false; - } - return true; -} - -std::string yaml::escape(StringRef Input, bool EscapePrintable) { - std::string EscapedInput; - for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { - if (*i == '\\') - EscapedInput += "\\\\"; - else if (*i == '"') - EscapedInput += "\\\""; - else if (*i == 0) - EscapedInput += "\\0"; - else if (*i == 0x07) - EscapedInput += "\\a"; - else if (*i == 0x08) - EscapedInput += "\\b"; - else if (*i == 0x09) - EscapedInput += "\\t"; - else if (*i == 0x0A) - EscapedInput += "\\n"; - else if (*i == 0x0B) - EscapedInput += "\\v"; - else if (*i == 0x0C) - EscapedInput += "\\f"; - else if (*i == 0x0D) - EscapedInput += "\\r"; - else if (*i == 0x1B) - EscapedInput += "\\e"; - else if ((unsigned char)*i < 0x20) { // Control characters not handled above. - std::string HexStr = utohexstr(*i); - EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; - } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. - UTF8Decoded UnicodeScalarValue - = decodeUTF8(StringRef(i, Input.end() - i)); - if (UnicodeScalarValue.second == 0) { - // Found invalid char. - SmallString<4> Val; - encodeUTF8(0xFFFD, Val); - EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); - // FIXME: Error reporting. - return EscapedInput; - } - if (UnicodeScalarValue.first == 0x85) - EscapedInput += "\\N"; - else if (UnicodeScalarValue.first == 0xA0) - EscapedInput += "\\_"; - else if (UnicodeScalarValue.first == 0x2028) - EscapedInput += "\\L"; - else if (UnicodeScalarValue.first == 0x2029) - EscapedInput += "\\P"; - else if (!EscapePrintable && - sys::unicode::isPrintable(UnicodeScalarValue.first)) - EscapedInput += StringRef(i, UnicodeScalarValue.second); - else { - std::string HexStr = utohexstr(UnicodeScalarValue.first); - if (HexStr.size() <= 2) - EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; - else if (HexStr.size() <= 4) - EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; - else if (HexStr.size() <= 8) - EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; - } - i += UnicodeScalarValue.second - 1; - } else - EscapedInput.push_back(*i); - } - return EscapedInput; -} - -Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, - std::error_code *EC) - : SM(sm), ShowColors(ShowColors), EC(EC) { - init(MemoryBufferRef(Input, "YAML")); -} - -Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, - std::error_code *EC) - : SM(SM_), ShowColors(ShowColors), EC(EC) { - init(Buffer); -} - -void Scanner::init(MemoryBufferRef Buffer) { - InputBuffer = Buffer; - Current = InputBuffer.getBufferStart(); - End = InputBuffer.getBufferEnd(); - Indent = -1; - Column = 0; - Line = 0; - FlowLevel = 0; - IsStartOfStream = true; - IsSimpleKeyAllowed = true; - Failed = false; - std::unique_ptr<MemoryBuffer> InputBufferOwner = - MemoryBuffer::getMemBuffer(Buffer); - SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); -} - -Token &Scanner::peekNext() { - // If the current token is a possible simple key, keep parsing until we - // can confirm. - bool NeedMore = false; - while (true) { - if (TokenQueue.empty() || NeedMore) { - if (!fetchMoreTokens()) { - TokenQueue.clear(); - TokenQueue.push_back(Token()); - return TokenQueue.front(); - } - } - assert(!TokenQueue.empty() && - "fetchMoreTokens lied about getting tokens!"); - - removeStaleSimpleKeyCandidates(); - SimpleKey SK; - SK.Tok = TokenQueue.begin(); - if (!is_contained(SimpleKeys, SK)) - break; - else - NeedMore = true; - } - return TokenQueue.front(); -} - -Token Scanner::getNext() { - Token Ret = peekNext(); - // TokenQueue can be empty if there was an error getting the next token. - if (!TokenQueue.empty()) - TokenQueue.pop_front(); - - // There cannot be any referenced Token's if the TokenQueue is empty. So do a - // quick deallocation of them all. - if (TokenQueue.empty()) - TokenQueue.resetAlloc(); - - return Ret; -} - -StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { - if (Position == End) - return Position; - // Check 7 bit c-printable - b-char. - if ( *Position == 0x09 - || (*Position >= 0x20 && *Position <= 0x7E)) - return Position + 1; - - // Check for valid UTF-8. - if (uint8_t(*Position) & 0x80) { - UTF8Decoded u8d = decodeUTF8(Position); - if ( u8d.second != 0 - && u8d.first != 0xFEFF - && ( u8d.first == 0x85 - || ( u8d.first >= 0xA0 - && u8d.first <= 0xD7FF) - || ( u8d.first >= 0xE000 - && u8d.first <= 0xFFFD) - || ( u8d.first >= 0x10000 - && u8d.first <= 0x10FFFF))) - return Position + u8d.second; - } - return Position; -} - -StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { - if (Position == End) - return Position; - if (*Position == 0x0D) { - if (Position + 1 != End && *(Position + 1) == 0x0A) - return Position + 2; - return Position + 1; - } - - if (*Position == 0x0A) - return Position + 1; - return Position; -} - -StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { - if (Position == End) - return Position; - if (*Position == ' ') - return Position + 1; - return Position; -} - -StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { - if (Position == End) - return Position; - if (*Position == ' ' || *Position == '\t') - return Position + 1; - return Position; -} - -StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { - if (Position == End) - return Position; - if (*Position == ' ' || *Position == '\t') - return Position; - return skip_nb_char(Position); -} - -StringRef::iterator Scanner::skip_while( SkipWhileFunc Func - , StringRef::iterator Position) { - while (true) { - StringRef::iterator i = (this->*Func)(Position); - if (i == Position) - break; - Position = i; - } - return Position; -} - -void Scanner::advanceWhile(SkipWhileFunc Func) { - auto Final = skip_while(Func, Current); - Column += Final - Current; - Current = Final; -} - -static bool is_ns_hex_digit(const char C) { - return (C >= '0' && C <= '9') - || (C >= 'a' && C <= 'z') - || (C >= 'A' && C <= 'Z'); -} - -static bool is_ns_word_char(const char C) { - return C == '-' - || (C >= 'a' && C <= 'z') - || (C >= 'A' && C <= 'Z'); -} - -void Scanner::scan_ns_uri_char() { - while (true) { - if (Current == End) - break; - if (( *Current == '%' - && Current + 2 < End - && is_ns_hex_digit(*(Current + 1)) - && is_ns_hex_digit(*(Current + 2))) - || is_ns_word_char(*Current) - || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") - != StringRef::npos) { - ++Current; - ++Column; - } else - break; - } -} - -bool Scanner::consume(uint32_t Expected) { - if (Expected >= 0x80) - report_fatal_error("Not dealing with this yet"); - if (Current == End) - return false; - if (uint8_t(*Current) >= 0x80) - report_fatal_error("Not dealing with this yet"); - if (uint8_t(*Current) == Expected) { - ++Current; - ++Column; - return true; - } - return false; -} - -void Scanner::skip(uint32_t Distance) { - Current += Distance; - Column += Distance; - assert(Current <= End && "Skipped past the end"); -} - -bool Scanner::isBlankOrBreak(StringRef::iterator Position) { - if (Position == End) - return false; - return *Position == ' ' || *Position == '\t' || *Position == '\r' || - *Position == '\n'; -} - -bool Scanner::consumeLineBreakIfPresent() { - auto Next = skip_b_break(Current); - if (Next == Current) - return false; - Column = 0; - ++Line; - Current = Next; - return true; -} - -void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok - , unsigned AtColumn - , bool IsRequired) { - if (IsSimpleKeyAllowed) { - SimpleKey SK; - SK.Tok = Tok; - SK.Line = Line; - SK.Column = AtColumn; - SK.IsRequired = IsRequired; - SK.FlowLevel = FlowLevel; - SimpleKeys.push_back(SK); - } -} - -void Scanner::removeStaleSimpleKeyCandidates() { - for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); - i != SimpleKeys.end();) { - if (i->Line != Line || i->Column + 1024 < Column) { - if (i->IsRequired) - setError( "Could not find expected : for simple key" - , i->Tok->Range.begin()); - i = SimpleKeys.erase(i); - } else - ++i; - } -} - -void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { - if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) - SimpleKeys.pop_back(); -} - -bool Scanner::unrollIndent(int ToColumn) { - Token T; - // Indentation is ignored in flow. - if (FlowLevel != 0) - return true; - - while (Indent > ToColumn) { - T.Kind = Token::TK_BlockEnd; - T.Range = StringRef(Current, 1); - TokenQueue.push_back(T); - Indent = Indents.pop_back_val(); - } - - return true; -} - -bool Scanner::rollIndent( int ToColumn - , Token::TokenKind Kind - , TokenQueueT::iterator InsertPoint) { - if (FlowLevel) - return true; - if (Indent < ToColumn) { - Indents.push_back(Indent); - Indent = ToColumn; - - Token T; - T.Kind = Kind; - T.Range = StringRef(Current, 0); - TokenQueue.insert(InsertPoint, T); - } - return true; -} - -void Scanner::skipComment() { - if (*Current != '#') - return; - while (true) { - // This may skip more than one byte, thus Column is only incremented - // for code points. - StringRef::iterator I = skip_nb_char(Current); - if (I == Current) - break; - Current = I; - ++Column; - } -} - -void Scanner::scanToNextToken() { - while (true) { - while (*Current == ' ' || *Current == '\t') { - skip(1); - } - - skipComment(); - - // Skip EOL. - StringRef::iterator i = skip_b_break(Current); - if (i == Current) - break; - Current = i; - ++Line; - Column = 0; - // New lines may start a simple key. - if (!FlowLevel) - IsSimpleKeyAllowed = true; - } -} - -bool Scanner::scanStreamStart() { - IsStartOfStream = false; - - EncodingInfo EI = getUnicodeEncoding(currentInput()); - - Token T; - T.Kind = Token::TK_StreamStart; - T.Range = StringRef(Current, EI.second); - TokenQueue.push_back(T); - Current += EI.second; - return true; -} - -bool Scanner::scanStreamEnd() { - // Force an ending new line if one isn't present. - if (Column != 0) { - Column = 0; - ++Line; - } - - unrollIndent(-1); - SimpleKeys.clear(); - IsSimpleKeyAllowed = false; - - Token T; - T.Kind = Token::TK_StreamEnd; - T.Range = StringRef(Current, 0); - TokenQueue.push_back(T); - return true; -} - -bool Scanner::scanDirective() { - // Reset the indentation level. - unrollIndent(-1); - SimpleKeys.clear(); - IsSimpleKeyAllowed = false; - - StringRef::iterator Start = Current; - consume('%'); - StringRef::iterator NameStart = Current; - Current = skip_while(&Scanner::skip_ns_char, Current); - StringRef Name(NameStart, Current - NameStart); - Current = skip_while(&Scanner::skip_s_white, Current); - - Token T; - if (Name == "YAML") { - Current = skip_while(&Scanner::skip_ns_char, Current); - T.Kind = Token::TK_VersionDirective; - T.Range = StringRef(Start, Current - Start); - TokenQueue.push_back(T); - return true; - } else if(Name == "TAG") { - Current = skip_while(&Scanner::skip_ns_char, Current); - Current = skip_while(&Scanner::skip_s_white, Current); - Current = skip_while(&Scanner::skip_ns_char, Current); - T.Kind = Token::TK_TagDirective; - T.Range = StringRef(Start, Current - Start); - TokenQueue.push_back(T); - return true; - } - return false; -} - -bool Scanner::scanDocumentIndicator(bool IsStart) { - unrollIndent(-1); - SimpleKeys.clear(); - IsSimpleKeyAllowed = false; - - Token T; - T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; - T.Range = StringRef(Current, 3); - skip(3); - TokenQueue.push_back(T); - return true; -} - -bool Scanner::scanFlowCollectionStart(bool IsSequence) { - Token T; - T.Kind = IsSequence ? Token::TK_FlowSequenceStart - : Token::TK_FlowMappingStart; - T.Range = StringRef(Current, 1); - skip(1); - TokenQueue.push_back(T); - - // [ and { may begin a simple key. - saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); - - // And may also be followed by a simple key. - IsSimpleKeyAllowed = true; - ++FlowLevel; - return true; -} - -bool Scanner::scanFlowCollectionEnd(bool IsSequence) { - removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); - IsSimpleKeyAllowed = false; - Token T; - T.Kind = IsSequence ? Token::TK_FlowSequenceEnd - : Token::TK_FlowMappingEnd; - T.Range = StringRef(Current, 1); - skip(1); - TokenQueue.push_back(T); - if (FlowLevel) - --FlowLevel; - return true; -} - -bool Scanner::scanFlowEntry() { - removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); - IsSimpleKeyAllowed = true; - Token T; - T.Kind = Token::TK_FlowEntry; - T.Range = StringRef(Current, 1); - skip(1); - TokenQueue.push_back(T); - return true; -} - -bool Scanner::scanBlockEntry() { - rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); - removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); - IsSimpleKeyAllowed = true; - Token T; - T.Kind = Token::TK_BlockEntry; - T.Range = StringRef(Current, 1); - skip(1); - TokenQueue.push_back(T); - return true; -} - -bool Scanner::scanKey() { - if (!FlowLevel) - rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); - - removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); - IsSimpleKeyAllowed = !FlowLevel; - - Token T; - T.Kind = Token::TK_Key; - T.Range = StringRef(Current, 1); - skip(1); - TokenQueue.push_back(T); - return true; -} - -bool Scanner::scanValue() { - // If the previous token could have been a simple key, insert the key token - // into the token queue. - if (!SimpleKeys.empty()) { - SimpleKey SK = SimpleKeys.pop_back_val(); - Token T; - T.Kind = Token::TK_Key; - T.Range = SK.Tok->Range; - TokenQueueT::iterator i, e; - for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { - if (i == SK.Tok) - break; - } - assert(i != e && "SimpleKey not in token queue!"); - i = TokenQueue.insert(i, T); - - // We may also need to add a Block-Mapping-Start token. - rollIndent(SK.Column, Token::TK_BlockMappingStart, i); - - IsSimpleKeyAllowed = false; - } else { - if (!FlowLevel) - rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); - IsSimpleKeyAllowed = !FlowLevel; - } - - Token T; - T.Kind = Token::TK_Value; - T.Range = StringRef(Current, 1); - skip(1); - TokenQueue.push_back(T); - return true; -} - -// Forbidding inlining improves performance by roughly 20%. -// FIXME: Remove once llvm optimizes this to the faster version without hints. -LLVM_ATTRIBUTE_NOINLINE static bool -wasEscaped(StringRef::iterator First, StringRef::iterator Position); - -// Returns whether a character at 'Position' was escaped with a leading '\'. -// 'First' specifies the position of the first character in the string. -static bool wasEscaped(StringRef::iterator First, - StringRef::iterator Position) { - assert(Position - 1 >= First); - StringRef::iterator I = Position - 1; - // We calculate the number of consecutive '\'s before the current position - // by iterating backwards through our string. - while (I >= First && *I == '\\') --I; - // (Position - 1 - I) now contains the number of '\'s before the current - // position. If it is odd, the character at 'Position' was escaped. - return (Position - 1 - I) % 2 == 1; -} - -bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { - StringRef::iterator Start = Current; - unsigned ColStart = Column; - if (IsDoubleQuoted) { - do { - ++Current; - while (Current != End && *Current != '"') - ++Current; - // Repeat until the previous character was not a '\' or was an escaped - // backslash. - } while ( Current != End - && *(Current - 1) == '\\' - && wasEscaped(Start + 1, Current)); - } else { - skip(1); - while (true) { - // Skip a ' followed by another '. - if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { - skip(2); - continue; - } else if (*Current == '\'') - break; - StringRef::iterator i = skip_nb_char(Current); - if (i == Current) { - i = skip_b_break(Current); - if (i == Current) - break; - Current = i; - Column = 0; - ++Line; - } else { - if (i == End) - break; - Current = i; - ++Column; - } - } - } - - if (Current == End) { - setError("Expected quote at end of scalar", Current); - return false; - } - - skip(1); // Skip ending quote. - Token T; - T.Kind = Token::TK_Scalar; - T.Range = StringRef(Start, Current - Start); - TokenQueue.push_back(T); - - saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); - - IsSimpleKeyAllowed = false; - - return true; -} - -bool Scanner::scanPlainScalar() { - StringRef::iterator Start = Current; - unsigned ColStart = Column; - unsigned LeadingBlanks = 0; - assert(Indent >= -1 && "Indent must be >= -1 !"); - unsigned indent = static_cast<unsigned>(Indent + 1); - while (true) { - if (*Current == '#') - break; - - while (!isBlankOrBreak(Current)) { - if ( FlowLevel && *Current == ':' - && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { - setError("Found unexpected ':' while scanning a plain scalar", Current); - return false; - } - - // Check for the end of the plain scalar. - if ( (*Current == ':' && isBlankOrBreak(Current + 1)) - || ( FlowLevel - && (StringRef(Current, 1).find_first_of(",:?[]{}") - != StringRef::npos))) - break; - - StringRef::iterator i = skip_nb_char(Current); - if (i == Current) - break; - Current = i; - ++Column; - } - - // Are we at the end? - if (!isBlankOrBreak(Current)) - break; - - // Eat blanks. - StringRef::iterator Tmp = Current; - while (isBlankOrBreak(Tmp)) { - StringRef::iterator i = skip_s_white(Tmp); - if (i != Tmp) { - if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { - setError("Found invalid tab character in indentation", Tmp); - return false; - } - Tmp = i; - ++Column; - } else { - i = skip_b_break(Tmp); - if (!LeadingBlanks) - LeadingBlanks = 1; - Tmp = i; - Column = 0; - ++Line; - } - } - - if (!FlowLevel && Column < indent) - break; - - Current = Tmp; - } - if (Start == Current) { - setError("Got empty plain scalar", Start); - return false; - } - Token T; - T.Kind = Token::TK_Scalar; - T.Range = StringRef(Start, Current - Start); - TokenQueue.push_back(T); - - // Plain scalars can be simple keys. - saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); - - IsSimpleKeyAllowed = false; - - return true; -} - -bool Scanner::scanAliasOrAnchor(bool IsAlias) { - StringRef::iterator Start = Current; - unsigned ColStart = Column; - skip(1); - while(true) { - if ( *Current == '[' || *Current == ']' - || *Current == '{' || *Current == '}' - || *Current == ',' - || *Current == ':') - break; - StringRef::iterator i = skip_ns_char(Current); - if (i == Current) - break; - Current = i; - ++Column; - } - - if (Start == Current) { - setError("Got empty alias or anchor", Start); - return false; - } - - Token T; - T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; - T.Range = StringRef(Start, Current - Start); - TokenQueue.push_back(T); - - // Alias and anchors can be simple keys. - saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); - - IsSimpleKeyAllowed = false; - - return true; -} - -char Scanner::scanBlockChompingIndicator() { - char Indicator = ' '; - if (Current != End && (*Current == '+' || *Current == '-')) { - Indicator = *Current; - skip(1); - } - return Indicator; -} - -/// Get the number of line breaks after chomping. -/// -/// Return the number of trailing line breaks to emit, depending on -/// \p ChompingIndicator. -static unsigned getChompedLineBreaks(char ChompingIndicator, - unsigned LineBreaks, StringRef Str) { - if (ChompingIndicator == '-') // Strip all line breaks. - return 0; - if (ChompingIndicator == '+') // Keep all line breaks. - return LineBreaks; - // Clip trailing lines. - return Str.empty() ? 0 : 1; -} - -unsigned Scanner::scanBlockIndentationIndicator() { - unsigned Indent = 0; - if (Current != End && (*Current >= '1' && *Current <= '9')) { - Indent = unsigned(*Current - '0'); - skip(1); - } - return Indent; -} - -bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, - unsigned &IndentIndicator, bool &IsDone) { - auto Start = Current; - - ChompingIndicator = scanBlockChompingIndicator(); - IndentIndicator = scanBlockIndentationIndicator(); - // Check for the chomping indicator once again. - if (ChompingIndicator == ' ') - ChompingIndicator = scanBlockChompingIndicator(); - Current = skip_while(&Scanner::skip_s_white, Current); - skipComment(); - - if (Current == End) { // EOF, we have an empty scalar. - Token T; - T.Kind = Token::TK_BlockScalar; - T.Range = StringRef(Start, Current - Start); - TokenQueue.push_back(T); - IsDone = true; - return true; - } - - if (!consumeLineBreakIfPresent()) { - setError("Expected a line break after block scalar header", Current); - return false; - } - return true; -} - -bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, - unsigned BlockExitIndent, - unsigned &LineBreaks, bool &IsDone) { - unsigned MaxAllSpaceLineCharacters = 0; - StringRef::iterator LongestAllSpaceLine; - - while (true) { - advanceWhile(&Scanner::skip_s_space); - if (skip_nb_char(Current) != Current) { - // This line isn't empty, so try and find the indentation. - if (Column <= BlockExitIndent) { // End of the block literal. - IsDone = true; - return true; - } - // We found the block's indentation. - BlockIndent = Column; - if (MaxAllSpaceLineCharacters > BlockIndent) { - setError( - "Leading all-spaces line must be smaller than the block indent", - LongestAllSpaceLine); - return false; - } - return true; - } - if (skip_b_break(Current) != Current && - Column > MaxAllSpaceLineCharacters) { - // Record the longest all-space line in case it's longer than the - // discovered block indent. - MaxAllSpaceLineCharacters = Column; - LongestAllSpaceLine = Current; - } - - // Check for EOF. - if (Current == End) { - IsDone = true; - return true; - } - - if (!consumeLineBreakIfPresent()) { - IsDone = true; - return true; - } - ++LineBreaks; - } - return true; -} - -bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, - unsigned BlockExitIndent, bool &IsDone) { - // Skip the indentation. - while (Column < BlockIndent) { - auto I = skip_s_space(Current); - if (I == Current) - break; - Current = I; - ++Column; - } - - if (skip_nb_char(Current) == Current) - return true; - - if (Column <= BlockExitIndent) { // End of the block literal. - IsDone = true; - return true; - } - - if (Column < BlockIndent) { - if (Current != End && *Current == '#') { // Trailing comment. - IsDone = true; - return true; - } - setError("A text line is less indented than the block scalar", Current); - return false; - } - return true; // A normal text line. -} - -bool Scanner::scanBlockScalar(bool IsLiteral) { - // Eat '|' or '>' - assert(*Current == '|' || *Current == '>'); - skip(1); - - char ChompingIndicator; - unsigned BlockIndent; - bool IsDone = false; - if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) - return false; - if (IsDone) - return true; - - auto Start = Current; - unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; - unsigned LineBreaks = 0; - if (BlockIndent == 0) { - if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, - IsDone)) - return false; - } - - // Scan the block's scalars body. - SmallString<256> Str; - while (!IsDone) { - if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) - return false; - if (IsDone) - break; - - // Parse the current line. - auto LineStart = Current; - advanceWhile(&Scanner::skip_nb_char); - if (LineStart != Current) { - Str.append(LineBreaks, '\n'); - Str.append(StringRef(LineStart, Current - LineStart)); - LineBreaks = 0; - } - - // Check for EOF. - if (Current == End) - break; - - if (!consumeLineBreakIfPresent()) - break; - ++LineBreaks; - } - - if (Current == End && !LineBreaks) - // Ensure that there is at least one line break before the end of file. - LineBreaks = 1; - Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); - - // New lines may start a simple key. - if (!FlowLevel) - IsSimpleKeyAllowed = true; - - Token T; - T.Kind = Token::TK_BlockScalar; - T.Range = StringRef(Start, Current - Start); - T.Value = Str.str().str(); - TokenQueue.push_back(T); - return true; -} - -bool Scanner::scanTag() { - StringRef::iterator Start = Current; - unsigned ColStart = Column; - skip(1); // Eat !. - if (Current == End || isBlankOrBreak(Current)); // An empty tag. - else if (*Current == '<') { - skip(1); - scan_ns_uri_char(); - if (!consume('>')) - return false; - } else { - // FIXME: Actually parse the c-ns-shorthand-tag rule. - Current = skip_while(&Scanner::skip_ns_char, Current); - } - - Token T; - T.Kind = Token::TK_Tag; - T.Range = StringRef(Start, Current - Start); - TokenQueue.push_back(T); - - // Tags can be simple keys. - saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); - - IsSimpleKeyAllowed = false; - - return true; -} - -bool Scanner::fetchMoreTokens() { - if (IsStartOfStream) - return scanStreamStart(); - - scanToNextToken(); - - if (Current == End) - return scanStreamEnd(); - - removeStaleSimpleKeyCandidates(); - - unrollIndent(Column); - - if (Column == 0 && *Current == '%') - return scanDirective(); - - if (Column == 0 && Current + 4 <= End - && *Current == '-' - && *(Current + 1) == '-' - && *(Current + 2) == '-' - && (Current + 3 == End || isBlankOrBreak(Current + 3))) - return scanDocumentIndicator(true); - - if (Column == 0 && Current + 4 <= End - && *Current == '.' - && *(Current + 1) == '.' - && *(Current + 2) == '.' - && (Current + 3 == End || isBlankOrBreak(Current + 3))) - return scanDocumentIndicator(false); - - if (*Current == '[') - return scanFlowCollectionStart(true); - - if (*Current == '{') - return scanFlowCollectionStart(false); - - if (*Current == ']') - return scanFlowCollectionEnd(true); - - if (*Current == '}') - return scanFlowCollectionEnd(false); - - if (*Current == ',') - return scanFlowEntry(); - - if (*Current == '-' && isBlankOrBreak(Current + 1)) - return scanBlockEntry(); - - if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) - return scanKey(); - - if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) - return scanValue(); - - if (*Current == '*') - return scanAliasOrAnchor(true); - - if (*Current == '&') - return scanAliasOrAnchor(false); - - if (*Current == '!') - return scanTag(); - - if (*Current == '|' && !FlowLevel) - return scanBlockScalar(true); - - if (*Current == '>' && !FlowLevel) - return scanBlockScalar(false); - - if (*Current == '\'') - return scanFlowScalar(false); - - if (*Current == '"') - return scanFlowScalar(true); - - // Get a plain scalar. - StringRef FirstChar(Current, 1); - if (!(isBlankOrBreak(Current) - || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) - || (*Current == '-' && !isBlankOrBreak(Current + 1)) - || (!FlowLevel && (*Current == '?' || *Current == ':') - && isBlankOrBreak(Current + 1)) - || (!FlowLevel && *Current == ':' - && Current + 2 < End - && *(Current + 1) == ':' - && !isBlankOrBreak(Current + 2))) - return scanPlainScalar(); - - setError("Unrecognized character while tokenizing."); - return false; -} - -Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, - std::error_code *EC) - : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} - -Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, - std::error_code *EC) - : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} - -Stream::~Stream() = default; - -bool Stream::failed() { return scanner->failed(); } - -void Stream::printError(Node *N, const Twine &Msg) { - scanner->printError( N->getSourceRange().Start - , SourceMgr::DK_Error - , Msg - , N->getSourceRange()); -} - -document_iterator Stream::begin() { - if (CurrentDoc) - report_fatal_error("Can only iterate over the stream once"); - - // Skip Stream-Start. - scanner->getNext(); - - CurrentDoc.reset(new Document(*this)); - return document_iterator(CurrentDoc); -} - -document_iterator Stream::end() { - return document_iterator(); -} - -void Stream::skip() { - for (document_iterator i = begin(), e = end(); i != e; ++i) - i->skip(); -} - -Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, - StringRef T) - : Doc(D), TypeID(Type), Anchor(A), Tag(T) { - SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); - SourceRange = SMRange(Start, Start); -} - -std::string Node::getVerbatimTag() const { - StringRef Raw = getRawTag(); - if (!Raw.empty() && Raw != "!") { - std::string Ret; - if (Raw.find_last_of('!') == 0) { - Ret = Doc->getTagMap().find("!")->second; - Ret += Raw.substr(1); - return Ret; - } else if (Raw.startswith("!!")) { - Ret = Doc->getTagMap().find("!!")->second; - Ret += Raw.substr(2); - return Ret; - } else { - StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); - std::map<StringRef, StringRef>::const_iterator It = - Doc->getTagMap().find(TagHandle); - if (It != Doc->getTagMap().end()) - Ret = It->second; - else { - Token T; - T.Kind = Token::TK_Tag; - T.Range = TagHandle; - setError(Twine("Unknown tag handle ") + TagHandle, T); - } - Ret += Raw.substr(Raw.find_last_of('!') + 1); - return Ret; - } - } - - switch (getType()) { - case NK_Null: - return "tag:yaml.org,2002:null"; - case NK_Scalar: - case NK_BlockScalar: - // TODO: Tag resolution. - return "tag:yaml.org,2002:str"; - case NK_Mapping: - return "tag:yaml.org,2002:map"; - case NK_Sequence: - return "tag:yaml.org,2002:seq"; - } - - return ""; -} - -Token &Node::peekNext() { - return Doc->peekNext(); -} - -Token Node::getNext() { - return Doc->getNext(); -} - -Node *Node::parseBlockNode() { - return Doc->parseBlockNode(); -} - -BumpPtrAllocator &Node::getAllocator() { - return Doc->NodeAllocator; -} - -void Node::setError(const Twine &Msg, Token &Tok) const { - Doc->setError(Msg, Tok); -} - -bool Node::failed() const { - return Doc->failed(); -} - -StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { - // TODO: Handle newlines properly. We need to remove leading whitespace. - if (Value[0] == '"') { // Double quoted. - // Pull off the leading and trailing "s. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - // Search for characters that would require unescaping the value. - StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); - if (i != StringRef::npos) - return unescapeDoubleQuoted(UnquotedValue, i, Storage); - return UnquotedValue; - } else if (Value[0] == '\'') { // Single quoted. - // Pull off the leading and trailing 's. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - StringRef::size_type i = UnquotedValue.find('\''); - if (i != StringRef::npos) { - // We're going to need Storage. - Storage.clear(); - Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { - StringRef Valid(UnquotedValue.begin(), i); - Storage.insert(Storage.end(), Valid.begin(), Valid.end()); - Storage.push_back('\''); - UnquotedValue = UnquotedValue.substr(i + 2); - } - Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); - return StringRef(Storage.begin(), Storage.size()); - } - return UnquotedValue; - } - // Plain or block. - return Value.rtrim(' '); -} - -StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue - , StringRef::size_type i - , SmallVectorImpl<char> &Storage) - const { - // Use Storage to build proper value. - Storage.clear(); - Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { - // Insert all previous chars into Storage. - StringRef Valid(UnquotedValue.begin(), i); - Storage.insert(Storage.end(), Valid.begin(), Valid.end()); - // Chop off inserted chars. - UnquotedValue = UnquotedValue.substr(i); - - assert(!UnquotedValue.empty() && "Can't be empty!"); - - // Parse escape or line break. - switch (UnquotedValue[0]) { - case '\r': - case '\n': - Storage.push_back('\n'); - if ( UnquotedValue.size() > 1 - && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) - UnquotedValue = UnquotedValue.substr(1); - UnquotedValue = UnquotedValue.substr(1); - break; - default: - if (UnquotedValue.size() == 1) - // TODO: Report error. - break; - UnquotedValue = UnquotedValue.substr(1); - switch (UnquotedValue[0]) { - default: { - Token T; - T.Range = StringRef(UnquotedValue.begin(), 1); - setError("Unrecognized escape code!", T); - return ""; - } - case '\r': - case '\n': - // Remove the new line. - if ( UnquotedValue.size() > 1 - && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) - UnquotedValue = UnquotedValue.substr(1); - // If this was just a single byte newline, it will get skipped - // below. - break; - case '0': - Storage.push_back(0x00); - break; - case 'a': - Storage.push_back(0x07); - break; - case 'b': - Storage.push_back(0x08); - break; - case 't': - case 0x09: - Storage.push_back(0x09); - break; - case 'n': - Storage.push_back(0x0A); - break; - case 'v': - Storage.push_back(0x0B); - break; - case 'f': - Storage.push_back(0x0C); - break; - case 'r': - Storage.push_back(0x0D); - break; - case 'e': - Storage.push_back(0x1B); - break; - case ' ': - Storage.push_back(0x20); - break; - case '"': - Storage.push_back(0x22); - break; - case '/': - Storage.push_back(0x2F); - break; - case '\\': - Storage.push_back(0x5C); - break; - case 'N': - encodeUTF8(0x85, Storage); - break; - case '_': - encodeUTF8(0xA0, Storage); - break; - case 'L': - encodeUTF8(0x2028, Storage); - break; - case 'P': - encodeUTF8(0x2029, Storage); - break; - case 'x': { - if (UnquotedValue.size() < 3) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(2); - break; - } - case 'u': { - if (UnquotedValue.size() < 5) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(4); - break; - } - case 'U': { - if (UnquotedValue.size() < 9) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(8); - break; - } - } - UnquotedValue = UnquotedValue.substr(1); - } - } - Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); - return StringRef(Storage.begin(), Storage.size()); -} - -Node *KeyValueNode::getKey() { - if (Key) - return Key; - // Handle implicit null keys. - { - Token &t = peekNext(); - if ( t.Kind == Token::TK_BlockEnd - || t.Kind == Token::TK_Value - || t.Kind == Token::TK_Error) { - return Key = new (getAllocator()) NullNode(Doc); - } - if (t.Kind == Token::TK_Key) - getNext(); // skip TK_Key. - } - - // Handle explicit null keys. - Token &t = peekNext(); - if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { - return Key = new (getAllocator()) NullNode(Doc); - } - - // We've got a normal key. - return Key = parseBlockNode(); -} - -Node *KeyValueNode::getValue() { - if (Value) - return Value; - getKey()->skip(); - if (failed()) - return Value = new (getAllocator()) NullNode(Doc); - - // Handle implicit null values. - { - Token &t = peekNext(); - if ( t.Kind == Token::TK_BlockEnd - || t.Kind == Token::TK_FlowMappingEnd - || t.Kind == Token::TK_Key - || t.Kind == Token::TK_FlowEntry - || t.Kind == Token::TK_Error) { - return Value = new (getAllocator()) NullNode(Doc); - } - - if (t.Kind != Token::TK_Value) { - setError("Unexpected token in Key Value.", t); - return Value = new (getAllocator()) NullNode(Doc); - } - getNext(); // skip TK_Value. - } - - // Handle explicit null values. - Token &t = peekNext(); - if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { - return Value = new (getAllocator()) NullNode(Doc); - } - - // We got a normal value. - return Value = parseBlockNode(); -} - -void MappingNode::increment() { - if (failed()) { - IsAtEnd = true; - CurrentEntry = nullptr; - return; - } - if (CurrentEntry) { - CurrentEntry->skip(); - if (Type == MT_Inline) { - IsAtEnd = true; - CurrentEntry = nullptr; - return; - } - } - Token T = peekNext(); - if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { - // KeyValueNode eats the TK_Key. That way it can detect null keys. - CurrentEntry = new (getAllocator()) KeyValueNode(Doc); - } else if (Type == MT_Block) { - switch (T.Kind) { - case Token::TK_BlockEnd: - getNext(); - IsAtEnd = true; - CurrentEntry = nullptr; - break; - default: - setError("Unexpected token. Expected Key or Block End", T); - LLVM_FALLTHROUGH; - case Token::TK_Error: - IsAtEnd = true; - CurrentEntry = nullptr; - } - } else { - switch (T.Kind) { - case Token::TK_FlowEntry: - // Eat the flow entry and recurse. - getNext(); - return increment(); - case Token::TK_FlowMappingEnd: - getNext(); - LLVM_FALLTHROUGH; - case Token::TK_Error: - // Set this to end iterator. - IsAtEnd = true; - CurrentEntry = nullptr; - break; - default: - setError( "Unexpected token. Expected Key, Flow Entry, or Flow " - "Mapping End." - , T); - IsAtEnd = true; - CurrentEntry = nullptr; - } - } -} - -void SequenceNode::increment() { - if (failed()) { - IsAtEnd = true; - CurrentEntry = nullptr; - return; - } - if (CurrentEntry) - CurrentEntry->skip(); - Token T = peekNext(); - if (SeqType == ST_Block) { - switch (T.Kind) { - case Token::TK_BlockEntry: - getNext(); - CurrentEntry = parseBlockNode(); - if (!CurrentEntry) { // An error occurred. - IsAtEnd = true; - CurrentEntry = nullptr; - } - break; - case Token::TK_BlockEnd: - getNext(); - IsAtEnd = true; - CurrentEntry = nullptr; - break; - default: - setError( "Unexpected token. Expected Block Entry or Block End." - , T); - LLVM_FALLTHROUGH; - case Token::TK_Error: - IsAtEnd = true; - CurrentEntry = nullptr; - } - } else if (SeqType == ST_Indentless) { - switch (T.Kind) { - case Token::TK_BlockEntry: - getNext(); - CurrentEntry = parseBlockNode(); - if (!CurrentEntry) { // An error occurred. - IsAtEnd = true; - CurrentEntry = nullptr; - } - break; - default: - case Token::TK_Error: - IsAtEnd = true; - CurrentEntry = nullptr; - } - } else if (SeqType == ST_Flow) { - switch (T.Kind) { - case Token::TK_FlowEntry: - // Eat the flow entry and recurse. - getNext(); - WasPreviousTokenFlowEntry = true; - return increment(); - case Token::TK_FlowSequenceEnd: - getNext(); - LLVM_FALLTHROUGH; - case Token::TK_Error: - // Set this to end iterator. - IsAtEnd = true; - CurrentEntry = nullptr; - break; - case Token::TK_StreamEnd: - case Token::TK_DocumentEnd: - case Token::TK_DocumentStart: - setError("Could not find closing ]!", T); - // Set this to end iterator. - IsAtEnd = true; - CurrentEntry = nullptr; - break; - default: - if (!WasPreviousTokenFlowEntry) { - setError("Expected , between entries!", T); - IsAtEnd = true; - CurrentEntry = nullptr; - break; - } - // Otherwise it must be a flow entry. - CurrentEntry = parseBlockNode(); - if (!CurrentEntry) { - IsAtEnd = true; - } - WasPreviousTokenFlowEntry = false; - break; - } - } -} - -Document::Document(Stream &S) : stream(S), Root(nullptr) { - // Tag maps starts with two default mappings. - TagMap["!"] = "!"; - TagMap["!!"] = "tag:yaml.org,2002:"; - - if (parseDirectives()) - expectToken(Token::TK_DocumentStart); - Token &T = peekNext(); - if (T.Kind == Token::TK_DocumentStart) - getNext(); -} - -bool Document::skip() { - if (stream.scanner->failed()) - return false; - if (!Root) - getRoot(); - Root->skip(); - Token &T = peekNext(); - if (T.Kind == Token::TK_StreamEnd) - return false; - if (T.Kind == Token::TK_DocumentEnd) { - getNext(); - return skip(); - } - return true; -} - -Token &Document::peekNext() { - return stream.scanner->peekNext(); -} - -Token Document::getNext() { - return stream.scanner->getNext(); -} - -void Document::setError(const Twine &Message, Token &Location) const { - stream.scanner->setError(Message, Location.Range.begin()); -} - -bool Document::failed() const { - return stream.scanner->failed(); -} - -Node *Document::parseBlockNode() { - Token T = peekNext(); - // Handle properties. - Token AnchorInfo; - Token TagInfo; -parse_property: - switch (T.Kind) { - case Token::TK_Alias: - getNext(); - return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); - case Token::TK_Anchor: - if (AnchorInfo.Kind == Token::TK_Anchor) { - setError("Already encountered an anchor for this node!", T); - return nullptr; - } - AnchorInfo = getNext(); // Consume TK_Anchor. - T = peekNext(); - goto parse_property; - case Token::TK_Tag: - if (TagInfo.Kind == Token::TK_Tag) { - setError("Already encountered a tag for this node!", T); - return nullptr; - } - TagInfo = getNext(); // Consume TK_Tag. - T = peekNext(); - goto parse_property; - default: - break; - } - - switch (T.Kind) { - case Token::TK_BlockEntry: - // We got an unindented BlockEntry sequence. This is not terminated with - // a BlockEnd. - // Don't eat the TK_BlockEntry, SequenceNode needs it. - return new (NodeAllocator) SequenceNode( stream.CurrentDoc - , AnchorInfo.Range.substr(1) - , TagInfo.Range - , SequenceNode::ST_Indentless); - case Token::TK_BlockSequenceStart: - getNext(); - return new (NodeAllocator) - SequenceNode( stream.CurrentDoc - , AnchorInfo.Range.substr(1) - , TagInfo.Range - , SequenceNode::ST_Block); - case Token::TK_BlockMappingStart: - getNext(); - return new (NodeAllocator) - MappingNode( stream.CurrentDoc - , AnchorInfo.Range.substr(1) - , TagInfo.Range - , MappingNode::MT_Block); - case Token::TK_FlowSequenceStart: - getNext(); - return new (NodeAllocator) - SequenceNode( stream.CurrentDoc - , AnchorInfo.Range.substr(1) - , TagInfo.Range - , SequenceNode::ST_Flow); - case Token::TK_FlowMappingStart: - getNext(); - return new (NodeAllocator) - MappingNode( stream.CurrentDoc - , AnchorInfo.Range.substr(1) - , TagInfo.Range - , MappingNode::MT_Flow); - case Token::TK_Scalar: - getNext(); - return new (NodeAllocator) - ScalarNode( stream.CurrentDoc - , AnchorInfo.Range.substr(1) - , TagInfo.Range - , T.Range); - case Token::TK_BlockScalar: { - getNext(); - StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); - StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); - return new (NodeAllocator) - BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), - TagInfo.Range, StrCopy, T.Range); - } - case Token::TK_Key: - // Don't eat the TK_Key, KeyValueNode expects it. - return new (NodeAllocator) - MappingNode( stream.CurrentDoc - , AnchorInfo.Range.substr(1) - , TagInfo.Range - , MappingNode::MT_Inline); - case Token::TK_DocumentStart: - case Token::TK_DocumentEnd: - case Token::TK_StreamEnd: - default: - // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not - // !!null null. - return new (NodeAllocator) NullNode(stream.CurrentDoc); - case Token::TK_Error: - return nullptr; - } - llvm_unreachable("Control flow shouldn't reach here."); - return nullptr; -} - -bool Document::parseDirectives() { - bool isDirective = false; - while (true) { - Token T = peekNext(); - if (T.Kind == Token::TK_TagDirective) { - parseTAGDirective(); - isDirective = true; - } else if (T.Kind == Token::TK_VersionDirective) { - parseYAMLDirective(); - isDirective = true; - } else - break; - } - return isDirective; -} - -void Document::parseYAMLDirective() { - getNext(); // Eat %YAML <version> -} - -void Document::parseTAGDirective() { - Token Tag = getNext(); // %TAG <handle> <prefix> - StringRef T = Tag.Range; - // Strip %TAG - T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); - std::size_t HandleEnd = T.find_first_of(" \t"); - StringRef TagHandle = T.substr(0, HandleEnd); - StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); - TagMap[TagHandle] = TagPrefix; -} - -bool Document::expectToken(int TK) { - Token T = getNext(); - if (T.Kind != TK) { - setError("Unexpected token", T); - return false; - } - return true; -} |
