diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-18 20:30:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2024-04-06 20:11:55 +0000 |
commit | 5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch) | |
tree | 1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp | |
parent | 3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff) | |
parent | 312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp | 447 |
1 files changed, 254 insertions, 193 deletions
diff --git a/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp index 6ac2c6aeeb46..fdd0ed6e682e 100644 --- a/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp +++ b/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp @@ -392,6 +392,10 @@ private: /// Pos is whitespace or a new line bool isBlankOrBreak(StringRef::iterator Position); + /// Return true if the minimal well-formed code unit subsequence at + /// Pos is considered a "safe" character for plain scalars. + bool isPlainSafeNonBlank(StringRef::iterator Position); + /// Return true if the line is a line break, false otherwise. bool isLineEmpty(StringRef Line); @@ -545,6 +549,10 @@ private: /// Can the next token be the start of a simple key? bool IsSimpleKeyAllowed; + /// Can the next token be a value indicator even if it does not have a + /// trailing space? + bool IsAdjacentValueAllowedInFlow; + /// True if an error has occurred. bool Failed; @@ -868,6 +876,7 @@ void Scanner::init(MemoryBufferRef Buffer) { FlowLevel = 0; IsStartOfStream = true; IsSimpleKeyAllowed = true; + IsAdjacentValueAllowedInFlow = false; Failed = false; std::unique_ptr<MemoryBuffer> InputBufferOwner = MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); @@ -1049,6 +1058,15 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) { *Position == '\n'; } +bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) { + if (Position == End || isBlankOrBreak(Position)) + return false; + if (FlowLevel && + StringRef(Position, 1).find_first_of(",[]{}") != StringRef::npos) + return false; + return true; +} + bool Scanner::isLineEmpty(StringRef Line) { for (const auto *Position = Line.begin(); Position != Line.end(); ++Position) if (!isBlankOrBreak(Position)) @@ -1189,6 +1207,7 @@ bool Scanner::scanStreamEnd() { unrollIndent(-1); SimpleKeys.clear(); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = false; Token T; T.Kind = Token::TK_StreamEnd; @@ -1202,6 +1221,7 @@ bool Scanner::scanDirective() { unrollIndent(-1); SimpleKeys.clear(); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = false; StringRef::iterator Start = Current; consume('%'); @@ -1233,6 +1253,7 @@ bool Scanner::scanDocumentIndicator(bool IsStart) { unrollIndent(-1); SimpleKeys.clear(); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = false; Token T; T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; @@ -1255,6 +1276,8 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) { // And may also be followed by a simple key. IsSimpleKeyAllowed = true; + // Adjacent values are allowed in flows only after JSON-style keys. + IsAdjacentValueAllowedInFlow = false; ++FlowLevel; return true; } @@ -1262,6 +1285,7 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) { bool Scanner::scanFlowCollectionEnd(bool IsSequence) { removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = true; Token T; T.Kind = IsSequence ? Token::TK_FlowSequenceEnd : Token::TK_FlowMappingEnd; @@ -1276,6 +1300,7 @@ bool Scanner::scanFlowCollectionEnd(bool IsSequence) { bool Scanner::scanFlowEntry() { removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); IsSimpleKeyAllowed = true; + IsAdjacentValueAllowedInFlow = false; Token T; T.Kind = Token::TK_FlowEntry; T.Range = StringRef(Current, 1); @@ -1288,6 +1313,7 @@ bool Scanner::scanBlockEntry() { rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); IsSimpleKeyAllowed = true; + IsAdjacentValueAllowedInFlow = false; Token T; T.Kind = Token::TK_BlockEntry; T.Range = StringRef(Current, 1); @@ -1302,6 +1328,7 @@ bool Scanner::scanKey() { removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); IsSimpleKeyAllowed = !FlowLevel; + IsAdjacentValueAllowedInFlow = false; Token T; T.Kind = Token::TK_Key; @@ -1339,6 +1366,7 @@ bool Scanner::scanValue() { rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); IsSimpleKeyAllowed = !FlowLevel; } + IsAdjacentValueAllowedInFlow = false; Token T; T.Kind = Token::TK_Value; @@ -1420,6 +1448,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = true; return true; } @@ -1434,21 +1463,9 @@ bool Scanner::scanPlainScalar() { if (*Current == '#') break; - while (Current != End && !isBlankOrBreak(Current)) { - if (FlowLevel && *Current == ':' && - (Current + 1 == End || - !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) { - setError("Found unexpected ':' while scanning a plain scalar", Current); - return false; - } - - // Check for the end of the plain scalar. - if ( (*Current == ':' && isBlankOrBreak(Current + 1)) - || ( FlowLevel - && (StringRef(Current, 1).find_first_of(",:?[]{}") - != StringRef::npos))) - break; - + while (Current != End && + ((*Current != ':' && isPlainSafeNonBlank(Current)) || + (*Current == ':' && isPlainSafeNonBlank(Current + 1)))) { StringRef::iterator i = skip_nb_char(Current); if (i == Current) break; @@ -1499,6 +1516,7 @@ bool Scanner::scanPlainScalar() { saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = false; return true; } @@ -1534,6 +1552,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) { saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = false; return true; } @@ -1766,6 +1785,7 @@ bool Scanner::scanBlockScalar(bool IsLiteral) { // New lines may start a simple key. if (!FlowLevel) IsSimpleKeyAllowed = true; + IsAdjacentValueAllowedInFlow = false; Token T; T.Kind = Token::TK_BlockScalar; @@ -1799,6 +1819,7 @@ bool Scanner::scanTag() { saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; + IsAdjacentValueAllowedInFlow = false; return true; } @@ -1848,13 +1869,14 @@ bool Scanner::fetchMoreTokens() { if (*Current == ',') return scanFlowEntry(); - if (*Current == '-' && isBlankOrBreak(Current + 1)) + if (*Current == '-' && (isBlankOrBreak(Current + 1) || Current + 1 == End)) return scanBlockEntry(); - if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) + if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Current + 1))) return scanKey(); - if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) + if (*Current == ':' && + (!isPlainSafeNonBlank(Current + 1) || IsAdjacentValueAllowedInFlow)) return scanValue(); if (*Current == '*') @@ -1880,15 +1902,10 @@ bool Scanner::fetchMoreTokens() { // Get a plain scalar. StringRef FirstChar(Current, 1); - if (!(isBlankOrBreak(Current) - || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) - || (*Current == '-' && !isBlankOrBreak(Current + 1)) - || (!FlowLevel && (*Current == '?' || *Current == ':') - && isBlankOrBreak(Current + 1)) - || (!FlowLevel && *Current == ':' - && Current + 2 < End - && *(Current + 1) == ':' - && !isBlankOrBreak(Current + 2))) + if ((!isBlankOrBreak(Current) && + FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos) || + (FirstChar.find_first_of("?:-") != StringRef::npos && + isPlainSafeNonBlank(Current + 1))) return scanPlainScalar(); setError("Unrecognized character while tokenizing.", Current); @@ -1951,7 +1968,7 @@ std::string Node::getVerbatimTag() const { Ret = std::string(Doc->getTagMap().find("!")->second); Ret += Raw.substr(1); return Ret; - } else if (Raw.startswith("!!")) { + } else if (Raw.starts_with("!!")) { Ret = std::string(Doc->getTagMap().find("!!")->second); Ret += Raw.substr(2); return Ret; @@ -2013,185 +2030,229 @@ bool Node::failed() const { } StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { - // TODO: Handle newlines properly. We need to remove leading whitespace. - if (Value[0] == '"') { // Double quoted. - // Pull off the leading and trailing "s. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - // Search for characters that would require unescaping the value. - StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); - if (i != StringRef::npos) - return unescapeDoubleQuoted(UnquotedValue, i, Storage); + if (Value[0] == '"') + return getDoubleQuotedValue(Value, Storage); + if (Value[0] == '\'') + return getSingleQuotedValue(Value, Storage); + return getPlainValue(Value, Storage); +} + +/// parseScalarValue - A common parsing routine for all flow scalar styles. +/// It handles line break characters by itself, adds regular content characters +/// to the result, and forwards escaped sequences to the provided routine for +/// the style-specific processing. +/// +/// \param UnquotedValue - An input value without quotation marks. +/// \param Storage - A storage for the result if the input value is multiline or +/// contains escaped characters. +/// \param LookupChars - A set of special characters to search in the input +/// string. Should include line break characters and the escape character +/// specific for the processing scalar style, if any. +/// \param UnescapeCallback - This is called when the escape character is found +/// in the input. +/// \returns - The unfolded and unescaped value. +static StringRef +parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage, + StringRef LookupChars, + std::function<StringRef(StringRef, SmallVectorImpl<char> &)> + UnescapeCallback) { + size_t I = UnquotedValue.find_first_of(LookupChars); + if (I == StringRef::npos) return UnquotedValue; - } else if (Value[0] == '\'') { // Single quoted. - // Pull off the leading and trailing 's. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - StringRef::size_type i = UnquotedValue.find('\''); - if (i != StringRef::npos) { - // We're going to need Storage. - Storage.clear(); - Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { - StringRef Valid(UnquotedValue.begin(), i); - llvm::append_range(Storage, Valid); - Storage.push_back('\''); - UnquotedValue = UnquotedValue.substr(i + 2); + + Storage.clear(); + Storage.reserve(UnquotedValue.size()); + char LastNewLineAddedAs = '\0'; + for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) { + if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') { + llvm::append_range(Storage, UnquotedValue.take_front(I)); + UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage); + LastNewLineAddedAs = '\0'; + continue; + } + if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I); + LastNonSWhite != StringRef::npos) { + llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1)); + Storage.push_back(' '); + LastNewLineAddedAs = ' '; + } else { + // Note: we can't just check if the last character in Storage is ' ', + // '\n', or something else; that would give a wrong result for double + // quoted values containing an escaped space character before a new-line + // character. + switch (LastNewLineAddedAs) { + case ' ': + assert(!Storage.empty() && Storage.back() == ' '); + Storage.back() = '\n'; + LastNewLineAddedAs = '\n'; + break; + case '\n': + assert(!Storage.empty() && Storage.back() == '\n'); + Storage.push_back('\n'); + break; + default: + Storage.push_back(' '); + LastNewLineAddedAs = ' '; + break; } - llvm::append_range(Storage, UnquotedValue); - return StringRef(Storage.begin(), Storage.size()); } - return UnquotedValue; + // Handle Windows-style EOL + if (UnquotedValue.substr(I, 2) == "\r\n") + I++; + UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t"); } - // Plain. - // Trim whitespace ('b-char' and 's-white'). - // NOTE: Alternatively we could change the scanner to not include whitespace - // here in the first place. - return Value.rtrim("\x0A\x0D\x20\x09"); + llvm::append_range(Storage, UnquotedValue); + return StringRef(Storage.begin(), Storage.size()); } -StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue - , StringRef::size_type i - , SmallVectorImpl<char> &Storage) - const { - // Use Storage to build proper value. - Storage.clear(); - Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { - // Insert all previous chars into Storage. - StringRef Valid(UnquotedValue.begin(), i); - llvm::append_range(Storage, Valid); - // Chop off inserted chars. - UnquotedValue = UnquotedValue.substr(i); - - assert(!UnquotedValue.empty() && "Can't be empty!"); - - // Parse escape or line break. +StringRef +ScalarNode::getDoubleQuotedValue(StringRef RawValue, + SmallVectorImpl<char> &Storage) const { + assert(RawValue.size() >= 2 && RawValue.front() == '"' && + RawValue.back() == '"'); + StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); + + auto UnescapeFunc = [this](StringRef UnquotedValue, + SmallVectorImpl<char> &Storage) { + assert(UnquotedValue.take_front(1) == "\\"); + if (UnquotedValue.size() == 1) { + Token T; + T.Range = UnquotedValue; + setError("Unrecognized escape code", T); + Storage.clear(); + return StringRef(); + } + UnquotedValue = UnquotedValue.drop_front(1); switch (UnquotedValue[0]) { + default: { + Token T; + T.Range = UnquotedValue.take_front(1); + setError("Unrecognized escape code", T); + Storage.clear(); + return StringRef(); + } case '\r': + // Shrink the Windows-style EOL. + if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') + UnquotedValue = UnquotedValue.drop_front(1); + [[fallthrough]]; case '\n': - Storage.push_back('\n'); - if ( UnquotedValue.size() > 1 - && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) - UnquotedValue = UnquotedValue.substr(1); - UnquotedValue = UnquotedValue.substr(1); + return UnquotedValue.drop_front(1).ltrim(" \t"); + case '0': + Storage.push_back(0x00); break; - default: - if (UnquotedValue.size() == 1) { - Token T; - T.Range = StringRef(UnquotedValue.begin(), 1); - setError("Unrecognized escape code", T); - return ""; - } - UnquotedValue = UnquotedValue.substr(1); - switch (UnquotedValue[0]) { - default: { - Token T; - T.Range = StringRef(UnquotedValue.begin(), 1); - setError("Unrecognized escape code", T); - return ""; - } - case '\r': - case '\n': - // Remove the new line. - if ( UnquotedValue.size() > 1 - && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) - UnquotedValue = UnquotedValue.substr(1); - // If this was just a single byte newline, it will get skipped - // below. - break; - case '0': - Storage.push_back(0x00); - break; - case 'a': - Storage.push_back(0x07); - break; - case 'b': - Storage.push_back(0x08); - break; - case 't': - case 0x09: - Storage.push_back(0x09); - break; - case 'n': - Storage.push_back(0x0A); - break; - case 'v': - Storage.push_back(0x0B); - break; - case 'f': - Storage.push_back(0x0C); - break; - case 'r': - Storage.push_back(0x0D); - break; - case 'e': - Storage.push_back(0x1B); - break; - case ' ': - Storage.push_back(0x20); - break; - case '"': - Storage.push_back(0x22); - break; - case '/': - Storage.push_back(0x2F); - break; - case '\\': - Storage.push_back(0x5C); - break; - case 'N': - encodeUTF8(0x85, Storage); - break; - case '_': - encodeUTF8(0xA0, Storage); + case 'a': + Storage.push_back(0x07); + break; + case 'b': + Storage.push_back(0x08); + break; + case 't': + case 0x09: + Storage.push_back(0x09); + break; + case 'n': + Storage.push_back(0x0A); + break; + case 'v': + Storage.push_back(0x0B); + break; + case 'f': + Storage.push_back(0x0C); + break; + case 'r': + Storage.push_back(0x0D); + break; + case 'e': + Storage.push_back(0x1B); + break; + case ' ': + Storage.push_back(0x20); + break; + case '"': + Storage.push_back(0x22); + break; + case '/': + Storage.push_back(0x2F); + break; + case '\\': + Storage.push_back(0x5C); + break; + case 'N': + encodeUTF8(0x85, Storage); + break; + case '_': + encodeUTF8(0xA0, Storage); + break; + case 'L': + encodeUTF8(0x2028, Storage); + break; + case 'P': + encodeUTF8(0x2029, Storage); + break; + case 'x': { + if (UnquotedValue.size() < 3) + // TODO: Report error. break; - case 'L': - encodeUTF8(0x2028, Storage); + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(3); + } + case 'u': { + if (UnquotedValue.size() < 5) + // TODO: Report error. break; - case 'P': - encodeUTF8(0x2029, Storage); + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(5); + } + case 'U': { + if (UnquotedValue.size() < 9) + // TODO: Report error. break; - case 'x': { - if (UnquotedValue.size() < 3) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(2); - break; - } - case 'u': { - if (UnquotedValue.size() < 5) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(4); - break; - } - case 'U': { - if (UnquotedValue.size() < 9) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(8); - break; - } - } - UnquotedValue = UnquotedValue.substr(1); + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(9); } - } - llvm::append_range(Storage, UnquotedValue); - return StringRef(Storage.begin(), Storage.size()); + } + return UnquotedValue.drop_front(1); + }; + + return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc); +} + +StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue, + SmallVectorImpl<char> &Storage) { + assert(RawValue.size() >= 2 && RawValue.front() == '\'' && + RawValue.back() == '\''); + StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); + + auto UnescapeFunc = [](StringRef UnquotedValue, + SmallVectorImpl<char> &Storage) { + assert(UnquotedValue.take_front(2) == "''"); + Storage.push_back('\''); + return UnquotedValue.drop_front(2); + }; + + return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc); +} + +StringRef ScalarNode::getPlainValue(StringRef RawValue, + SmallVectorImpl<char> &Storage) { + // Trim trailing whitespace ('b-char' and 's-white'). + // NOTE: Alternatively we could change the scanner to not include whitespace + // here in the first place. + RawValue = RawValue.rtrim("\r\n \t"); + return parseScalarValue(RawValue, Storage, "\r\n", nullptr); } Node *KeyValueNode::getKey() { |