diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-18 20:30:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2024-04-06 20:11:55 +0000 |
commit | 5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch) | |
tree | 1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Support/GlobPattern.cpp | |
parent | 3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff) | |
parent | 312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Support/GlobPattern.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Support/GlobPattern.cpp | 273 |
1 files changed, 173 insertions, 100 deletions
diff --git a/contrib/llvm-project/llvm/lib/Support/GlobPattern.cpp b/contrib/llvm-project/llvm/lib/Support/GlobPattern.cpp index b8c6ea80b44c..7004adf461a0 100644 --- a/contrib/llvm-project/llvm/lib/Support/GlobPattern.cpp +++ b/contrib/llvm-project/llvm/lib/Support/GlobPattern.cpp @@ -11,16 +11,11 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/GlobPattern.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Errc.h" using namespace llvm; -static bool hasWildcard(StringRef S) { - return S.find_first_of("?*[\\") != StringRef::npos; -} - // Expands character ranges and returns a bitmap. // For example, "a-cf-hz" is expanded to "abcfghz". static Expected<BitVector> expand(StringRef S, StringRef Original) { @@ -58,120 +53,198 @@ static Expected<BitVector> expand(StringRef S, StringRef Original) { return BV; } -// This is a scanner for the glob pattern. -// A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]" -// (which is a negative form of "[<chars>]"), "[!<chars>]" (which is -// equivalent to "[^<chars>]"), or a non-meta character. -// This function returns the first token in S. -static Expected<BitVector> scan(StringRef &S, StringRef Original) { - switch (S[0]) { - case '*': - S = S.substr(1); - // '*' is represented by an empty bitvector. - // All other bitvectors are 256-bit long. - return BitVector(); - case '?': - S = S.substr(1); - return BitVector(256, true); - case '[': { - // ']' is allowed as the first character of a character class. '[]' is - // invalid. So, just skip the first character. - size_t End = S.find(']', 2); - if (End == StringRef::npos) - return make_error<StringError>("invalid glob pattern: " + Original, - errc::invalid_argument); - - StringRef Chars = S.substr(1, End - 1); - S = S.substr(End + 1); - if (Chars.startswith("^") || Chars.startswith("!")) { - Expected<BitVector> BV = expand(Chars.substr(1), Original); - if (!BV) - return BV.takeError(); - return BV->flip(); +// Identify brace expansions in S and return the list of patterns they expand +// into. +static Expected<SmallVector<std::string, 1>> +parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) { + SmallVector<std::string> SubPatterns = {S.str()}; + if (!MaxSubPatterns || !S.contains('{')) + return std::move(SubPatterns); + + struct BraceExpansion { + size_t Start; + size_t Length; + SmallVector<StringRef, 2> Terms; + }; + SmallVector<BraceExpansion, 0> BraceExpansions; + + BraceExpansion *CurrentBE = nullptr; + size_t TermBegin; + for (size_t I = 0, E = S.size(); I != E; ++I) { + if (S[I] == '[') { + I = S.find(']', I + 2); + if (I == std::string::npos) + return make_error<StringError>("invalid glob pattern, unmatched '['", + errc::invalid_argument); + } else if (S[I] == '{') { + if (CurrentBE) + return make_error<StringError>( + "nested brace expansions are not supported", + errc::invalid_argument); + CurrentBE = &BraceExpansions.emplace_back(); + CurrentBE->Start = I; + TermBegin = I + 1; + } else if (S[I] == ',') { + if (!CurrentBE) + continue; + CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); + TermBegin = I + 1; + } else if (S[I] == '}') { + if (!CurrentBE) + continue; + if (CurrentBE->Terms.empty()) + return make_error<StringError>( + "empty or singleton brace expansions are not supported", + errc::invalid_argument); + CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); + CurrentBE->Length = I - CurrentBE->Start + 1; + CurrentBE = nullptr; + } else if (S[I] == '\\') { + if (++I == E) + return make_error<StringError>("invalid glob pattern, stray '\\'", + errc::invalid_argument); } - return expand(Chars, Original); } - case '\\': - // Eat this character and fall through below to treat it like a non-meta - // character. - S = S.substr(1); - [[fallthrough]]; - default: - BitVector BV(256, false); - BV[(uint8_t)S[0]] = true; - S = S.substr(1); - return BV; + if (CurrentBE) + return make_error<StringError>("incomplete brace expansion", + errc::invalid_argument); + + size_t NumSubPatterns = 1; + for (auto &BE : BraceExpansions) { + if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) { + NumSubPatterns = std::numeric_limits<size_t>::max(); + break; + } + NumSubPatterns *= BE.Terms.size(); + } + if (NumSubPatterns > *MaxSubPatterns) + return make_error<StringError>("too many brace expansions", + errc::invalid_argument); + // Replace brace expansions in reverse order so that we don't invalidate + // earlier start indices + for (auto &BE : reverse(BraceExpansions)) { + SmallVector<std::string> OrigSubPatterns; + std::swap(SubPatterns, OrigSubPatterns); + for (StringRef Term : BE.Terms) + for (StringRef Orig : OrigSubPatterns) + SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term); } + return std::move(SubPatterns); } -Expected<GlobPattern> GlobPattern::create(StringRef S) { +Expected<GlobPattern> +GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) { GlobPattern Pat; - // S doesn't contain any metacharacter, - // so the regular string comparison should work. - if (!hasWildcard(S)) { - Pat.Exact = S; - return Pat; - } - - // S is something like "foo*", and the "* is not escaped. We can use - // startswith(). - if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) { - Pat.Prefix = S.drop_back(); + // Store the prefix that does not contain any metacharacter. + size_t PrefixSize = S.find_first_of("?*[{\\"); + Pat.Prefix = S.substr(0, PrefixSize); + if (PrefixSize == std::string::npos) return Pat; + S = S.substr(PrefixSize); + + SmallVector<std::string, 1> SubPats; + if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats)) + return std::move(Err); + for (StringRef SubPat : SubPats) { + auto SubGlobOrErr = SubGlobPattern::create(SubPat); + if (!SubGlobOrErr) + return SubGlobOrErr.takeError(); + Pat.SubGlobs.push_back(*SubGlobOrErr); } - // S is something like "*foo". We can use endswith(). - if (S.startswith("*") && !hasWildcard(S.drop_front())) { - Pat.Suffix = S.drop_front(); - return Pat; - } + return Pat; +} - // Otherwise, we need to do real glob pattern matching. - // Parse the pattern now. - StringRef Original = S; - while (!S.empty()) { - Expected<BitVector> BV = scan(S, Original); - if (!BV) - return BV.takeError(); - Pat.Tokens.push_back(*BV); +Expected<GlobPattern::SubGlobPattern> +GlobPattern::SubGlobPattern::create(StringRef S) { + SubGlobPattern Pat; + + // Parse brackets. + Pat.Pat.assign(S.begin(), S.end()); + for (size_t I = 0, E = S.size(); I != E; ++I) { + if (S[I] == '[') { + // ']' is allowed as the first character of a character class. '[]' is + // invalid. So, just skip the first character. + ++I; + size_t J = S.find(']', I + 1); + if (J == StringRef::npos) + return make_error<StringError>("invalid glob pattern, unmatched '['", + errc::invalid_argument); + StringRef Chars = S.substr(I, J - I); + bool Invert = S[I] == '^' || S[I] == '!'; + Expected<BitVector> BV = + Invert ? expand(Chars.substr(1), S) : expand(Chars, S); + if (!BV) + return BV.takeError(); + if (Invert) + BV->flip(); + Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)}); + I = J; + } else if (S[I] == '\\') { + if (++I == E) + return make_error<StringError>("invalid glob pattern, stray '\\'", + errc::invalid_argument); + } } return Pat; } bool GlobPattern::match(StringRef S) const { - if (Exact) - return S == *Exact; - if (Prefix) - return S.startswith(*Prefix); - if (Suffix) - return S.endswith(*Suffix); - return matchOne(Tokens, S); + if (!S.consume_front(Prefix)) + return false; + if (SubGlobs.empty() && S.empty()) + return true; + for (auto &Glob : SubGlobs) + if (Glob.match(S)) + return true; + return false; } -// Runs glob pattern Pats against string S. -bool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const { - for (;;) { - if (Pats.empty()) - return S.empty(); - - // If Pats[0] is '*', try to match Pats[1..] against all possible - // tail strings of S to see at least one pattern succeeds. - if (Pats[0].size() == 0) { - Pats = Pats.slice(1); - if (Pats.empty()) - // Fast path. If a pattern is '*', it matches anything. - return true; - for (size_t I = 0, E = S.size(); I < E; ++I) - if (matchOne(Pats, S.substr(I))) - return true; - return false; +// Factor the pattern into segments split by '*'. The segment is matched +// sequentianlly by finding the first occurrence past the end of the previous +// match. +bool GlobPattern::SubGlobPattern::match(StringRef Str) const { + const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(), + *SavedS = S; + const char *const PEnd = P + Pat.size(), *const End = S + Str.size(); + size_t B = 0, SavedB = 0; + while (S != End) { + if (P == PEnd) + ; + else if (*P == '*') { + // The non-* substring on the left of '*' matches the tail of S. Save the + // positions to be used by backtracking if we see a mismatch later. + SegmentBegin = ++P; + SavedS = S; + SavedB = B; + continue; + } else if (*P == '[') { + if (Brackets[B].Bytes[uint8_t(*S)]) { + P = Pat.data() + Brackets[B++].NextOffset; + ++S; + continue; + } + } else if (*P == '\\') { + if (*++P == *S) { + ++P; + ++S; + continue; + } + } else if (*P == *S || *P == '?') { + ++P; + ++S; + continue; } - - // If Pats[0] is not '*', it must consume one character. - if (S.empty() || !Pats[0][(uint8_t)S[0]]) + if (!SegmentBegin) return false; - Pats = Pats.slice(1); - S = S.substr(1); + // We have seen a '*'. Backtrack to the saved positions. Shift the S + // position to probe the next starting position in the segment. + P = SegmentBegin; + S = ++SavedS; + B = SavedB; } + // All bytes in Str have been matched. Return true if the rest part of Pat is + // empty or contains only '*'. + return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos; } |