diff options
Diffstat (limited to 'utils/text/regex.cpp')
-rw-r--r-- | utils/text/regex.cpp | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/utils/text/regex.cpp b/utils/text/regex.cpp new file mode 100644 index 000000000000..b078ba88f6b4 --- /dev/null +++ b/utils/text/regex.cpp @@ -0,0 +1,302 @@ +// Copyright 2014 The Kyua Authors. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "utils/text/regex.hpp" + +extern "C" { +#include <sys/types.h> + +#include <regex.h> +} + +#include "utils/auto_array.ipp" +#include "utils/defs.hpp" +#include "utils/format/macros.hpp" +#include "utils/noncopyable.hpp" +#include "utils/sanity.hpp" +#include "utils/text/exceptions.hpp" + +namespace text = utils::text; + + +namespace { + + +static void throw_regex_error(const int, const ::regex_t*, const std::string&) + UTILS_NORETURN; + + +/// Constructs and raises a regex_error. +/// +/// \param error The error code returned by regcomp(3) or regexec(3). +/// \param preg The native regex object that caused this error. +/// \param prefix Error message prefix string. +/// +/// \throw regex_error The constructed exception. +static void +throw_regex_error(const int error, const ::regex_t* preg, + const std::string& prefix) +{ + char buffer[1024]; + + // TODO(jmmv): Would be nice to handle the case where the message does + // not fit in the temporary buffer. + (void)::regerror(error, preg, buffer, sizeof(buffer)); + + throw text::regex_error(F("%s: %s") % prefix % buffer); +} + + +} // anonymous namespace + + +/// Internal implementation for regex_matches. +struct utils::text::regex_matches::impl : utils::noncopyable { + /// String on which we are matching. + /// + /// In theory, we could take a reference here instead of a copy, and make + /// it a requirement for the caller to ensure that the lifecycle of the + /// input string outlasts the lifecycle of the regex_matches. However, that + /// contract is very easy to break with hardcoded strings (as we do in + /// tests). Just go for the safer case here. + const std::string _string; + + /// Maximum number of matching groups we expect, including the full match. + /// + /// In other words, this is the size of the _matches array. + const std::size_t _nmatches; + + /// Native regular expression match representation. + utils::auto_array< ::regmatch_t > _matches; + + /// Constructor. + /// + /// This executes the regex on the given string and sets up the internal + /// class state based on the results. + /// + /// \param preg The native regex object. + /// \param str The string on which to execute the regex. + /// \param ngroups Number of capture groups in the regex. This is an upper + /// bound and may be greater than the actual matches. + /// + /// \throw regex_error If the call to regexec(3) fails. + impl(const ::regex_t* preg, const std::string& str, + const std::size_t ngroups) : + _string(str), + _nmatches(ngroups + 1), + _matches(new ::regmatch_t[_nmatches]) + { + const int error = ::regexec(preg, _string.c_str(), _nmatches, + _matches.get(), 0); + if (error == REG_NOMATCH) { + _matches.reset(NULL); + } else if (error != 0) { + throw_regex_error(error, preg, + F("regexec on '%s' failed") % _string); + } + } + + /// Destructor. + ~impl(void) + { + } +}; + + +/// Constructor. +/// +/// \param pimpl Constructed implementation of the object. +text::regex_matches::regex_matches(std::shared_ptr< impl > pimpl) : + _pimpl(pimpl) +{ +} + + +/// Destructor. +text::regex_matches::~regex_matches(void) +{ +} + + +/// Returns the number of matches in this object. +/// +/// Note that this does not correspond to the number of groups provided at +/// construction time. The returned value here accounts for only the returned +/// valid matches. +/// +/// \return Number of matches, including the full match. +std::size_t +text::regex_matches::count(void) const +{ + std::size_t total = 0; + if (_pimpl->_matches.get() != NULL) { + for (std::size_t i = 0; i < _pimpl->_nmatches; ++i) { + if (_pimpl->_matches[i].rm_so != -1) + ++total; + } + INV(total <= _pimpl->_nmatches); + } + return total; +} + + +/// Gets a match. +/// +/// \param index Number of the match to get. Index 0 always contains the match +/// of the whole regex. +/// +/// \pre There regex must have matched the input string. +/// \pre index must be lower than count(). +/// +/// \return The textual match. +std::string +text::regex_matches::get(const std::size_t index) const +{ + PRE(*this); + PRE(index < count()); + + const ::regmatch_t* match = &_pimpl->_matches[index]; + + return std::string(_pimpl->_string.c_str() + match->rm_so, + match->rm_eo - match->rm_so); +} + + +/// Checks if there are any matches. +/// +/// \return True if the object contains one or more matches; false otherwise. +text::regex_matches::operator bool(void) const +{ + return _pimpl->_matches.get() != NULL; +} + + +/// Internal implementation for regex. +struct utils::text::regex::impl : utils::noncopyable { + /// Native regular expression representation. + ::regex_t _preg; + + /// Number of capture groups in the regular expression. This is an upper + /// bound and does NOT include the default full string match. + std::size_t _ngroups; + + /// Constructor. + /// + /// This compiles the given regular expression. + /// + /// \param regex_ The regular expression to compile. + /// \param ngroups Number of capture groups in the regular expression. This + /// is an upper bound and does NOT include the default full string + /// match. + /// \param ignore_case Whether to ignore case during matching. + /// + /// \throw regex_error If the call to regcomp(3) fails. + impl(const std::string& regex_, const std::size_t ngroups, + const bool ignore_case) : + _ngroups(ngroups) + { + const int flags = REG_EXTENDED | (ignore_case ? REG_ICASE : 0); + const int error = ::regcomp(&_preg, regex_.c_str(), flags); + if (error != 0) + throw_regex_error(error, &_preg, F("regcomp on '%s' failed") + % regex_); + } + + /// Destructor. + ~impl(void) + { + ::regfree(&_preg); + } +}; + + +/// Constructor. +/// +/// \param pimpl Constructed implementation of the object. +text::regex::regex(std::shared_ptr< impl > pimpl) : _pimpl(pimpl) +{ +} + + +/// Destructor. +text::regex::~regex(void) +{ +} + + +/// Compiles a new regular expression. +/// +/// \param regex_ The regular expression to compile. +/// \param ngroups Number of capture groups in the regular expression. This is +/// an upper bound and does NOT include the default full string match. +/// \param ignore_case Whether to ignore case during matching. +/// +/// \return A new regular expression, ready to match strings. +/// +/// \throw regex_error If the regular expression is invalid and cannot be +/// compiled. +text::regex +text::regex::compile(const std::string& regex_, const std::size_t ngroups, + const bool ignore_case) +{ + return regex(std::shared_ptr< impl >(new impl(regex_, ngroups, + ignore_case))); +} + + +/// Matches the regular expression against a string. +/// +/// \param str String to match the regular expression against. +/// +/// \return A new regex_matches object with the results of the match. +text::regex_matches +text::regex::match(const std::string& str) const +{ + std::shared_ptr< regex_matches::impl > pimpl(new regex_matches::impl( + &_pimpl->_preg, str, _pimpl->_ngroups)); + return regex_matches(pimpl); +} + + +/// Compiles and matches a regular expression once. +/// +/// This is syntactic sugar to simplify the instantiation of a new regex object +/// and its subsequent match on a string. +/// +/// \param regex_ The regular expression to compile and match. +/// \param str String to match the regular expression against. +/// \param ngroups Number of capture groups in the regular expression. +/// \param ignore_case Whether to ignore case during matching. +/// +/// \return A new regex_matches object with the results of the match. +text::regex_matches +text::match_regex(const std::string& regex_, const std::string& str, + const std::size_t ngroups, const bool ignore_case) +{ + return regex::compile(regex_, ngroups, ignore_case).match(str); +} |