lnav/src/pcrepp/pcrepp.hh

583 lines
16 KiB
C++
Raw Normal View History

2010-04-26 06:12:25 +02:00
/**
2013-06-12 06:28:03 +02:00
* Copyright (c) 2007-2013, Timothy Stack
2013-05-03 08:02:03 +02:00
*
* All rights reserved.
2013-05-28 06:35:00 +02:00
*
2013-05-03 08:02:03 +02:00
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
2013-05-28 06:35:00 +02:00
*
2013-05-03 08:02:03 +02:00
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* * Neither the name of Timothy Stack nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
2013-05-28 06:35:00 +02:00
*
2013-05-03 08:02:03 +02:00
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
2022-03-16 23:38:08 +01:00
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2013-05-03 08:02:03 +02:00
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
2010-04-26 06:12:25 +02:00
* @file pcrepp.hh
*
* A C++ adapter for the pcre library. The interface provided here has a
* different focus than the pcrecpp.h file included in the pcre distribution.
* The standard pcrecpp.h interface is more concerned with regular expressions
* that are digesting data to be used within the program itself. Whereas this
* interface is dealing with regular expression entered by the user and
* processing a series of matches on text files.
*/
#ifndef pcrepp_hh
#define pcrepp_hh
2009-09-14 03:07:32 +02:00
2022-03-16 23:38:08 +01:00
#include "config.h"
2009-09-14 03:07:32 +02:00
#ifdef HAVE_PCRE_H
2022-03-16 23:38:08 +01:00
# include <pcre.h>
2009-09-14 03:07:32 +02:00
#elif HAVE_PCRE_PCRE_H
2022-03-16 23:38:08 +01:00
# include <pcre/pcre.h>
2009-09-14 03:07:32 +02:00
#else
2022-03-16 23:38:08 +01:00
# error "pcre.h not found?"
2009-09-14 03:07:32 +02:00
#endif
#include <cassert>
2022-03-16 23:38:08 +01:00
#include <exception>
2009-09-14 03:07:32 +02:00
#include <memory>
2022-03-16 23:38:08 +01:00
#include <string>
#include <utility>
#include <vector>
2022-03-16 23:38:08 +01:00
#include <stdio.h>
#include <string.h>
2009-09-14 03:07:32 +02:00
2022-04-13 01:07:13 +02:00
#include "base/auto_mem.hh"
#include "base/intern_string.hh"
#include "base/result.h"
2009-09-14 03:07:32 +02:00
class pcrepp;
2010-04-26 06:12:25 +02:00
/**
* Context that tracks captures found during a match operation. This class is a
* base that defines iterator methods and fields, but does not allocate space
* for the capture array.
*/
2009-09-14 03:07:32 +02:00
class pcre_context {
public:
struct capture_t {
capture_t()
{ /* We don't initialize anything since it's a perf hit. */
}
2022-03-16 23:38:08 +01:00
capture_t(int begin, int end) : c_begin(begin), c_end(end)
2013-05-28 06:35:00 +02:00
{
2021-05-14 04:50:04 +02:00
assert(begin <= end);
}
2013-05-28 06:35:00 +02:00
int c_begin;
int c_end;
void ltrim(const char* str);
2022-03-16 23:38:08 +01:00
bool contains(int pos) const
{
return this->c_begin <= pos && pos < this->c_end;
}
bool is_valid() const { return this->c_begin != -1; }
2013-06-26 05:43:27 +02:00
int length() const { return this->c_end - this->c_begin; }
bool empty() const { return this->c_begin == this->c_end; }
};
using iterator = capture_t*;
using const_iterator = const capture_t*;
2009-09-20 00:36:27 +02:00
2010-04-26 06:12:25 +02:00
/** @return The maximum number of strings this context can capture. */
int get_max_count() const { return this->pc_max_count; }
2009-09-14 03:07:32 +02:00
void set_count(int count) { this->pc_count = count; }
2009-09-14 03:07:32 +02:00
int get_count() const { return this->pc_count; }
void set_pcrepp(const pcrepp* src) { this->pc_pcre = src; }
2010-04-26 06:12:25 +02:00
/**
* @return a capture_t that covers all of the text that was matched.
*/
capture_t* all() const { return pc_captures; }
2009-09-14 03:07:32 +02:00
2010-04-26 06:12:25 +02:00
/** @return An iterator to the first capture. */
iterator begin() { return pc_captures + 1; }
2010-04-26 06:12:25 +02:00
/** @return An iterator that refers to the end of the capture array. */
iterator end() { return pc_captures + pc_count; };
2022-03-16 23:38:08 +01:00
capture_t* operator[](int offset) const
{
2014-03-15 19:37:03 +01:00
if (offset < 0) {
2020-05-07 16:08:59 +02:00
return nullptr;
2014-03-15 19:37:03 +01:00
}
return &this->pc_captures[offset + 1];
}
2022-03-16 23:38:08 +01:00
capture_t* operator[](const char* name) const;
2022-03-16 23:38:08 +01:00
capture_t* operator[](const std::string& name) const
{
return (*this)[name.c_str()];
}
capture_t* first_valid() const;
2009-09-14 03:07:32 +02:00
protected:
2022-03-16 23:38:08 +01:00
pcre_context(capture_t* captures, int max_count)
: pc_captures(captures), pc_max_count(max_count)
{
}
2022-03-16 23:38:08 +01:00
const pcrepp* pc_pcre{nullptr};
2022-03-16 23:38:08 +01:00
capture_t* pc_captures;
int pc_max_count;
int pc_count{0};
2009-09-14 03:07:32 +02:00
};
struct capture_if_not {
capture_if_not(int begin) : cin_begin(begin) {}
2022-03-16 23:38:08 +01:00
bool operator()(const pcre_context::capture_t& cap) const
2013-05-28 06:35:00 +02:00
{
return cap.c_begin != this->cin_begin;
}
2013-05-28 06:35:00 +02:00
int cin_begin;
};
2010-04-26 06:12:25 +02:00
/**
* A pcre_context that allocates storage for the capture array within the object
* itself.
*/
2009-09-14 03:07:32 +02:00
template<size_t MAX_COUNT>
class pcre_context_static : public pcre_context {
public:
pcre_context_static()
2022-03-16 23:38:08 +01:00
: pcre_context(this->pc_match_buffer, MAX_COUNT + 1){};
2009-09-14 03:07:32 +02:00
private:
2010-04-26 06:12:25 +02:00
capture_t pc_match_buffer[MAX_COUNT + 1];
2009-09-14 03:07:32 +02:00
};
2010-04-26 06:12:25 +02:00
/**
2013-05-28 06:35:00 +02:00
*
2010-04-26 06:12:25 +02:00
*/
2009-09-14 03:07:32 +02:00
class pcre_input {
public:
2022-03-16 23:38:08 +01:00
pcre_input(const char* str, size_t off = 0, size_t len = -1)
: pi_offset(off), pi_next_offset(off), pi_length(len), pi_string(str)
2013-05-28 06:35:00 +02:00
{
2022-03-16 23:38:08 +01:00
if (this->pi_length == (size_t) -1) {
2013-05-28 06:35:00 +02:00
this->pi_length = strlen(str);
}
}
2009-09-14 03:07:32 +02:00
2022-03-16 23:38:08 +01:00
pcre_input(const string_fragment& s)
: pi_offset(0), pi_next_offset(0), pi_length(s.length()),
pi_string(s.data()){};
2021-11-05 23:13:16 +01:00
pcre_input(const intern_string_t& s)
2022-03-16 23:38:08 +01:00
: pi_offset(0), pi_next_offset(0), pi_length(s.size()),
pi_string(s.get()){};
2021-11-05 23:13:16 +01:00
2022-03-16 23:38:08 +01:00
pcre_input(const string_fragment&&) = delete;
2022-03-16 23:38:08 +01:00
pcre_input(const std::string& str, size_t off = 0)
: pi_offset(off), pi_next_offset(off), pi_length(str.length()),
pi_string(str.c_str()){};
2009-09-14 03:07:32 +02:00
2022-03-16 23:38:08 +01:00
pcre_input(const std::string&&, size_t off = 0) = delete;
const char* get_string() const { return this->pi_string; }
2009-09-14 03:07:32 +02:00
2022-03-16 23:38:08 +01:00
const char* get_substr_start(pcre_context::const_iterator iter) const
2013-05-28 06:35:00 +02:00
{
return &this->pi_string[iter->c_begin];
}
2010-04-26 06:12:25 +02:00
size_t get_substr_len(pcre_context::const_iterator iter) const
{
return iter->length();
}
2013-05-28 06:35:00 +02:00
std::string get_substr(pcre_context::const_iterator iter) const
{
if (iter->c_begin == -1) {
return "";
}
2022-03-16 23:38:08 +01:00
return std::string(&this->pi_string[iter->c_begin], iter->length());
}
2009-09-14 03:07:32 +02:00
2022-03-16 23:38:08 +01:00
intern_string_t get_substr_i(pcre_context::const_iterator iter) const
{
return intern_string::lookup(&this->pi_string[iter->c_begin],
iter->length());
}
2022-03-16 23:38:08 +01:00
nonstd::optional<std::string> get_substr_opt(
pcre_context::const_iterator iter) const
{
if (iter->is_valid()) {
return std::string(&this->pi_string[iter->c_begin], iter->length());
}
return nonstd::nullopt;
}
2022-03-16 23:38:08 +01:00
void get_substr(pcre_context::const_iterator iter, char* dst) const
{
2014-03-10 15:32:22 +01:00
memcpy(dst, &this->pi_string[iter->c_begin], iter->length());
dst[iter->length()] = '\0';
}
2014-03-10 15:32:22 +01:00
void reset_next_offset() { this->pi_next_offset = this->pi_offset; }
2022-03-16 23:38:08 +01:00
void reset(const char* str, size_t off = 0, size_t len = -1)
2013-06-16 03:07:50 +02:00
{
2022-03-16 23:38:08 +01:00
this->pi_string = str;
this->pi_offset = off;
2013-06-08 16:57:40 +02:00
this->pi_next_offset = off;
2022-03-16 23:38:08 +01:00
if (this->pi_length == (size_t) -1) {
2013-06-08 16:57:40 +02:00
this->pi_length = strlen(str);
2022-03-16 23:38:08 +01:00
} else {
2013-06-08 16:57:40 +02:00
this->pi_length = len;
}
}
2022-03-16 23:38:08 +01:00
void reset(const std::string& str, size_t off = 0)
2013-06-16 03:07:50 +02:00
{
2013-06-08 16:57:40 +02:00
this->reset(str.c_str(), off, str.length());
}
2013-06-08 16:57:40 +02:00
2009-09-14 03:07:32 +02:00
size_t pi_offset;
size_t pi_next_offset;
size_t pi_length;
2022-03-16 23:38:08 +01:00
2009-09-14 03:07:32 +02:00
private:
2022-03-16 23:38:08 +01:00
const char* pi_string;
2009-09-14 03:07:32 +02:00
};
struct pcre_named_capture {
class iterator {
public:
2022-03-16 23:38:08 +01:00
iterator(pcre_named_capture* pnc, size_t name_len)
: i_named_capture(pnc), i_name_len(name_len){};
2022-03-16 23:38:08 +01:00
iterator() : i_named_capture(nullptr), i_name_len(0){};
2022-03-16 23:38:08 +01:00
const pcre_named_capture& operator*() const
{
return *this->i_named_capture;
}
2022-03-16 23:38:08 +01:00
const pcre_named_capture* operator->() const
{
return this->i_named_capture;
}
2022-03-16 23:38:08 +01:00
bool operator!=(const iterator& rhs) const
{
return this->i_named_capture != rhs.i_named_capture;
}
2022-03-16 23:38:08 +01:00
iterator& operator++()
{
char* ptr = (char*) this->i_named_capture;
ptr += this->i_name_len;
2022-03-16 23:38:08 +01:00
this->i_named_capture = (pcre_named_capture*) ptr;
return *this;
}
private:
2022-03-16 23:38:08 +01:00
pcre_named_capture* i_named_capture;
size_t i_name_len;
};
2022-03-16 23:38:08 +01:00
int index() const
{
return (this->pnc_index_msb << 8 | this->pnc_index_lsb) - 1;
}
char pnc_index_msb;
char pnc_index_lsb;
char pnc_name[];
};
struct pcre_extractor {
2022-03-16 23:38:08 +01:00
const pcre_context& pe_context;
const pcre_input& pe_input;
template<typename T>
2022-03-16 23:38:08 +01:00
intern_string_t get_substr_i(T name) const
{
return this->pe_input.get_substr_i(this->pe_context[name]);
}
template<typename T>
2022-03-16 23:38:08 +01:00
std::string get_substr(T name) const
{
return this->pe_input.get_substr(this->pe_context[name]);
}
};
2009-09-14 03:07:32 +02:00
class pcrepp {
public:
class error : public std::exception {
2022-03-16 23:38:08 +01:00
public:
error(std::string msg, int offset = 0)
2022-03-16 23:38:08 +01:00
: e_msg(std::move(msg)), e_offset(offset){};
2013-05-28 06:35:00 +02:00
2022-03-16 23:38:08 +01:00
const char* what() const noexcept override
{
2013-05-28 06:35:00 +02:00
return this->e_msg.c_str();
};
const std::string e_msg;
int e_offset;
2009-09-14 03:07:32 +02:00
};
2022-03-16 23:38:08 +01:00
static std::string quote(const char* unquoted);
2022-03-16 23:38:08 +01:00
static std::string quote(const std::string& unquoted)
{
return quote(unquoted.c_str());
}
struct compile_error {
const char* ce_msg{nullptr};
int ce_offset{0};
};
2022-03-16 23:38:08 +01:00
static Result<pcrepp, compile_error> from_str(std::string pattern,
int options = 0);
2022-03-16 23:38:08 +01:00
pcrepp(pcre* code) : p_code(code), p_code_extra(pcre_free_study)
2013-05-28 06:35:00 +02:00
{
pcre_refcount(this->p_code, 1);
this->study();
}
2013-05-28 06:35:00 +02:00
2022-03-16 23:38:08 +01:00
pcrepp(std::string pattern, pcre* code)
: p_code(code), p_pattern(std::move(pattern)),
p_code_extra(pcre_free_study)
{
pcre_refcount(this->p_code, 1);
this->study();
this->find_captures(this->p_pattern.c_str());
}
2022-03-16 23:38:08 +01:00
explicit pcrepp(const char* pattern, int options = 0)
: p_pattern(pattern), p_code_extra(pcre_free_study)
2013-05-28 06:35:00 +02:00
{
2022-03-16 23:38:08 +01:00
const char* errptr;
int eoff;
2020-05-07 16:08:59 +02:00
2022-03-16 23:38:08 +01:00
if ((this->p_code
= pcre_compile(pattern, options, &errptr, &eoff, nullptr))
== nullptr)
{
2013-05-28 06:35:00 +02:00
throw error(errptr, eoff);
}
pcre_refcount(this->p_code, 1);
this->study();
this->find_captures(pattern);
}
2009-09-14 03:07:32 +02:00
2022-03-16 23:38:08 +01:00
explicit pcrepp(const std::string& pattern, int options = 0)
: p_pattern(pattern), p_code_extra(pcre_free_study)
{
2022-03-16 23:38:08 +01:00
const char* errptr;
int eoff;
2022-03-16 23:38:08 +01:00
if ((this->p_code = pcre_compile(
pattern.c_str(), options | PCRE_UTF8, &errptr, &eoff, nullptr))
== nullptr)
{
throw error(errptr, eoff);
}
pcre_refcount(this->p_code, 1);
this->study();
this->find_captures(pattern.c_str());
}
2022-03-16 23:38:08 +01:00
pcrepp() {}
2022-03-16 23:38:08 +01:00
pcrepp(const pcrepp& other)
: p_code(other.p_code), p_pattern(other.p_pattern),
p_code_extra(pcre_free_study), p_captures(other.p_captures)
2013-05-28 06:35:00 +02:00
{
pcre_refcount(this->p_code, 1);
this->study();
}
2011-06-13 16:46:03 +02:00
2022-03-16 23:38:08 +01:00
pcrepp(pcrepp&& other)
: p_code(other.p_code), p_pattern(std::move(other.p_pattern)),
p_code_extra(pcre_free_study), p_capture_count(other.p_capture_count),
p_named_count(other.p_named_count), p_name_len(other.p_name_len),
p_options(other.p_options), p_named_entries(other.p_named_entries),
p_captures(std::move(other.p_captures))
{
pcre_refcount(this->p_code, 1);
this->p_code_extra = std::move(other.p_code_extra);
}
virtual ~pcrepp() { this->clear(); }
2022-03-16 23:38:08 +01:00
pcrepp& operator=(pcrepp&& other) noexcept
{
if (this == &other) {
return *this;
}
this->p_code = other.p_code;
pcre_refcount(this->p_code, 1);
this->p_pattern = std::move(other.p_pattern);
this->p_code_extra = std::move(other.p_code_extra);
this->p_capture_count = other.p_capture_count;
this->p_named_count = other.p_named_count;
this->p_name_len = other.p_name_len;
this->p_options = other.p_options;
this->p_named_entries = other.p_named_entries;
this->p_captures = std::move(other.p_captures);
return *this;
}
2022-03-16 23:38:08 +01:00
const std::string& get_pattern() const
{
return this->p_pattern;
}
2022-03-16 23:38:08 +01:00
bool empty() const
{
return this->p_pattern.empty();
}
2022-03-16 23:38:08 +01:00
void clear()
{
if (this->p_code && pcre_refcount(this->p_code, -1) == 0) {
2013-05-28 06:35:00 +02:00
free(this->p_code);
this->p_code = nullptr;
2013-05-28 06:35:00 +02:00
}
this->p_pattern.clear();
this->p_code_extra.reset();
this->p_capture_count = 0;
this->p_named_count = 0;
this->p_name_len = 0;
this->p_options = 0;
this->p_named_entries = nullptr;
this->p_captures.clear();
}
2009-09-14 03:07:32 +02:00
2022-03-16 23:38:08 +01:00
pcre_named_capture::iterator named_begin() const
{
2018-10-17 16:03:33 +02:00
return {this->p_named_entries, static_cast<size_t>(this->p_name_len)};
}
2022-03-16 23:38:08 +01:00
pcre_named_capture::iterator named_end() const
{
char* ptr = (char*) this->p_named_entries;
ptr += this->p_named_count * this->p_name_len;
2022-03-16 23:38:08 +01:00
return {(pcre_named_capture*) ptr,
2018-10-17 16:03:33 +02:00
static_cast<size_t>(this->p_name_len)};
}
const std::vector<pcre_context::capture_t>& captures() const
2022-03-16 23:38:08 +01:00
{
return this->p_captures;
}
std::vector<pcre_context::capture_t>::const_iterator cap_begin() const
2022-03-16 23:38:08 +01:00
{
return this->p_captures.begin();
}
std::vector<pcre_context::capture_t>::const_iterator cap_end() const
2022-03-16 23:38:08 +01:00
{
return this->p_captures.end();
}
2022-03-16 23:38:08 +01:00
int name_index(const std::string& name) const
{
return this->name_index(name.c_str());
}
int name_index(const char* name) const;
const char* name_for_capture(int index) const;
2015-08-06 08:18:19 +02:00
int get_capture_count() const { return this->p_capture_count; }
2015-08-06 08:18:19 +02:00
2022-03-16 23:38:08 +01:00
bool match(pcre_context& pc, pcre_input& pi, int options = 0) const;
2013-05-28 06:35:00 +02:00
template<size_t MATCH_COUNT>
2022-03-16 23:38:08 +01:00
nonstd::optional<pcre_context_static<MATCH_COUNT>> match(pcre_input& pi,
int options
= 0) const
{
pcre_context_static<MATCH_COUNT> pc;
if (this->match(pc, pi, options)) {
return pc;
}
return nonstd::nullopt;
}
2022-03-16 23:38:08 +01:00
std::string replace(const char* str, const char* repl) const;
size_t match_partial(pcre_input& pi) const;
2018-10-17 16:03:33 +02:00
// #undef PCRE_STUDY_JIT_COMPILE
2013-06-12 06:28:03 +02:00
#ifdef PCRE_STUDY_JIT_COMPILE
2022-03-16 23:38:08 +01:00
static pcre_jit_stack* jit_stack();
2013-06-16 03:07:50 +02:00
2013-06-12 06:28:03 +02:00
#else
2022-03-16 23:38:08 +01:00
static void pcre_free_study(pcre_extra*);
2013-06-12 06:28:03 +02:00
#endif
2020-05-07 16:08:59 +02:00
void study();
2022-03-16 23:38:08 +01:00
void find_captures(const char* pattern);
2022-03-16 23:38:08 +01:00
pcre* p_code{nullptr};
std::string p_pattern;
2009-09-14 03:07:32 +02:00
auto_mem<pcre_extra> p_code_extra;
int p_capture_count{0};
int p_named_count{0};
int p_name_len{0};
unsigned long p_options{0};
2022-03-16 23:38:08 +01:00
pcre_named_capture* p_named_entries{nullptr};
std::vector<pcre_context::capture_t> p_captures;
};
template<int options = 0>
class pcrepp_with_options : public pcrepp {
public:
template<typename... Args>
pcrepp_with_options(Args... args) : pcrepp(args..., options)
{
}
2009-09-14 03:07:32 +02:00
};
2009-09-14 03:07:32 +02:00
#endif