/** * Copyright (c) 2015, Timothy Stack * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Timothy Stack nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include "data_scanner.hh" bool data_scanner::tokenize2(pcre_context &pc, data_token_t &token_out) { # define YYCTYPE unsigned char # define CAPTURE(tok) { \ if (YYCURSOR.val == EMPTY) { \ pi.pi_next_offset = pi.pi_length; \ } else { \ pi.pi_next_offset = YYCURSOR.val - (const unsigned char *) pi.get_string(); \ } \ cap[0].c_end = pi.pi_next_offset; \ cap[1].c_end = pi.pi_next_offset; \ token_out = tok; \ } # define RET(tok) { \ CAPTURE(tok); \ return true; \ } static const unsigned char *EMPTY = (const unsigned char *) ""; pcre_input &pi = this->ds_pcre_input; struct _YYCURSOR { YYCTYPE operator*() const { if (this->val < this->lim) { return *val; } return '\0'; } operator const YYCTYPE *() const { if (this->val < this->lim) { return this->val; } return EMPTY; } const YYCTYPE *operator=(const YYCTYPE *rhs) { this->val = rhs; return rhs; } const YYCTYPE *operator+(int rhs) { return this->val + rhs; } const _YYCURSOR *operator-=(int rhs) { this->val -= rhs; return this; } _YYCURSOR& operator++() { this->val += 1; return *this; } const YYCTYPE *val{nullptr}; const YYCTYPE *lim{nullptr}; } YYCURSOR; YYCURSOR = (const unsigned char *) pi.get_string() + pi.pi_next_offset; _YYCURSOR yyt1; _YYCURSOR yyt2; _YYCURSOR yyt3; _YYCURSOR yyt4; const YYCTYPE *YYLIMIT = (const unsigned char *) pi.get_string() + pi.pi_length; const YYCTYPE *YYMARKER = YYCURSOR; pcre_context::capture_t *cap = pc.all(); YYCURSOR.lim = YYLIMIT; pc.set_count(2); cap[0].c_begin = pi.pi_next_offset; cap[0].c_end = pi.pi_next_offset; cap[1].c_begin = pi.pi_next_offset; cap[1].c_end = pi.pi_next_offset; /*!re2c re2c:yyfill:enable = 0; re2c:flags:tags = 1; SPACE = [ \t\r]; ALPHA = [a-zA-Z]; NUM = [0-9]; ALPHANUM = [a-zA-Z0-9_]; EOF = "\x00"; IPV4SEG = ("25"[0-5]|("2"[0-4]|"1"{0,1}[0-9]){0,1}[0-9]); IPV4ADDR = (IPV4SEG"."){3,3}IPV4SEG; IPV6SEG = [0-9a-fA-F]{1,4}; IPV6ADDR = ( (IPV6SEG":"){7,7}IPV6SEG| (IPV6SEG":"){1,7}":"| (IPV6SEG":"){1,6}":"IPV6SEG| (IPV6SEG":"){1,5}(":"IPV6SEG){1,2}| (IPV6SEG":"){1,4}(":"IPV6SEG){1,3}| (IPV6SEG":"){1,3}(":"IPV6SEG){1,4}| (IPV6SEG":"){1,2}(":"IPV6SEG){1,5}| IPV6SEG":"((":"IPV6SEG){1,6})| ":"((":"IPV6SEG){1,7}|":")| [a-fA-F0-9]{4}":"(":"IPV6SEG){0,4}"%"[0-9a-zA-Z]{1,}| "::"('ffff'(":0"{1,4}){0,1}":"){0,1}IPV4ADDR| (IPV6SEG":"){1,4}":"IPV4ADDR ); EOF { return false; } ("u"|"r")?'"'('\\'.|[^\x00\"\\]|'""')*'"' { CAPTURE(DT_QUOTED_STRING); switch (pi.get_string()[cap[1].c_begin]) { case 'u': case 'r': cap[1].c_begin += 1; break; } cap[1].c_begin += 1; cap[1].c_end -= 1; return true; } [a-qstv-zA-QSTV-Z]"'" { CAPTURE(DT_WORD); } ("u"|"r")?"'"('\\'.|"''"|[^\x00\'\\])*"'"/[^sS] { CAPTURE(DT_QUOTED_STRING); switch (pi.get_string()[cap[1].c_begin]) { case 'u': case 'r': cap[1].c_begin += 1; break; } cap[1].c_begin += 1; cap[1].c_end -= 1; return true; } [a-zA-Z0-9]+"://"[^\x00\r\n\t '"\[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); } ("/"|"./"|"../")[a-zA-Z0-9_\.\-\~/!@#$%^&*()]* { RET(DT_PATH); } (SPACE|NUM)NUM":"NUM{2}/[^:] { RET(DT_TIME); } (SPACE|NUM)NUM?":"NUM{2}":"NUM{2}("."NUM{3,6})?/[^:] { RET(DT_TIME); } [0-9a-fA-F][0-9a-fA-F](":"[0-9a-fA-F][0-9a-fA-F])+ { if ((YYCURSOR - (const unsigned char *) pi.get_string()) == 17) { RET(DT_MAC_ADDRESS); } else { RET(DT_HEX_DUMP); } } (NUM{4}"/"NUM{1,2}"/"NUM{1,2}|NUM{4}"-"NUM{1,2}"-"NUM{1,2}|NUM{2}"/"ALPHA{3}"/"NUM{4})"T"? { RET(DT_DATE); } IPV6ADDR/[^:a-zA-Z0-9] { RET(DT_IPV6_ADDRESS); } "<""?"?[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*'='SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+)))*SPACE*("/"|"?")">" { RET(DT_XML_EMPTY_TAG); } "<"[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*"="SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+)))*SPACE*">" { RET(DT_XML_OPEN_TAG); } "" { RET(DT_XML_CLOSE_TAG); } ":" { RET(DT_COLON); } "=" { RET(DT_EQUALS); } "," { RET(DT_COMMA); } ";" { RET(DT_SEMI); } "()" | "{}" | "[]" { RET(DT_EMPTY_CONTAINER); } "{" { RET(DT_LCURLY); } "}" { RET(DT_RCURLY); } "[" { RET(DT_LSQUARE); } "]" { RET(DT_RSQUARE); } "(" { RET(DT_LPAREN); } ")" { RET(DT_RPAREN); } "<" { RET(DT_LANGLE); } ">" { RET(DT_RANGLE); } IPV4ADDR/[^0-9] { RET(DT_IPV4_ADDRESS); } [0-9a-fA-F]{8}("-"[0-9a-fA-F]{4}){3}"-"[0-9a-fA-F]{12} { RET(DT_UUID); } [0-9]"."[0-9]+'e'[\-\+][0-9]+ { RET(DT_NUMBER); } [0-9]+("."[0-9]+[a-zA-Z0-9_]*){2,}("-"[a-zA-Z0-9_]+)?|[0-9]+("."[0-9]+[a-zA-Z0-9_]*)+"-"[a-zA-Z0-9_]+ { RET(DT_VERSION_NUMBER); } "-"?"0"[0-7]+ { RET(DT_OCTAL_NUMBER); } "-"?[0-9]+("."[0-9]+)?[ ]*"%" { RET(DT_PERCENTAGE); } "-"?[0-9]+("."[0-9]+)?([eE][\-+][0-9]+)? { RET(DT_NUMBER); } "-"?("0x"|[0-9])[0-9a-fA-F]+ { RET(DT_HEX_NUMBER); } [a-zA-Z0-9\._%+-]+"@"[a-zA-Z0-9\.-]+"."[a-zA-Z]+ { RET(DT_EMAIL); } "true"|"True"|"TRUE"|"false"|"False"|"FALSE"|"None"|"null"|"NULL"/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_CONSTANT); } ("re-")?[a-zA-Z][a-z']+/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_WORD); } [^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+("::"[^\x00"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* { RET(DT_SYMBOL); } ("\r"?"\n"|"\\n") { RET(DT_LINE); } SPACE+ { RET(DT_WHITE); } "." { RET(DT_DOT); } . { RET(DT_GARBAGE); } */ }