package syntax import ( "fmt" "math" "os" "sort" "strconv" "unicode" ) type RegexOptions int32 const ( IgnoreCase RegexOptions = 0x0001 // "i" Multiline = 0x0002 // "m" ExplicitCapture = 0x0004 // "n" Compiled = 0x0008 // "c" Singleline = 0x0010 // "s" IgnorePatternWhitespace = 0x0020 // "x" RightToLeft = 0x0040 // "r" Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" ) func optionFromCode(ch rune) RegexOptions { // case-insensitive switch ch { case 'i', 'I': return IgnoreCase case 'r', 'R': return RightToLeft case 'm', 'M': return Multiline case 'n', 'N': return ExplicitCapture case 's', 'S': return Singleline case 'x', 'X': return IgnorePatternWhitespace case 'd', 'D': return Debug case 'e', 'E': return ECMAScript default: return 0 } } // An Error describes a failure to parse a regular expression // and gives the offending expression. type Error struct { Code ErrorCode Expr string Args []interface{} } func (e *Error) Error() string { if len(e.Args) == 0 { return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`" } return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`" } // An ErrorCode describes a failure to parse a regular expression. type ErrorCode string const ( // internal issue ErrInternalError ErrorCode = "regexp/syntax: internal error" // Parser errors ErrUnterminatedComment = "unterminated comment" ErrInvalidCharRange = "invalid character class range" ErrInvalidRepeatSize = "invalid repeat count" ErrInvalidUTF8 = "invalid UTF-8" ErrCaptureGroupOutOfRange = "capture group number out of range" ErrUnexpectedParen = "unexpected )" ErrMissingParen = "missing closing )" ErrMissingBrace = "missing closing }" ErrInvalidRepeatOp = "invalid nested repetition operator" ErrMissingRepeatArgument = "missing argument to repetition operator" ErrConditionalExpression = "illegal conditional (?(...)) expression" ErrTooManyAlternates = "too many | in (?()|)" ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" ErrCapNumNotZero = "capture number cannot be zero" ErrUndefinedBackRef = "reference to undefined group number %v" ErrUndefinedNameRef = "reference to undefined group name %v" ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named" ErrAlternationCantHaveComment = "alternation conditions cannot be comments" ErrMalformedReference = "(?(%v) ) malformed" ErrUndefinedReference = "(?(%v) ) reference to undefined group" ErrIllegalEndEscape = "illegal \\ at end of pattern" ErrMalformedSlashP = "malformed \\p{X} character escape" ErrIncompleteSlashP = "incomplete \\p{X} character escape" ErrUnknownSlashP = "unknown unicode category, script, or property '%v'" ErrUnrecognizedEscape = "unrecognized escape sequence \\%v" ErrMissingControl = "missing control character" ErrUnrecognizedControl = "unrecognized control character" ErrTooFewHex = "insufficient hexadecimal digits" ErrInvalidHex = "hex values may not be larger than 0x10FFFF" ErrMalformedNameRef = "malformed \\k<...> named back reference" ErrBadClassInCharRange = "cannot include class \\%v in character range" ErrUnterminatedBracket = "unterminated [] set" ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class" ErrReversedCharRange = "[x-y] range in reverse order" ) func (e ErrorCode) String() string { return string(e) } type parser struct { stack *regexNode group *regexNode alternation *regexNode concatenation *regexNode unit *regexNode patternRaw string pattern []rune currentPos int specialCase *unicode.SpecialCase autocap int capcount int captop int capsize int caps map[int]int capnames map[string]int capnumlist []int capnamelist []string options RegexOptions optionsStack []RegexOptions ignoreNextParen bool } const ( maxValueDiv10 int = math.MaxInt32 / 10 maxValueMod10 = math.MaxInt32 % 10 ) // Parse converts a regex string into a parse tree func Parse(re string, op RegexOptions) (*RegexTree, error) { p := parser{ options: op, caps: make(map[int]int), } p.setPattern(re) if err := p.countCaptures(); err != nil { return nil, err } p.reset(op) root, err := p.scanRegex() if err != nil { return nil, err } tree := &RegexTree{ root: root, caps: p.caps, capnumlist: p.capnumlist, captop: p.captop, Capnames: p.capnames, Caplist: p.capnamelist, options: op, } if tree.options&Debug > 0 { os.Stdout.WriteString(tree.Dump()) } return tree, nil } func (p *parser) setPattern(pattern string) { p.patternRaw = pattern p.pattern = make([]rune, 0, len(pattern)) //populate our rune array to handle utf8 encoding for _, r := range pattern { p.pattern = append(p.pattern, r) } } func (p *parser) getErr(code ErrorCode, args ...interface{}) error { return &Error{Code: code, Expr: p.patternRaw, Args: args} } func (p *parser) noteCaptureSlot(i, pos int) { if _, ok := p.caps[i]; !ok { // the rhs of the hashtable isn't used in the parser p.caps[i] = pos p.capcount++ if p.captop <= i { if i == math.MaxInt32 { p.captop = i } else { p.captop = i + 1 } } } } func (p *parser) noteCaptureName(name string, pos int) { if p.capnames == nil { p.capnames = make(map[string]int) } if _, ok := p.capnames[name]; !ok { p.capnames[name] = pos p.capnamelist = append(p.capnamelist, name) } } func (p *parser) assignNameSlots() { if p.capnames != nil { for _, name := range p.capnamelist { for p.isCaptureSlot(p.autocap) { p.autocap++ } pos := p.capnames[name] p.capnames[name] = p.autocap p.noteCaptureSlot(p.autocap, pos) p.autocap++ } } // if the caps array has at least one gap, construct the list of used slots if p.capcount < p.captop { p.capnumlist = make([]int, p.capcount) i := 0 for k := range p.caps { p.capnumlist[i] = k i++ } sort.Ints(p.capnumlist) } // merge capsnumlist into capnamelist if p.capnames != nil || p.capnumlist != nil { var oldcapnamelist []string var next int var k int if p.capnames == nil { oldcapnamelist = nil p.capnames = make(map[string]int) p.capnamelist = []string{} next = -1 } else { oldcapnamelist = p.capnamelist p.capnamelist = []string{} next = p.capnames[oldcapnamelist[0]] } for i := 0; i < p.capcount; i++ { j := i if p.capnumlist != nil { j = p.capnumlist[i] } if next == j { p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) k++ if k == len(oldcapnamelist) { next = -1 } else { next = p.capnames[oldcapnamelist[k]] } } else { //feature: culture? str := strconv.Itoa(j) p.capnamelist = append(p.capnamelist, str) p.capnames[str] = j } } } } func (p *parser) consumeAutocap() int { r := p.autocap p.autocap++ return r } // CountCaptures is a prescanner for deducing the slots used for // captures by doing a partial tokenization of the pattern. func (p *parser) countCaptures() error { var ch rune p.noteCaptureSlot(0, 0) p.autocap = 1 for p.charsRight() > 0 { pos := p.textpos() ch = p.moveRightGetChar() switch ch { case '\\': if p.charsRight() > 0 { p.moveRight(1) } case '#': if p.useOptionX() { p.moveLeft() p.scanBlank() } case '[': p.scanCharSet(false, true) case ')': if !p.emptyOptionsStack() { p.popOptions() } case '(': if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' { p.moveLeft() p.scanBlank() } else { p.pushOptions() if p.charsRight() > 0 && p.rightChar(0) == '?' { // we have (?... p.moveRight(1) if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') { // named group: (?<... or (?'... p.moveRight(1) ch = p.rightChar(0) if ch != '0' && IsWordChar(ch) { if ch >= '1' && ch <= '9' { dec, err := p.scanDecimal() if err != nil { return err } p.noteCaptureSlot(dec, pos) } else { p.noteCaptureName(p.scanCapname(), pos) } } } else { // (?... // get the options if it's an option construct (?cimsx-cimsx...) p.scanOptions() if p.charsRight() > 0 { if p.rightChar(0) == ')' { // (?cimsx-cimsx) p.moveRight(1) p.popKeepOptions() } else if p.rightChar(0) == '(' { // alternation construct: (?(foo)yes|no) // ignore the next paren so we don't capture the condition p.ignoreNextParen = true // break from here so we don't reset ignoreNextParen continue } } } } else { if !p.useOptionN() && !p.ignoreNextParen { p.noteCaptureSlot(p.consumeAutocap(), pos) } } } p.ignoreNextParen = false } } p.assignNameSlots() return nil } func (p *parser) reset(topopts RegexOptions) { p.currentPos = 0 p.autocap = 1 p.ignoreNextParen = false if len(p.optionsStack) > 0 { p.optionsStack = p.optionsStack[:0] } p.options = topopts p.stack = nil } func (p *parser) scanRegex() (*regexNode, error) { ch := '@' // nonspecial ch, means at beginning isQuant := false p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1)) for p.charsRight() > 0 { wasPrevQuantifier := isQuant isQuant = false if err := p.scanBlank(); err != nil { return nil, err } startpos := p.textpos() // move past all of the normal characters. We'll stop when we hit some kind of control character, // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. if p.useOptionX() { for p.charsRight() > 0 { ch = p.rightChar(0) //UGLY: clean up, this is ugly if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) { break } p.moveRight(1) } } else { for p.charsRight() > 0 { ch = p.rightChar(0) if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) { break } p.moveRight(1) } } endpos := p.textpos() p.scanBlank() if p.charsRight() == 0 { ch = '!' // nonspecial, means at end } else if ch = p.rightChar(0); isSpecial(ch) { isQuant = isQuantifier(ch) p.moveRight(1) } else { ch = ' ' // nonspecial, means at ordinary char } if startpos < endpos { cchUnquantified := endpos - startpos if isQuant { cchUnquantified-- } wasPrevQuantifier = false if cchUnquantified > 0 { p.addToConcatenate(startpos, cchUnquantified, false) } if isQuant { p.addUnitOne(p.charAt(endpos - 1)) } } switch ch { case '!': goto BreakOuterScan case ' ': goto ContinueOuterScan case '[': cc, err := p.scanCharSet(p.useOptionI(), false) if err != nil { return nil, err } p.addUnitSet(cc) case '(': p.pushOptions() if grouper, err := p.scanGroupOpen(); err != nil { return nil, err } else if grouper == nil { p.popKeepOptions() } else { p.pushGroup() p.startGroup(grouper) } continue case '|': p.addAlternate() goto ContinueOuterScan case ')': if p.emptyStack() { return nil, p.getErr(ErrUnexpectedParen) } if err := p.addGroup(); err != nil { return nil, err } if err := p.popGroup(); err != nil { return nil, err } p.popOptions() if p.unit == nil { goto ContinueOuterScan } case '\\': n, err := p.scanBackslash() if err != nil { return nil, err } p.addUnitNode(n) case '^': if p.useOptionM() { p.addUnitType(ntBol) } else { p.addUnitType(ntBeginning) } case '$': if p.useOptionM() { p.addUnitType(ntEol) } else { p.addUnitType(ntEndZ) } case '.': if p.useOptionE() { p.addUnitSet(ECMAAnyClass()) } else if p.useOptionS() { p.addUnitSet(AnyClass()) } else { p.addUnitNotone('\n') } case '{', '*', '+', '?': if p.unit == nil { if wasPrevQuantifier { return nil, p.getErr(ErrInvalidRepeatOp) } else { return nil, p.getErr(ErrMissingRepeatArgument) } } p.moveLeft() default: return nil, p.getErr(ErrInternalError) } if err := p.scanBlank(); err != nil { return nil, err } if p.charsRight() > 0 { isQuant = p.isTrueQuantifier() } if p.charsRight() == 0 || !isQuant { //maintain odd C# assignment order -- not sure if required, could clean up? p.addConcatenate() goto ContinueOuterScan } ch = p.moveRightGetChar() // Handle quantifiers for p.unit != nil { var min, max int var lazy bool switch ch { case '*': min = 0 max = math.MaxInt32 case '?': min = 0 max = 1 case '+': min = 1 max = math.MaxInt32 case '{': { var err error startpos = p.textpos() if min, err = p.scanDecimal(); err != nil { return nil, err } max = min if startpos < p.textpos() { if p.charsRight() > 0 && p.rightChar(0) == ',' { p.moveRight(1) if p.charsRight() == 0 || p.rightChar(0) == '}' { max = math.MaxInt32 } else { if max, err = p.scanDecimal(); err != nil { return nil, err } } } } if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' { p.addConcatenate() p.textto(startpos - 1) goto ContinueOuterScan } } default: return nil, p.getErr(ErrInternalError) } if err := p.scanBlank(); err != nil { return nil, err } if p.charsRight() == 0 || p.rightChar(0) != '?' { lazy = false } else { p.moveRight(1) lazy = true } if min > max { return nil, p.getErr(ErrInvalidRepeatSize) } p.addConcatenate3(lazy, min, max) } ContinueOuterScan: } BreakOuterScan: ; if !p.emptyStack() { return nil, p.getErr(ErrMissingParen) } if err := p.addGroup(); err != nil { return nil, err } return p.unit, nil } /* * Simple parsing for replacement patterns */ func (p *parser) scanReplacement() (*regexNode, error) { var c, startpos int p.concatenation = newRegexNode(ntConcatenate, p.options) for { c = p.charsRight() if c == 0 { break } startpos = p.textpos() for c > 0 && p.rightChar(0) != '$' { p.moveRight(1) c-- } p.addToConcatenate(startpos, p.textpos()-startpos, true) if c > 0 { if p.moveRightGetChar() == '$' { n, err := p.scanDollar() if err != nil { return nil, err } p.addUnitNode(n) } p.addConcatenate() } } return p.concatenation, nil } /* * Scans $ patterns recognized within replacement patterns */ func (p *parser) scanDollar() (*regexNode, error) { if p.charsRight() == 0 { return newRegexNodeCh(ntOne, p.options, '$'), nil } ch := p.rightChar(0) angled := false backpos := p.textpos() lastEndPos := backpos // Note angle if ch == '{' && p.charsRight() > 1 { angled = true p.moveRight(1) ch = p.rightChar(0) } // Try to parse backreference: \1 or \{1} or \{cap} if ch >= '0' && ch <= '9' { if !angled && p.useOptionE() { capnum := -1 newcapnum := int(ch - '0') p.moveRight(1) if p.isCaptureSlot(newcapnum) { capnum = newcapnum lastEndPos = p.textpos() } for p.charsRight() > 0 { ch = p.rightChar(0) if ch < '0' || ch > '9' { break } digit := int(ch - '0') if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) { return nil, p.getErr(ErrCaptureGroupOutOfRange) } newcapnum = newcapnum*10 + digit p.moveRight(1) if p.isCaptureSlot(newcapnum) { capnum = newcapnum lastEndPos = p.textpos() } } p.textto(lastEndPos) if capnum >= 0 { return newRegexNodeM(ntRef, p.options, capnum), nil } } else { capnum, err := p.scanDecimal() if err != nil { return nil, err } if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' { if p.isCaptureSlot(capnum) { return newRegexNodeM(ntRef, p.options, capnum), nil } } } } else if angled && IsWordChar(ch) { capname := p.scanCapname() if p.charsRight() > 0 && p.moveRightGetChar() == '}' { if p.isCaptureName(capname) { return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil } } } else if !angled { capnum := 1 switch ch { case '$': p.moveRight(1) return newRegexNodeCh(ntOne, p.options, '$'), nil case '&': capnum = 0 case '`': capnum = replaceLeftPortion case '\'': capnum = replaceRightPortion case '+': capnum = replaceLastGroup case '_': capnum = replaceWholeString } if capnum != 1 { p.moveRight(1) return newRegexNodeM(ntRef, p.options, capnum), nil } } // unrecognized $: literalize p.textto(backpos) return newRegexNodeCh(ntOne, p.options, '$'), nil } // scanGroupOpen scans chars following a '(' (not counting the '('), and returns // a RegexNode for the type of group scanned, or nil if the group // simply changed options (?cimsx-cimsx) or was a comment (#...). func (p *parser) scanGroupOpen() (*regexNode, error) { var ch rune var nt nodeType var err error close := '>' start := p.textpos() // just return a RegexNode if we have: // 1. "(" followed by nothing // 2. "(x" where x != ? // 3. "(?)" if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) { if p.useOptionN() || p.ignoreNextParen { p.ignoreNextParen = false return newRegexNode(ntGroup, p.options), nil } return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil } p.moveRight(1) for { if p.charsRight() == 0 { break } switch ch = p.moveRightGetChar(); ch { case ':': nt = ntGroup case '=': p.options &= ^RightToLeft nt = ntRequire case '!': p.options &= ^RightToLeft nt = ntPrevent case '>': nt = ntGreedy case '\'': close = '\'' fallthrough case '<': if p.charsRight() == 0 { goto BreakRecognize } switch ch = p.moveRightGetChar(); ch { case '=': if close == '\'' { goto BreakRecognize } p.options |= RightToLeft nt = ntRequire case '!': if close == '\'' { goto BreakRecognize } p.options |= RightToLeft nt = ntPrevent default: p.moveLeft() capnum := -1 uncapnum := -1 proceed := false // grab part before - if ch >= '0' && ch <= '9' { if capnum, err = p.scanDecimal(); err != nil { return nil, err } if !p.isCaptureSlot(capnum) { capnum = -1 } // check if we have bogus characters after the number if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { return nil, p.getErr(ErrInvalidGroupName) } if capnum == 0 { return nil, p.getErr(ErrCapNumNotZero) } } else if IsWordChar(ch) { capname := p.scanCapname() if p.isCaptureName(capname) { capnum = p.captureSlotFromName(capname) } // check if we have bogus character after the name if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { return nil, p.getErr(ErrInvalidGroupName) } } else if ch == '-' { proceed = true } else { // bad group name - starts with something other than a word character and isn't a number return nil, p.getErr(ErrInvalidGroupName) } // grab part after - if any if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { p.moveRight(1) //no more chars left, no closing char, etc if p.charsRight() == 0 { return nil, p.getErr(ErrInvalidGroupName) } ch = p.rightChar(0) if ch >= '0' && ch <= '9' { if uncapnum, err = p.scanDecimal(); err != nil { return nil, err } if !p.isCaptureSlot(uncapnum) { return nil, p.getErr(ErrUndefinedBackRef, uncapnum) } // check if we have bogus characters after the number if p.charsRight() > 0 && p.rightChar(0) != close { return nil, p.getErr(ErrInvalidGroupName) } } else if IsWordChar(ch) { uncapname := p.scanCapname() if !p.isCaptureName(uncapname) { return nil, p.getErr(ErrUndefinedNameRef, uncapname) } uncapnum = p.captureSlotFromName(uncapname) // check if we have bogus character after the name if p.charsRight() > 0 && p.rightChar(0) != close { return nil, p.getErr(ErrInvalidGroupName) } } else { // bad group name - starts with something other than a word character and isn't a number return nil, p.getErr(ErrInvalidGroupName) } } // actually make the node if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close { return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil } goto BreakRecognize } case '(': // alternation construct (?(...) | ) parenPos := p.textpos() if p.charsRight() > 0 { ch = p.rightChar(0) // check if the alternation condition is a backref if ch >= '0' && ch <= '9' { var capnum int if capnum, err = p.scanDecimal(); err != nil { return nil, err } if p.charsRight() > 0 && p.moveRightGetChar() == ')' { if p.isCaptureSlot(capnum) { return newRegexNodeM(ntTestref, p.options, capnum), nil } return nil, p.getErr(ErrUndefinedReference, capnum) } return nil, p.getErr(ErrMalformedReference, capnum) } else if IsWordChar(ch) { capname := p.scanCapname() if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' { return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil } } } // not a backref nt = ntTestgroup p.textto(parenPos - 1) // jump to the start of the parentheses p.ignoreNextParen = true // but make sure we don't try to capture the insides charsRight := p.charsRight() if charsRight >= 3 && p.rightChar(1) == '?' { rightchar2 := p.rightChar(2) // disallow comments in the condition if rightchar2 == '#' { return nil, p.getErr(ErrAlternationCantHaveComment) } // disallow named capture group (?<..>..) in the condition if rightchar2 == '\'' { return nil, p.getErr(ErrAlternationCantCapture) } if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') { return nil, p.getErr(ErrAlternationCantCapture) } } default: p.moveLeft() nt = ntGroup // disallow options in the children of a testgroup node if p.group.t != ntTestgroup { p.scanOptions() } if p.charsRight() == 0 { goto BreakRecognize } if ch = p.moveRightGetChar(); ch == ')' { return nil, nil } if ch != ':' { goto BreakRecognize } } return newRegexNode(nt, p.options), nil } BreakRecognize: // break Recognize comes here return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()])) } // scans backslash specials and basics func (p *parser) scanBackslash() (*regexNode, error) { if p.charsRight() == 0 { return nil, p.getErr(ErrIllegalEndEscape) } switch ch := p.rightChar(0); ch { case 'b', 'B', 'A', 'G', 'Z', 'z': p.moveRight(1) return newRegexNode(p.typeFromCode(ch), p.options), nil case 'w': p.moveRight(1) if p.useOptionE() { return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil } return newRegexNodeSet(ntSet, p.options, WordClass()), nil case 'W': p.moveRight(1) if p.useOptionE() { return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil } return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil case 's': p.moveRight(1) if p.useOptionE() { return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil } return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil case 'S': p.moveRight(1) if p.useOptionE() { return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil } return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil case 'd': p.moveRight(1) if p.useOptionE() { return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil } return newRegexNodeSet(ntSet, p.options, DigitClass()), nil case 'D': p.moveRight(1) if p.useOptionE() { return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil } return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil case 'p', 'P': p.moveRight(1) prop, err := p.parseProperty() if err != nil { return nil, err } cc := &CharSet{} cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) if p.useOptionI() { cc.addLowercase() } return newRegexNodeSet(ntSet, p.options, cc), nil default: return p.scanBasicBackslash() } } // Scans \-style backreferences and character escapes func (p *parser) scanBasicBackslash() (*regexNode, error) { if p.charsRight() == 0 { return nil, p.getErr(ErrIllegalEndEscape) } angled := false close := '\x00' backpos := p.textpos() ch := p.rightChar(0) // allow \k instead of \, which is now deprecated if ch == 'k' { if p.charsRight() >= 2 { p.moveRight(1) ch = p.moveRightGetChar() if ch == '<' || ch == '\'' { angled = true if ch == '\'' { close = '\'' } else { close = '>' } } } if !angled || p.charsRight() <= 0 { return nil, p.getErr(ErrMalformedNameRef) } ch = p.rightChar(0) } else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g angled = true if ch == '\'' { close = '\'' } else { close = '>' } p.moveRight(1) ch = p.rightChar(0) } // Try to parse backreference: \<1> or \ if angled && ch >= '0' && ch <= '9' { capnum, err := p.scanDecimal() if err != nil { return nil, err } if p.charsRight() > 0 && p.moveRightGetChar() == close { if p.isCaptureSlot(capnum) { return newRegexNodeM(ntRef, p.options, capnum), nil } else { return nil, p.getErr(ErrUndefinedBackRef, capnum) } } } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1 capnum, err := p.scanDecimal() if err != nil { return nil, err } if p.useOptionE() || p.isCaptureSlot(capnum) { return newRegexNodeM(ntRef, p.options, capnum), nil } if capnum <= 9 { return nil, p.getErr(ErrUndefinedBackRef, capnum) } } else if angled && IsWordChar(ch) { capname := p.scanCapname() if p.charsRight() > 0 && p.moveRightGetChar() == close { if p.isCaptureName(capname) { return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil } return nil, p.getErr(ErrUndefinedNameRef, capname) } } // Not backreference: must be char code p.textto(backpos) ch, err := p.scanCharEscape() if err != nil { return nil, err } if p.useOptionI() { ch = unicode.ToLower(ch) } return newRegexNodeCh(ntOne, p.options, ch), nil } // Scans X for \p{X} or \P{X} func (p *parser) parseProperty() (string, error) { if p.charsRight() < 3 { return "", p.getErr(ErrIncompleteSlashP) } ch := p.moveRightGetChar() if ch != '{' { return "", p.getErr(ErrMalformedSlashP) } startpos := p.textpos() for p.charsRight() > 0 { ch = p.moveRightGetChar() if !(IsWordChar(ch) || ch == '-') { p.moveLeft() break } } capname := string(p.pattern[startpos:p.textpos()]) if p.charsRight() == 0 || p.moveRightGetChar() != '}' { return "", p.getErr(ErrIncompleteSlashP) } if !isValidUnicodeCat(capname) { return "", p.getErr(ErrUnknownSlashP, capname) } return capname, nil } // Returns ReNode type for zero-length assertions with a \ code. func (p *parser) typeFromCode(ch rune) nodeType { switch ch { case 'b': if p.useOptionE() { return ntECMABoundary } return ntBoundary case 'B': if p.useOptionE() { return ntNonECMABoundary } return ntNonboundary case 'A': return ntBeginning case 'G': return ntStart case 'Z': return ntEndZ case 'z': return ntEnd default: return ntNothing } } // Scans whitespace or x-mode comments. func (p *parser) scanBlank() error { if p.useOptionX() { for { for p.charsRight() > 0 && isSpace(p.rightChar(0)) { p.moveRight(1) } if p.charsRight() == 0 { break } if p.rightChar(0) == '#' { for p.charsRight() > 0 && p.rightChar(0) != '\n' { p.moveRight(1) } } else if p.charsRight() >= 3 && p.rightChar(2) == '#' && p.rightChar(1) == '?' && p.rightChar(0) == '(' { for p.charsRight() > 0 && p.rightChar(0) != ')' { p.moveRight(1) } if p.charsRight() == 0 { return p.getErr(ErrUnterminatedComment) } p.moveRight(1) } else { break } } } else { for { if p.charsRight() < 3 || p.rightChar(2) != '#' || p.rightChar(1) != '?' || p.rightChar(0) != '(' { return nil } for p.charsRight() > 0 && p.rightChar(0) != ')' { p.moveRight(1) } if p.charsRight() == 0 { return p.getErr(ErrUnterminatedComment) } p.moveRight(1) } } return nil } func (p *parser) scanCapname() string { startpos := p.textpos() for p.charsRight() > 0 { if !IsWordChar(p.moveRightGetChar()) { p.moveLeft() break } } return string(p.pattern[startpos:p.textpos()]) } //Scans contents of [] (not including []'s), and converts to a set. func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { ch := '\x00' chPrev := '\x00' inRange := false firstChar := true closed := false var cc *CharSet if !scanOnly { cc = &CharSet{} } if p.charsRight() > 0 && p.rightChar(0) == '^' { p.moveRight(1) if !scanOnly { cc.negate = true } } for ; p.charsRight() > 0; firstChar = false { fTranslatedChar := false ch = p.moveRightGetChar() if ch == ']' { if !firstChar { closed = true break } else if p.useOptionE() { if !scanOnly { cc.addRanges(NoneClass().ranges) } closed = true break } } else if ch == '\\' && p.charsRight() > 0 { switch ch = p.moveRightGetChar(); ch { case 'D', 'd': if !scanOnly { if inRange { return nil, p.getErr(ErrBadClassInCharRange, ch) } cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw) } continue case 'S', 's': if !scanOnly { if inRange { return nil, p.getErr(ErrBadClassInCharRange, ch) } cc.addSpace(p.useOptionE(), ch == 'S') } continue case 'W', 'w': if !scanOnly { if inRange { return nil, p.getErr(ErrBadClassInCharRange, ch) } cc.addWord(p.useOptionE(), ch == 'W') } continue case 'p', 'P': if !scanOnly { if inRange { return nil, p.getErr(ErrBadClassInCharRange, ch) } prop, err := p.parseProperty() if err != nil { return nil, err } cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) } else { p.parseProperty() } continue case '-': if !scanOnly { cc.addRange(ch, ch) } continue default: p.moveLeft() var err error ch, err = p.scanCharEscape() // non-literal character if err != nil { return nil, err } fTranslatedChar = true break // this break will only break out of the switch } } else if ch == '[' { // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. // It currently doesn't do anything other than skip the whole thing! if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange { savePos := p.textpos() p.moveRight(1) p.scanCapname() // throwaway the name if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' { p.textto(savePos) } // else lookup name (nyi) } } if inRange { inRange = false if !scanOnly { if ch == '[' && !fTranslatedChar && !firstChar { // We thought we were in a range, but we're actually starting a subtraction. // In that case, we'll add chPrev to our char class, skip the opening [, and // scan the new character class recursively. cc.addChar(chPrev) sub, err := p.scanCharSet(caseInsensitive, false) if err != nil { return nil, err } cc.addSubtraction(sub) if p.charsRight() > 0 && p.rightChar(0) != ']' { return nil, p.getErr(ErrSubtractionMustBeLast) } } else { // a regular range, like a-z if chPrev > ch { return nil, p.getErr(ErrReversedCharRange) } cc.addRange(chPrev, ch) } } } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { // this could be the start of a range chPrev = ch inRange = true p.moveRight(1) } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar { // we aren't in a range, and now there is a subtraction. Usually this happens // only when a subtraction follows a range, like [a-z-[b]] if !scanOnly { p.moveRight(1) sub, err := p.scanCharSet(caseInsensitive, false) if err != nil { return nil, err } cc.addSubtraction(sub) if p.charsRight() > 0 && p.rightChar(0) != ']' { return nil, p.getErr(ErrSubtractionMustBeLast) } } else { p.moveRight(1) p.scanCharSet(caseInsensitive, true) } } else { if !scanOnly { cc.addRange(ch, ch) } } } if !closed { return nil, p.getErr(ErrUnterminatedBracket) } if !scanOnly && caseInsensitive { cc.addLowercase() } return cc, nil } // Scans any number of decimal digits (pegs value at 2^31-1 if too large) func (p *parser) scanDecimal() (int, error) { i := 0 var d int for p.charsRight() > 0 { d = int(p.rightChar(0) - '0') if d < 0 || d > 9 { break } p.moveRight(1) if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) { return 0, p.getErr(ErrCaptureGroupOutOfRange) } i *= 10 i += d } return int(i), nil } // Returns true for options allowed only at the top level func isOnlyTopOption(option RegexOptions) bool { return option == RightToLeft || option == ECMAScript } // Scans cimsx-cimsx option string, stops at the first unrecognized char. func (p *parser) scanOptions() { for off := false; p.charsRight() > 0; p.moveRight(1) { ch := p.rightChar(0) if ch == '-' { off = true } else if ch == '+' { off = false } else { option := optionFromCode(ch) if option == 0 || isOnlyTopOption(option) { return } if off { p.options &= ^option } else { p.options |= option } } } } // Scans \ code for escape codes that map to single unicode chars. func (p *parser) scanCharEscape() (rune, error) { ch := p.moveRightGetChar() if ch >= '0' && ch <= '7' { p.moveLeft() return p.scanOctal(), nil } switch ch { case 'x': // support for \x{HEX} syntax from Perl and PCRE if p.charsRight() > 0 && p.rightChar(0) == '{' { p.moveRight(1) return p.scanHexUntilBrace() } return p.scanHex(2) case 'u': return p.scanHex(4) case 'a': return '\u0007', nil case 'b': return '\b', nil case 'e': return '\u001B', nil case 'f': return '\f', nil case 'n': return '\n', nil case 'r': return '\r', nil case 't': return '\t', nil case 'v': return '\u000B', nil case 'c': return p.scanControl() default: if !p.useOptionE() && IsWordChar(ch) { return 0, p.getErr(ErrUnrecognizedEscape, string(ch)) } return ch, nil } } // Grabs and converts an ascii control character func (p *parser) scanControl() (rune, error) { if p.charsRight() <= 0 { return 0, p.getErr(ErrMissingControl) } ch := p.moveRightGetChar() // \ca interpreted as \cA if ch >= 'a' && ch <= 'z' { ch = (ch - ('a' - 'A')) } ch = (ch - '@') if ch >= 0 && ch < ' ' { return ch, nil } return 0, p.getErr(ErrUnrecognizedControl) } // Scan hex digits until we hit a closing brace. // Non-hex digits, hex value too large for UTF-8, or running out of chars are errors func (p *parser) scanHexUntilBrace() (rune, error) { // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit // so we can enforce that i := 0 hasContent := false for p.charsRight() > 0 { ch := p.moveRightGetChar() if ch == '}' { // hit our close brace, we're done here // prevent \x{} if !hasContent { return 0, p.getErr(ErrTooFewHex) } return rune(i), nil } hasContent = true // no brace needs to be hex digit d := hexDigit(ch) if d < 0 { return 0, p.getErr(ErrMissingBrace) } i *= 0x10 i += d if i > unicode.MaxRune { return 0, p.getErr(ErrInvalidHex) } } // we only make it here if we run out of digits without finding the brace return 0, p.getErr(ErrMissingBrace) } // Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) func (p *parser) scanHex(c int) (rune, error) { i := 0 if p.charsRight() >= c { for c > 0 { d := hexDigit(p.moveRightGetChar()) if d < 0 { break } i *= 0x10 i += d c-- } } if c > 0 { return 0, p.getErr(ErrTooFewHex) } return rune(i), nil } // Returns n <= 0xF for a hex digit. func hexDigit(ch rune) int { if d := uint(ch - '0'); d <= 9 { return int(d) } if d := uint(ch - 'a'); d <= 5 { return int(d + 0xa) } if d := uint(ch - 'A'); d <= 5 { return int(d + 0xa) } return -1 } // Scans up to three octal digits (stops before exceeding 0377). func (p *parser) scanOctal() rune { // Consume octal chars only up to 3 digits and value 0377 c := 3 if c > p.charsRight() { c = p.charsRight() } //we know the first char is good because the caller had to check i := 0 d := int(p.rightChar(0) - '0') for c > 0 && d <= 7 { i *= 8 i += d if p.useOptionE() && i >= 0x20 { break } c-- p.moveRight(1) if !p.rightMost() { d = int(p.rightChar(0) - '0') } } // Octal codes only go up to 255. Any larger and the behavior that Perl follows // is simply to truncate the high bits. i &= 0xFF return rune(i) } // Returns the current parsing position. func (p *parser) textpos() int { return p.currentPos } // Zaps to a specific parsing position. func (p *parser) textto(pos int) { p.currentPos = pos } // Returns the char at the right of the current parsing position and advances to the right. func (p *parser) moveRightGetChar() rune { ch := p.pattern[p.currentPos] p.currentPos++ return ch } // Moves the current position to the right. func (p *parser) moveRight(i int) { // default would be 1 p.currentPos += i } // Moves the current parsing position one to the left. func (p *parser) moveLeft() { p.currentPos-- } // Returns the char left of the current parsing position. func (p *parser) charAt(i int) rune { return p.pattern[i] } // Returns the char i chars right of the current parsing position. func (p *parser) rightChar(i int) rune { // default would be 0 return p.pattern[p.currentPos+i] } // Number of characters to the right of the current parsing position. func (p *parser) charsRight() int { return len(p.pattern) - p.currentPos } func (p *parser) rightMost() bool { return p.currentPos == len(p.pattern) } // Looks up the slot number for a given name func (p *parser) captureSlotFromName(capname string) int { return p.capnames[capname] } // True if the capture slot was noted func (p *parser) isCaptureSlot(i int) bool { if p.caps != nil { _, ok := p.caps[i] return ok } return (i >= 0 && i < p.capsize) } // Looks up the slot number for a given name func (p *parser) isCaptureName(capname string) bool { if p.capnames == nil { return false } _, ok := p.capnames[capname] return ok } // option shortcuts // True if N option disabling '(' autocapture is on. func (p *parser) useOptionN() bool { return (p.options & ExplicitCapture) != 0 } // True if I option enabling case-insensitivity is on. func (p *parser) useOptionI() bool { return (p.options & IgnoreCase) != 0 } // True if M option altering meaning of $ and ^ is on. func (p *parser) useOptionM() bool { return (p.options & Multiline) != 0 } // True if S option altering meaning of . is on. func (p *parser) useOptionS() bool { return (p.options & Singleline) != 0 } // True if X option enabling whitespace/comment mode is on. func (p *parser) useOptionX() bool { return (p.options & IgnorePatternWhitespace) != 0 } // True if E option enabling ECMAScript behavior on. func (p *parser) useOptionE() bool { return (p.options & ECMAScript) != 0 } // True if options stack is empty. func (p *parser) emptyOptionsStack() bool { return len(p.optionsStack) == 0 } // Finish the current quantifiable (when a quantifier is not found or is not possible) func (p *parser) addConcatenate() { // The first (| inside a Testgroup group goes directly to the group p.concatenation.addChild(p.unit) p.unit = nil } // Finish the current quantifiable (when a quantifier is found) func (p *parser) addConcatenate3(lazy bool, min, max int) { p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max)) p.unit = nil } // Sets the current unit to a single char node func (p *parser) addUnitOne(ch rune) { if p.useOptionI() { ch = unicode.ToLower(ch) } p.unit = newRegexNodeCh(ntOne, p.options, ch) } // Sets the current unit to a single inverse-char node func (p *parser) addUnitNotone(ch rune) { if p.useOptionI() { ch = unicode.ToLower(ch) } p.unit = newRegexNodeCh(ntNotone, p.options, ch) } // Sets the current unit to a single set node func (p *parser) addUnitSet(set *CharSet) { p.unit = newRegexNodeSet(ntSet, p.options, set) } // Sets the current unit to a subtree func (p *parser) addUnitNode(node *regexNode) { p.unit = node } // Sets the current unit to an assertion of the specified type func (p *parser) addUnitType(t nodeType) { p.unit = newRegexNode(t, p.options) } // Finish the current group (in response to a ')' or end) func (p *parser) addGroup() error { if p.group.t == ntTestgroup || p.group.t == ntTestref { p.group.addChild(p.concatenation.reverseLeft()) if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 { return p.getErr(ErrTooManyAlternates) } } else { p.alternation.addChild(p.concatenation.reverseLeft()) p.group.addChild(p.alternation) } p.unit = p.group return nil } // Pops the option stack, but keeps the current options unchanged. func (p *parser) popKeepOptions() { lastIdx := len(p.optionsStack) - 1 p.optionsStack = p.optionsStack[:lastIdx] } // Recalls options from the stack. func (p *parser) popOptions() { lastIdx := len(p.optionsStack) - 1 // get the last item on the stack and then remove it by reslicing p.options = p.optionsStack[lastIdx] p.optionsStack = p.optionsStack[:lastIdx] } // Saves options on a stack. func (p *parser) pushOptions() { p.optionsStack = append(p.optionsStack, p.options) } // Add a string to the last concatenate. func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) { var node *regexNode if cch == 0 { return } if cch > 1 { str := p.pattern[pos : pos+cch] if p.useOptionI() && !isReplacement { // We do the ToLower character by character for consistency. With surrogate chars, doing // a ToLower on the entire string could actually change the surrogate pair. This is more correct // linguistically, but since Regex doesn't support surrogates, it's more important to be // consistent. for i := 0; i < len(str); i++ { str[i] = unicode.ToLower(str[i]) } } node = newRegexNodeStr(ntMulti, p.options, str) } else { ch := p.charAt(pos) if p.useOptionI() && !isReplacement { ch = unicode.ToLower(ch) } node = newRegexNodeCh(ntOne, p.options, ch) } p.concatenation.addChild(node) } // Push the parser state (in response to an open paren) func (p *parser) pushGroup() { p.group.next = p.stack p.alternation.next = p.group p.concatenation.next = p.alternation p.stack = p.concatenation } // Remember the pushed state (in response to a ')') func (p *parser) popGroup() error { p.concatenation = p.stack p.alternation = p.concatenation.next p.group = p.alternation.next p.stack = p.group.next // The first () inside a Testgroup group goes directly to the group if p.group.t == ntTestgroup && len(p.group.children) == 0 { if p.unit == nil { return p.getErr(ErrConditionalExpression) } p.group.addChild(p.unit) p.unit = nil } return nil } // True if the group stack is empty. func (p *parser) emptyStack() bool { return p.stack == nil } // Start a new round for the parser state (in response to an open paren or string start) func (p *parser) startGroup(openGroup *regexNode) { p.group = openGroup p.alternation = newRegexNode(ntAlternate, p.options) p.concatenation = newRegexNode(ntConcatenate, p.options) } // Finish the current concatenation (in response to a |) func (p *parser) addAlternate() { // The | parts inside a Testgroup group go directly to the group if p.group.t == ntTestgroup || p.group.t == ntTestref { p.group.addChild(p.concatenation.reverseLeft()) } else { p.alternation.addChild(p.concatenation.reverseLeft()) } p.concatenation = newRegexNode(ntConcatenate, p.options) } // For categorizing ascii characters. const ( Q byte = 5 // quantifier S = 4 // ordinary stopper Z = 3 // ScanBlank stopper X = 2 // whitespace E = 1 // should be escaped ) var _category = []byte{ //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0, //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0, } func isSpace(ch rune) bool { return (ch <= ' ' && _category[ch] == X) } // Returns true for those characters that terminate a string of ordinary chars. func isSpecial(ch rune) bool { return (ch <= '|' && _category[ch] >= S) } // Returns true for those characters that terminate a string of ordinary chars. func isStopperX(ch rune) bool { return (ch <= '|' && _category[ch] >= X) } // Returns true for those characters that begin a quantifier. func isQuantifier(ch rune) bool { return (ch <= '{' && _category[ch] >= Q) } func (p *parser) isTrueQuantifier() bool { nChars := p.charsRight() if nChars == 0 { return false } startpos := p.textpos() ch := p.charAt(startpos) if ch != '{' { return ch <= '{' && _category[ch] >= Q } //UGLY: this is ugly -- the original code was ugly too pos := startpos for { nChars-- if nChars <= 0 { break } pos++ ch = p.charAt(pos) if ch < '0' || ch > '9' { break } } if nChars == 0 || pos-startpos == 1 { return false } if ch == '}' { return true } if ch != ',' { return false } for { nChars-- if nChars <= 0 { break } pos++ ch = p.charAt(pos) if ch < '0' || ch > '9' { break } } return nChars > 0 && ch == '}' }