package syntax import ( "bytes" "fmt" "strconv" "unicode" "unicode/utf8" ) type Prefix struct { PrefixStr []rune PrefixSet CharSet CaseInsensitive bool } // It takes a RegexTree and computes the set of chars that can start it. func getFirstCharsPrefix(tree *RegexTree) *Prefix { s := regexFcd{ fcStack: make([]regexFc, 32), intStack: make([]int, 32), } fc := s.regexFCFromRegexTree(tree) if fc == nil || fc.nullable || fc.cc.IsEmpty() { return nil } fcSet := fc.getFirstChars() return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive} } type regexFcd struct { intStack []int intDepth int fcStack []regexFc fcDepth int skipAllChildren bool // don't process any more children at the current level skipchild bool // don't process the current child. failed bool } /* * The main FC computation. It does a shortcutted depth-first walk * through the tree and calls CalculateFC to emits code before * and after each child of an interior node, and at each leaf. */ func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc { curNode := tree.root curChild := 0 for { if len(curNode.children) == 0 { // This is a leaf node s.calculateFC(curNode.t, curNode, 0) } else if curChild < len(curNode.children) && !s.skipAllChildren { // This is an interior node, and we have more children to analyze s.calculateFC(curNode.t|beforeChild, curNode, curChild) if !s.skipchild { curNode = curNode.children[curChild] // this stack is how we get a depth first walk of the tree. s.pushInt(curChild) curChild = 0 } else { curChild++ s.skipchild = false } continue } // This is an interior node where we've finished analyzing all the children, or // the end of a leaf node. s.skipAllChildren = false if s.intIsEmpty() { break } curChild = s.popInt() curNode = curNode.next s.calculateFC(curNode.t|afterChild, curNode, curChild) if s.failed { return nil } curChild++ } if s.fcIsEmpty() { return nil } return s.popFC() } // To avoid recursion, we use a simple integer stack. // This is the push. func (s *regexFcd) pushInt(I int) { if s.intDepth >= len(s.intStack) { expanded := make([]int, s.intDepth*2) copy(expanded, s.intStack) s.intStack = expanded } s.intStack[s.intDepth] = I s.intDepth++ } // True if the stack is empty. func (s *regexFcd) intIsEmpty() bool { return s.intDepth == 0 } // This is the pop. func (s *regexFcd) popInt() int { s.intDepth-- return s.intStack[s.intDepth] } // We also use a stack of RegexFC objects. // This is the push. func (s *regexFcd) pushFC(fc regexFc) { if s.fcDepth >= len(s.fcStack) { expanded := make([]regexFc, s.fcDepth*2) copy(expanded, s.fcStack) s.fcStack = expanded } s.fcStack[s.fcDepth] = fc s.fcDepth++ } // True if the stack is empty. func (s *regexFcd) fcIsEmpty() bool { return s.fcDepth == 0 } // This is the pop. func (s *regexFcd) popFC() *regexFc { s.fcDepth-- return &s.fcStack[s.fcDepth] } // This is the top. func (s *regexFcd) topFC() *regexFc { return &s.fcStack[s.fcDepth-1] } // Called in Beforechild to prevent further processing of the current child func (s *regexFcd) skipChild() { s.skipchild = true } // FC computation and shortcut cases for each node type func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) { //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description()) ci := false rtl := false if nt <= ntRef { if (node.options & IgnoreCase) != 0 { ci = true } if (node.options & RightToLeft) != 0 { rtl = true } } switch nt { case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild: break case ntTestgroup | beforeChild: if CurIndex == 0 { s.skipChild() } break case ntEmpty: s.pushFC(regexFc{nullable: true}) break case ntConcatenate | afterChild: if CurIndex != 0 { child := s.popFC() cumul := s.topFC() s.failed = !cumul.addFC(*child, true) } fc := s.topFC() if !fc.nullable { s.skipAllChildren = true } break case ntTestgroup | afterChild: if CurIndex > 1 { child := s.popFC() cumul := s.topFC() s.failed = !cumul.addFC(*child, false) } break case ntAlternate | afterChild, ntTestref | afterChild: if CurIndex != 0 { child := s.popFC() cumul := s.topFC() s.failed = !cumul.addFC(*child, false) } break case ntLoop | afterChild, ntLazyloop | afterChild: if node.m == 0 { fc := s.topFC() fc.nullable = true } break case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild: break case ntRequire | beforeChild, ntPrevent | beforeChild: s.skipChild() s.pushFC(regexFc{nullable: true}) break case ntRequire | afterChild, ntPrevent | afterChild: break case ntOne, ntNotone: s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci)) break case ntOneloop, ntOnelazy: s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci)) break case ntNotoneloop, ntNotonelazy: s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci)) break case ntMulti: if len(node.str) == 0 { s.pushFC(regexFc{nullable: true}) } else if !rtl { s.pushFC(newRegexFc(node.str[0], false, false, ci)) } else { s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci)) } break case ntSet: s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci}) break case ntSetloop, ntSetlazy: s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci}) break case ntRef: s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false}) break case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: s.pushFC(regexFc{nullable: true}) break default: panic(fmt.Sprintf("unexpected op code: %v", nt)) } } type regexFc struct { cc CharSet nullable bool caseInsensitive bool } func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc { r := regexFc{ caseInsensitive: caseInsensitive, nullable: nullable, } if not { if ch > 0 { r.cc.addRange('\x00', ch-1) } if ch < 0xFFFF { r.cc.addRange(ch+1, utf8.MaxRune) } } else { r.cc.addRange(ch, ch) } return r } func (r *regexFc) getFirstChars() CharSet { if r.caseInsensitive { r.cc.addLowercase() } return r.cc } func (r *regexFc) addFC(fc regexFc, concatenate bool) bool { if !r.cc.IsMergeable() || !fc.cc.IsMergeable() { return false } if concatenate { if !r.nullable { return true } if !fc.nullable { r.nullable = false } } else { if fc.nullable { r.nullable = true } } r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive r.cc.addSet(fc.cc) return true } // This is a related computation: it takes a RegexTree and computes the // leading substring if it sees one. It's quite trivial and gives up easily. func getPrefix(tree *RegexTree) *Prefix { var concatNode *regexNode nextChild := 0 curNode := tree.root for { switch curNode.t { case ntConcatenate: if len(curNode.children) > 0 { concatNode = curNode nextChild = 0 } case ntGreedy, ntCapture: curNode = curNode.children[0] concatNode = nil continue case ntOneloop, ntOnelazy: if curNode.m > 0 { return &Prefix{ PrefixStr: repeat(curNode.ch, curNode.m), CaseInsensitive: (curNode.options & IgnoreCase) != 0, } } return nil case ntOne: return &Prefix{ PrefixStr: []rune{curNode.ch}, CaseInsensitive: (curNode.options & IgnoreCase) != 0, } case ntMulti: return &Prefix{ PrefixStr: curNode.str, CaseInsensitive: (curNode.options & IgnoreCase) != 0, } case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent: default: return nil } if concatNode == nil || nextChild >= len(concatNode.children) { return nil } curNode = concatNode.children[nextChild] nextChild++ } } // repeat the rune r, c times... up to the max of MaxPrefixSize func repeat(r rune, c int) []rune { if c > MaxPrefixSize { c = MaxPrefixSize } ret := make([]rune, c) // binary growth using copy for speed ret[0] = r bp := 1 for bp < len(ret) { copy(ret[bp:], ret[:bp]) bp *= 2 } return ret } // BmPrefix precomputes the Boyer-Moore // tables for fast string scanning. These tables allow // you to scan for the first occurrence of a string within // a large body of text without examining every character. // The performance of the heuristic depends on the actual // string and the text being searched, but usually, the longer // the string that is being searched for, the fewer characters // need to be examined. type BmPrefix struct { positive []int negativeASCII []int negativeUnicode [][]int pattern []rune lowASCII rune highASCII rune rightToLeft bool caseInsensitive bool } func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix { b := &BmPrefix{ rightToLeft: rightToLeft, caseInsensitive: caseInsensitive, pattern: pattern, } if caseInsensitive { for i := 0; i < len(b.pattern); i++ { // We do the ToLower character by character for consistency. With surrogate chars, doing // a ToLower on the entire string could actually change the surrogate pair. This is more correct // linguistically, but since Regex doesn't support surrogates, it's more important to be // consistent. b.pattern[i] = unicode.ToLower(b.pattern[i]) } } var beforefirst, last, bump int var scan, match int if !rightToLeft { beforefirst = -1 last = len(b.pattern) - 1 bump = 1 } else { beforefirst = len(b.pattern) last = 0 bump = -1 } // PART I - the good-suffix shift table // // compute the positive requirement: // if char "i" is the first one from the right that doesn't match, // then we know the matcher can advance by _positive[i]. // // This algorithm is a simplified variant of the standard // Boyer-Moore good suffix calculation. b.positive = make([]int, len(b.pattern)) examine := last ch := b.pattern[examine] b.positive[examine] = bump examine -= bump Outerloop: for { // find an internal char (examine) that matches the tail for { if examine == beforefirst { break Outerloop } if b.pattern[examine] == ch { break } examine -= bump } match = last scan = examine // find the length of the match for { if scan == beforefirst || b.pattern[match] != b.pattern[scan] { // at the end of the match, note the difference in _positive // this is not the length of the match, but the distance from the internal match // to the tail suffix. if b.positive[match] == 0 { b.positive[match] = match - scan } // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); break } scan -= bump match -= bump } examine -= bump } match = last - bump // scan for the chars for which there are no shifts that yield a different candidate // The inside of the if statement used to say // "_positive[match] = last - beforefirst;" // This is slightly less aggressive in how much we skip, but at worst it // should mean a little more work rather than skipping a potential match. for match != beforefirst { if b.positive[match] == 0 { b.positive[match] = bump } match -= bump } // PART II - the bad-character shift table // // compute the negative requirement: // if char "ch" is the reject character when testing position "i", // we can slide up by _negative[ch]; // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) // // the lookup table is divided into ASCII and Unicode portions; // only those parts of the Unicode 16-bit code set that actually // appear in the string are in the table. (Maximum size with // Unicode is 65K; ASCII only case is 512 bytes.) b.negativeASCII = make([]int, 128) for i := 0; i < len(b.negativeASCII); i++ { b.negativeASCII[i] = last - beforefirst } b.lowASCII = 127 b.highASCII = 0 for examine = last; examine != beforefirst; examine -= bump { ch = b.pattern[examine] switch { case ch < 128: if b.lowASCII > ch { b.lowASCII = ch } if b.highASCII < ch { b.highASCII = ch } if b.negativeASCII[ch] == last-beforefirst { b.negativeASCII[ch] = last - examine } case ch <= 0xffff: i, j := ch>>8, ch&0xFF if b.negativeUnicode == nil { b.negativeUnicode = make([][]int, 256) } if b.negativeUnicode[i] == nil { newarray := make([]int, 256) for k := 0; k < len(newarray); k++ { newarray[k] = last - beforefirst } if i == 0 { copy(newarray, b.negativeASCII) //TODO: this line needed? b.negativeASCII = newarray } b.negativeUnicode[i] = newarray } if b.negativeUnicode[i][j] == last-beforefirst { b.negativeUnicode[i][j] = last - examine } default: // we can't do the filter because this algo doesn't support // unicode chars >0xffff return nil } } return b } func (b *BmPrefix) String() string { return string(b.pattern) } // Dump returns the contents of the filter as a human readable string func (b *BmPrefix) Dump(indent string) string { buf := &bytes.Buffer{} fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent) for i := 0; i < len(b.positive); i++ { buf.WriteString(strconv.Itoa(b.positive[i])) buf.WriteRune(' ') } buf.WriteRune('\n') if b.negativeASCII != nil { buf.WriteString(indent) buf.WriteString("Negative table\n") for i := 0; i < len(b.negativeASCII); i++ { if b.negativeASCII[i] != len(b.pattern) { fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i])) } } } return buf.String() } // Scan uses the Boyer-Moore algorithm to find the first occurrence // of the specified string within text, beginning at index, and // constrained within beglimit and endlimit. // // The direction and case-sensitivity of the match is determined // by the arguments to the RegexBoyerMoore constructor. func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int { var ( defadv, test, test2 int match, startmatch, endmatch int bump, advance int chTest rune unicodeLookup []int ) if !b.rightToLeft { defadv = len(b.pattern) startmatch = len(b.pattern) - 1 endmatch = 0 test = index + defadv - 1 bump = 1 } else { defadv = -len(b.pattern) startmatch = 0 endmatch = -defadv - 1 test = index + defadv bump = -1 } chMatch := b.pattern[startmatch] for { if test >= endlimit || test < beglimit { return -1 } chTest = text[test] if b.caseInsensitive { chTest = unicode.ToLower(chTest) } if chTest != chMatch { if chTest < 128 { advance = b.negativeASCII[chTest] } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { unicodeLookup = b.negativeUnicode[chTest>>8] if len(unicodeLookup) > 0 { advance = unicodeLookup[chTest&0xFF] } else { advance = defadv } } else { advance = defadv } test += advance } else { // if (chTest == chMatch) test2 = test match = startmatch for { if match == endmatch { if b.rightToLeft { return test2 + 1 } else { return test2 } } match -= bump test2 -= bump chTest = text[test2] if b.caseInsensitive { chTest = unicode.ToLower(chTest) } if chTest != b.pattern[match] { advance = b.positive[match] if chTest < 128 { test2 = (match - startmatch) + b.negativeASCII[chTest] } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { unicodeLookup = b.negativeUnicode[chTest>>8] if len(unicodeLookup) > 0 { test2 = (match - startmatch) + unicodeLookup[chTest&0xFF] } else { test += advance break } } else { test += advance break } if b.rightToLeft { if test2 < advance { advance = test2 } } else if test2 > advance { advance = test2 } test += advance break } } } } } // When a regex is anchored, we can do a quick IsMatch test instead of a Scan func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool { if !b.rightToLeft { if index < beglimit || endlimit-index < len(b.pattern) { return false } return b.matchPattern(text, index) } else { if index > endlimit || index-beglimit < len(b.pattern) { return false } return b.matchPattern(text, index-len(b.pattern)) } } func (b *BmPrefix) matchPattern(text []rune, index int) bool { if len(text)-index < len(b.pattern) { return false } if b.caseInsensitive { for i := 0; i < len(b.pattern); i++ { //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); if unicode.ToLower(text[index+i]) != b.pattern[i] { return false } } return true } else { for i := 0; i < len(b.pattern); i++ { if text[index+i] != b.pattern[i] { return false } } return true } } type AnchorLoc int16 // where the regex can be pegged const ( AnchorBeginning AnchorLoc = 0x0001 AnchorBol = 0x0002 AnchorStart = 0x0004 AnchorEol = 0x0008 AnchorEndZ = 0x0010 AnchorEnd = 0x0020 AnchorBoundary = 0x0040 AnchorECMABoundary = 0x0080 ) func getAnchors(tree *RegexTree) AnchorLoc { var concatNode *regexNode nextChild, result := 0, AnchorLoc(0) curNode := tree.root for { switch curNode.t { case ntConcatenate: if len(curNode.children) > 0 { concatNode = curNode nextChild = 0 } case ntGreedy, ntCapture: curNode = curNode.children[0] concatNode = nil continue case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: return result | anchorFromType(curNode.t) case ntEmpty, ntRequire, ntPrevent: default: return result } if concatNode == nil || nextChild >= len(concatNode.children) { return result } curNode = concatNode.children[nextChild] nextChild++ } } func anchorFromType(t nodeType) AnchorLoc { switch t { case ntBol: return AnchorBol case ntEol: return AnchorEol case ntBoundary: return AnchorBoundary case ntECMABoundary: return AnchorECMABoundary case ntBeginning: return AnchorBeginning case ntStart: return AnchorStart case ntEndZ: return AnchorEndZ case ntEnd: return AnchorEnd default: return 0 } } // anchorDescription returns a human-readable description of the anchors func (anchors AnchorLoc) String() string { buf := &bytes.Buffer{} if 0 != (anchors & AnchorBeginning) { buf.WriteString(", Beginning") } if 0 != (anchors & AnchorStart) { buf.WriteString(", Start") } if 0 != (anchors & AnchorBol) { buf.WriteString(", Bol") } if 0 != (anchors & AnchorBoundary) { buf.WriteString(", Boundary") } if 0 != (anchors & AnchorECMABoundary) { buf.WriteString(", ECMABoundary") } if 0 != (anchors & AnchorEol) { buf.WriteString(", Eol") } if 0 != (anchors & AnchorEnd) { buf.WriteString(", End") } if 0 != (anchors & AnchorEndZ) { buf.WriteString(", EndZ") } // trim off comma if buf.Len() >= 2 { return buf.String()[2:] } return "None" }