// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package syntaximport ()// An Error describes a failure to parse a regular expression// and gives the offending expression.typeErrorstruct {CodeErrorCodeExprstring}func ( *Error) () string {return"error parsing regexp: " + .Code.String() + ": `" + .Expr + "`"}// An ErrorCode describes a failure to parse a regular expression.typeErrorCodestringconst (// Unexpected errorErrInternalErrorErrorCode = "regexp/syntax: internal error"// Parse errorsErrInvalidCharClassErrorCode = "invalid character class"ErrInvalidCharRangeErrorCode = "invalid character class range"ErrInvalidEscapeErrorCode = "invalid escape sequence"ErrInvalidNamedCaptureErrorCode = "invalid named capture"ErrInvalidPerlOpErrorCode = "invalid or unsupported Perl syntax"ErrInvalidRepeatOpErrorCode = "invalid nested repetition operator"ErrInvalidRepeatSizeErrorCode = "invalid repeat count"ErrInvalidUTF8ErrorCode = "invalid UTF-8"ErrMissingBracketErrorCode = "missing closing ]"ErrMissingParenErrorCode = "missing closing )"ErrMissingRepeatArgumentErrorCode = "missing argument to repetition operator"ErrTrailingBackslashErrorCode = "trailing backslash at end of expression"ErrUnexpectedParenErrorCode = "unexpected )"ErrNestingDepthErrorCode = "expression nests too deeply"ErrLargeErrorCode = "expression too large")func ( ErrorCode) () string {returnstring()}// Flags control the behavior of the parser and record information about regexp context.typeFlagsuint16const (FoldCaseFlags = 1 << iota// case-insensitive matchLiteral// treat pattern as literal stringClassNL// allow character classes like [^a-z] and [[:space:]] to match newlineDotNL// allow . to match newlineOneLine// treat ^ and $ as only matching at beginning and end of textNonGreedy// make repetition operators default to non-greedyPerlX// allow Perl extensionsUnicodeGroups// allow \p{Han}, \P{Han} for Unicode group and negationWasDollar// regexp OpEndText was $, not \zSimple// regexp contains no counted repetitionMatchNL = ClassNL | DotNLPerl = ClassNL | OneLine | PerlX | UnicodeGroups// as close to Perl as possiblePOSIXFlags = 0// POSIX syntax)// Pseudo-ops for parsing stack.const (opLeftParen = opPseudo + iotaopVerticalBar)// maxHeight is the maximum height of a regexp parse tree.// It is somewhat arbitrarily chosen, but the idea is to be large enough// that no one will actually hit in real use but at the same time small enough// that recursion on the Regexp tree will not hit the 1GB Go stack limit.// The maximum amount of stack for a single recursive frame is probably// closer to 1kB, so this could potentially be raised, but it seems unlikely// that people have regexps nested even this deeply.// We ran a test on Google's C++ code base and turned up only// a single use case with depth > 100; it had depth 128.// Using depth 1000 should be plenty of margin.// As an optimization, we don't even bother calculating heights// until we've allocated at least maxHeight Regexp structures.constmaxHeight = 1000// maxSize is the maximum size of a compiled regexp in Insts.// It too is somewhat arbitrarily chosen, but the idea is to be large enough// to allow significant regexps while at the same time small enough that// the compiled form will not take up too much memory.// 128 MB is enough for a 3.3 million Inst structures, which roughly// corresponds to a 3.3 MB regexp.const (maxSize = 128 << 20 / instSizeinstSize = 5 * 8// byte, 2 uint32, slice is 5 64-bit words)// maxRunes is the maximum number of runes allowed in a regexp tree// counting the runes in all the nodes.// Ignoring character classes p.numRunes is always less than the length of the regexp.// Character classes can make it much larger: each \pL adds 1292 runes.// 128 MB is enough for 32M runes, which is over 26k \pL instances.// Note that repetitions do not make copies of the rune slices,// so \pL{1000} is only one rune slice, not 1000.// We could keep a cache of character classes we've seen,// so that all the \pL we see use the same rune list,// but that doesn't remove the problem entirely:// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].// And because the Rune slice is exposed directly in the Regexp,// there is not an opportunity to change the representation to allow// partial sharing between different character classes.// So the limit is the best we can do.const (maxRunes = 128 << 20 / runeSizeruneSize = 4// rune is int32)typeparserstruct {flagsFlags// parse mode flagsstack []*Regexp// stack of parsed expressionsfree *RegexpnumCapint// number of capturing groups seenwholeRegexpstringtmpClass []rune// temporary char class work spacenumRegexpint// number of regexps allocatednumRunesint// number of runes in char classesrepeatsint64// product of all repetitions seenheightmap[*Regexp]int// regexp height, for height limit checksizemap[*Regexp]int64// regexp compiled size, for size limit check}func ( *parser) ( Op) *Regexp { := .freeif != nil { .free = .Sub0[0] * = Regexp{} } else { = new(Regexp) .numRegexp++ } .Op = return}func ( *parser) ( *Regexp) {if .height != nil {delete(.height, ) } .Sub0[0] = .free .free = }func ( *parser) ( *Regexp) {if .numRunes > maxRunes {panic(ErrLarge) } .checkSize() .checkHeight()}func ( *parser) ( *Regexp) {if .size == nil {// We haven't started tracking size yet. // Do a relatively cheap check to see if we need to start. // Maintain the product of all the repeats we've seen // and don't track if the total number of regexp nodes // we've seen times the repeat product is in budget.if .repeats == 0 { .repeats = 1 }if .Op == OpRepeat { := .Maxif == -1 { = .Min }if <= 0 { = 1 }ifint64() > maxSize/.repeats { .repeats = maxSize } else { .repeats *= int64() } }ifint64(.numRegexp) < maxSize/.repeats {return }// We need to start tracking size. // Make the map and belatedly populate it // with info about everything we've constructed so far. .size = make(map[*Regexp]int64)for , := range .stack { .() } }if .calcSize(, true) > maxSize {panic(ErrLarge) }}func ( *parser) ( *Regexp, bool) int64 {if ! {if , := .size[]; {return } }varint64switch .Op {caseOpLiteral: = int64(len(.Rune))caseOpCapture, OpStar:// star can be 1+ or 2+; assume 2 pessimistically = 2 + .(.Sub[0], false)caseOpPlus, OpQuest: = 1 + .(.Sub[0], false)caseOpConcat:for , := range .Sub { += .(, false) }caseOpAlternate:for , := range .Sub { += .(, false) }iflen(.Sub) > 1 { += int64(len(.Sub)) - 1 }caseOpRepeat: := .(.Sub[0], false)if .Max == -1 {if .Min == 0 { = 2 + // x* } else { = 1 + int64(.Min)* // xxx+ }break }// x{2,5} = xx(x(x(x)?)?)? = int64(.Max)* + int64(.Max-.Min) }if < 1 { = 1 } .size[] = return}func ( *parser) ( *Regexp) {if .numRegexp < maxHeight {return }if .height == nil { .height = make(map[*Regexp]int)for , := range .stack { .() } }if .calcHeight(, true) > maxHeight {panic(ErrNestingDepth) }}func ( *parser) ( *Regexp, bool) int {if ! {if , := .height[]; {return } } := 1for , := range .Sub { := .(, false)if < 1+ { = 1 + } } .height[] = return}// Parse stack manipulation.// push pushes the regexp re onto the parse stack and returns the regexp.func ( *parser) ( *Regexp) *Regexp { .numRunes += len(.Rune)if .Op == OpCharClass && len(.Rune) == 2 && .Rune[0] == .Rune[1] {// Single rune.if .maybeConcat(.Rune[0], .flags&^FoldCase) {returnnil } .Op = OpLiteral .Rune = .Rune[:1] .Flags = .flags &^ FoldCase } elseif .Op == OpCharClass && len(.Rune) == 4 && .Rune[0] == .Rune[1] && .Rune[2] == .Rune[3] &&unicode.SimpleFold(.Rune[0]) == .Rune[2] &&unicode.SimpleFold(.Rune[2]) == .Rune[0] || .Op == OpCharClass && len(.Rune) == 2 && .Rune[0]+1 == .Rune[1] &&unicode.SimpleFold(.Rune[0]) == .Rune[1] &&unicode.SimpleFold(.Rune[1]) == .Rune[0] {// Case-insensitive rune like [Aa] or [Δδ].if .maybeConcat(.Rune[0], .flags|FoldCase) {returnnil }// Rewrite as (case-insensitive) literal. .Op = OpLiteral .Rune = .Rune[:1] .Flags = .flags | FoldCase } else {// Incremental concatenation. .maybeConcat(-1, 0) } .stack = append(.stack, ) .checkLimits()return}// maybeConcat implements incremental concatenation// of literal runes into string nodes. The parser calls this// before each push, so only the top fragment of the stack// might need processing. Since this is called before a push,// the topmost literal is no longer subject to operators like *// (Otherwise ab* would turn into (ab)*.)// If r >= 0 and there's a node left over, maybeConcat uses it// to push r with the given flags.// maybeConcat reports whether r was pushed.func ( *parser) ( rune, Flags) bool { := len(.stack)if < 2 {returnfalse } := .stack[-1] := .stack[-2]if .Op != OpLiteral || .Op != OpLiteral || .Flags&FoldCase != .Flags&FoldCase {returnfalse }// Push re1 into re2. .Rune = append(.Rune, .Rune...)// Reuse re1 if possible.if >= 0 { .Rune = .Rune0[:1] .Rune[0] = .Flags = returntrue } .stack = .stack[:-1] .reuse()returnfalse// did not push r}// literal pushes a literal regexp for the rune r on the stack.func ( *parser) ( rune) { := .newRegexp(OpLiteral) .Flags = .flagsif .flags&FoldCase != 0 { = minFoldRune() } .Rune0[0] = .Rune = .Rune0[:1] .push()}// minFoldRune returns the minimum rune fold-equivalent to r.func ( rune) rune {if < minFold || > maxFold {return } := := for = unicode.SimpleFold(); != ; = unicode.SimpleFold() {if > { = } }return}// op pushes a regexp with the given op onto the stack// and returns that regexp.func ( *parser) ( Op) *Regexp { := .newRegexp() .Flags = .flagsreturn .push()}// repeat replaces the top stack element with itself repeated according to op, min, max.// before is the regexp suffix starting at the repetition operator.// after is the regexp suffix following after the repetition operator.// repeat returns an updated 'after' and an error, if any.func ( *parser) ( Op, , int, , , string) (string, error) { := .flagsif .flags&PerlX != 0 {iflen() > 0 && [0] == '?' { = [1:] ^= NonGreedy }if != "" {// In Perl it is not allowed to stack repetition operators: // a** is a syntax error, not a doubled star, and a++ means // something else entirely, which we don't support!return"", &Error{ErrInvalidRepeatOp, [:len()-len()]} } } := len(.stack)if == 0 {return"", &Error{ErrMissingRepeatArgument, [:len()-len()]} } := .stack[-1]if .Op >= opPseudo {return"", &Error{ErrMissingRepeatArgument, [:len()-len()]} } := .newRegexp() .Min = .Max = .Flags = .Sub = .Sub0[:1] .Sub[0] = .stack[-1] = .checkLimits()if == OpRepeat && ( >= 2 || >= 2) && !repeatIsValid(, 1000) {return"", &Error{ErrInvalidRepeatSize, [:len()-len()]} }return , nil}// repeatIsValid reports whether the repetition re is valid.// Valid means that the combination of the top-level repetition// and any inner repetitions does not exceed n copies of the// innermost thing.// This function rewalks the regexp tree and is called for every repetition,// so we have to worry about inducing quadratic behavior in the parser.// We avoid this by only calling repeatIsValid when min or max >= 2.// In that case the depth of any >= 2 nesting can only get to 9 without// triggering a parse error, so each subtree can only be rewalked 9 times.func ( *Regexp, int) bool {if .Op == OpRepeat { := .Maxif == 0 {returntrue }if < 0 { = .Min }if > {returnfalse }if > 0 { /= } }for , := range .Sub {if !(, ) {returnfalse } }returntrue}// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.func ( *parser) () *Regexp { .maybeConcat(-1, 0)// Scan down to find pseudo-operator | or (. := len(.stack)for > 0 && .stack[-1].Op < opPseudo { -- } := .stack[:] .stack = .stack[:]// Empty concatenation is special case.iflen() == 0 {return .push(.newRegexp(OpEmptyMatch)) }return .push(.collapse(, OpConcat))}// alternate replaces the top of the stack (above the topmost '(') with its alternation.func ( *parser) () *Regexp {// Scan down to find pseudo-operator (. // There are no | above (. := len(.stack)for > 0 && .stack[-1].Op < opPseudo { -- } := .stack[:] .stack = .stack[:]// Make sure top class is clean. // All the others already are (see swapVerticalBar).iflen() > 0 {cleanAlt([len()-1]) }// Empty alternate is special case // (shouldn't happen but easy to handle).iflen() == 0 {return .push(.newRegexp(OpNoMatch)) }return .push(.collapse(, OpAlternate))}// cleanAlt cleans re for eventual inclusion in an alternation.func ( *Regexp) {switch .Op {caseOpCharClass: .Rune = cleanClass(&.Rune)iflen(.Rune) == 2 && .Rune[0] == 0 && .Rune[1] == unicode.MaxRune { .Rune = nil .Op = OpAnyCharreturn }iflen(.Rune) == 4 && .Rune[0] == 0 && .Rune[1] == '\n'-1 && .Rune[2] == '\n'+1 && .Rune[3] == unicode.MaxRune { .Rune = nil .Op = OpAnyCharNotNLreturn }ifcap(.Rune)-len(.Rune) > 100 {// re.Rune will not grow any more. // Make a copy or inline to reclaim storage. .Rune = append(.Rune0[:0], .Rune...) } }}// collapse returns the result of applying op to sub.// If sub contains op nodes, they all get hoisted up// so that there is never a concat of a concat or an// alternate of an alternate.func ( *parser) ( []*Regexp, Op) *Regexp {iflen() == 1 {return [0] } := .newRegexp() .Sub = .Sub0[:0]for , := range {if .Op == { .Sub = append(.Sub, .Sub...) .reuse() } else { .Sub = append(.Sub, ) } }if == OpAlternate { .Sub = .factor(.Sub)iflen(.Sub) == 1 { := = .Sub[0] .reuse() } }return}// factor factors common prefixes from the alternation list sub.// It returns a replacement list that reuses the same storage and// frees (passes to p.reuse) any removed *Regexps.//// For example,//// ABC|ABD|AEF|BCX|BCY//// simplifies by literal prefix extraction to//// A(B(C|D)|EF)|BC(X|Y)//// which simplifies by character class introduction to//// A(B[CD]|EF)|BC[XY]func ( *parser) ( []*Regexp) []*Regexp {iflen() < 2 {return }// Round 1: Factor out common literal prefixes.var []runevarFlags := 0 := [:0]for := 0; <= len(); ++ {// Invariant: the Regexps that were in sub[0:start] have been // used or marked for reuse, and the slice space has been reused // for out (len(out) <= start). // // Invariant: sub[start:i] consists of regexps that all begin // with str as modified by strflags.var []runevarFlagsif < len() { , = .leadingString([])if == { := 0for < len() && < len() && [] == [] { ++ }if > 0 {// Matches at least one rune in current range. // Keep going around. = [:]continue } } }// Found end of a run with common leading literal string: // sub[start:i] all begin with str[0:len(str)], but sub[i] // does not even begin with str[0]. // // Factor out common string and append factored expression to out.if == {// Nothing to do - run of length 0. } elseif == +1 {// Just one: don't bother factoring. = append(, []) } else {// Construct factored form: prefix(suffix1|suffix2|...) := .newRegexp(OpLiteral) .Flags = .Rune = append(.Rune[:0], ...)for := ; < ; ++ { [] = .removeLeadingString([], len()) .checkLimits([]) } := .collapse([:], OpAlternate) // recurse := .newRegexp(OpConcat) .Sub = append(.Sub[:0], , ) = append(, ) }// Prepare for next iteration. = = = } = // Round 2: Factor out common simple prefixes, // just the first piece of each concatenation. // This will be good enough a lot of the time. // // Complex subexpressions (e.g. involving quantifiers) // are not safe to factor because that collapses their // distinct paths through the automaton, which affects // correctness in some cases. = 0 = [:0]var *Regexpfor := 0; <= len(); ++ {// Invariant: the Regexps that were in sub[0:start] have been // used or marked for reuse, and the slice space has been reused // for out (len(out) <= start). // // Invariant: sub[start:i] consists of regexps that all begin with ifirst.var *Regexpif < len() { = .leadingRegexp([])if != nil && .Equal() &&// first must be a character class OR a fixed repeat of a character class. (isCharClass() || (.Op == OpRepeat && .Min == .Max && isCharClass(.Sub[0]))) {continue } }// Found end of a run with common leading regexp: // sub[start:i] all begin with first but sub[i] does not. // // Factor out common regexp and append factored expression to out.if == {// Nothing to do - run of length 0. } elseif == +1 {// Just one: don't bother factoring. = append(, []) } else {// Construct factored form: prefix(suffix1|suffix2|...) := for := ; < ; ++ { := != // prefix came from sub[start] [] = .removeLeadingRegexp([], ) .checkLimits([]) } := .collapse([:], OpAlternate) // recurse := .newRegexp(OpConcat) .Sub = append(.Sub[:0], , ) = append(, ) }// Prepare for next iteration. = = } = // Round 3: Collapse runs of single literals into character classes. = 0 = [:0]for := 0; <= len(); ++ {// Invariant: the Regexps that were in sub[0:start] have been // used or marked for reuse, and the slice space has been reused // for out (len(out) <= start). // // Invariant: sub[start:i] consists of regexps that are either // literal runes or character classes.if < len() && isCharClass([]) {continue }// sub[i] is not a char or char class; // emit char class for sub[start:i]...if == {// Nothing to do - run of length 0. } elseif == +1 { = append(, []) } else {// Make new char class. // Start with most complex regexp in sub[start]. := for := + 1; < ; ++ {if [].Op < [].Op || [].Op == [].Op && len([].Rune) < len([].Rune) { = } } [], [] = [], []for := + 1; < ; ++ {mergeCharClass([], []) .reuse([]) }cleanAlt([]) = append(, []) }// ... and then emit sub[i].if < len() { = append(, []) } = + 1 } = // Round 4: Collapse runs of empty matches into a single empty match. = 0 = [:0]for := range {if +1 < len() && [].Op == OpEmptyMatch && [+1].Op == OpEmptyMatch {continue } = append(, []) } = return}// leadingString returns the leading literal string that re begins with.// The string refers to storage in re or its children.func ( *parser) ( *Regexp) ([]rune, Flags) {if .Op == OpConcat && len(.Sub) > 0 { = .Sub[0] }if .Op != OpLiteral {returnnil, 0 }return .Rune, .Flags & FoldCase}// removeLeadingString removes the first n leading runes// from the beginning of re. It returns the replacement for re.func ( *parser) ( *Regexp, int) *Regexp {if .Op == OpConcat && len(.Sub) > 0 {// Removing a leading string in a concatenation // might simplify the concatenation. := .Sub[0] = .(, ) .Sub[0] = if .Op == OpEmptyMatch { .reuse()switchlen(.Sub) {case0, 1:// Impossible but handle. .Op = OpEmptyMatch .Sub = nilcase2: := = .Sub[1] .reuse()default:copy(.Sub, .Sub[1:]) .Sub = .Sub[:len(.Sub)-1] } }return }if .Op == OpLiteral { .Rune = .Rune[:copy(.Rune, .Rune[:])]iflen(.Rune) == 0 { .Op = OpEmptyMatch } }return}// leadingRegexp returns the leading regexp that re begins with.// The regexp refers to storage in re or its children.func ( *parser) ( *Regexp) *Regexp {if .Op == OpEmptyMatch {returnnil }if .Op == OpConcat && len(.Sub) > 0 { := .Sub[0]if .Op == OpEmptyMatch {returnnil }return }return}// removeLeadingRegexp removes the leading regexp in re.// It returns the replacement for re.// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.func ( *parser) ( *Regexp, bool) *Regexp {if .Op == OpConcat && len(.Sub) > 0 {if { .reuse(.Sub[0]) } .Sub = .Sub[:copy(.Sub, .Sub[1:])]switchlen(.Sub) {case0: .Op = OpEmptyMatch .Sub = nilcase1: := = .Sub[0] .reuse() }return }if { .reuse() }return .newRegexp(OpEmptyMatch)}func ( string, Flags) *Regexp { := &Regexp{Op: OpLiteral} .Flags = .Rune = .Rune0[:0] // use local storage for small stringsfor , := range {iflen(.Rune) >= cap(.Rune) {// string is too long to fit in Rune0. let Go handle it .Rune = []rune()break } .Rune = append(.Rune, ) }return}// Parsing.// Parse parses a regular expression string s, controlled by the specified// Flags, and returns a regular expression parse tree. The syntax is// described in the top-level comment.func ( string, Flags) (*Regexp, error) {returnparse(, )}func ( string, Flags) ( *Regexp, error) {deferfunc() {switch := recover(); {default:panic()casenil:// okcaseErrLarge: // too big = &Error{Code: ErrLarge, Expr: }caseErrNestingDepth: = &Error{Code: ErrNestingDepth, Expr: } } }()if &Literal != 0 {// Trivial parser for literal string.if := checkUTF8(); != nil {returnnil, }returnliteralRegexp(, ), nil }// Otherwise, must do real work.var (parserruneOpstring ) .flags = .wholeRegexp = := for != "" { := "" :switch [0] {default:if , , = nextRune(); != nil {returnnil, } .literal()case'(':if .flags&PerlX != 0 && len() >= 2 && [1] == '?' {// Flag changes and non-capturing groups.if , = .parsePerlFlags(); != nil {returnnil, }break } .numCap++ .op(opLeftParen).Cap = .numCap = [1:]case'|':if = .parseVerticalBar(); != nil {returnnil, } = [1:]case')':if = .parseRightParen(); != nil {returnnil, } = [1:]case'^':if .flags&OneLine != 0 { .op(OpBeginText) } else { .op(OpBeginLine) } = [1:]case'$':if .flags&OneLine != 0 { .op(OpEndText).Flags |= WasDollar } else { .op(OpEndLine) } = [1:]case'.':if .flags&DotNL != 0 { .op(OpAnyChar) } else { .op(OpAnyCharNotNL) } = [1:]case'[':if , = .parseClass(); != nil {returnnil, }case'*', '+', '?': := switch [0] {case'*': = OpStarcase'+': = OpPluscase'?': = OpQuest } := [1:]if , = .repeat(, 0, 0, , , ); != nil {returnnil, } = = case'{': = OpRepeat := , , , := .parseRepeat()if ! {// If the repeat cannot be parsed, { is a literal. .literal('{') = [1:]break }if < 0 || > 1000 || > 1000 || >= 0 && > {// Numbers were too big, or max is present and min > max.returnnil, &Error{ErrInvalidRepeatSize, [:len()-len()]} }if , = .repeat(, , , , , ); != nil {returnnil, } = = case'\\':if .flags&PerlX != 0 && len() >= 2 {switch [1] {case'A': .op(OpBeginText) = [2:]breakcase'b': .op(OpWordBoundary) = [2:]breakcase'B': .op(OpNoWordBoundary) = [2:]breakcase'C':// any byte; not supportedreturnnil, &Error{ErrInvalidEscape, [:2]}case'Q':// \Q ... \E: the ... is always literalsvarstring , , _ = strings.Cut([2:], `\E`)for != "" { , , := nextRune()if != nil {returnnil, } .literal() = }breakcase'z': .op(OpEndText) = [2:]break } } := .newRegexp(OpCharClass) .Flags = .flags// Look for Unicode character group like \p{Han}iflen() >= 2 && ([1] == 'p' || [1] == 'P') { , , := .parseUnicodeClass(, .Rune0[:0])if != nil {returnnil, }if != nil { .Rune = = .push()break } }// Perl character class escape.if , := .parsePerlClassEscape(, .Rune0[:0]); != nil { .Rune = = .push()break } .reuse()// Ordinary single-character escape.if , , = .parseEscape(); != nil {returnnil, } .literal() } = } .concat()if .swapVerticalBar() {// pop vertical bar .stack = .stack[:len(.stack)-1] } .alternate() := len(.stack)if != 1 {returnnil, &Error{ErrMissingParen, } }return .stack[0], nil}// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.// If s is not of that form, it returns ok == false.// If s has the right form but the values are too big, it returns min == -1, ok == true.func ( *parser) ( string) (, int, string, bool) {if == "" || [0] != '{' {return } = [1:]varboolif , , = .parseInt(); ! {return }if == "" {return }if [0] != ',' { = } else { = [1:]if == "" {return }if [0] == '}' { = -1 } elseif , , = .parseInt(); ! {return } elseif < 0 {// parseInt found too big a number = -1 } }if == "" || [0] != '}' {return } = [1:] = truereturn}// parsePerlFlags parses a Perl flag setting or non-capturing group or both,// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.// The caller must have ensured that s begins with "(?".func ( *parser) ( string) ( string, error) { := // Check for named captures, first introduced in Python's regexp library. // As usual, there are three slightly different syntaxes: // // (?P<name>expr) the original, introduced by Python // (?<name>expr) the .NET alteration, adopted by Perl 5.10 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 // // Perl 5.10 gave in and implemented the Python version too, // but they claim that the last two are the preferred forms. // PCRE and languages based on it (specifically, PHP and Ruby) // support all three as well. EcmaScript 4 uses only the Python form. // // In both the open source world (via Code Search) and the // Google source tree, (?P<expr>name) is the dominant form, // so that's the one we implement. One is enough.iflen() > 4 && [2] == 'P' && [3] == '<' {// Pull out name. := strings.IndexRune(, '>')if < 0 {if = checkUTF8(); != nil {return"", }return"", &Error{ErrInvalidNamedCapture, } } := [:+1] // "(?P<name>" := [4:] // "name"if = checkUTF8(); != nil {return"", }if !isValidCaptureName() {return"", &Error{ErrInvalidNamedCapture, } }// Like ordinary capture, but named. .numCap++ := .op(opLeftParen) .Cap = .numCap .Name = return [+1:], nil }// Non-capturing group. Might also twiddle Perl flags.varrune = [2:] // skip (? := .flags := +1 := false:for != "" {if , , = nextRune(); != nil {return"", }switch {default:break// Flags.case'i': |= FoldCase = truecase'm': &^= OneLine = truecase's': |= DotNL = truecase'U': |= NonGreedy = true// Switch to negation.case'-':if < 0 {break } = -1// Invert flags so that | above turn into &^ and vice versa. // We'll invert flags again before using it below. = ^ = false// End of flags, starting group or not.case':', ')':if < 0 {if ! {break } = ^ }if == ':' {// Open new group .op(opLeftParen) } .flags = return , nil } }return"", &Error{ErrInvalidPerlOp, [:len()-len()]}}// isValidCaptureName reports whether name// is a valid capture name: [A-Za-z0-9_]+.// PCRE limits names to 32 bytes.// Python rejects names starting with digits.// We don't enforce either of those.func ( string) bool {if == "" {returnfalse }for , := range {if != '_' && !isalnum() {returnfalse } }returntrue}// parseInt parses a decimal integer.func ( *parser) ( string) ( int, string, bool) {if == "" || [0] < '0' || '9' < [0] {return }// Disallow leading zeros.iflen() >= 2 && [0] == '0' && '0' <= [1] && [1] <= '9' {return } := for != "" && '0' <= [0] && [0] <= '9' { = [1:] } = = true// Have digits, compute value. = [:len()-len()]for := 0; < len(); ++ {// Avoid overflow.if >= 1e8 { = -1break } = *10 + int([]) - '0' }return}// can this be represented as a character class?// single-rune literal string, char class, ., and .|\n.func ( *Regexp) bool {return .Op == OpLiteral && len(.Rune) == 1 || .Op == OpCharClass || .Op == OpAnyCharNotNL || .Op == OpAnyChar}// does re match r?func ( *Regexp, rune) bool {switch .Op {caseOpLiteral:returnlen(.Rune) == 1 && .Rune[0] == caseOpCharClass:for := 0; < len(.Rune); += 2 {if .Rune[] <= && <= .Rune[+1] {returntrue } }returnfalsecaseOpAnyCharNotNL:return != '\n'caseOpAnyChar:returntrue }returnfalse}// parseVerticalBar handles a | in the input.func ( *parser) () error { .concat()// The concatenation we just parsed is on top of the stack. // If it sits above an opVerticalBar, swap it below // (things below an opVerticalBar become an alternation). // Otherwise, push a new vertical bar.if !.swapVerticalBar() { .op(opVerticalBar) }returnnil}// mergeCharClass makes dst = dst|src.// The caller must ensure that dst.Op >= src.Op,// to reduce the amount of copying.func (, *Regexp) {switch .Op {caseOpAnyChar:// src doesn't add anything.caseOpAnyCharNotNL:// src might add \nifmatchRune(, '\n') { .Op = OpAnyChar }caseOpCharClass:// src is simpler, so either literal or char classif .Op == OpLiteral { .Rune = appendLiteral(.Rune, .Rune[0], .Flags) } else { .Rune = appendClass(.Rune, .Rune) }caseOpLiteral:// both literalif .Rune[0] == .Rune[0] && .Flags == .Flags {break } .Op = OpCharClass .Rune = appendLiteral(.Rune[:0], .Rune[0], .Flags) .Rune = appendLiteral(.Rune, .Rune[0], .Flags) }}// If the top of the stack is an element followed by an opVerticalBar// swapVerticalBar swaps the two and returns true.// Otherwise it returns false.func ( *parser) () bool {// If above and below vertical bar are literal or char class, // can merge into a single char class. := len(.stack)if >= 3 && .stack[-2].Op == opVerticalBar && isCharClass(.stack[-1]) && isCharClass(.stack[-3]) { := .stack[-1] := .stack[-3]// Make re3 the more complex of the two.if .Op > .Op { , = , .stack[-3] = }mergeCharClass(, ) .reuse() .stack = .stack[:-1]returntrue }if >= 2 { := .stack[-1] := .stack[-2]if .Op == opVerticalBar {if >= 3 {// Now out of reach. // Clean opportunistically.cleanAlt(.stack[-3]) } .stack[-2] = .stack[-1] = returntrue } }returnfalse}// parseRightParen handles a ) in the input.func ( *parser) () error { .concat()if .swapVerticalBar() {// pop vertical bar .stack = .stack[:len(.stack)-1] } .alternate() := len(.stack)if < 2 {return &Error{ErrUnexpectedParen, .wholeRegexp} } := .stack[-1] := .stack[-2] .stack = .stack[:-2]if .Op != opLeftParen {return &Error{ErrUnexpectedParen, .wholeRegexp} }// Restore flags at time of paren. .flags = .Flagsif .Cap == 0 {// Just for grouping. .push() } else { .Op = OpCapture .Sub = .Sub0[:1] .Sub[0] = .push() }returnnil}// parseEscape parses an escape sequence at the beginning of s// and returns the rune.func ( *parser) ( string) ( rune, string, error) { := [1:]if == "" {return0, "", &Error{ErrTrailingBackslash, ""} } , , := nextRune()if != nil {return0, "", }:switch {default:if < utf8.RuneSelf && !isalnum() {// Escaped non-word characters are always themselves. // PCRE is not quite so rigorous: it accepts things like // \q, but we don't. We once rejected \_, but too many // programs and people insist on using it, so allow \_.return , , nil }// Octal escapes.case'1', '2', '3', '4', '5', '6', '7':// Single non-zero digit is a backreference; not supportedif == "" || [0] < '0' || [0] > '7' {break }fallthroughcase'0':// Consume up to three octal digits; already have one. = - '0'for := 1; < 3; ++ {if == "" || [0] < '0' || [0] > '7' {break } = *8 + rune([0]) - '0' = [1:] }return , , nil// Hexadecimal escapes.case'x':if == "" {break }if , , = nextRune(); != nil {return0, "", }if == '{' {// Any number of digits in braces. // Perl accepts any text at all; it ignores all text // after the first non-hex digit. We require only hex digits, // and at least one. := 0 = 0for {if == "" {break }if , , = nextRune(); != nil {return0, "", }if == '}' {break } := unhex()if < 0 {break } = *16 + if > unicode.MaxRune {break } ++ }if == 0 {break }return , , nil }// Easy case: two hex digits. := unhex()if , , = nextRune(); != nil {return0, "", } := unhex()if < 0 || < 0 {break }return *16 + , , nil// C escapes. There is no case 'b', to avoid misparsing // the Perl word-boundary \b as the C backspace \b // when in POSIX mode. In Perl, /\b/ means word-boundary // but /[\b]/ means backspace. We don't support that. // If you want a backspace, embed a literal backspace // character or use \x08.case'a':return'\a', , case'f':return'\f', , case'n':return'\n', , case'r':return'\r', , case't':return'\t', , case'v':return'\v', , }return0, "", &Error{ErrInvalidEscape, [:len()-len()]}}// parseClassChar parses a character class character at the beginning of s// and returns it.func ( *parser) (, string) ( rune, string, error) {if == "" {return0, "", &Error{Code: ErrMissingBracket, Expr: } }// Allow regular escape sequences even though // many need not be escaped in this context.if [0] == '\\' {return .parseEscape() }returnnextRune()}typecharGroupstruct {signintclass []rune}// parsePerlClassEscape parses a leading Perl character class escape like \d// from the beginning of s. If one is present, it appends the characters to r// and returns the new slice r and the remainder of the string.func ( *parser) ( string, []rune) ( []rune, string) {if .flags&PerlX == 0 || len() < 2 || [0] != '\\' {return } := perlGroup[[0:2]]if .sign == 0 {return }return .appendGroup(, ), [2:]}// parseNamedClass parses a leading POSIX named character class like [:alnum:]// from the beginning of s. If one is present, it appends the characters to r// and returns the new slice r and the remainder of the string.func ( *parser) ( string, []rune) ( []rune, string, error) {iflen() < 2 || [0] != '[' || [1] != ':' {return } := strings.Index([2:], ":]")if < 0 {return } += 2 , := [0:+2], [+2:] := posixGroup[]if .sign == 0 {returnnil, "", &Error{ErrInvalidCharRange, } }return .appendGroup(, ), , nil}func ( *parser) ( []rune, charGroup) []rune {if .flags&FoldCase == 0 {if .sign < 0 { = appendNegatedClass(, .class) } else { = appendClass(, .class) } } else { := .tmpClass[:0] = appendFoldedClass(, .class) .tmpClass = = cleanClass(&.tmpClass)if .sign < 0 { = appendNegatedClass(, ) } else { = appendClass(, ) } }return}varanyTable = &unicode.RangeTable{R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},}// unicodeTable returns the unicode.RangeTable identified by name// and the table of additional fold-equivalent code points.func ( string) (*unicode.RangeTable, *unicode.RangeTable) {// Special case: "Any" means any.if == "Any" {returnanyTable, anyTable }if := unicode.Categories[]; != nil {return , unicode.FoldCategory[] }if := unicode.Scripts[]; != nil {return , unicode.FoldScript[] }returnnil, nil}// parseUnicodeClass parses a leading Unicode character class like \p{Han}// from the beginning of s. If one is present, it appends the characters to r// and returns the new slice r and the remainder of the string.func ( *parser) ( string, []rune) ( []rune, string, error) {if .flags&UnicodeGroups == 0 || len() < 2 || [0] != '\\' || [1] != 'p' && [1] != 'P' {return }// Committed to parse or return error. := +1if [1] == 'P' { = -1 } := [2:] , , := nextRune()if != nil {return }var , stringif != '{' {// Single-letter name. = [:len()-len()] = [2:] } else {// Name is in braces. := strings.IndexRune(, '}')if < 0 {if = checkUTF8(); != nil {return }returnnil, "", &Error{ErrInvalidCharRange, } } , = [:+1], [+1:] = [3:]if = checkUTF8(); != nil {return } }// Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.if != "" && [0] == '^' { = - = [1:] } , := unicodeTable()if == nil {returnnil, "", &Error{ErrInvalidCharRange, } }if .flags&FoldCase == 0 || == nil {if > 0 { = appendTable(, ) } else { = appendNegatedTable(, ) } } else {// Merge and clean tab and fold in a temporary buffer. // This is necessary for the negative case and just tidy // for the positive case. := .tmpClass[:0] = appendTable(, ) = appendTable(, ) .tmpClass = = cleanClass(&.tmpClass)if > 0 { = appendClass(, ) } else { = appendNegatedClass(, ) } }return , , nil}// parseClass parses a character class at the beginning of s// and pushes it onto the parse stack.func ( *parser) ( string) ( string, error) { := [1:] // chop [ := .newRegexp(OpCharClass) .Flags = .flags .Rune = .Rune0[:0] := +1if != "" && [0] == '^' { = -1 = [1:]// If character class does not match \n, add it here, // so that negation later will do the right thing.if .flags&ClassNL == 0 { .Rune = append(.Rune, '\n', '\n') } } := .Rune := true// ] and - are okay as first char in classfor == "" || [0] != ']' || {// POSIX: - is only okay unescaped as first or last in class. // Perl: - is okay anywhere.if != "" && [0] == '-' && .flags&PerlX == 0 && ! && (len() == 1 || [1] != ']') { , := utf8.DecodeRuneInString([1:])return"", &Error{Code: ErrInvalidCharRange, Expr: [:1+]} } = false// Look for POSIX [:alnum:] etc.iflen() > 2 && [0] == '[' && [1] == ':' { , , := .parseNamedClass(, )if != nil {return"", }if != nil { , = , continue } }// Look for Unicode character group like \p{Han}. , , := .parseUnicodeClass(, )if != nil {return"", }if != nil { , = , continue }// Look for Perl character class symbols (extension).if , := .parsePerlClassEscape(, ); != nil { , = , continue }// Single character or simple range. := var , runeif , , = .parseClassChar(, ); != nil {return"", } = // [a-] means (a|-) so check for final ].iflen() >= 2 && [0] == '-' && [1] != ']' { = [1:]if , , = .parseClassChar(, ); != nil {return"", }if < { = [:len()-len()]return"", &Error{Code: ErrInvalidCharRange, Expr: } } }if .flags&FoldCase == 0 { = appendRange(, , ) } else { = appendFoldedRange(, , ) } } = [1:] // chop ]// Use &re.Rune instead of &class to avoid allocation. .Rune = = cleanClass(&.Rune)if < 0 { = negateClass() } .Rune = .push()return , nil}// cleanClass sorts the ranges (pairs of elements of r),// merges them, and eliminates duplicates.func ( *[]rune) []rune {// Sort by lo increasing, hi decreasing to break ties.sort.Sort(ranges{}) := *iflen() < 2 {return }// Merge abutting, overlapping. := 2// write indexfor := 2; < len(); += 2 { , := [], [+1]if <= [-1]+1 {// merge with previous rangeif > [-1] { [-1] = }continue }// new disjoint range [] = [+1] = += 2 }return [:]}// appendLiteral returns the result of appending the literal x to the class r.func ( []rune, rune, Flags) []rune {if &FoldCase != 0 {returnappendFoldedRange(, , ) }returnappendRange(, , )}// appendRange returns the result of appending the range lo-hi to the class r.func ( []rune, , rune) []rune {// Expand last range or next to last range if it overlaps or abuts. // Checking two ranges helps when appending case-folded // alphabets, so that one range can be expanding A-Z and the // other expanding a-z. := len()for := 2; <= 4; += 2 { // twice, using i=2, i=4if >= { , := [-], [-+1]if <= +1 && <= +1 {if < { [-] = }if > { [-+1] = }return } } }returnappend(, , )}const (// minimum and maximum runes involved in folding. // checked during test.minFold = 0x0041maxFold = 0x1e943)// appendFoldedRange returns the result of appending the range lo-hi// and its case folding-equivalent runes to the class r.func ( []rune, , rune) []rune {// Optimizations.if <= minFold && >= maxFold {// Range is full: folding can't add more.returnappendRange(, , ) }if < minFold || > maxFold {// Range is outside folding possibilities.returnappendRange(, , ) }if < minFold {// [lo, minFold-1] needs no folding. = appendRange(, , minFold-1) = minFold }if > maxFold {// [maxFold+1, hi] needs no folding. = appendRange(, maxFold+1, ) = maxFold }// Brute force. Depend on appendRange to coalesce ranges on the fly.for := ; <= ; ++ { = appendRange(, , ) := unicode.SimpleFold()for != { = appendRange(, , ) = unicode.SimpleFold() } }return}// appendClass returns the result of appending the class x to the class r.// It assume x is clean.func ( []rune, []rune) []rune {for := 0; < len(); += 2 { = appendRange(, [], [+1]) }return}// appendFoldedClass returns the result of appending the case folding of the class x to the class r.func ( []rune, []rune) []rune {for := 0; < len(); += 2 { = appendFoldedRange(, [], [+1]) }return}// appendNegatedClass returns the result of appending the negation of the class x to the class r.// It assumes x is clean.func ( []rune, []rune) []rune { := '\u0000'for := 0; < len(); += 2 { , := [], [+1]if <= -1 { = appendRange(, , -1) } = + 1 }if <= unicode.MaxRune { = appendRange(, , unicode.MaxRune) }return}// appendTable returns the result of appending x to the class r.func ( []rune, *unicode.RangeTable) []rune {for , := range .R16 { , , := rune(.Lo), rune(.Hi), rune(.Stride)if == 1 { = appendRange(, , )continue }for := ; <= ; += { = appendRange(, , ) } }for , := range .R32 { , , := rune(.Lo), rune(.Hi), rune(.Stride)if == 1 { = appendRange(, , )continue }for := ; <= ; += { = appendRange(, , ) } }return}// appendNegatedTable returns the result of appending the negation of x to the class r.func ( []rune, *unicode.RangeTable) []rune { := '\u0000'// lo end of next class to addfor , := range .R16 { , , := rune(.Lo), rune(.Hi), rune(.Stride)if == 1 {if <= -1 { = appendRange(, , -1) } = + 1continue }for := ; <= ; += {if <= -1 { = appendRange(, , -1) } = + 1 } }for , := range .R32 { , , := rune(.Lo), rune(.Hi), rune(.Stride)if == 1 {if <= -1 { = appendRange(, , -1) } = + 1continue }for := ; <= ; += {if <= -1 { = appendRange(, , -1) } = + 1 } }if <= unicode.MaxRune { = appendRange(, , unicode.MaxRune) }return}// negateClass overwrites r and returns r's negation.// It assumes the class r is already clean.func ( []rune) []rune { := '\u0000'// lo end of next class to add := 0// write indexfor := 0; < len(); += 2 { , := [], [+1]if <= -1 { [] = [+1] = - 1 += 2 } = + 1 } = [:]if <= unicode.MaxRune {// It's possible for the negation to have one more // range - this one - than the original class, so use append. = append(, , unicode.MaxRune) }return}// ranges implements sort.Interface on a []rune.// The choice of receiver type definition is strange// but avoids an allocation since we already have// a *[]rune.typerangesstruct {p *[]rune}func ( ranges) (, int) bool { := *.p *= 2 *= 2return [] < [] || [] == [] && [+1] > [+1]}func ( ranges) () int {returnlen(*.p) / 2}func ( ranges) (, int) { := *.p *= 2 *= 2 [], [+1], [], [+1] = [], [+1], [], [+1]}func ( string) error {for != "" { , := utf8.DecodeRuneInString()if == utf8.RuneError && == 1 {return &Error{Code: ErrInvalidUTF8, Expr: } } = [:] }returnnil}func ( string) ( rune, string, error) { , := utf8.DecodeRuneInString()if == utf8.RuneError && == 1 {return0, "", &Error{Code: ErrInvalidUTF8, Expr: } }return , [:], nil}func ( rune) bool {return'0' <= && <= '9' || 'A' <= && <= 'Z' || 'a' <= && <= 'z'}func ( rune) rune {if'0' <= && <= '9' {return - '0' }if'a' <= && <= 'f' {return - 'a' + 10 }if'A' <= && <= 'F' {return - 'A' + 10 }return -1}
The pages are generated with Goldsv0.6.7. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds.