diff options
Diffstat (limited to 'Godeps/_workspace/src/github.com/robertkrimen/otto/parser/regexp.go')
-rw-r--r-- | Godeps/_workspace/src/github.com/robertkrimen/otto/parser/regexp.go | 358 |
1 files changed, 358 insertions, 0 deletions
diff --git a/Godeps/_workspace/src/github.com/robertkrimen/otto/parser/regexp.go b/Godeps/_workspace/src/github.com/robertkrimen/otto/parser/regexp.go new file mode 100644 index 000000000..f614dae74 --- /dev/null +++ b/Godeps/_workspace/src/github.com/robertkrimen/otto/parser/regexp.go @@ -0,0 +1,358 @@ +package parser + +import ( + "bytes" + "fmt" + "strconv" +) + +type _RegExp_parser struct { + str string + length int + + chr rune // The current character + chrOffset int // The offset of current character + offset int // The offset after current character (may be greater than 1) + + errors []error + invalid bool // The input is an invalid JavaScript RegExp + + goRegexp *bytes.Buffer +} + +// TransformRegExp transforms a JavaScript pattern into a Go "regexp" pattern. +// +// re2 (Go) cannot do backtracking, so the presence of a lookahead (?=) (?!) or +// backreference (\1, \2, ...) will cause an error. +// +// re2 (Go) has a different definition for \s: [\t\n\f\r ]. +// The JavaScript definition, on the other hand, also includes \v, Unicode "Separator, Space", etc. +// +// If the pattern is invalid (not valid even in JavaScript), then this function +// returns the empty string and an error. +// +// If the pattern is valid, but incompatible (contains a lookahead or backreference), +// then this function returns the transformation (a non-empty string) AND an error. +func TransformRegExp(pattern string) (string, error) { + + if pattern == "" { + return "", nil + } + + // TODO If without \, if without (?=, (?!, then another shortcut + + parser := _RegExp_parser{ + str: pattern, + length: len(pattern), + goRegexp: bytes.NewBuffer(make([]byte, 0, 3*len(pattern)/2)), + } + parser.read() // Pull in the first character + parser.scan() + var err error + if len(parser.errors) > 0 { + err = parser.errors[0] + } + if parser.invalid { + return "", err + } + + // Might not be re2 compatible, but is still a valid JavaScript RegExp + return parser.goRegexp.String(), err +} + +func (self *_RegExp_parser) scan() { + for self.chr != -1 { + switch self.chr { + case '\\': + self.read() + self.scanEscape(false) + case '(': + self.pass() + self.scanGroup() + case '[': + self.pass() + self.scanBracket() + case ')': + self.error(-1, "Unmatched ')'") + self.invalid = true + self.pass() + default: + self.pass() + } + } +} + +// (...) +func (self *_RegExp_parser) scanGroup() { + str := self.str[self.chrOffset:] + if len(str) > 1 { // A possibility of (?= or (?! + if str[0] == '?' { + if str[1] == '=' || str[1] == '!' { + self.error(-1, "re2: Invalid (%s) <lookahead>", self.str[self.chrOffset:self.chrOffset+2]) + } + } + } + for self.chr != -1 && self.chr != ')' { + switch self.chr { + case '\\': + self.read() + self.scanEscape(false) + case '(': + self.pass() + self.scanGroup() + case '[': + self.pass() + self.scanBracket() + default: + self.pass() + continue + } + } + if self.chr != ')' { + self.error(-1, "Unterminated group") + self.invalid = true + return + } + self.pass() +} + +// [...] +func (self *_RegExp_parser) scanBracket() { + for self.chr != -1 { + if self.chr == ']' { + break + } else if self.chr == '\\' { + self.read() + self.scanEscape(true) + continue + } + self.pass() + } + if self.chr != ']' { + self.error(-1, "Unterminated character class") + self.invalid = true + return + } + self.pass() +} + +// \... +func (self *_RegExp_parser) scanEscape(inClass bool) { + offset := self.chrOffset + + var length, base uint32 + switch self.chr { + + case '0', '1', '2', '3', '4', '5', '6', '7': + var value int64 + size := 0 + for { + digit := int64(digitValue(self.chr)) + if digit >= 8 { + // Not a valid digit + break + } + value = value*8 + digit + self.read() + size += 1 + } + if size == 1 { // The number of characters read + _, err := self.goRegexp.Write([]byte{'\\', byte(value) + '0'}) + if err != nil { + self.errors = append(self.errors, err) + } + if value != 0 { + // An invalid backreference + self.error(-1, "re2: Invalid \\%d <backreference>", value) + } + return + } + tmp := []byte{'\\', 'x', '0', 0} + if value >= 16 { + tmp = tmp[0:2] + } else { + tmp = tmp[0:3] + } + tmp = strconv.AppendInt(tmp, value, 16) + _, err := self.goRegexp.Write(tmp) + if err != nil { + self.errors = append(self.errors, err) + } + return + + case '8', '9': + size := 0 + for { + digit := digitValue(self.chr) + if digit >= 10 { + // Not a valid digit + break + } + self.read() + size += 1 + } + err := self.goRegexp.WriteByte('\\') + if err != nil { + self.errors = append(self.errors, err) + } + _, err = self.goRegexp.WriteString(self.str[offset:self.chrOffset]) + if err != nil { + self.errors = append(self.errors, err) + } + self.error(-1, "re2: Invalid \\%s <backreference>", self.str[offset:self.chrOffset]) + return + + case 'x': + self.read() + length, base = 2, 16 + + case 'u': + self.read() + length, base = 4, 16 + + case 'b': + if inClass { + _, err := self.goRegexp.Write([]byte{'\\', 'x', '0', '8'}) + if err != nil { + self.errors = append(self.errors, err) + } + self.read() + return + } + fallthrough + + case 'B': + fallthrough + + case 'd', 'D', 's', 'S', 'w', 'W': + // This is slightly broken, because ECMAScript + // includes \v in \s, \S, while re2 does not + fallthrough + + case '\\': + fallthrough + + case 'f', 'n', 'r', 't', 'v': + err := self.goRegexp.WriteByte('\\') + if err != nil { + self.errors = append(self.errors, err) + } + self.pass() + return + + case 'c': + self.read() + var value int64 + if 'a' <= self.chr && self.chr <= 'z' { + value = int64(self.chr) - 'a' + 1 + } else if 'A' <= self.chr && self.chr <= 'Z' { + value = int64(self.chr) - 'A' + 1 + } else { + err := self.goRegexp.WriteByte('c') + if err != nil { + self.errors = append(self.errors, err) + } + return + } + tmp := []byte{'\\', 'x', '0', 0} + if value >= 16 { + tmp = tmp[0:2] + } else { + tmp = tmp[0:3] + } + tmp = strconv.AppendInt(tmp, value, 16) + _, err := self.goRegexp.Write(tmp) + if err != nil { + self.errors = append(self.errors, err) + } + self.read() + return + + default: + // $ is an identifier character, so we have to have + // a special case for it here + if self.chr == '$' || !isIdentifierPart(self.chr) { + // A non-identifier character needs escaping + err := self.goRegexp.WriteByte('\\') + if err != nil { + self.errors = append(self.errors, err) + } + } else { + // Unescape the character for re2 + } + self.pass() + return + } + + // Otherwise, we're a \u.... or \x... + valueOffset := self.chrOffset + + var value uint32 + { + length := length + for ; length > 0; length-- { + digit := uint32(digitValue(self.chr)) + if digit >= base { + // Not a valid digit + goto skip + } + value = value*base + digit + self.read() + } + } + + if length == 4 { + _, err := self.goRegexp.Write([]byte{ + '\\', + 'x', + '{', + self.str[valueOffset+0], + self.str[valueOffset+1], + self.str[valueOffset+2], + self.str[valueOffset+3], + '}', + }) + if err != nil { + self.errors = append(self.errors, err) + } + } else if length == 2 { + _, err := self.goRegexp.Write([]byte{ + '\\', + 'x', + self.str[valueOffset+0], + self.str[valueOffset+1], + }) + if err != nil { + self.errors = append(self.errors, err) + } + } else { + // Should never, ever get here... + self.error(-1, "re2: Illegal branch in scanEscape") + goto skip + } + + return + +skip: + _, err := self.goRegexp.WriteString(self.str[offset:self.chrOffset]) + if err != nil { + self.errors = append(self.errors, err) + } +} + +func (self *_RegExp_parser) pass() { + if self.chr != -1 { + _, err := self.goRegexp.WriteRune(self.chr) + if err != nil { + self.errors = append(self.errors, err) + } + } + self.read() +} + +// TODO Better error reporting, use the offset, etc. +func (self *_RegExp_parser) error(offset int, msg string, msgValues ...interface{}) error { + err := fmt.Errorf(msg, msgValues...) + self.errors = append(self.errors, err) + return err +} |