diff options
Diffstat (limited to 'vendor/github.com/robertkrimen/otto/parser/lexer.go')
-rw-r--r-- | vendor/github.com/robertkrimen/otto/parser/lexer.go | 866 |
1 files changed, 866 insertions, 0 deletions
diff --git a/vendor/github.com/robertkrimen/otto/parser/lexer.go b/vendor/github.com/robertkrimen/otto/parser/lexer.go new file mode 100644 index 000000000..d9d69e124 --- /dev/null +++ b/vendor/github.com/robertkrimen/otto/parser/lexer.go @@ -0,0 +1,866 @@ +package parser + +import ( + "bytes" + "errors" + "fmt" + "regexp" + "strconv" + "strings" + "unicode" + "unicode/utf8" + + "github.com/robertkrimen/otto/ast" + "github.com/robertkrimen/otto/file" + "github.com/robertkrimen/otto/token" +) + +type _chr struct { + value rune + width int +} + +var matchIdentifier = regexp.MustCompile(`^[$_\p{L}][$_\p{L}\d}]*$`) + +func isDecimalDigit(chr rune) bool { + return '0' <= chr && chr <= '9' +} + +func digitValue(chr rune) int { + switch { + case '0' <= chr && chr <= '9': + return int(chr - '0') + case 'a' <= chr && chr <= 'f': + return int(chr - 'a' + 10) + case 'A' <= chr && chr <= 'F': + return int(chr - 'A' + 10) + } + return 16 // Larger than any legal digit value +} + +func isDigit(chr rune, base int) bool { + return digitValue(chr) < base +} + +func isIdentifierStart(chr rune) bool { + return chr == '$' || chr == '_' || chr == '\\' || + 'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' || + chr >= utf8.RuneSelf && unicode.IsLetter(chr) +} + +func isIdentifierPart(chr rune) bool { + return chr == '$' || chr == '_' || chr == '\\' || + 'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' || + '0' <= chr && chr <= '9' || + chr >= utf8.RuneSelf && (unicode.IsLetter(chr) || unicode.IsDigit(chr)) +} + +func (self *_parser) scanIdentifier() (string, error) { + offset := self.chrOffset + parse := false + for isIdentifierPart(self.chr) { + if self.chr == '\\' { + distance := self.chrOffset - offset + self.read() + if self.chr != 'u' { + return "", fmt.Errorf("Invalid identifier escape character: %c (%s)", self.chr, string(self.chr)) + } + parse = true + var value rune + for j := 0; j < 4; j++ { + self.read() + decimal, ok := hex2decimal(byte(self.chr)) + if !ok { + return "", fmt.Errorf("Invalid identifier escape character: %c (%s)", self.chr, string(self.chr)) + } + value = value<<4 | decimal + } + if value == '\\' { + return "", fmt.Errorf("Invalid identifier escape value: %c (%s)", value, string(value)) + } else if distance == 0 { + if !isIdentifierStart(value) { + return "", fmt.Errorf("Invalid identifier escape value: %c (%s)", value, string(value)) + } + } else if distance > 0 { + if !isIdentifierPart(value) { + return "", fmt.Errorf("Invalid identifier escape value: %c (%s)", value, string(value)) + } + } + } + self.read() + } + literal := string(self.str[offset:self.chrOffset]) + if parse { + return parseStringLiteral(literal) + } + return literal, nil +} + +// 7.2 +func isLineWhiteSpace(chr rune) bool { + switch chr { + case '\u0009', '\u000b', '\u000c', '\u0020', '\u00a0', '\ufeff': + return true + case '\u000a', '\u000d', '\u2028', '\u2029': + return false + case '\u0085': + return false + } + return unicode.IsSpace(chr) +} + +// 7.3 +func isLineTerminator(chr rune) bool { + switch chr { + case '\u000a', '\u000d', '\u2028', '\u2029': + return true + } + return false +} + +func (self *_parser) scan() (tkn token.Token, literal string, idx file.Idx) { + + self.implicitSemicolon = false + + for { + self.skipWhiteSpace() + + idx = self.idxOf(self.chrOffset) + insertSemicolon := false + + switch chr := self.chr; { + case isIdentifierStart(chr): + var err error + literal, err = self.scanIdentifier() + if err != nil { + tkn = token.ILLEGAL + break + } + if len(literal) > 1 { + // Keywords are longer than 1 character, avoid lookup otherwise + var strict bool + tkn, strict = token.IsKeyword(literal) + + switch tkn { + + case 0: // Not a keyword + if literal == "true" || literal == "false" { + self.insertSemicolon = true + tkn = token.BOOLEAN + return + } else if literal == "null" { + self.insertSemicolon = true + tkn = token.NULL + return + } + + case token.KEYWORD: + tkn = token.KEYWORD + if strict { + // TODO If strict and in strict mode, then this is not a break + break + } + return + + case + token.THIS, + token.BREAK, + token.THROW, // A newline after a throw is not allowed, but we need to detect it + token.RETURN, + token.CONTINUE, + token.DEBUGGER: + self.insertSemicolon = true + return + + default: + return + + } + } + self.insertSemicolon = true + tkn = token.IDENTIFIER + return + case '0' <= chr && chr <= '9': + self.insertSemicolon = true + tkn, literal = self.scanNumericLiteral(false) + return + default: + self.read() + switch chr { + case -1: + if self.insertSemicolon { + self.insertSemicolon = false + self.implicitSemicolon = true + } + tkn = token.EOF + case '\r', '\n', '\u2028', '\u2029': + self.insertSemicolon = false + self.implicitSemicolon = true + self.comments.AtLineBreak() + continue + case ':': + tkn = token.COLON + case '.': + if digitValue(self.chr) < 10 { + insertSemicolon = true + tkn, literal = self.scanNumericLiteral(true) + } else { + tkn = token.PERIOD + } + case ',': + tkn = token.COMMA + case ';': + tkn = token.SEMICOLON + case '(': + tkn = token.LEFT_PARENTHESIS + case ')': + tkn = token.RIGHT_PARENTHESIS + insertSemicolon = true + case '[': + tkn = token.LEFT_BRACKET + case ']': + tkn = token.RIGHT_BRACKET + insertSemicolon = true + case '{': + tkn = token.LEFT_BRACE + case '}': + tkn = token.RIGHT_BRACE + insertSemicolon = true + case '+': + tkn = self.switch3(token.PLUS, token.ADD_ASSIGN, '+', token.INCREMENT) + if tkn == token.INCREMENT { + insertSemicolon = true + } + case '-': + tkn = self.switch3(token.MINUS, token.SUBTRACT_ASSIGN, '-', token.DECREMENT) + if tkn == token.DECREMENT { + insertSemicolon = true + } + case '*': + tkn = self.switch2(token.MULTIPLY, token.MULTIPLY_ASSIGN) + case '/': + if self.chr == '/' { + if self.mode&StoreComments != 0 { + literal := string(self.readSingleLineComment()) + self.comments.AddComment(ast.NewComment(literal, self.idx)) + continue + } + self.skipSingleLineComment() + continue + } else if self.chr == '*' { + if self.mode&StoreComments != 0 { + literal = string(self.readMultiLineComment()) + self.comments.AddComment(ast.NewComment(literal, self.idx)) + continue + } + self.skipMultiLineComment() + continue + } else { + // Could be division, could be RegExp literal + tkn = self.switch2(token.SLASH, token.QUOTIENT_ASSIGN) + insertSemicolon = true + } + case '%': + tkn = self.switch2(token.REMAINDER, token.REMAINDER_ASSIGN) + case '^': + tkn = self.switch2(token.EXCLUSIVE_OR, token.EXCLUSIVE_OR_ASSIGN) + case '<': + tkn = self.switch4(token.LESS, token.LESS_OR_EQUAL, '<', token.SHIFT_LEFT, token.SHIFT_LEFT_ASSIGN) + case '>': + tkn = self.switch6(token.GREATER, token.GREATER_OR_EQUAL, '>', token.SHIFT_RIGHT, token.SHIFT_RIGHT_ASSIGN, '>', token.UNSIGNED_SHIFT_RIGHT, token.UNSIGNED_SHIFT_RIGHT_ASSIGN) + case '=': + tkn = self.switch2(token.ASSIGN, token.EQUAL) + if tkn == token.EQUAL && self.chr == '=' { + self.read() + tkn = token.STRICT_EQUAL + } + case '!': + tkn = self.switch2(token.NOT, token.NOT_EQUAL) + if tkn == token.NOT_EQUAL && self.chr == '=' { + self.read() + tkn = token.STRICT_NOT_EQUAL + } + case '&': + if self.chr == '^' { + self.read() + tkn = self.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) + } else { + tkn = self.switch3(token.AND, token.AND_ASSIGN, '&', token.LOGICAL_AND) + } + case '|': + tkn = self.switch3(token.OR, token.OR_ASSIGN, '|', token.LOGICAL_OR) + case '~': + tkn = token.BITWISE_NOT + case '?': + tkn = token.QUESTION_MARK + case '"', '\'': + insertSemicolon = true + tkn = token.STRING + var err error + literal, err = self.scanString(self.chrOffset - 1) + if err != nil { + tkn = token.ILLEGAL + } + default: + self.errorUnexpected(idx, chr) + tkn = token.ILLEGAL + } + } + self.insertSemicolon = insertSemicolon + return + } +} + +func (self *_parser) switch2(tkn0, tkn1 token.Token) token.Token { + if self.chr == '=' { + self.read() + return tkn1 + } + return tkn0 +} + +func (self *_parser) switch3(tkn0, tkn1 token.Token, chr2 rune, tkn2 token.Token) token.Token { + if self.chr == '=' { + self.read() + return tkn1 + } + if self.chr == chr2 { + self.read() + return tkn2 + } + return tkn0 +} + +func (self *_parser) switch4(tkn0, tkn1 token.Token, chr2 rune, tkn2, tkn3 token.Token) token.Token { + if self.chr == '=' { + self.read() + return tkn1 + } + if self.chr == chr2 { + self.read() + if self.chr == '=' { + self.read() + return tkn3 + } + return tkn2 + } + return tkn0 +} + +func (self *_parser) switch6(tkn0, tkn1 token.Token, chr2 rune, tkn2, tkn3 token.Token, chr3 rune, tkn4, tkn5 token.Token) token.Token { + if self.chr == '=' { + self.read() + return tkn1 + } + if self.chr == chr2 { + self.read() + if self.chr == '=' { + self.read() + return tkn3 + } + if self.chr == chr3 { + self.read() + if self.chr == '=' { + self.read() + return tkn5 + } + return tkn4 + } + return tkn2 + } + return tkn0 +} + +func (self *_parser) chrAt(index int) _chr { + value, width := utf8.DecodeRuneInString(self.str[index:]) + return _chr{ + value: value, + width: width, + } +} + +func (self *_parser) _peek() rune { + if self.offset+1 < self.length { + return rune(self.str[self.offset+1]) + } + return -1 +} + +func (self *_parser) read() { + if self.offset < self.length { + self.chrOffset = self.offset + chr, width := rune(self.str[self.offset]), 1 + if chr >= utf8.RuneSelf { // !ASCII + chr, width = utf8.DecodeRuneInString(self.str[self.offset:]) + if chr == utf8.RuneError && width == 1 { + self.error(self.chrOffset, "Invalid UTF-8 character") + } + } + self.offset += width + self.chr = chr + } else { + self.chrOffset = self.length + self.chr = -1 // EOF + } +} + +// This is here since the functions are so similar +func (self *_RegExp_parser) read() { + if self.offset < self.length { + self.chrOffset = self.offset + chr, width := rune(self.str[self.offset]), 1 + if chr >= utf8.RuneSelf { // !ASCII + chr, width = utf8.DecodeRuneInString(self.str[self.offset:]) + if chr == utf8.RuneError && width == 1 { + self.error(self.chrOffset, "Invalid UTF-8 character") + } + } + self.offset += width + self.chr = chr + } else { + self.chrOffset = self.length + self.chr = -1 // EOF + } +} + +func (self *_parser) readSingleLineComment() (result []rune) { + for self.chr != -1 { + self.read() + if isLineTerminator(self.chr) { + return + } + result = append(result, self.chr) + } + + // Get rid of the trailing -1 + result = result[:len(result)-1] + + return +} + +func (self *_parser) readMultiLineComment() (result []rune) { + self.read() + for self.chr >= 0 { + chr := self.chr + self.read() + if chr == '*' && self.chr == '/' { + self.read() + return + } + + result = append(result, chr) + } + + self.errorUnexpected(0, self.chr) + + return +} + +func (self *_parser) skipSingleLineComment() { + for self.chr != -1 { + self.read() + if isLineTerminator(self.chr) { + return + } + } +} + +func (self *_parser) skipMultiLineComment() { + self.read() + for self.chr >= 0 { + chr := self.chr + self.read() + if chr == '*' && self.chr == '/' { + self.read() + return + } + } + + self.errorUnexpected(0, self.chr) +} + +func (self *_parser) skipWhiteSpace() { + for { + switch self.chr { + case ' ', '\t', '\f', '\v', '\u00a0', '\ufeff': + self.read() + continue + case '\r': + if self._peek() == '\n' { + self.comments.AtLineBreak() + self.read() + } + fallthrough + case '\u2028', '\u2029', '\n': + if self.insertSemicolon { + return + } + self.comments.AtLineBreak() + self.read() + continue + } + if self.chr >= utf8.RuneSelf { + if unicode.IsSpace(self.chr) { + self.read() + continue + } + } + break + } +} + +func (self *_parser) skipLineWhiteSpace() { + for isLineWhiteSpace(self.chr) { + self.read() + } +} + +func (self *_parser) scanMantissa(base int) { + for digitValue(self.chr) < base { + self.read() + } +} + +func (self *_parser) scanEscape(quote rune) { + + var length, base uint32 + switch self.chr { + //case '0', '1', '2', '3', '4', '5', '6', '7': + // Octal: + // length, base, limit = 3, 8, 255 + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '0': + self.read() + return + case '\r', '\n', '\u2028', '\u2029': + self.scanNewline() + return + case 'x': + self.read() + length, base = 2, 16 + case 'u': + self.read() + length, base = 4, 16 + default: + self.read() // Always make progress + return + } + + var value uint32 + for ; length > 0 && self.chr != quote && self.chr >= 0; length-- { + digit := uint32(digitValue(self.chr)) + if digit >= base { + break + } + value = value*base + digit + self.read() + } +} + +func (self *_parser) scanString(offset int) (string, error) { + // " ' / + quote := rune(self.str[offset]) + + for self.chr != quote { + chr := self.chr + if chr == '\n' || chr == '\r' || chr == '\u2028' || chr == '\u2029' || chr < 0 { + goto newline + } + self.read() + if chr == '\\' { + if quote == '/' { + if self.chr == '\n' || self.chr == '\r' || self.chr == '\u2028' || self.chr == '\u2029' || self.chr < 0 { + goto newline + } + self.read() + } else { + self.scanEscape(quote) + } + } else if chr == '[' && quote == '/' { + // Allow a slash (/) in a bracket character class ([...]) + // TODO Fix this, this is hacky... + quote = -1 + } else if chr == ']' && quote == -1 { + quote = '/' + } + } + + // " ' / + self.read() + + return string(self.str[offset:self.chrOffset]), nil + +newline: + self.scanNewline() + err := "String not terminated" + if quote == '/' { + err = "Invalid regular expression: missing /" + self.error(self.idxOf(offset), err) + } + return "", errors.New(err) +} + +func (self *_parser) scanNewline() { + if self.chr == '\r' { + self.read() + if self.chr != '\n' { + return + } + } + self.read() +} + +func hex2decimal(chr byte) (value rune, ok bool) { + { + chr := rune(chr) + switch { + case '0' <= chr && chr <= '9': + return chr - '0', true + case 'a' <= chr && chr <= 'f': + return chr - 'a' + 10, true + case 'A' <= chr && chr <= 'F': + return chr - 'A' + 10, true + } + return + } +} + +func parseNumberLiteral(literal string) (value interface{}, err error) { + // TODO Is Uint okay? What about -MAX_UINT + value, err = strconv.ParseInt(literal, 0, 64) + if err == nil { + return + } + + parseIntErr := err // Save this first error, just in case + + value, err = strconv.ParseFloat(literal, 64) + if err == nil { + return + } else if err.(*strconv.NumError).Err == strconv.ErrRange { + // Infinity, etc. + return value, nil + } + + err = parseIntErr + + if err.(*strconv.NumError).Err == strconv.ErrRange { + if len(literal) > 2 && literal[0] == '0' && (literal[1] == 'X' || literal[1] == 'x') { + // Could just be a very large number (e.g. 0x8000000000000000) + var value float64 + literal = literal[2:] + for _, chr := range literal { + digit := digitValue(chr) + if digit >= 16 { + goto error + } + value = value*16 + float64(digit) + } + return value, nil + } + } + +error: + return nil, errors.New("Illegal numeric literal") +} + +func parseStringLiteral(literal string) (string, error) { + // Best case scenario... + if literal == "" { + return "", nil + } + + // Slightly less-best case scenario... + if !strings.ContainsRune(literal, '\\') { + return literal, nil + } + + str := literal + buffer := bytes.NewBuffer(make([]byte, 0, 3*len(literal)/2)) + + for len(str) > 0 { + switch chr := str[0]; { + // We do not explicitly handle the case of the quote + // value, which can be: " ' / + // This assumes we're already passed a partially well-formed literal + case chr >= utf8.RuneSelf: + chr, size := utf8.DecodeRuneInString(str) + buffer.WriteRune(chr) + str = str[size:] + continue + case chr != '\\': + buffer.WriteByte(chr) + str = str[1:] + continue + } + + if len(str) <= 1 { + panic("len(str) <= 1") + } + chr := str[1] + var value rune + if chr >= utf8.RuneSelf { + str = str[1:] + var size int + value, size = utf8.DecodeRuneInString(str) + str = str[size:] // \ + <character> + } else { + str = str[2:] // \<character> + switch chr { + case 'b': + value = '\b' + case 'f': + value = '\f' + case 'n': + value = '\n' + case 'r': + value = '\r' + case 't': + value = '\t' + case 'v': + value = '\v' + case 'x', 'u': + size := 0 + switch chr { + case 'x': + size = 2 + case 'u': + size = 4 + } + if len(str) < size { + return "", fmt.Errorf("invalid escape: \\%s: len(%q) != %d", string(chr), str, size) + } + for j := 0; j < size; j++ { + decimal, ok := hex2decimal(str[j]) + if !ok { + return "", fmt.Errorf("invalid escape: \\%s: %q", string(chr), str[:size]) + } + value = value<<4 | decimal + } + str = str[size:] + if chr == 'x' { + break + } + if value > utf8.MaxRune { + panic("value > utf8.MaxRune") + } + case '0': + if len(str) == 0 || '0' > str[0] || str[0] > '7' { + value = 0 + break + } + fallthrough + case '1', '2', '3', '4', '5', '6', '7': + // TODO strict + value = rune(chr) - '0' + j := 0 + for ; j < 2; j++ { + if len(str) < j+1 { + break + } + chr := str[j] + if '0' > chr || chr > '7' { + break + } + decimal := rune(str[j]) - '0' + value = (value << 3) | decimal + } + str = str[j:] + case '\\': + value = '\\' + case '\'', '"': + value = rune(chr) + case '\r': + if len(str) > 0 { + if str[0] == '\n' { + str = str[1:] + } + } + fallthrough + case '\n': + continue + default: + value = rune(chr) + } + } + buffer.WriteRune(value) + } + + return buffer.String(), nil +} + +func (self *_parser) scanNumericLiteral(decimalPoint bool) (token.Token, string) { + + offset := self.chrOffset + tkn := token.NUMBER + + if decimalPoint { + offset-- + self.scanMantissa(10) + goto exponent + } + + if self.chr == '0' { + offset := self.chrOffset + self.read() + if self.chr == 'x' || self.chr == 'X' { + // Hexadecimal + self.read() + if isDigit(self.chr, 16) { + self.read() + } else { + return token.ILLEGAL, self.str[offset:self.chrOffset] + } + self.scanMantissa(16) + + if self.chrOffset-offset <= 2 { + // Only "0x" or "0X" + self.error(0, "Illegal hexadecimal number") + } + + goto hexadecimal + } else if self.chr == '.' { + // Float + goto float + } else { + // Octal, Float + if self.chr == 'e' || self.chr == 'E' { + goto exponent + } + self.scanMantissa(8) + if self.chr == '8' || self.chr == '9' { + return token.ILLEGAL, self.str[offset:self.chrOffset] + } + goto octal + } + } + + self.scanMantissa(10) + +float: + if self.chr == '.' { + self.read() + self.scanMantissa(10) + } + +exponent: + if self.chr == 'e' || self.chr == 'E' { + self.read() + if self.chr == '-' || self.chr == '+' { + self.read() + } + if isDecimalDigit(self.chr) { + self.read() + self.scanMantissa(10) + } else { + return token.ILLEGAL, self.str[offset:self.chrOffset] + } + } + +hexadecimal: +octal: + if isIdentifierStart(self.chr) || isDecimalDigit(self.chr) { + return token.ILLEGAL, self.str[offset:self.chrOffset] + } + + return tkn, self.str[offset:self.chrOffset] +} |