2014-10-18 21:09:06 +00:00
|
|
|
// The lex package implements a lexical reader which can take in any io.Reader.
|
|
|
|
// It does not care about the meaning or logical validity of the tokens it
|
|
|
|
// parses out, it simply does its job.
|
|
|
|
package lex
|
2014-10-18 16:04:48 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"unicode"
|
|
|
|
)
|
|
|
|
|
|
|
|
type TokenType int
|
2014-10-21 01:58:09 +00:00
|
|
|
|
2014-10-18 16:04:48 +00:00
|
|
|
const (
|
|
|
|
BareString TokenType = iota
|
|
|
|
QuotedString
|
2014-10-18 21:24:42 +00:00
|
|
|
Open
|
|
|
|
Close
|
2014-10-18 16:04:48 +00:00
|
|
|
Err
|
2014-10-18 16:25:16 +00:00
|
|
|
eof
|
2014-10-18 16:04:48 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var invalidBareStringRunes = map[rune]bool{
|
|
|
|
'"': true,
|
|
|
|
'\'': true,
|
|
|
|
'(': true,
|
|
|
|
')': true,
|
|
|
|
'[': true,
|
|
|
|
']': true,
|
|
|
|
'{': true,
|
|
|
|
'}': true,
|
|
|
|
}
|
|
|
|
|
2014-10-18 16:32:54 +00:00
|
|
|
// Token represents a single set of characters which *could* be a valid token of
|
|
|
|
// the given type
|
2014-10-18 16:04:48 +00:00
|
|
|
type Token struct {
|
|
|
|
Type TokenType
|
|
|
|
Val string
|
|
|
|
}
|
|
|
|
|
2014-10-19 00:04:57 +00:00
|
|
|
// Returns the token's value as an error, or nil if the token is not of type
|
|
|
|
// Err. If the token is nil returns io.EOF, since that is the ostensible meaning
|
|
|
|
func (t *Token) AsError() error {
|
|
|
|
if t == nil {
|
|
|
|
return io.EOF
|
|
|
|
}
|
|
|
|
if t.Type != Err {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return errors.New(t.Val)
|
|
|
|
}
|
|
|
|
|
2014-10-18 16:04:48 +00:00
|
|
|
var (
|
|
|
|
errInvalidUTF8 = errors.New("invalid utf8 character")
|
|
|
|
)
|
|
|
|
|
|
|
|
// Lexer reads through an io.Reader and emits Tokens from it.
|
|
|
|
type Lexer struct {
|
2014-10-21 01:58:09 +00:00
|
|
|
r *bufio.Reader
|
2014-10-18 16:04:48 +00:00
|
|
|
outbuf *bytes.Buffer
|
2014-10-21 01:58:09 +00:00
|
|
|
ch chan *Token
|
2014-10-18 16:04:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewLexer constructs a new Lexer struct and returns it. r is internally
|
2014-10-18 16:32:54 +00:00
|
|
|
// wrapped with a bufio.Reader, unless it already is one. This will spawn a
|
|
|
|
// go-routine which reads from r until it hits an error, at which point it will
|
|
|
|
// end execution.
|
2014-10-18 16:04:48 +00:00
|
|
|
func NewLexer(r io.Reader) *Lexer {
|
|
|
|
var br *bufio.Reader
|
|
|
|
var ok bool
|
|
|
|
if br, ok = r.(*bufio.Reader); !ok {
|
|
|
|
br = bufio.NewReader(r)
|
|
|
|
}
|
|
|
|
|
|
|
|
l := Lexer{
|
2014-10-21 01:58:09 +00:00
|
|
|
r: br,
|
|
|
|
ch: make(chan *Token),
|
2014-10-18 16:04:48 +00:00
|
|
|
outbuf: bytes.NewBuffer(make([]byte, 0, 1024)),
|
|
|
|
}
|
|
|
|
|
|
|
|
go l.spin()
|
|
|
|
|
|
|
|
return &l
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) spin() {
|
|
|
|
f := lexWhitespace
|
|
|
|
for {
|
|
|
|
f = f(l)
|
|
|
|
if f == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-18 16:32:54 +00:00
|
|
|
// Returns the next available token, or nil if EOF has been reached. If an error
|
|
|
|
// other than EOF has been reached it will be returned as the Err token type,
|
|
|
|
// and this method should not be called again after that.
|
2014-10-18 16:04:48 +00:00
|
|
|
func (l *Lexer) Next() *Token {
|
|
|
|
t := <-l.ch
|
2014-10-18 16:25:16 +00:00
|
|
|
if t.Type == eof {
|
2014-10-18 16:04:48 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return t
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) emit(t TokenType) {
|
|
|
|
str := l.outbuf.String()
|
|
|
|
l.ch <- &Token{
|
|
|
|
Type: t,
|
2014-10-21 01:58:09 +00:00
|
|
|
Val: str,
|
2014-10-18 16:04:48 +00:00
|
|
|
}
|
|
|
|
l.outbuf.Reset()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) peek() (rune, error) {
|
|
|
|
r, err := l.readRune()
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
if err = l.r.UnreadRune(); err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
return r, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) readRune() (rune, error) {
|
|
|
|
r, i, err := l.r.ReadRune()
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
} else if r == unicode.ReplacementChar && i == 1 {
|
|
|
|
return 0, errInvalidUTF8
|
|
|
|
}
|
|
|
|
return r, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) err(err error) lexerFunc {
|
|
|
|
if err == io.EOF {
|
2014-10-18 16:25:16 +00:00
|
|
|
l.ch <- &Token{eof, ""}
|
2014-10-18 16:04:48 +00:00
|
|
|
} else {
|
|
|
|
l.ch <- &Token{Err, err.Error()}
|
|
|
|
}
|
|
|
|
close(l.ch)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) errf(format string, args ...interface{}) lexerFunc {
|
|
|
|
s := fmt.Sprintf(format, args...)
|
|
|
|
l.ch <- &Token{Err, s}
|
|
|
|
close(l.ch)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type lexerFunc func(*Lexer) lexerFunc
|
|
|
|
|
|
|
|
func lexWhitespace(l *Lexer) lexerFunc {
|
|
|
|
r, err := l.readRune()
|
|
|
|
if err != nil {
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if unicode.IsSpace(r) {
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
|
|
|
|
l.outbuf.WriteRune(r)
|
2014-10-21 01:58:09 +00:00
|
|
|
|
2014-10-18 16:04:48 +00:00
|
|
|
switch r {
|
|
|
|
case '"':
|
|
|
|
return lexQuotedString
|
|
|
|
case '(':
|
2014-10-18 21:24:42 +00:00
|
|
|
l.emit(Open)
|
2014-10-18 16:04:48 +00:00
|
|
|
case ')':
|
2014-10-18 21:24:42 +00:00
|
|
|
l.emit(Close)
|
|
|
|
case '[':
|
|
|
|
l.emit(Open)
|
|
|
|
case ']':
|
|
|
|
l.emit(Close)
|
|
|
|
case '{':
|
|
|
|
l.emit(Open)
|
|
|
|
case '}':
|
|
|
|
l.emit(Close)
|
2014-10-18 16:04:48 +00:00
|
|
|
default:
|
|
|
|
return lexBareString
|
|
|
|
}
|
|
|
|
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
|
|
|
|
func lexQuotedString(l *Lexer) lexerFunc {
|
|
|
|
r, err := l.readRune()
|
|
|
|
if err != nil {
|
|
|
|
l.emit(QuotedString)
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
l.outbuf.WriteRune(r)
|
|
|
|
buf := l.outbuf.Bytes()
|
|
|
|
|
2014-10-21 01:58:09 +00:00
|
|
|
if r == '"' && buf[len(buf)-2] != '\\' {
|
2014-10-18 16:04:48 +00:00
|
|
|
l.emit(QuotedString)
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
return lexQuotedString
|
|
|
|
}
|
|
|
|
|
|
|
|
func lexBareString(l *Lexer) lexerFunc {
|
|
|
|
r, err := l.peek()
|
|
|
|
if err != nil {
|
|
|
|
l.emit(BareString)
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, ok := invalidBareStringRunes[r]; ok || unicode.IsSpace(r) {
|
|
|
|
l.emit(BareString)
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, err = l.readRune(); err != nil {
|
|
|
|
l.emit(BareString)
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
l.outbuf.WriteRune(r)
|
|
|
|
return lexBareString
|
|
|
|
}
|