2014-10-18 16:32:54 +00:00
|
|
|
// The lexer package implements a lexical reader which can take in any
|
|
|
|
// io.Reader. It does not care about the meaning or logical validity of the
|
|
|
|
// tokens it parses out, it simply does its job.
|
2014-10-18 16:04:48 +00:00
|
|
|
package lexer
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"unicode"
|
|
|
|
)
|
|
|
|
|
|
|
|
type TokenType int
|
|
|
|
const (
|
|
|
|
BareString TokenType = iota
|
|
|
|
QuotedString
|
|
|
|
OpenParen
|
|
|
|
CloseParen
|
|
|
|
Err
|
2014-10-18 16:25:16 +00:00
|
|
|
eof
|
2014-10-18 16:04:48 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var invalidBareStringRunes = map[rune]bool{
|
|
|
|
'"': true,
|
|
|
|
'\'': true,
|
|
|
|
'(': true,
|
|
|
|
')': true,
|
|
|
|
'[': true,
|
|
|
|
']': true,
|
|
|
|
'{': true,
|
|
|
|
'}': true,
|
|
|
|
}
|
|
|
|
|
2014-10-18 16:32:54 +00:00
|
|
|
// Token represents a single set of characters which *could* be a valid token of
|
|
|
|
// the given type
|
2014-10-18 16:04:48 +00:00
|
|
|
type Token struct {
|
|
|
|
Type TokenType
|
|
|
|
Val string
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
errInvalidUTF8 = errors.New("invalid utf8 character")
|
|
|
|
)
|
|
|
|
|
|
|
|
// Lexer reads through an io.Reader and emits Tokens from it.
|
|
|
|
type Lexer struct {
|
|
|
|
r *bufio.Reader
|
|
|
|
outbuf *bytes.Buffer
|
|
|
|
ch chan *Token
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewLexer constructs a new Lexer struct and returns it. r is internally
|
2014-10-18 16:32:54 +00:00
|
|
|
// wrapped with a bufio.Reader, unless it already is one. This will spawn a
|
|
|
|
// go-routine which reads from r until it hits an error, at which point it will
|
|
|
|
// end execution.
|
2014-10-18 16:04:48 +00:00
|
|
|
func NewLexer(r io.Reader) *Lexer {
|
|
|
|
var br *bufio.Reader
|
|
|
|
var ok bool
|
|
|
|
if br, ok = r.(*bufio.Reader); !ok {
|
|
|
|
br = bufio.NewReader(r)
|
|
|
|
}
|
|
|
|
|
|
|
|
l := Lexer{
|
|
|
|
r: br,
|
2014-10-18 16:32:54 +00:00
|
|
|
ch: make(chan *Token),
|
2014-10-18 16:04:48 +00:00
|
|
|
outbuf: bytes.NewBuffer(make([]byte, 0, 1024)),
|
|
|
|
}
|
|
|
|
|
|
|
|
go l.spin()
|
|
|
|
|
|
|
|
return &l
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) spin() {
|
|
|
|
f := lexWhitespace
|
|
|
|
for {
|
|
|
|
f = f(l)
|
|
|
|
if f == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-18 16:32:54 +00:00
|
|
|
// Returns the next available token, or nil if EOF has been reached. If an error
|
|
|
|
// other than EOF has been reached it will be returned as the Err token type,
|
|
|
|
// and this method should not be called again after that.
|
2014-10-18 16:04:48 +00:00
|
|
|
func (l *Lexer) Next() *Token {
|
|
|
|
t := <-l.ch
|
2014-10-18 16:25:16 +00:00
|
|
|
if t.Type == eof {
|
2014-10-18 16:04:48 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return t
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) emit(t TokenType) {
|
|
|
|
str := l.outbuf.String()
|
|
|
|
fmt.Printf("emitting %q\n", str)
|
|
|
|
l.ch <- &Token{
|
|
|
|
Type: t,
|
|
|
|
Val: l.outbuf.String(),
|
|
|
|
}
|
|
|
|
l.outbuf.Reset()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) peek() (rune, error) {
|
|
|
|
r, err := l.readRune()
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
if err = l.r.UnreadRune(); err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
return r, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) readRune() (rune, error) {
|
|
|
|
r, i, err := l.r.ReadRune()
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
} else if r == unicode.ReplacementChar && i == 1 {
|
|
|
|
return 0, errInvalidUTF8
|
|
|
|
}
|
|
|
|
return r, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) err(err error) lexerFunc {
|
|
|
|
if err == io.EOF {
|
2014-10-18 16:25:16 +00:00
|
|
|
l.ch <- &Token{eof, ""}
|
2014-10-18 16:04:48 +00:00
|
|
|
} else {
|
|
|
|
l.ch <- &Token{Err, err.Error()}
|
|
|
|
}
|
|
|
|
close(l.ch)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Lexer) errf(format string, args ...interface{}) lexerFunc {
|
|
|
|
s := fmt.Sprintf(format, args...)
|
|
|
|
l.ch <- &Token{Err, s}
|
|
|
|
close(l.ch)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type lexerFunc func(*Lexer) lexerFunc
|
|
|
|
|
|
|
|
func lexWhitespace(l *Lexer) lexerFunc {
|
|
|
|
r, err := l.readRune()
|
|
|
|
if err != nil {
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if unicode.IsSpace(r) {
|
|
|
|
fmt.Printf("skipping %q because it's a space\n", r)
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
|
|
|
|
fmt.Printf("not skipping %q\n", r)
|
|
|
|
l.outbuf.WriteRune(r)
|
|
|
|
|
|
|
|
switch r {
|
|
|
|
case '"':
|
|
|
|
return lexQuotedString
|
|
|
|
case '(':
|
|
|
|
l.emit(OpenParen)
|
|
|
|
case ')':
|
|
|
|
l.emit(CloseParen)
|
|
|
|
default:
|
|
|
|
return lexBareString
|
|
|
|
}
|
|
|
|
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
|
|
|
|
func lexQuotedString(l *Lexer) lexerFunc {
|
|
|
|
r, err := l.readRune()
|
|
|
|
if err != nil {
|
|
|
|
l.emit(QuotedString)
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
l.outbuf.WriteRune(r)
|
|
|
|
buf := l.outbuf.Bytes()
|
|
|
|
|
|
|
|
if r == '"' && buf[len(buf) - 2] != '\\' {
|
|
|
|
l.emit(QuotedString)
|
|
|
|
fmt.Println("emitting quoted string, parsing whitespace")
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
return lexQuotedString
|
|
|
|
}
|
|
|
|
|
|
|
|
func lexBareString(l *Lexer) lexerFunc {
|
|
|
|
r, err := l.peek()
|
|
|
|
if err != nil {
|
|
|
|
fmt.Printf("got err %s in peek\n", err)
|
|
|
|
l.emit(BareString)
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, ok := invalidBareStringRunes[r]; ok || unicode.IsSpace(r) {
|
|
|
|
l.emit(BareString)
|
|
|
|
return lexWhitespace
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, err = l.readRune(); err != nil {
|
|
|
|
fmt.Printf("got err %s in read\n", err)
|
|
|
|
l.emit(BareString)
|
|
|
|
return l.err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
l.outbuf.WriteRune(r)
|
|
|
|
return lexBareString
|
|
|
|
}
|