bail on the parse code, watched rob pike's talk and wrote a lexer based on that

This commit is contained in:
Brian Picciano 2014-10-18 12:04:48 -04:00
parent c8fdb2e242
commit 6fa4b3b11d
4 changed files with 261 additions and 143 deletions

205
parse/lex/lex.go Normal file
View File

@ -0,0 +1,205 @@
package lexer
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"unicode"
)
type TokenType int
const (
BareString TokenType = iota
QuotedString
OpenParen
CloseParen
Err
EOF
)
var invalidBareStringRunes = map[rune]bool{
'"': true,
'\'': true,
'(': true,
')': true,
'[': true,
']': true,
'{': true,
'}': true,
}
// Token represents a single set of characters that are a "thing" in the syntax
type Token struct {
Type TokenType
Val string
}
var (
errInvalidUTF8 = errors.New("invalid utf8 character")
)
// Lexer reads through an io.Reader and emits Tokens from it.
type Lexer struct {
r *bufio.Reader
outbuf *bytes.Buffer
ch chan *Token
}
// NewLexer constructs a new Lexer struct and returns it. r is internally
// wrapped with a bufio.Reader, unless it already is one.
func NewLexer(r io.Reader) *Lexer {
var br *bufio.Reader
var ok bool
if br, ok = r.(*bufio.Reader); !ok {
br = bufio.NewReader(r)
}
l := Lexer{
r: br,
ch: make(chan *Token, 1),
outbuf: bytes.NewBuffer(make([]byte, 0, 1024)),
}
go l.spin()
return &l
}
func (l *Lexer) spin() {
f := lexWhitespace
for {
f = f(l)
if f == nil {
return
}
}
}
func (l *Lexer) Next() *Token {
t := <-l.ch
if t.Type == EOF {
return nil
}
return t
}
func (l *Lexer) emit(t TokenType) {
str := l.outbuf.String()
fmt.Printf("emitting %q\n", str)
l.ch <- &Token{
Type: t,
Val: l.outbuf.String(),
}
l.outbuf.Reset()
}
func (l *Lexer) peek() (rune, error) {
r, err := l.readRune()
if err != nil {
return 0, err
}
if err = l.r.UnreadRune(); err != nil {
return 0, err
}
return r, nil
}
func (l *Lexer) readRune() (rune, error) {
r, i, err := l.r.ReadRune()
if err != nil {
return 0, err
} else if r == unicode.ReplacementChar && i == 1 {
return 0, errInvalidUTF8
}
return r, nil
}
func (l *Lexer) err(err error) lexerFunc {
if err == io.EOF {
l.ch <- &Token{EOF, ""}
} else {
l.ch <- &Token{Err, err.Error()}
}
close(l.ch)
return nil
}
func (l *Lexer) errf(format string, args ...interface{}) lexerFunc {
s := fmt.Sprintf(format, args...)
l.ch <- &Token{Err, s}
close(l.ch)
return nil
}
type lexerFunc func(*Lexer) lexerFunc
func lexWhitespace(l *Lexer) lexerFunc {
r, err := l.readRune()
if err != nil {
return l.err(err)
}
if unicode.IsSpace(r) {
fmt.Printf("skipping %q because it's a space\n", r)
return lexWhitespace
}
fmt.Printf("not skipping %q\n", r)
l.outbuf.WriteRune(r)
switch r {
case '"':
return lexQuotedString
case '(':
l.emit(OpenParen)
case ')':
l.emit(CloseParen)
default:
return lexBareString
}
return lexWhitespace
}
func lexQuotedString(l *Lexer) lexerFunc {
r, err := l.readRune()
if err != nil {
l.emit(QuotedString)
return l.err(err)
}
l.outbuf.WriteRune(r)
buf := l.outbuf.Bytes()
if r == '"' && buf[len(buf) - 2] != '\\' {
l.emit(QuotedString)
fmt.Println("emitting quoted string, parsing whitespace")
return lexWhitespace
}
return lexQuotedString
}
func lexBareString(l *Lexer) lexerFunc {
r, err := l.peek()
if err != nil {
fmt.Printf("got err %s in peek\n", err)
l.emit(BareString)
return l.err(err)
}
if _, ok := invalidBareStringRunes[r]; ok || unicode.IsSpace(r) {
l.emit(BareString)
return lexWhitespace
}
if _, err = l.readRune(); err != nil {
fmt.Printf("got err %s in read\n", err)
l.emit(BareString)
return l.err(err)
}
l.outbuf.WriteRune(r)
return lexBareString
}

56
parse/lex/lex_test.go Normal file
View File

@ -0,0 +1,56 @@
package lexer
import (
"bytes"
. "testing"
)
func TestLexer(t *T) {
m := map[string][]Token{
"": {{EOF, ""}},
" \t": {{EOF, ""}},
"a b c": {{BareString, "a"},
{BareString, "b"},
{BareString, "c"},
{EOF, ""}},
"\"foo\" bar": {{QuotedString, "\"foo\""},
{BareString, "bar"},
{EOF, ""}},
"\"foo\nbar\" baz": {{QuotedString, "\"foo\nbar\""},
{BareString, "baz"},
{EOF, ""}},
"( foo bar ) baz": {{OpenParen, "("},
{BareString, "foo"},
{BareString, "bar"},
{CloseParen, ")"},
{BareString, "baz"},
{EOF, ""}},
"((foo-bar))": {{OpenParen, "("},
{OpenParen, "("},
{BareString, "foo-bar"},
{CloseParen, ")"},
{CloseParen, ")"},
{EOF, ""}},
"(\"foo\nbar\")": {{OpenParen, "("},
{QuotedString, "\"foo\nbar\""},
{CloseParen, ")"},
{EOF, ""}},
}
for input, output := range m {
buf := bytes.NewBufferString(input)
l := NewLexer(buf)
for i := range output {
tok := l.Next()
if tok == nil {
if output[i].Type == EOF {
continue
}
t.Fatalf("input: %q (%d) %#v != %#v", input, i, output[i], tok)
}
if *tok != output[i] {
t.Fatalf("input: %s (%d) %#v != %#v", input, i, output[i], tok)
}
}
}
}

View File

@ -1,92 +1,4 @@
package parse
import (
"bufio"
"strconv"
"github.com/mediocregopher/ginger/types"
)
//func ReadElem(r io.Reader) (types.Elem, error) {
// buf := bufio.NewReader(r)
// var err error
// for {
// }
//}
// ReadString reads in a string from the given reader. It assumes the first
// double-quote has already been read off. Ginger strings are wrapped with " and
// are allowed to have newlines literal in them. In all other respects they are
// the same as go strings.
func ReadString(r *bufio.Reader) (types.Str, error) {
str := types.Str("\"")
for {
piece, err := r.ReadBytes('"')
if err != nil {
return "", err
}
str += types.Str(piece)
if piece[len(piece)-2] != '\\' {
break
}
}
ret, err := strconv.Unquote(string(str))
if err != nil {
return "", err
}
return types.Str(ret), nil
}
// Returns (isNumber, isFloat). Can never return (false, true)
func whatNumber(el string) (bool, bool) {
var isFloat bool
first := el[0]
var start int
if first == '-' {
if len(el) == 1 {
return false, false
}
start = 1
}
el = el[start:]
for i := range el {
if el[i] == '.' {
isFloat = true
} else if el[i] < '0' || el[i] > '9' {
return false, false
}
}
return true, isFloat
}
// Given a string with no spaces and with a length >= 1, parses it into either a
// number or string.
func ParseBareElement(el string) (types.Elem, error) {
isNumber, isFloat := whatNumber(el)
if isNumber {
if isFloat {
f, err := strconv.ParseFloat(el, 64)
if err != nil {
return nil, err
}
return types.Float(f), nil
} else {
i, err := strconv.ParseInt(el, 10, 64)
if err != nil {
return nil, err
}
return types.Int(i), nil
}
}
if el[0] == ':' {
return types.Str(el), nil
}
return types.Str(":"+el), nil
}

View File

@ -1,60 +1,5 @@
package parse
import (
"bytes"
"bufio"
. "testing"
"github.com/mediocregopher/ginger/types"
)
func TestReadString(t *T) {
m := map[string]types.Str{
`"hey there"`: "hey there",
`"hey\nthere"`: "hey\nthere",
`"hey there ⌘"`: "hey there ⌘",
`"hey\nthere \u2318"`: "hey\nthere ⌘",
}
for input, output := range m {
buf := bytes.NewBufferString(input)
buf.ReadByte()
buf2 := bufio.NewReader(buf)
parseOut, err := ReadString(buf2)
if err != nil {
t.Fatal(err)
}
if output != parseOut {
t.Fatalf("`%s` != `%s`", output, parseOut)
}
}
}
func TestParseBareElement(t *T) {
m := map[string]types.Elem{
`1`: types.Int(1),
`12`: types.Int(12),
`-1`: types.Int(-1),
`-12`: types.Int(-12),
`1.0`: types.Float(1.0),
`12.5`: types.Float(12.5),
`-12.5`: types.Float(-12.5),
`-`: types.Str(":-"),
`bare`: types.Str(":bare"),
`:not-bare`: types.Str(":not-bare"),
}
for input, output := range m {
el, err := ParseBareElement(input)
if err != nil {
t.Fatal(err)
}
if output != el {
t.Fatalf("`%s` != `%s`", output, el)
}
}
}