got basic parsing of single expressions, parenthesis, and tuples done

This commit is contained in:
Brian Picciano 2016-07-22 14:38:20 -06:00
parent a3602c52a9
commit 76f963694f
4 changed files with 506 additions and 92 deletions

View File

@ -3,6 +3,7 @@ package lexer
import ( import (
"bufio" "bufio"
"bytes" "bytes"
"errors"
"fmt" "fmt"
"io" "io"
"strings" "strings"
@ -16,6 +17,8 @@ const (
Identifier TokenType = "identifier" Identifier TokenType = "identifier"
Punctuation TokenType = "punctuation" Punctuation TokenType = "punctuation"
String TokenType = "string" String TokenType = "string"
Err TokenType = "err"
EOF TokenType = "eof"
) )
// Token is a single token which has been read in. All Tokens have a non-empty // Token is a single token which has been read in. All Tokens have a non-empty
@ -26,18 +29,45 @@ type Token struct {
Row, Col int Row, Col int
} }
type lexerFn func(*Lexer, rune, rune) lexerFn // Equal returns whether two tokens are of equal type and value
func (tok Token) Equal(tok2 Token) bool {
return tok.TokenType == tok2.TokenType && tok.Val == tok2.Val
}
// Err returns the error contained by the token, if any. Only returns non-nil if
// TokenType is Err or EOR
func (tok Token) Err() error {
if tok.TokenType == Err || tok.TokenType == EOF {
return errors.New(tok.Val)
}
return nil
}
func (tok Token) String() string {
var typ string
switch tok.TokenType {
case Identifier:
typ = "ident"
case Punctuation:
typ = "punct"
case String:
typ = "str"
case Err, EOF:
typ = "err"
}
return fmt.Sprintf("%s(%q)", typ, tok.Val)
}
type lexerFn func(*Lexer) lexerFn
// Lexer is used to read in ginger tokens from a source. HasNext() must be // Lexer is used to read in ginger tokens from a source. HasNext() must be
// called before every call to Next(), and Err() must be called once HasNext() // called before every call to Next()
// returns false.
type Lexer struct { type Lexer struct {
in *bufio.Reader in *bufio.Reader
out *bytes.Buffer out *bytes.Buffer
cur lexerFn cur lexerFn
next []Token next []Token
err error
row, col int row, col int
absRow, absCol int absRow, absCol int
@ -62,36 +92,69 @@ func (l *Lexer) emit(t TokenType) {
} }
l.out.Reset() l.out.Reset()
l.next = append(l.next, Token{ l.emitTok(Token{
TokenType: t, TokenType: t,
Val: str, Val: str,
Row: l.row, Row: l.row,
Col: l.col, Col: l.col,
}) })
}
func (l *Lexer) emitErr(err error) {
tok := Token{
TokenType: Err,
Val: err.Error(),
Row: l.absRow,
Col: l.absCol,
}
if err == io.EOF {
tok.TokenType = EOF
}
l.emitTok(tok)
}
func (l *Lexer) emitTok(tok Token) {
l.next = append(l.next, tok)
l.row = -1 l.row = -1
l.col = -1 l.col = -1
} }
func (l *Lexer) readRune() (rune, bool) { func (l *Lexer) readRune() (rune, error) {
r, _, err := l.in.ReadRune() r, _, err := l.in.ReadRune()
if err != nil { if err != nil {
l.err = err return r, err
return r, false
}
return r, true
} }
func (l *Lexer) peekRune() (rune, bool) { if r == '\n' {
r, ok := l.readRune() l.absRow++
if !ok { l.absCol = 0
return r, ok } else {
l.absCol++
}
return r, err
}
func (l *Lexer) peekRune() (rune, error) {
r, _, err := l.in.ReadRune()
if err != nil {
return r, err
} }
if err := l.in.UnreadRune(); err != nil { if err := l.in.UnreadRune(); err != nil {
l.err = err return r, err
return r, false
} }
return r, true return r, nil
}
func (l *Lexer) readAndPeek() (rune, rune, error) {
r, err := l.readRune()
if err != nil {
return r, 0, err
}
n, err := l.peekRune()
return r, n, err
} }
func (l *Lexer) bufferRune(r rune) { func (l *Lexer) bufferRune(r rune) {
@ -105,33 +168,13 @@ func (l *Lexer) bufferRune(r rune) {
// called and Err should be called instead. When HasNext returns false the Lexer // called and Err should be called instead. When HasNext returns false the Lexer
// is considered to be done // is considered to be done
func (l *Lexer) HasNext() bool { func (l *Lexer) HasNext() bool {
if l.err != nil || l.cur == nil {
return false
}
for { for {
if len(l.next) > 0 { if len(l.next) > 0 {
return true return true
} } else if l.cur == nil {
var ok bool
var r, n rune
if r, ok = l.readRune(); !ok {
return false return false
} }
l.cur = l.cur(l)
if n, ok = l.peekRune(); !ok {
return false
}
if r == '\n' {
l.absRow++
l.absCol = 0
} else {
l.absCol++
}
l.cur = l.cur(l, r, n)
} }
} }
@ -146,54 +189,68 @@ func (l *Lexer) Next() Token {
return t return t
} }
// Err returns the error which caused HasNext to return false. Will return nil
// if the error was io.EOF
func (l *Lexer) Err() error {
if l.err != nil && l.err != io.EOF {
return l.err
} else if l.out.Len() > 0 {
return fmt.Errorf("incomplete token: %q", l.out.String())
}
return nil
}
var whitespaceSet = " \n\r\t\v\f" var whitespaceSet = " \n\r\t\v\f"
var punctuationSet = ",{}()<>|" var punctuationSet = ",{}()<>|"
var identifierSepSet = whitespaceSet + punctuationSet var identifierSepSet = whitespaceSet + punctuationSet
func lex(lexer *Lexer, r, n rune) lexerFn { func lex(l *Lexer) lexerFn {
r, err := l.readRune()
if err != nil {
l.emitErr(err)
return nil
}
// handle comments first, cause we have to peek for those. We ignore errors,
// and assume that any error that would happen here will happen again the
// next read
if n, _ := l.peekRune(); n == '/' {
return lexLineComment
}
return lexSingleRune(l, r)
}
func lexSingleRune(l *Lexer, r rune) lexerFn {
switch { switch {
case strings.ContainsRune(whitespaceSet, r): case strings.ContainsRune(whitespaceSet, r):
return lex return lex
case r == '/' && n == '/':
return lexLineComment
case strings.ContainsRune(punctuationSet, r): case strings.ContainsRune(punctuationSet, r):
return lexPunctuation(lexer, r, n) l.bufferRune(r)
l.emit(Punctuation)
return lex
case r == '"' || r == '\'' || r == '`': case r == '"' || r == '\'' || r == '`':
canEscape := r != '`' canEscape := r != '`'
return lexStrStart(lexer, r, makeLexStr(r, canEscape)) return lexStrStart(l, r, makeLexStr(r, canEscape))
default: default:
return lexIdentifier(lexer, r, n) l.bufferRune(r)
return lexIdentifier
} }
} }
func lexPunctuation(lexer *Lexer, r, n rune) lexerFn { func lexIdentifier(l *Lexer) lexerFn {
lexer.bufferRune(r) r, err := l.readRune()
lexer.emit(Punctuation) if err != nil {
return lex l.emit(Identifier)
l.emitErr(err)
return nil
} }
func lexIdentifier(lexer *Lexer, r, n rune) lexerFn {
if strings.ContainsRune(identifierSepSet, r) { if strings.ContainsRune(identifierSepSet, r) {
lexer.emit(Identifier) l.emit(Identifier)
return lex(lexer, r, n) return lexSingleRune(l, r)
} }
lexer.bufferRune(r) l.bufferRune(r)
return lexIdentifier return lexIdentifier
} }
func lexLineComment(lexer *Lexer, r, n rune) lexerFn { func lexLineComment(l *Lexer) lexerFn {
r, err := l.readRune()
if err != nil {
l.emitErr(err)
return nil
}
if r == '\n' { if r == '\n' {
return lex return lex
} }
@ -207,16 +264,30 @@ func lexStrStart(lexer *Lexer, r rune, then lexerFn) lexerFn {
func makeLexStr(quoteC rune, canEscape bool) lexerFn { func makeLexStr(quoteC rune, canEscape bool) lexerFn {
var fn lexerFn var fn lexerFn
fn = func(lexer *Lexer, r, n rune) lexerFn { fn = func(l *Lexer) lexerFn {
r, n, err := l.readAndPeek()
if err != nil {
if err == io.EOF {
if r == quoteC {
l.bufferRune(r)
l.emit(String)
l.emitErr(err)
return nil
}
l.emitErr(errors.New("expected end of string, got end of file"))
return nil
}
}
if canEscape && r == '\\' && n == quoteC { if canEscape && r == '\\' && n == quoteC {
lexer.bufferRune(r) l.bufferRune(r)
lexer.bufferRune(n) l.bufferRune(n)
return lexSkipThen(fn) return lexSkipThen(fn)
} }
lexer.bufferRune(r) l.bufferRune(r)
if r == quoteC { if r == quoteC {
lexer.emit(String) l.emit(String)
return lex return lex
} }
@ -226,7 +297,11 @@ func makeLexStr(quoteC rune, canEscape bool) lexerFn {
} }
func lexSkipThen(then lexerFn) lexerFn { func lexSkipThen(then lexerFn) lexerFn {
return func(lexer *Lexer, r, n rune) lexerFn { return func(l *Lexer) lexerFn {
if _, err := l.readRune(); err != nil {
l.emitErr(err)
return nil
}
return then return then
} }
} }

View File

@ -63,7 +63,7 @@ func TestLex(t *T) {
assertNext(String, `"\"foo"`, 14, 2) assertNext(String, `"\"foo"`, 14, 2)
assertNext(String, `"bar\"baz\""`, 15, 2) assertNext(String, `"bar\"baz\""`, 15, 2)
assertNext(String, `"buz\0"`, 16, 2) assertNext(String, `"buz\0"`, 16, 2)
assertNext(EOF, "EOF", 17, 0)
assert.False(t, l.HasNext()) assert.False(t, l.HasNext())
assert.Nil(t, l.Err())
} }

286
types.go
View File

@ -1,27 +1,279 @@
package ginger package ginger
type Expr struct { import (
// [0-9]+ "fmt"
Int int "io"
"strconv"
"strings"
// true | false "github.com/mediocregopher/ginger/lexer"
Bool bool )
// [Expr [, Expr]] // TODO error type which incorporates token
Tuple []Expr
// { [Statement (;\s)]* } type tok lexer.Token
Block []Expr
// [Expr | Expr] func (t tok) Token() lexer.Token {
Pipeline []Expr return lexer.Token(t)
}
// [a-z]+ type Expr interface {
Identifier string Token() lexer.Token
String() string
// Expr > Expr // Equal should return true if the type and value of the other expression
Statement *struct { // are equal. The tokens shouldn't be taken into account
Input Expr Equal(Expr) bool
Into Expr }
////////////////////////////////////////////////////////////////////////////////
type Bool struct {
tok
val bool
}
func (b Bool) String() string {
return fmt.Sprint(b.val)
}
func (b Bool) Equal(e Expr) bool {
bb, ok := e.(Bool)
if !ok {
return false
}
return bb.val == b.val
}
////////////////////////////////////////////////////////////////////////////////
type Int struct {
tok
val int64
}
func (i Int) String() string {
return fmt.Sprint(i.val)
}
func (i Int) Equal(e Expr) bool {
ii, ok := e.(Int)
if !ok {
return false
}
return ii.val == i.val
}
////////////////////////////////////////////////////////////////////////////////
type String struct {
tok
str string
}
func (s String) String() string {
return strconv.QuoteToASCII(s.str)
}
func (s String) Equal(e Expr) bool {
ss, ok := e.(String)
if !ok {
return false
}
return ss.str == s.str
}
////////////////////////////////////////////////////////////////////////////////
type Identifier struct {
tok
ident string
}
func (id Identifier) String() string {
return id.ident
}
func (id Identifier) Equal(e Expr) bool {
idid, ok := e.(Identifier)
if !ok {
return false
}
return idid.ident == id.ident
}
////////////////////////////////////////////////////////////////////////////////
type Tuple struct {
exprs []Expr
}
func (tup Tuple) Token() lexer.Token {
return tup.exprs[0].Token()
}
func (tup Tuple) String() string {
strs := make([]string, len(tup.exprs))
for i := range tup.exprs {
strs[i] = tup.exprs[i].String()
}
return "(" + strings.Join(strs, ", ") + ")"
}
func (tup Tuple) Equal(e Expr) bool {
tuptup, ok := e.(Tuple)
if !ok || len(tuptup.exprs) != len(tup.exprs) {
return false
}
for i := range tup.exprs {
if !tup.exprs[i].Equal(tuptup.exprs[i]) {
return false
} }
} }
return true
}
////////////////////////////////////////////////////////////////////////////////
// toks[0] must be start
func sliceEnclosedToks(toks []lexer.Token, start, end lexer.Token) ([]lexer.Token, []lexer.Token, error) {
c := 1
ret := []lexer.Token{}
for i, tok := range toks[1:] {
if err := tok.Err(); err != nil {
return nil, nil, fmt.Errorf("missing closing %v, hit error:% s", end, err)
}
if tok.Equal(start) {
c++
} else if tok.Equal(end) {
c--
}
if c == 0 {
return ret, toks[2+i:], nil
}
ret = append(ret, tok)
}
return nil, nil, fmt.Errorf("missing closing %v", end)
}
func readAllToks(r io.Reader) []lexer.Token {
l := lexer.New(r)
var toks []lexer.Token
for l.HasNext() {
toks = append(toks, l.Next())
}
return toks
}
// For all parse methods it is assumed that toks is not empty
var (
openParen = lexer.Token{TokenType: lexer.Punctuation, Val: "("}
closeParen = lexer.Token{TokenType: lexer.Punctuation, Val: ")"}
comma = lexer.Token{TokenType: lexer.Punctuation, Val: ","}
)
func parse(toks []lexer.Token) (Expr, []lexer.Token, error) {
expr, toks, err := parseSingle(toks)
if err != nil {
return nil, nil, err
}
if len(toks) > 0 && toks[0].TokenType == lexer.Punctuation {
return parseConnectingPunct(toks, expr)
}
return expr, toks, nil
}
func parseSingle(toks []lexer.Token) (Expr, []lexer.Token, error) {
var expr Expr
var err error
if err := toks[0].Err(); err != nil {
return nil, nil, err
}
if toks[0].Equal(openParen) {
starter := toks[0]
var ptoks []lexer.Token
ptoks, toks, err = sliceEnclosedToks(toks, openParen, closeParen)
if err != nil {
return nil, nil, err
}
if expr, ptoks, err = parse(ptoks); err != nil {
return nil, nil, err
} else if len(ptoks) > 0 {
return nil, nil, fmt.Errorf("multiple expressions inside parenthesis; %v", starter)
}
return expr, toks, nil
}
if expr, err = parseNonPunct(toks[0]); err != nil {
return nil, nil, err
}
return expr, toks[1:], nil
}
func parseNonPunct(tok lexer.Token) (Expr, error) {
if tok.TokenType == lexer.Identifier {
return parseIdentifier(tok)
} else if tok.TokenType == lexer.String {
return parseString(tok)
}
return nil, fmt.Errorf("unexpected non-punctuation token: %v", tok)
}
func parseIdentifier(t lexer.Token) (Expr, error) {
if t.Val[0] == '-' || (t.Val[0] >= '0' && t.Val[0] <= '9') {
n, err := strconv.ParseInt(t.Val, 10, 64)
return Int{tok: tok(t), val: n}, err
}
if t.Val == "true" {
return Bool{tok: tok(t), val: true}, nil
} else if t.Val == "false" {
return Bool{tok: tok(t), val: false}, nil
}
return Identifier{tok: tok(t), ident: t.Val}, nil
}
func parseString(t lexer.Token) (Expr, error) {
str, err := strconv.Unquote(t.Val)
return String{tok: tok(t), str: str}, err
}
func parseConnectingPunct(toks []lexer.Token, root Expr) (Expr, []lexer.Token, error) {
if toks[0].Equal(comma) {
return parseTuple(toks, root)
}
return nil, nil, fmt.Errorf("invalid connecting punctuation: %v", toks[0])
}
func parseTuple(toks []lexer.Token, root Expr) (Expr, []lexer.Token, error) {
rootTup, ok := root.(Tuple)
if !ok {
rootTup = Tuple{exprs: []Expr{root}}
}
if len(toks) < 2 {
return rootTup, toks, nil
} else if !toks[0].Equal(comma) {
return rootTup, toks, nil
}
var expr Expr
var err error
if expr, toks, err = parseSingle(toks[1:]); err != nil {
return nil, nil, err
}
rootTup.exprs = append(rootTup.exprs, expr)
return parseTuple(toks, rootTup)
}

87
types_test.go Normal file
View File

@ -0,0 +1,87 @@
package ginger
import (
. "testing"
"github.com/mediocregopher/ginger/lexer"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSliceEnclosedToks(t *T) {
doAssert := func(in, expOut, expRem []lexer.Token) {
out, rem, err := sliceEnclosedToks(in, openParen, closeParen)
require.Nil(t, err)
assert.Equal(t, expOut, out)
assert.Equal(t, expRem, rem)
}
foo := lexer.Token{TokenType: lexer.Identifier, Val: "foo"}
bar := lexer.Token{TokenType: lexer.Identifier, Val: "bar"}
toks := []lexer.Token{openParen, closeParen}
doAssert(toks, []lexer.Token{}, []lexer.Token{})
toks = []lexer.Token{openParen, foo, closeParen, bar}
doAssert(toks, []lexer.Token{foo}, []lexer.Token{bar})
toks = []lexer.Token{openParen, foo, foo, closeParen, bar, bar}
doAssert(toks, []lexer.Token{foo, foo}, []lexer.Token{bar, bar})
toks = []lexer.Token{openParen, foo, openParen, bar, closeParen, closeParen}
doAssert(toks, []lexer.Token{foo, openParen, bar, closeParen}, []lexer.Token{})
toks = []lexer.Token{openParen, foo, openParen, bar, closeParen, bar, closeParen, foo}
doAssert(toks, []lexer.Token{foo, openParen, bar, closeParen, bar}, []lexer.Token{foo})
}
func assertParse(t *T, in []lexer.Token, expExpr Expr, expOut []lexer.Token) {
expr, out, err := parse(in)
require.Nil(t, err)
t.Logf("expr:%v out:%v", expr, out)
assert.True(t, expExpr.Equal(expr))
assert.Equal(t, expOut, out)
}
func TestParseSingle(t *T) {
foo := lexer.Token{TokenType: lexer.Identifier, Val: "foo"}
fooExpr := Identifier{tok: tok(foo), ident: "foo"}
toks := []lexer.Token{foo}
assertParse(t, toks, fooExpr, []lexer.Token{})
toks = []lexer.Token{foo, foo}
assertParse(t, toks, fooExpr, []lexer.Token{foo})
toks = []lexer.Token{openParen, foo, closeParen, foo}
assertParse(t, toks, fooExpr, []lexer.Token{foo})
toks = []lexer.Token{openParen, openParen, foo, closeParen, closeParen, foo}
assertParse(t, toks, fooExpr, []lexer.Token{foo})
}
func TestParseTuple(t *T) {
tup := func(ee ...Expr) Expr {
return Tuple{exprs: ee}
}
foo := lexer.Token{TokenType: lexer.Identifier, Val: "foo"}
fooExpr := Identifier{tok: tok(foo), ident: "foo"}
toks := []lexer.Token{foo, comma, foo}
assertParse(t, toks, tup(fooExpr, fooExpr), []lexer.Token{})
toks = []lexer.Token{foo, comma, foo, foo}
assertParse(t, toks, tup(fooExpr, fooExpr), []lexer.Token{foo})
toks = []lexer.Token{foo, comma, foo, comma, foo}
assertParse(t, toks, tup(fooExpr, fooExpr, fooExpr), []lexer.Token{})
toks = []lexer.Token{foo, comma, foo, comma, foo, comma, foo}
assertParse(t, toks, tup(fooExpr, fooExpr, fooExpr, fooExpr), []lexer.Token{})
toks = []lexer.Token{foo, comma, openParen, foo, comma, foo, closeParen, comma, foo}
assertParse(t, toks, tup(fooExpr, tup(fooExpr, fooExpr), fooExpr), []lexer.Token{})
toks = []lexer.Token{foo, comma, openParen, foo, comma, foo, closeParen, comma, foo, foo}
assertParse(t, toks, tup(fooExpr, tup(fooExpr, fooExpr), fooExpr), []lexer.Token{foo})
}