From 76f963694fb5224d8b1a6fe047136d30a7cc6b1a Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Fri, 22 Jul 2016 14:38:20 -0600 Subject: [PATCH] got basic parsing of single expressions, parenthesis, and tuples done --- lexer/lexer.go | 217 +++++++++++++++++++++----------- lexer/lexer_test.go | 2 +- types.go | 292 +++++++++++++++++++++++++++++++++++++++++--- types_test.go | 87 +++++++++++++ 4 files changed, 506 insertions(+), 92 deletions(-) create mode 100644 types_test.go diff --git a/lexer/lexer.go b/lexer/lexer.go index 1716203..2fc3355 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -3,6 +3,7 @@ package lexer import ( "bufio" "bytes" + "errors" "fmt" "io" "strings" @@ -16,6 +17,8 @@ const ( Identifier TokenType = "identifier" Punctuation TokenType = "punctuation" String TokenType = "string" + Err TokenType = "err" + EOF TokenType = "eof" ) // Token is a single token which has been read in. All Tokens have a non-empty @@ -26,18 +29,45 @@ type Token struct { Row, Col int } -type lexerFn func(*Lexer, rune, rune) lexerFn +// Equal returns whether two tokens are of equal type and value +func (tok Token) Equal(tok2 Token) bool { + return tok.TokenType == tok2.TokenType && tok.Val == tok2.Val +} + +// Err returns the error contained by the token, if any. Only returns non-nil if +// TokenType is Err or EOR +func (tok Token) Err() error { + if tok.TokenType == Err || tok.TokenType == EOF { + return errors.New(tok.Val) + } + return nil +} + +func (tok Token) String() string { + var typ string + switch tok.TokenType { + case Identifier: + typ = "ident" + case Punctuation: + typ = "punct" + case String: + typ = "str" + case Err, EOF: + typ = "err" + } + return fmt.Sprintf("%s(%q)", typ, tok.Val) +} + +type lexerFn func(*Lexer) lexerFn // Lexer is used to read in ginger tokens from a source. HasNext() must be -// called before every call to Next(), and Err() must be called once HasNext() -// returns false. +// called before every call to Next() type Lexer struct { in *bufio.Reader out *bytes.Buffer cur lexerFn next []Token - err error row, col int absRow, absCol int @@ -62,36 +92,69 @@ func (l *Lexer) emit(t TokenType) { } l.out.Reset() - l.next = append(l.next, Token{ + l.emitTok(Token{ TokenType: t, Val: str, Row: l.row, Col: l.col, }) +} + +func (l *Lexer) emitErr(err error) { + tok := Token{ + TokenType: Err, + Val: err.Error(), + Row: l.absRow, + Col: l.absCol, + } + if err == io.EOF { + tok.TokenType = EOF + } + l.emitTok(tok) +} + +func (l *Lexer) emitTok(tok Token) { + l.next = append(l.next, tok) l.row = -1 l.col = -1 } -func (l *Lexer) readRune() (rune, bool) { +func (l *Lexer) readRune() (rune, error) { r, _, err := l.in.ReadRune() if err != nil { - l.err = err - return r, false + return r, err } - return r, true + + if r == '\n' { + l.absRow++ + l.absCol = 0 + } else { + l.absCol++ + } + + return r, err } -func (l *Lexer) peekRune() (rune, bool) { - r, ok := l.readRune() - if !ok { - return r, ok +func (l *Lexer) peekRune() (rune, error) { + r, _, err := l.in.ReadRune() + if err != nil { + return r, err } if err := l.in.UnreadRune(); err != nil { - l.err = err - return r, false + return r, err } - return r, true + return r, nil +} + +func (l *Lexer) readAndPeek() (rune, rune, error) { + r, err := l.readRune() + if err != nil { + return r, 0, err + } + + n, err := l.peekRune() + return r, n, err } func (l *Lexer) bufferRune(r rune) { @@ -105,33 +168,13 @@ func (l *Lexer) bufferRune(r rune) { // called and Err should be called instead. When HasNext returns false the Lexer // is considered to be done func (l *Lexer) HasNext() bool { - if l.err != nil || l.cur == nil { - return false - } - for { if len(l.next) > 0 { return true - } - - var ok bool - var r, n rune - if r, ok = l.readRune(); !ok { + } else if l.cur == nil { return false } - - if n, ok = l.peekRune(); !ok { - return false - } - - if r == '\n' { - l.absRow++ - l.absCol = 0 - } else { - l.absCol++ - } - - l.cur = l.cur(l, r, n) + l.cur = l.cur(l) } } @@ -146,54 +189,68 @@ func (l *Lexer) Next() Token { return t } -// Err returns the error which caused HasNext to return false. Will return nil -// if the error was io.EOF -func (l *Lexer) Err() error { - if l.err != nil && l.err != io.EOF { - return l.err - } else if l.out.Len() > 0 { - return fmt.Errorf("incomplete token: %q", l.out.String()) - } - return nil -} - var whitespaceSet = " \n\r\t\v\f" var punctuationSet = ",{}()<>|" var identifierSepSet = whitespaceSet + punctuationSet -func lex(lexer *Lexer, r, n rune) lexerFn { +func lex(l *Lexer) lexerFn { + r, err := l.readRune() + if err != nil { + l.emitErr(err) + return nil + } + + // handle comments first, cause we have to peek for those. We ignore errors, + // and assume that any error that would happen here will happen again the + // next read + if n, _ := l.peekRune(); n == '/' { + return lexLineComment + } + + return lexSingleRune(l, r) +} + +func lexSingleRune(l *Lexer, r rune) lexerFn { switch { case strings.ContainsRune(whitespaceSet, r): return lex - case r == '/' && n == '/': - return lexLineComment case strings.ContainsRune(punctuationSet, r): - return lexPunctuation(lexer, r, n) + l.bufferRune(r) + l.emit(Punctuation) + return lex case r == '"' || r == '\'' || r == '`': canEscape := r != '`' - return lexStrStart(lexer, r, makeLexStr(r, canEscape)) + return lexStrStart(l, r, makeLexStr(r, canEscape)) default: - return lexIdentifier(lexer, r, n) + l.bufferRune(r) + return lexIdentifier } } -func lexPunctuation(lexer *Lexer, r, n rune) lexerFn { - lexer.bufferRune(r) - lexer.emit(Punctuation) - return lex -} +func lexIdentifier(l *Lexer) lexerFn { + r, err := l.readRune() + if err != nil { + l.emit(Identifier) + l.emitErr(err) + return nil + } -func lexIdentifier(lexer *Lexer, r, n rune) lexerFn { if strings.ContainsRune(identifierSepSet, r) { - lexer.emit(Identifier) - return lex(lexer, r, n) + l.emit(Identifier) + return lexSingleRune(l, r) } - lexer.bufferRune(r) + l.bufferRune(r) + return lexIdentifier } -func lexLineComment(lexer *Lexer, r, n rune) lexerFn { +func lexLineComment(l *Lexer) lexerFn { + r, err := l.readRune() + if err != nil { + l.emitErr(err) + return nil + } if r == '\n' { return lex } @@ -207,16 +264,30 @@ func lexStrStart(lexer *Lexer, r rune, then lexerFn) lexerFn { func makeLexStr(quoteC rune, canEscape bool) lexerFn { var fn lexerFn - fn = func(lexer *Lexer, r, n rune) lexerFn { + fn = func(l *Lexer) lexerFn { + r, n, err := l.readAndPeek() + if err != nil { + if err == io.EOF { + if r == quoteC { + l.bufferRune(r) + l.emit(String) + l.emitErr(err) + return nil + } + l.emitErr(errors.New("expected end of string, got end of file")) + return nil + } + } + if canEscape && r == '\\' && n == quoteC { - lexer.bufferRune(r) - lexer.bufferRune(n) + l.bufferRune(r) + l.bufferRune(n) return lexSkipThen(fn) } - lexer.bufferRune(r) + l.bufferRune(r) if r == quoteC { - lexer.emit(String) + l.emit(String) return lex } @@ -226,7 +297,11 @@ func makeLexStr(quoteC rune, canEscape bool) lexerFn { } func lexSkipThen(then lexerFn) lexerFn { - return func(lexer *Lexer, r, n rune) lexerFn { + return func(l *Lexer) lexerFn { + if _, err := l.readRune(); err != nil { + l.emitErr(err) + return nil + } return then } } diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index 66ff831..b80d9aa 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -63,7 +63,7 @@ func TestLex(t *T) { assertNext(String, `"\"foo"`, 14, 2) assertNext(String, `"bar\"baz\""`, 15, 2) assertNext(String, `"buz\0"`, 16, 2) + assertNext(EOF, "EOF", 17, 0) assert.False(t, l.HasNext()) - assert.Nil(t, l.Err()) } diff --git a/types.go b/types.go index 8f42c88..341096e 100644 --- a/types.go +++ b/types.go @@ -1,27 +1,279 @@ package ginger -type Expr struct { - // [0-9]+ - Int int +import ( + "fmt" + "io" + "strconv" + "strings" - // true | false - Bool bool + "github.com/mediocregopher/ginger/lexer" +) - // [Expr [, Expr]] - Tuple []Expr +// TODO error type which incorporates token - // { [Statement (;\s)]* } - Block []Expr +type tok lexer.Token - // [Expr | Expr] - Pipeline []Expr - - // [a-z]+ - Identifier string - - // Expr > Expr - Statement *struct { - Input Expr - Into Expr - } +func (t tok) Token() lexer.Token { + return lexer.Token(t) +} + +type Expr interface { + Token() lexer.Token + String() string + + // Equal should return true if the type and value of the other expression + // are equal. The tokens shouldn't be taken into account + Equal(Expr) bool +} + +//////////////////////////////////////////////////////////////////////////////// + +type Bool struct { + tok + val bool +} + +func (b Bool) String() string { + return fmt.Sprint(b.val) +} + +func (b Bool) Equal(e Expr) bool { + bb, ok := e.(Bool) + if !ok { + return false + } + return bb.val == b.val +} + +//////////////////////////////////////////////////////////////////////////////// + +type Int struct { + tok + val int64 +} + +func (i Int) String() string { + return fmt.Sprint(i.val) +} + +func (i Int) Equal(e Expr) bool { + ii, ok := e.(Int) + if !ok { + return false + } + return ii.val == i.val +} + +//////////////////////////////////////////////////////////////////////////////// + +type String struct { + tok + str string +} + +func (s String) String() string { + return strconv.QuoteToASCII(s.str) +} + +func (s String) Equal(e Expr) bool { + ss, ok := e.(String) + if !ok { + return false + } + return ss.str == s.str +} + +//////////////////////////////////////////////////////////////////////////////// + +type Identifier struct { + tok + ident string +} + +func (id Identifier) String() string { + return id.ident +} + +func (id Identifier) Equal(e Expr) bool { + idid, ok := e.(Identifier) + if !ok { + return false + } + return idid.ident == id.ident +} + +//////////////////////////////////////////////////////////////////////////////// + +type Tuple struct { + exprs []Expr +} + +func (tup Tuple) Token() lexer.Token { + return tup.exprs[0].Token() +} + +func (tup Tuple) String() string { + strs := make([]string, len(tup.exprs)) + for i := range tup.exprs { + strs[i] = tup.exprs[i].String() + } + return "(" + strings.Join(strs, ", ") + ")" +} + +func (tup Tuple) Equal(e Expr) bool { + tuptup, ok := e.(Tuple) + if !ok || len(tuptup.exprs) != len(tup.exprs) { + return false + } + for i := range tup.exprs { + if !tup.exprs[i].Equal(tuptup.exprs[i]) { + return false + } + } + return true +} + +//////////////////////////////////////////////////////////////////////////////// + +// toks[0] must be start +func sliceEnclosedToks(toks []lexer.Token, start, end lexer.Token) ([]lexer.Token, []lexer.Token, error) { + c := 1 + ret := []lexer.Token{} + for i, tok := range toks[1:] { + if err := tok.Err(); err != nil { + return nil, nil, fmt.Errorf("missing closing %v, hit error:% s", end, err) + } + + if tok.Equal(start) { + c++ + } else if tok.Equal(end) { + c-- + } + if c == 0 { + return ret, toks[2+i:], nil + } + ret = append(ret, tok) + } + + return nil, nil, fmt.Errorf("missing closing %v", end) +} + +func readAllToks(r io.Reader) []lexer.Token { + l := lexer.New(r) + var toks []lexer.Token + for l.HasNext() { + toks = append(toks, l.Next()) + } + return toks +} + +// For all parse methods it is assumed that toks is not empty + +var ( + openParen = lexer.Token{TokenType: lexer.Punctuation, Val: "("} + closeParen = lexer.Token{TokenType: lexer.Punctuation, Val: ")"} + comma = lexer.Token{TokenType: lexer.Punctuation, Val: ","} +) + +func parse(toks []lexer.Token) (Expr, []lexer.Token, error) { + expr, toks, err := parseSingle(toks) + if err != nil { + return nil, nil, err + } + + if len(toks) > 0 && toks[0].TokenType == lexer.Punctuation { + return parseConnectingPunct(toks, expr) + } + + return expr, toks, nil +} + +func parseSingle(toks []lexer.Token) (Expr, []lexer.Token, error) { + var expr Expr + var err error + + if err := toks[0].Err(); err != nil { + return nil, nil, err + } + + if toks[0].Equal(openParen) { + starter := toks[0] + var ptoks []lexer.Token + ptoks, toks, err = sliceEnclosedToks(toks, openParen, closeParen) + if err != nil { + return nil, nil, err + } + + if expr, ptoks, err = parse(ptoks); err != nil { + return nil, nil, err + } else if len(ptoks) > 0 { + return nil, nil, fmt.Errorf("multiple expressions inside parenthesis; %v", starter) + } + return expr, toks, nil + } + + if expr, err = parseNonPunct(toks[0]); err != nil { + return nil, nil, err + } + return expr, toks[1:], nil +} + +func parseNonPunct(tok lexer.Token) (Expr, error) { + if tok.TokenType == lexer.Identifier { + return parseIdentifier(tok) + } else if tok.TokenType == lexer.String { + return parseString(tok) + } + + return nil, fmt.Errorf("unexpected non-punctuation token: %v", tok) +} + +func parseIdentifier(t lexer.Token) (Expr, error) { + if t.Val[0] == '-' || (t.Val[0] >= '0' && t.Val[0] <= '9') { + n, err := strconv.ParseInt(t.Val, 10, 64) + return Int{tok: tok(t), val: n}, err + } + + if t.Val == "true" { + return Bool{tok: tok(t), val: true}, nil + } else if t.Val == "false" { + return Bool{tok: tok(t), val: false}, nil + } + + return Identifier{tok: tok(t), ident: t.Val}, nil +} + +func parseString(t lexer.Token) (Expr, error) { + str, err := strconv.Unquote(t.Val) + return String{tok: tok(t), str: str}, err +} + +func parseConnectingPunct(toks []lexer.Token, root Expr) (Expr, []lexer.Token, error) { + if toks[0].Equal(comma) { + return parseTuple(toks, root) + } + + return nil, nil, fmt.Errorf("invalid connecting punctuation: %v", toks[0]) +} + +func parseTuple(toks []lexer.Token, root Expr) (Expr, []lexer.Token, error) { + rootTup, ok := root.(Tuple) + if !ok { + rootTup = Tuple{exprs: []Expr{root}} + } + + if len(toks) < 2 { + return rootTup, toks, nil + } else if !toks[0].Equal(comma) { + return rootTup, toks, nil + } + + var expr Expr + var err error + if expr, toks, err = parseSingle(toks[1:]); err != nil { + return nil, nil, err + } + + rootTup.exprs = append(rootTup.exprs, expr) + return parseTuple(toks, rootTup) } diff --git a/types_test.go b/types_test.go new file mode 100644 index 0000000..23340b8 --- /dev/null +++ b/types_test.go @@ -0,0 +1,87 @@ +package ginger + +import ( + . "testing" + + "github.com/mediocregopher/ginger/lexer" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSliceEnclosedToks(t *T) { + doAssert := func(in, expOut, expRem []lexer.Token) { + out, rem, err := sliceEnclosedToks(in, openParen, closeParen) + require.Nil(t, err) + assert.Equal(t, expOut, out) + assert.Equal(t, expRem, rem) + } + foo := lexer.Token{TokenType: lexer.Identifier, Val: "foo"} + bar := lexer.Token{TokenType: lexer.Identifier, Val: "bar"} + + toks := []lexer.Token{openParen, closeParen} + doAssert(toks, []lexer.Token{}, []lexer.Token{}) + + toks = []lexer.Token{openParen, foo, closeParen, bar} + doAssert(toks, []lexer.Token{foo}, []lexer.Token{bar}) + + toks = []lexer.Token{openParen, foo, foo, closeParen, bar, bar} + doAssert(toks, []lexer.Token{foo, foo}, []lexer.Token{bar, bar}) + + toks = []lexer.Token{openParen, foo, openParen, bar, closeParen, closeParen} + doAssert(toks, []lexer.Token{foo, openParen, bar, closeParen}, []lexer.Token{}) + + toks = []lexer.Token{openParen, foo, openParen, bar, closeParen, bar, closeParen, foo} + doAssert(toks, []lexer.Token{foo, openParen, bar, closeParen, bar}, []lexer.Token{foo}) +} + +func assertParse(t *T, in []lexer.Token, expExpr Expr, expOut []lexer.Token) { + expr, out, err := parse(in) + require.Nil(t, err) + t.Logf("expr:%v out:%v", expr, out) + assert.True(t, expExpr.Equal(expr)) + assert.Equal(t, expOut, out) +} + +func TestParseSingle(t *T) { + foo := lexer.Token{TokenType: lexer.Identifier, Val: "foo"} + fooExpr := Identifier{tok: tok(foo), ident: "foo"} + + toks := []lexer.Token{foo} + assertParse(t, toks, fooExpr, []lexer.Token{}) + + toks = []lexer.Token{foo, foo} + assertParse(t, toks, fooExpr, []lexer.Token{foo}) + + toks = []lexer.Token{openParen, foo, closeParen, foo} + assertParse(t, toks, fooExpr, []lexer.Token{foo}) + + toks = []lexer.Token{openParen, openParen, foo, closeParen, closeParen, foo} + assertParse(t, toks, fooExpr, []lexer.Token{foo}) +} + +func TestParseTuple(t *T) { + tup := func(ee ...Expr) Expr { + return Tuple{exprs: ee} + } + + foo := lexer.Token{TokenType: lexer.Identifier, Val: "foo"} + fooExpr := Identifier{tok: tok(foo), ident: "foo"} + + toks := []lexer.Token{foo, comma, foo} + assertParse(t, toks, tup(fooExpr, fooExpr), []lexer.Token{}) + + toks = []lexer.Token{foo, comma, foo, foo} + assertParse(t, toks, tup(fooExpr, fooExpr), []lexer.Token{foo}) + + toks = []lexer.Token{foo, comma, foo, comma, foo} + assertParse(t, toks, tup(fooExpr, fooExpr, fooExpr), []lexer.Token{}) + + toks = []lexer.Token{foo, comma, foo, comma, foo, comma, foo} + assertParse(t, toks, tup(fooExpr, fooExpr, fooExpr, fooExpr), []lexer.Token{}) + + toks = []lexer.Token{foo, comma, openParen, foo, comma, foo, closeParen, comma, foo} + assertParse(t, toks, tup(fooExpr, tup(fooExpr, fooExpr), fooExpr), []lexer.Token{}) + + toks = []lexer.Token{foo, comma, openParen, foo, comma, foo, closeParen, comma, foo, foo} + assertParse(t, toks, tup(fooExpr, tup(fooExpr, fooExpr), fooExpr), []lexer.Token{foo}) +}