From a3602c52a94258ba4d001c0e7be936fa1113db60 Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Wed, 6 Jul 2016 20:46:49 -0600 Subject: [PATCH] moved lexer, made it a lot better --- lex.go | 136 -------------------------- lex_test.go | 71 -------------- lexer/lexer.go | 232 ++++++++++++++++++++++++++++++++++++++++++++ lexer/lexer_test.go | 69 +++++++++++++ 4 files changed, 301 insertions(+), 207 deletions(-) delete mode 100644 lex.go delete mode 100644 lex_test.go create mode 100644 lexer/lexer.go create mode 100644 lexer/lexer_test.go diff --git a/lex.go b/lex.go deleted file mode 100644 index 9d04007..0000000 --- a/lex.go +++ /dev/null @@ -1,136 +0,0 @@ -package ginger - -import ( - "io" - "strings" - - "github.com/mediocregopher/lexgo" -) - -const ( - number lexgo.TokenType = lexgo.UserDefined + iota - identifier - punctuation -) - -var numberSet = "0123456789" -var whitespaceSet = " \n\r\t\v\f" -var punctuationSet = ",{}()<>|" - -func newLexer(r io.Reader) *lexgo.Lexer { - return lexgo.NewLexer(r, lexWhitespace) -} - -func lexWhitespace(lexer *lexgo.Lexer) lexgo.LexerFunc { - r, err := lexer.ReadRune() - if err != nil { - return nil - } - - if strings.ContainsRune(whitespaceSet, r) { - return lexWhitespace - } - - if r == '/' { - n, err := lexer.PeekRune() - if err != nil { - return nil - } - - var lexComment func(*lexgo.Lexer) bool - if n == '/' { - lexComment = lexLineComment - } else if n == '*' { - lexComment = lexBlockComment - } - if lexComment != nil { - if !lexComment(lexer) { - return nil - } - return lexWhitespace - } - } - - lexer.BufferRune(r) - - switch { - case strings.ContainsRune(punctuationSet, r): - return lexPunctuation - case strings.ContainsRune(numberSet, r): - return lexNumber - default: - return lexIdentifier - } -} - -// assumes the punctuation has already been buffered -func lexPunctuation(lexer *lexgo.Lexer) lexgo.LexerFunc { - lexer.Emit(punctuation) - return lexWhitespace -} - -func lexGeneralExpr(lexer *lexgo.Lexer, typ lexgo.TokenType) lexgo.LexerFunc { - for { - r, err := lexer.ReadRune() - if err != nil { - return nil - } - - if strings.ContainsRune(whitespaceSet, r) { - lexer.Emit(typ) - return lexWhitespace - } - - if strings.ContainsRune(punctuationSet, r) { - lexer.Emit(typ) - lexer.BufferRune(r) - return lexPunctuation - } - - lexer.BufferRune(r) - } -} - -func lexNumber(lexer *lexgo.Lexer) lexgo.LexerFunc { - return lexGeneralExpr(lexer, number) -} - -func lexIdentifier(lexer *lexgo.Lexer) lexgo.LexerFunc { - return lexGeneralExpr(lexer, identifier) -} - -func lexLineComment(lexer *lexgo.Lexer) bool { - for { - r, err := lexer.ReadRune() - if err != nil { - return false - } else if r == '\n' { - return true - } - } -} - -func lexBlockComment(lexer *lexgo.Lexer) bool { - for { - r, err := lexer.ReadRune() - if err != nil { - return false - } - - if r == '*' || r == '/' { - n, err := lexer.PeekRune() - if err != nil { - return false - } - if r == '*' && n == '/' { - _, err = lexer.ReadRune() - return err == nil - } - if r == '/' && n == '*' { - if !lexBlockComment(lexer) { - return false - } - } - } - } -} diff --git a/lex_test.go b/lex_test.go deleted file mode 100644 index b6447a9..0000000 --- a/lex_test.go +++ /dev/null @@ -1,71 +0,0 @@ -package ginger - -import ( - "bytes" - "io" - . "testing" - - "github.com/mediocregopher/lexgo" - "github.com/stretchr/testify/assert" -) - -var lexTestSrc = ` - // this is a comment - // // this is also a comment - a - anIdentifier - 1 - 100 - 1.5 - 1.5e9 - - /* block comment */ - prefix /* - Another block comment - /* Embedded */ - /* - Super embedded - */ - */ suffix - - // this one is kind of fun, technically it's a comment - /*/ - - (punctuation,is{cool}<> ) - -tab -` - -func TestLex(t *T) { - l := newLexer(bytes.NewBufferString(lexTestSrc)) - - assertNext := func(typ lexgo.TokenType, val string) { - t.Logf("asserting %q", val) - tok := l.Next() - assert.Equal(t, typ, tok.TokenType) - assert.Equal(t, val, tok.Val) - } - - assertNext(identifier, "a") - assertNext(identifier, "anIdentifier") - assertNext(number, "1") - assertNext(number, "100") - assertNext(number, "1.5") - assertNext(number, "1.5e9") - assertNext(identifier, "prefix") - assertNext(identifier, "suffix") - assertNext(punctuation, "(") - assertNext(identifier, "punctuation") - assertNext(punctuation, ",") - assertNext(identifier, "is") - assertNext(punctuation, "{") - assertNext(identifier, "cool") - assertNext(punctuation, "}") - assertNext(punctuation, "<") - assertNext(punctuation, ">") - assertNext(punctuation, ")") - assertNext(identifier, "-tab") - - tok := l.Next() - assert.Equal(t, tok.TokenType, lexgo.Err) - assert.Equal(t, tok.Err, io.EOF) -} diff --git a/lexer/lexer.go b/lexer/lexer.go new file mode 100644 index 0000000..1716203 --- /dev/null +++ b/lexer/lexer.go @@ -0,0 +1,232 @@ +package lexer + +import ( + "bufio" + "bytes" + "fmt" + "io" + "strings" +) + +// TokenType indicates the type of a token +type TokenType string + +// Different token types +const ( + Identifier TokenType = "identifier" + Punctuation TokenType = "punctuation" + String TokenType = "string" +) + +// Token is a single token which has been read in. All Tokens have a non-empty +// Val +type Token struct { + TokenType + Val string + Row, Col int +} + +type lexerFn func(*Lexer, rune, rune) lexerFn + +// Lexer is used to read in ginger tokens from a source. HasNext() must be +// called before every call to Next(), and Err() must be called once HasNext() +// returns false. +type Lexer struct { + in *bufio.Reader + out *bytes.Buffer + cur lexerFn + + next []Token + err error + + row, col int + absRow, absCol int +} + +// New returns a Lexer which will read tokens from the given source. +func New(r io.Reader) *Lexer { + return &Lexer{ + in: bufio.NewReader(r), + out: new(bytes.Buffer), + cur: lex, + + row: -1, + col: -1, + } +} + +func (l *Lexer) emit(t TokenType) { + str := l.out.String() + if str == "" { + panic("cannot emit empty token") + } + l.out.Reset() + + l.next = append(l.next, Token{ + TokenType: t, + Val: str, + Row: l.row, + Col: l.col, + }) + l.row = -1 + l.col = -1 +} + +func (l *Lexer) readRune() (rune, bool) { + r, _, err := l.in.ReadRune() + if err != nil { + l.err = err + return r, false + } + return r, true +} + +func (l *Lexer) peekRune() (rune, bool) { + r, ok := l.readRune() + if !ok { + return r, ok + } + + if err := l.in.UnreadRune(); err != nil { + l.err = err + return r, false + } + return r, true +} + +func (l *Lexer) bufferRune(r rune) { + l.out.WriteRune(r) + if l.row < 0 && l.col < 0 { + l.row, l.col = l.absRow, l.absCol + } +} + +// HasNext returns true if Next should be called, and false if it should not be +// called and Err should be called instead. When HasNext returns false the Lexer +// is considered to be done +func (l *Lexer) HasNext() bool { + if l.err != nil || l.cur == nil { + return false + } + + for { + if len(l.next) > 0 { + return true + } + + var ok bool + var r, n rune + if r, ok = l.readRune(); !ok { + return false + } + + if n, ok = l.peekRune(); !ok { + return false + } + + if r == '\n' { + l.absRow++ + l.absCol = 0 + } else { + l.absCol++ + } + + l.cur = l.cur(l, r, n) + } +} + +// Next returns the next available token. HasNext must be called before every +// call to Next +func (l *Lexer) Next() Token { + t := l.next[0] + l.next = l.next[1:] + if len(l.next) == 0 { + l.next = nil + } + return t +} + +// Err returns the error which caused HasNext to return false. Will return nil +// if the error was io.EOF +func (l *Lexer) Err() error { + if l.err != nil && l.err != io.EOF { + return l.err + } else if l.out.Len() > 0 { + return fmt.Errorf("incomplete token: %q", l.out.String()) + } + return nil +} + +var whitespaceSet = " \n\r\t\v\f" +var punctuationSet = ",{}()<>|" +var identifierSepSet = whitespaceSet + punctuationSet + +func lex(lexer *Lexer, r, n rune) lexerFn { + switch { + case strings.ContainsRune(whitespaceSet, r): + return lex + case r == '/' && n == '/': + return lexLineComment + case strings.ContainsRune(punctuationSet, r): + return lexPunctuation(lexer, r, n) + case r == '"' || r == '\'' || r == '`': + canEscape := r != '`' + return lexStrStart(lexer, r, makeLexStr(r, canEscape)) + default: + return lexIdentifier(lexer, r, n) + } +} + +func lexPunctuation(lexer *Lexer, r, n rune) lexerFn { + lexer.bufferRune(r) + lexer.emit(Punctuation) + return lex +} + +func lexIdentifier(lexer *Lexer, r, n rune) lexerFn { + if strings.ContainsRune(identifierSepSet, r) { + lexer.emit(Identifier) + return lex(lexer, r, n) + } + + lexer.bufferRune(r) + return lexIdentifier +} + +func lexLineComment(lexer *Lexer, r, n rune) lexerFn { + if r == '\n' { + return lex + } + return lexLineComment +} + +func lexStrStart(lexer *Lexer, r rune, then lexerFn) lexerFn { + lexer.bufferRune(r) + return then +} + +func makeLexStr(quoteC rune, canEscape bool) lexerFn { + var fn lexerFn + fn = func(lexer *Lexer, r, n rune) lexerFn { + if canEscape && r == '\\' && n == quoteC { + lexer.bufferRune(r) + lexer.bufferRune(n) + return lexSkipThen(fn) + } + + lexer.bufferRune(r) + if r == quoteC { + lexer.emit(String) + return lex + } + + return fn + } + return fn +} + +func lexSkipThen(then lexerFn) lexerFn { + return func(lexer *Lexer, r, n rune) lexerFn { + return then + } +} diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go new file mode 100644 index 0000000..66ff831 --- /dev/null +++ b/lexer/lexer_test.go @@ -0,0 +1,69 @@ +package lexer + +import ( + "bytes" + . "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var lexTestSrc = ` + // this is a comment + // // this is also a comment + a + anIdentifier + 1 + 100 + 1.5 + 1.5e9 + + (punctuation,is{cool}<> ) + -tab + + "this is a string", "and so is this one" + "\"foo" + "bar\"baz\"" + "buz\0" +` + +func TestLex(t *T) { + l := New(bytes.NewBufferString(lexTestSrc)) + + assertNext := func(typ TokenType, val string, row, col int) { + t.Logf("asserting %s %q [row:%d col:%d]", typ, val, row, col) + require.True(t, l.HasNext()) + tok := l.Next() + assert.Equal(t, typ, tok.TokenType) + assert.Equal(t, val, tok.Val) + assert.Equal(t, row, tok.Row) + assert.Equal(t, col, tok.Col) + } + + assertNext(Identifier, "a", 3, 2) + assertNext(Identifier, "anIdentifier", 4, 2) + assertNext(Identifier, "1", 5, 2) + assertNext(Identifier, "100", 6, 2) + assertNext(Identifier, "1.5", 7, 2) + assertNext(Identifier, "1.5e9", 8, 2) + assertNext(Punctuation, "(", 10, 2) + assertNext(Identifier, "punctuation", 10, 3) + assertNext(Punctuation, ",", 10, 14) + assertNext(Identifier, "is", 10, 15) + assertNext(Punctuation, "{", 10, 17) + assertNext(Identifier, "cool", 10, 18) + assertNext(Punctuation, "}", 10, 22) + assertNext(Punctuation, "<", 10, 23) + assertNext(Punctuation, ">", 10, 24) + assertNext(Punctuation, ")", 10, 26) + assertNext(Identifier, "-tab", 11, 2) + assertNext(String, `"this is a string"`, 13, 2) + assertNext(Punctuation, ",", 13, 20) + assertNext(String, `"and so is this one"`, 13, 22) + assertNext(String, `"\"foo"`, 14, 2) + assertNext(String, `"bar\"baz\""`, 15, 2) + assertNext(String, `"buz\0"`, 16, 2) + + assert.False(t, l.HasNext()) + assert.Nil(t, l.Err()) +}