moved lexer, made it a lot better

This commit is contained in:
Brian Picciano 2016-07-06 20:46:49 -06:00
parent b8ef198384
commit a3602c52a9
4 changed files with 301 additions and 207 deletions

136
lex.go
View File

@ -1,136 +0,0 @@
package ginger
import (
"io"
"strings"
"github.com/mediocregopher/lexgo"
)
const (
number lexgo.TokenType = lexgo.UserDefined + iota
identifier
punctuation
)
var numberSet = "0123456789"
var whitespaceSet = " \n\r\t\v\f"
var punctuationSet = ",{}()<>|"
func newLexer(r io.Reader) *lexgo.Lexer {
return lexgo.NewLexer(r, lexWhitespace)
}
func lexWhitespace(lexer *lexgo.Lexer) lexgo.LexerFunc {
r, err := lexer.ReadRune()
if err != nil {
return nil
}
if strings.ContainsRune(whitespaceSet, r) {
return lexWhitespace
}
if r == '/' {
n, err := lexer.PeekRune()
if err != nil {
return nil
}
var lexComment func(*lexgo.Lexer) bool
if n == '/' {
lexComment = lexLineComment
} else if n == '*' {
lexComment = lexBlockComment
}
if lexComment != nil {
if !lexComment(lexer) {
return nil
}
return lexWhitespace
}
}
lexer.BufferRune(r)
switch {
case strings.ContainsRune(punctuationSet, r):
return lexPunctuation
case strings.ContainsRune(numberSet, r):
return lexNumber
default:
return lexIdentifier
}
}
// assumes the punctuation has already been buffered
func lexPunctuation(lexer *lexgo.Lexer) lexgo.LexerFunc {
lexer.Emit(punctuation)
return lexWhitespace
}
func lexGeneralExpr(lexer *lexgo.Lexer, typ lexgo.TokenType) lexgo.LexerFunc {
for {
r, err := lexer.ReadRune()
if err != nil {
return nil
}
if strings.ContainsRune(whitespaceSet, r) {
lexer.Emit(typ)
return lexWhitespace
}
if strings.ContainsRune(punctuationSet, r) {
lexer.Emit(typ)
lexer.BufferRune(r)
return lexPunctuation
}
lexer.BufferRune(r)
}
}
func lexNumber(lexer *lexgo.Lexer) lexgo.LexerFunc {
return lexGeneralExpr(lexer, number)
}
func lexIdentifier(lexer *lexgo.Lexer) lexgo.LexerFunc {
return lexGeneralExpr(lexer, identifier)
}
func lexLineComment(lexer *lexgo.Lexer) bool {
for {
r, err := lexer.ReadRune()
if err != nil {
return false
} else if r == '\n' {
return true
}
}
}
func lexBlockComment(lexer *lexgo.Lexer) bool {
for {
r, err := lexer.ReadRune()
if err != nil {
return false
}
if r == '*' || r == '/' {
n, err := lexer.PeekRune()
if err != nil {
return false
}
if r == '*' && n == '/' {
_, err = lexer.ReadRune()
return err == nil
}
if r == '/' && n == '*' {
if !lexBlockComment(lexer) {
return false
}
}
}
}
}

View File

@ -1,71 +0,0 @@
package ginger
import (
"bytes"
"io"
. "testing"
"github.com/mediocregopher/lexgo"
"github.com/stretchr/testify/assert"
)
var lexTestSrc = `
// this is a comment
// // this is also a comment
a
anIdentifier
1
100
1.5
1.5e9
/* block comment */
prefix /*
Another block comment
/* Embedded */
/*
Super embedded
*/
*/ suffix
// this one is kind of fun, technically it's a comment
/*/
(punctuation,is{cool}<> )
-tab
`
func TestLex(t *T) {
l := newLexer(bytes.NewBufferString(lexTestSrc))
assertNext := func(typ lexgo.TokenType, val string) {
t.Logf("asserting %q", val)
tok := l.Next()
assert.Equal(t, typ, tok.TokenType)
assert.Equal(t, val, tok.Val)
}
assertNext(identifier, "a")
assertNext(identifier, "anIdentifier")
assertNext(number, "1")
assertNext(number, "100")
assertNext(number, "1.5")
assertNext(number, "1.5e9")
assertNext(identifier, "prefix")
assertNext(identifier, "suffix")
assertNext(punctuation, "(")
assertNext(identifier, "punctuation")
assertNext(punctuation, ",")
assertNext(identifier, "is")
assertNext(punctuation, "{")
assertNext(identifier, "cool")
assertNext(punctuation, "}")
assertNext(punctuation, "<")
assertNext(punctuation, ">")
assertNext(punctuation, ")")
assertNext(identifier, "-tab")
tok := l.Next()
assert.Equal(t, tok.TokenType, lexgo.Err)
assert.Equal(t, tok.Err, io.EOF)
}

232
lexer/lexer.go Normal file
View File

@ -0,0 +1,232 @@
package lexer
import (
"bufio"
"bytes"
"fmt"
"io"
"strings"
)
// TokenType indicates the type of a token
type TokenType string
// Different token types
const (
Identifier TokenType = "identifier"
Punctuation TokenType = "punctuation"
String TokenType = "string"
)
// Token is a single token which has been read in. All Tokens have a non-empty
// Val
type Token struct {
TokenType
Val string
Row, Col int
}
type lexerFn func(*Lexer, rune, rune) lexerFn
// Lexer is used to read in ginger tokens from a source. HasNext() must be
// called before every call to Next(), and Err() must be called once HasNext()
// returns false.
type Lexer struct {
in *bufio.Reader
out *bytes.Buffer
cur lexerFn
next []Token
err error
row, col int
absRow, absCol int
}
// New returns a Lexer which will read tokens from the given source.
func New(r io.Reader) *Lexer {
return &Lexer{
in: bufio.NewReader(r),
out: new(bytes.Buffer),
cur: lex,
row: -1,
col: -1,
}
}
func (l *Lexer) emit(t TokenType) {
str := l.out.String()
if str == "" {
panic("cannot emit empty token")
}
l.out.Reset()
l.next = append(l.next, Token{
TokenType: t,
Val: str,
Row: l.row,
Col: l.col,
})
l.row = -1
l.col = -1
}
func (l *Lexer) readRune() (rune, bool) {
r, _, err := l.in.ReadRune()
if err != nil {
l.err = err
return r, false
}
return r, true
}
func (l *Lexer) peekRune() (rune, bool) {
r, ok := l.readRune()
if !ok {
return r, ok
}
if err := l.in.UnreadRune(); err != nil {
l.err = err
return r, false
}
return r, true
}
func (l *Lexer) bufferRune(r rune) {
l.out.WriteRune(r)
if l.row < 0 && l.col < 0 {
l.row, l.col = l.absRow, l.absCol
}
}
// HasNext returns true if Next should be called, and false if it should not be
// called and Err should be called instead. When HasNext returns false the Lexer
// is considered to be done
func (l *Lexer) HasNext() bool {
if l.err != nil || l.cur == nil {
return false
}
for {
if len(l.next) > 0 {
return true
}
var ok bool
var r, n rune
if r, ok = l.readRune(); !ok {
return false
}
if n, ok = l.peekRune(); !ok {
return false
}
if r == '\n' {
l.absRow++
l.absCol = 0
} else {
l.absCol++
}
l.cur = l.cur(l, r, n)
}
}
// Next returns the next available token. HasNext must be called before every
// call to Next
func (l *Lexer) Next() Token {
t := l.next[0]
l.next = l.next[1:]
if len(l.next) == 0 {
l.next = nil
}
return t
}
// Err returns the error which caused HasNext to return false. Will return nil
// if the error was io.EOF
func (l *Lexer) Err() error {
if l.err != nil && l.err != io.EOF {
return l.err
} else if l.out.Len() > 0 {
return fmt.Errorf("incomplete token: %q", l.out.String())
}
return nil
}
var whitespaceSet = " \n\r\t\v\f"
var punctuationSet = ",{}()<>|"
var identifierSepSet = whitespaceSet + punctuationSet
func lex(lexer *Lexer, r, n rune) lexerFn {
switch {
case strings.ContainsRune(whitespaceSet, r):
return lex
case r == '/' && n == '/':
return lexLineComment
case strings.ContainsRune(punctuationSet, r):
return lexPunctuation(lexer, r, n)
case r == '"' || r == '\'' || r == '`':
canEscape := r != '`'
return lexStrStart(lexer, r, makeLexStr(r, canEscape))
default:
return lexIdentifier(lexer, r, n)
}
}
func lexPunctuation(lexer *Lexer, r, n rune) lexerFn {
lexer.bufferRune(r)
lexer.emit(Punctuation)
return lex
}
func lexIdentifier(lexer *Lexer, r, n rune) lexerFn {
if strings.ContainsRune(identifierSepSet, r) {
lexer.emit(Identifier)
return lex(lexer, r, n)
}
lexer.bufferRune(r)
return lexIdentifier
}
func lexLineComment(lexer *Lexer, r, n rune) lexerFn {
if r == '\n' {
return lex
}
return lexLineComment
}
func lexStrStart(lexer *Lexer, r rune, then lexerFn) lexerFn {
lexer.bufferRune(r)
return then
}
func makeLexStr(quoteC rune, canEscape bool) lexerFn {
var fn lexerFn
fn = func(lexer *Lexer, r, n rune) lexerFn {
if canEscape && r == '\\' && n == quoteC {
lexer.bufferRune(r)
lexer.bufferRune(n)
return lexSkipThen(fn)
}
lexer.bufferRune(r)
if r == quoteC {
lexer.emit(String)
return lex
}
return fn
}
return fn
}
func lexSkipThen(then lexerFn) lexerFn {
return func(lexer *Lexer, r, n rune) lexerFn {
return then
}
}

69
lexer/lexer_test.go Normal file
View File

@ -0,0 +1,69 @@
package lexer
import (
"bytes"
. "testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var lexTestSrc = `
// this is a comment
// // this is also a comment
a
anIdentifier
1
100
1.5
1.5e9
(punctuation,is{cool}<> )
-tab
"this is a string", "and so is this one"
"\"foo"
"bar\"baz\""
"buz\0"
`
func TestLex(t *T) {
l := New(bytes.NewBufferString(lexTestSrc))
assertNext := func(typ TokenType, val string, row, col int) {
t.Logf("asserting %s %q [row:%d col:%d]", typ, val, row, col)
require.True(t, l.HasNext())
tok := l.Next()
assert.Equal(t, typ, tok.TokenType)
assert.Equal(t, val, tok.Val)
assert.Equal(t, row, tok.Row)
assert.Equal(t, col, tok.Col)
}
assertNext(Identifier, "a", 3, 2)
assertNext(Identifier, "anIdentifier", 4, 2)
assertNext(Identifier, "1", 5, 2)
assertNext(Identifier, "100", 6, 2)
assertNext(Identifier, "1.5", 7, 2)
assertNext(Identifier, "1.5e9", 8, 2)
assertNext(Punctuation, "(", 10, 2)
assertNext(Identifier, "punctuation", 10, 3)
assertNext(Punctuation, ",", 10, 14)
assertNext(Identifier, "is", 10, 15)
assertNext(Punctuation, "{", 10, 17)
assertNext(Identifier, "cool", 10, 18)
assertNext(Punctuation, "}", 10, 22)
assertNext(Punctuation, "<", 10, 23)
assertNext(Punctuation, ">", 10, 24)
assertNext(Punctuation, ")", 10, 26)
assertNext(Identifier, "-tab", 11, 2)
assertNext(String, `"this is a string"`, 13, 2)
assertNext(Punctuation, ",", 13, 20)
assertNext(String, `"and so is this one"`, 13, 22)
assertNext(String, `"\"foo"`, 14, 2)
assertNext(String, `"bar\"baz\""`, 15, 2)
assertNext(String, `"buz\0"`, 16, 2)
assert.False(t, l.HasNext())
assert.Nil(t, l.Err())
}