350 lines
6.3 KiB
Go
350 lines
6.3 KiB
Go
package lexer
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
// TokenType indicates the type of a token
|
|
type TokenType string
|
|
|
|
// Different token types
|
|
const (
|
|
Identifier TokenType = "identifier"
|
|
|
|
// Punctuation are tokens which connect two other tokens
|
|
Punctuation TokenType = "punctuation"
|
|
|
|
// Wrapper wraps one or more tokens
|
|
Wrapper TokenType = "wrapper"
|
|
String TokenType = "string"
|
|
Err TokenType = "err"
|
|
EOF TokenType = "eof"
|
|
)
|
|
|
|
// Token is a single token which has been read in. All Tokens have a non-empty
|
|
// Val
|
|
type Token struct {
|
|
TokenType
|
|
Val string
|
|
Row, Col int
|
|
}
|
|
|
|
// Equal returns whether two tokens are of equal type and value
|
|
func (tok Token) Equal(tok2 Token) bool {
|
|
return tok.TokenType == tok2.TokenType && tok.Val == tok2.Val
|
|
}
|
|
|
|
// Err returns the error contained by the token, if any. Only returns non-nil if
|
|
// TokenType is Err or EOF
|
|
func (tok Token) Err() error {
|
|
if tok.TokenType == Err || tok.TokenType == EOF {
|
|
return fmt.Errorf("[line:%d col:%d] %s", tok.Row, tok.Col, tok.Val)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (tok Token) String() string {
|
|
var typ string
|
|
switch tok.TokenType {
|
|
case Identifier:
|
|
typ = "ident"
|
|
case Punctuation:
|
|
typ = "punct"
|
|
case String:
|
|
typ = "str"
|
|
case Err, EOF:
|
|
typ = "err"
|
|
}
|
|
return fmt.Sprintf("%s(%q)", typ, tok.Val)
|
|
}
|
|
|
|
type lexerFn func(*Lexer) lexerFn
|
|
|
|
// Lexer is used to read in ginger tokens from a source. HasNext() must be
|
|
// called before every call to Next()
|
|
type Lexer struct {
|
|
in *bufio.Reader
|
|
out *bytes.Buffer
|
|
cur lexerFn
|
|
|
|
next []Token
|
|
|
|
row, col int
|
|
absRow, absCol int
|
|
}
|
|
|
|
// New returns a Lexer which will read tokens from the given source.
|
|
func New(r io.Reader) *Lexer {
|
|
return &Lexer{
|
|
in: bufio.NewReader(r),
|
|
out: new(bytes.Buffer),
|
|
cur: lex,
|
|
|
|
row: -1,
|
|
col: -1,
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) emit(t TokenType) {
|
|
str := l.out.String()
|
|
if str == "" {
|
|
panic("cannot emit empty token")
|
|
}
|
|
l.out.Reset()
|
|
|
|
l.emitTok(Token{
|
|
TokenType: t,
|
|
Val: str,
|
|
Row: l.row,
|
|
Col: l.col,
|
|
})
|
|
}
|
|
|
|
func (l *Lexer) emitErr(err error) {
|
|
tok := Token{
|
|
TokenType: Err,
|
|
Val: err.Error(),
|
|
Row: l.absRow,
|
|
Col: l.absCol,
|
|
}
|
|
if err == io.EOF {
|
|
tok.TokenType = EOF
|
|
}
|
|
l.emitTok(tok)
|
|
}
|
|
|
|
func (l *Lexer) emitTok(tok Token) {
|
|
l.next = append(l.next, tok)
|
|
l.row = -1
|
|
l.col = -1
|
|
}
|
|
|
|
func (l *Lexer) readRune() (rune, error) {
|
|
r, _, err := l.in.ReadRune()
|
|
if err != nil {
|
|
return r, err
|
|
}
|
|
|
|
if r == '\n' {
|
|
l.absRow++
|
|
l.absCol = 0
|
|
} else {
|
|
l.absCol++
|
|
}
|
|
|
|
return r, err
|
|
}
|
|
|
|
func (l *Lexer) peekRune() (rune, error) {
|
|
r, _, err := l.in.ReadRune()
|
|
if err != nil {
|
|
return r, err
|
|
}
|
|
|
|
if err := l.in.UnreadRune(); err != nil {
|
|
return r, err
|
|
}
|
|
return r, nil
|
|
}
|
|
|
|
func (l *Lexer) readAndPeek() (rune, rune, error) {
|
|
r, err := l.readRune()
|
|
if err != nil {
|
|
return r, 0, err
|
|
}
|
|
|
|
n, err := l.peekRune()
|
|
return r, n, err
|
|
}
|
|
|
|
func (l *Lexer) bufferRune(r rune) {
|
|
l.out.WriteRune(r)
|
|
if l.row < 0 && l.col < 0 {
|
|
l.row, l.col = l.absRow, l.absCol
|
|
}
|
|
}
|
|
|
|
// HasNext returns true if Next should be called, and false if it should not be
|
|
// called and Err should be called instead. When HasNext returns false the Lexer
|
|
// is considered to be done
|
|
func (l *Lexer) HasNext() bool {
|
|
for {
|
|
if len(l.next) > 0 {
|
|
return true
|
|
} else if l.cur == nil {
|
|
return false
|
|
}
|
|
l.cur = l.cur(l)
|
|
}
|
|
}
|
|
|
|
// Next returns the next available token. HasNext must be called before every
|
|
// call to Next
|
|
func (l *Lexer) Next() Token {
|
|
t := l.next[0]
|
|
l.next = l.next[1:]
|
|
if len(l.next) == 0 {
|
|
l.next = nil
|
|
}
|
|
return t
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// the actual fsm
|
|
|
|
var whitespaceSet = " \n\r\t\v\f"
|
|
var punctuationSet = ",>"
|
|
var wrapperSet = "{}()"
|
|
var identifierSepSet = whitespaceSet + punctuationSet + wrapperSet
|
|
|
|
func lex(l *Lexer) lexerFn {
|
|
r, err := l.readRune()
|
|
if err != nil {
|
|
l.emitErr(err)
|
|
return nil
|
|
}
|
|
|
|
// handle comments first, cause we have to peek for those. We ignore errors,
|
|
// and assume that any error that would happen here will happen again the
|
|
// next read
|
|
if n, _ := l.peekRune(); r == '/' && n == '/' {
|
|
return lexLineComment
|
|
} else if r == '/' && n == '*' {
|
|
return lexBlockComment
|
|
}
|
|
|
|
return lexSingleRune(l, r)
|
|
}
|
|
|
|
func lexSingleRune(l *Lexer, r rune) lexerFn {
|
|
switch {
|
|
case strings.ContainsRune(whitespaceSet, r):
|
|
return lex
|
|
case strings.ContainsRune(punctuationSet, r):
|
|
l.bufferRune(r)
|
|
l.emit(Punctuation)
|
|
return lex
|
|
case strings.ContainsRune(wrapperSet, r):
|
|
l.bufferRune(r)
|
|
l.emit(Wrapper)
|
|
return lex
|
|
case r == '"' || r == '\'' || r == '`':
|
|
canEscape := r != '`'
|
|
return lexStrStart(l, r, makeLexStr(r, canEscape))
|
|
default:
|
|
l.bufferRune(r)
|
|
return lexIdentifier
|
|
}
|
|
}
|
|
|
|
func lexIdentifier(l *Lexer) lexerFn {
|
|
r, err := l.readRune()
|
|
if err != nil {
|
|
l.emit(Identifier)
|
|
l.emitErr(err)
|
|
return nil
|
|
}
|
|
|
|
if strings.ContainsRune(identifierSepSet, r) {
|
|
l.emit(Identifier)
|
|
return lexSingleRune(l, r)
|
|
}
|
|
|
|
l.bufferRune(r)
|
|
|
|
return lexIdentifier
|
|
}
|
|
|
|
func lexLineComment(l *Lexer) lexerFn {
|
|
r, err := l.readRune()
|
|
if err != nil {
|
|
l.emitErr(err)
|
|
return nil
|
|
}
|
|
if r == '\n' {
|
|
return lex
|
|
}
|
|
return lexLineComment
|
|
}
|
|
|
|
// assumes the starting / has been read already
|
|
func lexBlockComment(l *Lexer) lexerFn {
|
|
depth := 1
|
|
|
|
var recurse lexerFn
|
|
recurse = func(l *Lexer) lexerFn {
|
|
r, err := l.readRune()
|
|
if err != nil {
|
|
l.emitErr(err)
|
|
return nil
|
|
}
|
|
n, _ := l.peekRune()
|
|
|
|
if r == '/' && n == '*' {
|
|
depth++
|
|
} else if r == '*' && n == '/' {
|
|
depth--
|
|
}
|
|
|
|
if depth == 0 {
|
|
return lexSkipThen(lex)
|
|
}
|
|
return recurse
|
|
}
|
|
return recurse
|
|
}
|
|
|
|
func lexStrStart(lexer *Lexer, r rune, then lexerFn) lexerFn {
|
|
lexer.bufferRune(r)
|
|
return then
|
|
}
|
|
|
|
func makeLexStr(quoteC rune, canEscape bool) lexerFn {
|
|
var fn lexerFn
|
|
fn = func(l *Lexer) lexerFn {
|
|
r, n, err := l.readAndPeek()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
if r == quoteC {
|
|
l.bufferRune(r)
|
|
l.emit(String)
|
|
l.emitErr(err)
|
|
return nil
|
|
}
|
|
l.emitErr(errors.New("expected end of string, got end of file"))
|
|
return nil
|
|
}
|
|
}
|
|
|
|
if canEscape && r == '\\' && n == quoteC {
|
|
l.bufferRune(r)
|
|
l.bufferRune(n)
|
|
return lexSkipThen(fn)
|
|
}
|
|
|
|
l.bufferRune(r)
|
|
if r == quoteC {
|
|
l.emit(String)
|
|
return lex
|
|
}
|
|
|
|
return fn
|
|
}
|
|
return fn
|
|
}
|
|
|
|
func lexSkipThen(then lexerFn) lexerFn {
|
|
return func(l *Lexer) lexerFn {
|
|
if _, err := l.readRune(); err != nil {
|
|
l.emitErr(err)
|
|
return nil
|
|
}
|
|
return then
|
|
}
|
|
}
|