ginger/gg/lexer.go
Brian Picciano 33e59a3836 Implement Decoder
The decoder basically works, though there's some quirks in the design
I'll need to marinate one. For example, you can't have a tuple as an
edge value. This is probably fine?

Stringification of Graphs was added to aid in debugging the decoder, the
format it outputs is not the final one. Most likely the (future) encoder
will be used for that purpose.

The decoder is not implemented in the nicest way; it fully reads in the
LexerTokens first, and then processes. This made trying to wrap my head
around the problem a lot easier because it left fewer failure cases, but
it's not the most efficient thing to do.

Now that v0 is done it's pretty plain to see that the decoder could work
by only reading in the next N tokens that it needs at a time. But that
will be left for a future version.
2021-12-27 15:45:18 -07:00

272 lines
5.3 KiB
Go

package gg
import (
"bufio"
"fmt"
"io"
"strings"
"unicode"
)
// LexerError is returned by Lexer when an unexpected error occurs parsing a
// stream of LexerTokens.
type LexerError struct {
Err error
Row, Col int
}
func (e *LexerError) Error() string {
return fmt.Sprintf("%d:%d: %s", e.Row, e.Col, e.Err.Error())
}
func (e *LexerError) Unwrap() error {
return e.Err
}
// LexerTokenKind enumerates the different kinds of LexerToken there can be.
type LexerTokenKind string
// Enumeration of LexerTokenKinds.
const (
LexerTokenKindName LexerTokenKind = "name"
LexerTokenKindNumber LexerTokenKind = "number"
LexerTokenKindPunctuation LexerTokenKind = "punctuation"
)
// LexerToken describes a lexigraphical token which is used when deserializing
// Graphs.
type LexerToken struct {
Kind LexerTokenKind
Value string // never empty string
Row, Col int
}
// Lexer is used to parse a string stream into a sequence of tokens which can
// then be parsed by a Parser.
type Lexer interface {
// Next will return a LexerToken or a LexerError. io.EOF (wrapped in a
// LexerError) is returned if the stream being read from is finished.
Next() (LexerToken, error)
}
type lexer struct {
r *bufio.Reader
stringBuilder *strings.Builder
err *LexerError
// these fields are only needed to keep track of the current "cursor"
// position when reading.
lastRow, lastCol int
prevRune rune
}
// NewLexer wraps the io.Reader in a Lexer, which will read the io.Reader as a
// sequence of utf-8 characters and parse it into a sequence of LexerTokens.
func NewLexer(r io.Reader) Lexer {
return &lexer{
r: bufio.NewReader(r),
lastRow: 0,
lastCol: -1,
stringBuilder: new(strings.Builder),
}
}
// nextRowCol returns the row and column number which the next rune in the
// stream would be at.
func (l *lexer) nextRowCol() (int, int) {
if l.prevRune == '\n' {
return l.lastRow + 1, 0
}
return l.lastRow, l.lastCol + 1
}
func (l *lexer) fmtErr(err error) *LexerError {
row, col := l.nextRowCol()
return &LexerError{
Err: err,
Row: row,
Col: col,
}
}
func (l *lexer) fmtErrf(str string, args ...interface{}) *LexerError {
return l.fmtErr(fmt.Errorf(str, args...))
}
// discardRune must _always_ be called only after peekRune.
func (l *lexer) discardRune() {
r, _, err := l.r.ReadRune()
if err != nil {
panic(err)
}
l.lastRow, l.lastCol = l.nextRowCol()
l.prevRune = r
}
func (l *lexer) peekRune() (rune, error) {
r, _, err := l.r.ReadRune()
if err != nil {
return '0', err
} else if err := l.r.UnreadRune(); err != nil {
// since the most recent operation on the bufio.Reader was a ReadRune,
// UnreadRune should never return an error
panic(err)
}
return r, nil
}
// readWhile reads runes until the given predicate returns false, and returns a
// LexerToken of the given kind whose Value is comprised of all runes which
// returned true.
//
// If an error is encountered then both the token (or what's been parsed of it
// so far) and the error are returned.
func (l *lexer) readWhile(
kind LexerTokenKind, pred func(rune) bool,
) (
LexerToken, *LexerError,
) {
row, col := l.nextRowCol()
l.stringBuilder.Reset()
var lexErr *LexerError
for {
r, err := l.peekRune()
if err != nil {
lexErr = l.fmtErrf("peeking next character: %w", err)
break
} else if !pred(r) {
break
}
l.stringBuilder.WriteRune(r)
l.discardRune()
}
return LexerToken{
Kind: kind,
Value: l.stringBuilder.String(),
Row: row, Col: col,
}, lexErr
}
// we only support base-10 integers at the moment.
func isNumber(r rune) bool {
return r == '-' || ('0' <= r && r <= '9')
}
// next can return a token, an error, or both. If an error is returned then no
// further calls to next should occur.
func (l *lexer) next() (LexerToken, *LexerError) {
for {
r, err := l.peekRune()
if err != nil {
return LexerToken{}, l.fmtErrf("peeking next character: %w", err)
}
switch {
case r == '*': // comment
// comments are everything up until a newline
_, err := l.readWhile("", func(r rune) bool {
return r != '\n'
})
if err != nil {
return LexerToken{}, err
}
// terminating newline will be discarded on next loop
case r == '"' || r == '`':
// reserve double-quote and backtick for string parsing.
l.discardRune()
return LexerToken{}, l.fmtErrf("string parsing not yet implemented")
case unicode.IsLetter(r):
// letters denote the start of a name
return l.readWhile(LexerTokenKindName, func(r rune) bool {
if unicode.In(r, unicode.Letter, unicode.Number, unicode.Mark) {
return true
}
if r == '-' {
return true
}
return false
})
case isNumber(r):
return l.readWhile(LexerTokenKindNumber, isNumber)
case unicode.IsPunct(r) || unicode.IsSymbol(r):
// symbols are also considered punctuation
l.discardRune()
return LexerToken{
Kind: LexerTokenKindPunctuation,
Value: string(r),
Row: l.lastRow,
Col: l.lastCol,
}, nil
case unicode.IsSpace(r):
l.discardRune()
default:
return LexerToken{}, l.fmtErrf("unexpected character %q", r)
}
}
}
func (l *lexer) Next() (LexerToken, error) {
if l.err != nil {
return LexerToken{}, l.err
}
tok, err := l.next()
if err != nil {
l.err = err
if tok.Kind == "" {
return LexerToken{}, l.err
}
}
return tok, nil
}