ginger/gg/lexer.go

package gg

import (
	"bufio"
	"fmt"
	"io"
	"strings"
	"unicode"
)

// LexerError is returned by Lexer when an unexpected error occurs parsing a
// stream of LexerTokens.
type LexerError struct {
	Err      error
	Row, Col int
}

func (e *LexerError) Error() string {
	return fmt.Sprintf("%d:%d: %s", e.Row, e.Col, e.Err.Error())
}

func (e *LexerError) Unwrap() error {
	return e.Err
}

// LexerTokenKind enumerates the different kinds of LexerToken there can be.
type LexerTokenKind string

// Enumeration of LexerTokenKinds.
const (
	LexerTokenKindName        LexerTokenKind = "name"
	LexerTokenKindNumber      LexerTokenKind = "number"
	LexerTokenKindPunctuation LexerTokenKind = "punctuation"
)

// LexerToken describes a lexigraphical token which is used when deserializing
// Graphs.
type LexerToken struct {
	Kind  LexerTokenKind
	Value string // never empty string

	Row, Col int
}

// Lexer is used to parse a string stream into a sequence of tokens which can
// then be parsed by a Parser.
type Lexer interface {

	// Next will return a LexerToken or a LexerError. io.EOF (wrapped in a
	// LexerError) is returned if the stream being read from is finished.
	Next() (LexerToken, error)
}

type lexer struct {
	r             *bufio.Reader
	stringBuilder *strings.Builder
	err           *LexerError

	// these fields are only needed to keep track of the current "cursor"
	// position when reading.
	lastRow, lastCol int
	prevRune         rune
}

// NewLexer wraps the io.Reader in a Lexer, which will read the io.Reader as a
// sequence of utf-8 characters and parse it into a sequence of LexerTokens.
func NewLexer(r io.Reader) Lexer {
	return &lexer{
		r:             bufio.NewReader(r),
		lastRow:       0,
		lastCol:       -1,
		stringBuilder: new(strings.Builder),
	}
}

// nextRowCol returns the row and column number which the next rune in the
// stream would be at.
func (l *lexer) nextRowCol() (int, int) {

	if l.prevRune == '\n' {
		return l.lastRow + 1, 0
	}

	return l.lastRow, l.lastCol + 1
}

func (l *lexer) fmtErr(err error) *LexerError {

	row, col := l.nextRowCol()

	return &LexerError{
		Err: err,
		Row: row,
		Col: col,
	}
}

func (l *lexer) fmtErrf(str string, args ...interface{}) *LexerError {
	return l.fmtErr(fmt.Errorf(str, args...))
}

// discardRune must _always_ be called only after peekRune.
func (l *lexer) discardRune() {

	r, _, err := l.r.ReadRune()

	if err != nil {
		panic(err)
	}

	l.lastRow, l.lastCol = l.nextRowCol()
	l.prevRune = r
}

func (l *lexer) peekRune() (rune, error) {

	r, _, err := l.r.ReadRune()

	if err != nil {
		return '0', err

	} else if err := l.r.UnreadRune(); err != nil {

		// since the most recent operation on the bufio.Reader was a ReadRune,
		// UnreadRune should never return an error
		panic(err)
	}

	return r, nil
}

// readWhile reads runes until the given predicate returns false, and returns a
// LexerToken of the given kind whose Value is comprised of all runes which
// returned true.
//
// If an error is encountered then both the token (or what's been parsed of it
// so far) and the error are returned.
func (l *lexer) readWhile(
	kind LexerTokenKind, pred func(rune) bool,
) (
	LexerToken, *LexerError,
) {

	row, col := l.nextRowCol()

	l.stringBuilder.Reset()

	var lexErr *LexerError

	for {

		r, err := l.peekRune()

		if err != nil {
			lexErr = l.fmtErrf("peeking next character: %w", err)
			break

		} else if !pred(r) {
			break
		}

		l.stringBuilder.WriteRune(r)

		l.discardRune()
	}

	return LexerToken{
		Kind:  kind,
		Value: l.stringBuilder.String(),
		Row:   row, Col: col,
	}, lexErr
}

// we only support base-10 integers at the moment.
func isNumber(r rune) bool {
	return r == '-' || ('0' <= r && r <= '9')
}

// next can return a token, an error, or both. If an error is returned then no
// further calls to next should occur.
func (l *lexer) next() (LexerToken, *LexerError) {

	for {

		r, err := l.peekRune()

		if err != nil {
			return LexerToken{}, l.fmtErrf("peeking next character: %w", err)
		}

		switch {

		case r == '*': // comment

			// comments are everything up until a newline
			_, err := l.readWhile("", func(r rune) bool {
				return r != '\n'
			})

			if err != nil {
				return LexerToken{}, err
			}

			// terminating newline will be discarded on next loop

		case r == '"' || r == '`':

			// reserve double-quote and backtick for string parsing.
			l.discardRune()
			return LexerToken{}, l.fmtErrf("string parsing not yet implemented")

		case unicode.IsLetter(r):
			// letters denote the start of a name

			return l.readWhile(LexerTokenKindName, func(r rune) bool {

				if unicode.In(r, unicode.Letter, unicode.Number, unicode.Mark) {
					return true
				}

				if r == '-' {
					return true
				}

				return false
			})

		case isNumber(r):
			return l.readWhile(LexerTokenKindNumber, isNumber)

		case unicode.IsPunct(r) || unicode.IsSymbol(r):
			// symbols are also considered punctuation

			l.discardRune()

			return LexerToken{
				Kind:  LexerTokenKindPunctuation,
				Value: string(r),
				Row:   l.lastRow,
				Col:   l.lastCol,
			}, nil

		case unicode.IsSpace(r):
			l.discardRune()

		default:
			return LexerToken{}, l.fmtErrf("unexpected character %q", r)
		}

	}
}

func (l *lexer) Next() (LexerToken, error) {

	if l.err != nil {
		return LexerToken{}, l.err
	}

	tok, err := l.next()

	if err != nil {

		l.err = err

		if tok.Kind == "" {
			return LexerToken{}, l.err
		}
	}

	return tok, nil
}
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`package gg`

			`import (`
			`"bufio"`
			`"fmt"`
			`"io"`
			`"strings"`
			`"unicode"`
			`)`

			`// LexerError is returned by Lexer when an unexpected error occurs parsing a`
			`// stream of LexerTokens.`
			`type LexerError struct {`
			`Err error`
			`Row, Col int`
			`}`

			`func (e *LexerError) Error() string {`
Implement Decoder The decoder basically works, though there's some quirks in the design I'll need to marinate one. For example, you can't have a tuple as an edge value. This is probably fine? Stringification of Graphs was added to aid in debugging the decoder, the format it outputs is not the final one. Most likely the (future) encoder will be used for that purpose. The decoder is not implemented in the nicest way; it fully reads in the LexerTokens first, and then processes. This made trying to wrap my head around the problem a lot easier because it left fewer failure cases, but it's not the most efficient thing to do. Now that v0 is done it's pretty plain to see that the decoder could work by only reading in the next N tokens that it needs at a time. But that will be left for a future version. 2021-12-26 23:23:41 +00:00			`return fmt.Sprintf("%d:%d: %s", e.Row, e.Col, e.Err.Error())`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`}`

			`func (e *LexerError) Unwrap() error {`
			`return e.Err`
			`}`

			`// LexerTokenKind enumerates the different kinds of LexerToken there can be.`
			`type LexerTokenKind string`

			`// Enumeration of LexerTokenKinds.`
			`const (`
			`LexerTokenKindName LexerTokenKind = "name"`
			`LexerTokenKindNumber LexerTokenKind = "number"`
			`LexerTokenKindPunctuation LexerTokenKind = "punctuation"`
			`)`

			`// LexerToken describes a lexigraphical token which is used when deserializing`
			`// Graphs.`
			`type LexerToken struct {`
			`Kind LexerTokenKind`
			`Value string // never empty string`

			`Row, Col int`
			`}`

			`// Lexer is used to parse a string stream into a sequence of tokens which can`
			`// then be parsed by a Parser.`
			`type Lexer interface {`

			`// Next will return a LexerToken or a LexerError. io.EOF (wrapped in a`
			`// LexerError) is returned if the stream being read from is finished.`
			`Next() (LexerToken, error)`
			`}`

			`type lexer struct {`
			`r *bufio.Reader`
			`stringBuilder *strings.Builder`
			`err *LexerError`

			`// these fields are only needed to keep track of the current "cursor"`
			`// position when reading.`
			`lastRow, lastCol int`
			`prevRune rune`
			`}`

			`// NewLexer wraps the io.Reader in a Lexer, which will read the io.Reader as a`
			`// sequence of utf-8 characters and parse it into a sequence of LexerTokens.`
			`func NewLexer(r io.Reader) Lexer {`
			`return &lexer{`
			`r: bufio.NewReader(r),`
			`lastRow: 0,`
			`lastCol: -1,`
			`stringBuilder: new(strings.Builder),`
			`}`
			`}`

			`// nextRowCol returns the row and column number which the next rune in the`
			`// stream would be at.`
			`func (l *lexer) nextRowCol() (int, int) {`

			`if l.prevRune == '\n' {`
			`return l.lastRow + 1, 0`
			`}`

			`return l.lastRow, l.lastCol + 1`
			`}`

			`func (l lexer) fmtErr(err error) LexerError {`

			`row, col := l.nextRowCol()`

			`return &LexerError{`
			`Err: err,`
			`Row: row,`
			`Col: col,`
			`}`
			`}`

			`func (l lexer) fmtErrf(str string, args ...interface{}) LexerError {`
			`return l.fmtErr(fmt.Errorf(str, args...))`
			`}`

			`// discardRune must _always_ be called only after peekRune.`
			`func (l *lexer) discardRune() {`

			`r, _, err := l.r.ReadRune()`

			`if err != nil {`
			`panic(err)`
			`}`

			`l.lastRow, l.lastCol = l.nextRowCol()`
			`l.prevRune = r`
			`}`

			`func (l *lexer) peekRune() (rune, error) {`

			`r, _, err := l.r.ReadRune()`

			`if err != nil {`
			`return '0', err`

			`} else if err := l.r.UnreadRune(); err != nil {`

			`// since the most recent operation on the bufio.Reader was a ReadRune,`
			`// UnreadRune should never return an error`
			`panic(err)`
			`}`

			`return r, nil`
			`}`

			`// readWhile reads runes until the given predicate returns false, and returns a`
			`// LexerToken of the given kind whose Value is comprised of all runes which`
			`// returned true.`
			`//`
			`// If an error is encountered then both the token (or what's been parsed of it`
			`// so far) and the error are returned.`
			`func (l *lexer) readWhile(`
			`kind LexerTokenKind, pred func(rune) bool,`
			`) (`
			`LexerToken, *LexerError,`
			`) {`

			`row, col := l.nextRowCol()`

			`l.stringBuilder.Reset()`

			`var lexErr *LexerError`

			`for {`

			`r, err := l.peekRune()`

			`if err != nil {`
			`lexErr = l.fmtErrf("peeking next character: %w", err)`
			`break`

			`} else if !pred(r) {`
			`break`
			`}`

			`l.stringBuilder.WriteRune(r)`

			`l.discardRune()`
			`}`

			`return LexerToken{`
			`Kind: kind,`
			`Value: l.stringBuilder.String(),`
			`Row: row, Col: col,`
			`}, lexErr`
			`}`

			`// we only support base-10 integers at the moment.`
			`func isNumber(r rune) bool {`
			`return r == '-' \|\| ('0' <= r && r <= '9')`
			`}`

			`// next can return a token, an error, or both. If an error is returned then no`
			`// further calls to next should occur.`
			`func (l lexer) next() (LexerToken, LexerError) {`

			`for {`

			`r, err := l.peekRune()`

			`if err != nil {`
			`return LexerToken{}, l.fmtErrf("peeking next character: %w", err)`
			`}`

			`switch {`

			`case r == '*': // comment`

			`// comments are everything up until a newline`
			`_, err := l.readWhile("", func(r rune) bool {`
			`return r != '\n'`
			`})`

			`if err != nil {`
			`return LexerToken{}, err`
			`}`

Don't treat newlines specially For MVP newlines aren't going to be used as a syntax terminator, they're just going to be whitespace. Otherwise the decoding logic gets way more complicated. 2021-12-27 21:19:56 +00:00			`// terminating newline will be discarded on next loop`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00
			case r == '"' \|\| r == '`':

			`// reserve double-quote and backtick for string parsing.`
			`l.discardRune()`
			`return LexerToken{}, l.fmtErrf("string parsing not yet implemented")`

			`case unicode.IsLetter(r):`
			`// letters denote the start of a name`

			`return l.readWhile(LexerTokenKindName, func(r rune) bool {`

			`if unicode.In(r, unicode.Letter, unicode.Number, unicode.Mark) {`
			`return true`
			`}`

			`if r == '-' {`
			`return true`
			`}`

			`return false`
			`})`

			`case isNumber(r):`
			`return l.readWhile(LexerTokenKindNumber, isNumber)`

			`case unicode.IsPunct(r) \|\| unicode.IsSymbol(r):`
			`// symbols are also considered punctuation`

			`l.discardRune()`

			`return LexerToken{`
			`Kind: LexerTokenKindPunctuation,`
			`Value: string(r),`
			`Row: l.lastRow,`
			`Col: l.lastCol,`
			`}, nil`

			`case unicode.IsSpace(r):`
			`l.discardRune()`

			`default:`
			`return LexerToken{}, l.fmtErrf("unexpected character %q", r)`
			`}`

			`}`
			`}`

			`func (l *lexer) Next() (LexerToken, error) {`

			`if l.err != nil {`
			`return LexerToken{}, l.err`
			`}`

			`tok, err := l.next()`

			`if err != nil {`

			`l.err = err`

			`if tok.Kind == "" {`
			`return LexerToken{}, l.err`
			`}`
			`}`

			`return tok, nil`
			`}`