ginger/gg/lexer.go

package gg

import (
	"bufio"
	"fmt"
	"io"
	"strings"
	"unicode"
)

// LexerLocation describes the location in a file where a particular token was
// parsed from.
type LexerLocation struct {
	Row, Col int
}

func (l LexerLocation) String() string {
	return fmt.Sprintf("%d:%d", l.Row, l.Col)
}

// LexerError is returned by Lexer when an unexpected error occurs parsing a
// stream of LexerTokens.
type LexerError struct {
	Err error

	Location LexerLocation
}

func (e *LexerError) Error() string {
	return fmt.Sprintf("%s: %s", e.Location.String(), e.Err.Error())
}

func (e *LexerError) Unwrap() error {
	return e.Err
}

// LexerTokenKind enumerates the different kinds of LexerToken there can be.
type LexerTokenKind string

// Enumeration of LexerTokenKinds.
const (
	LexerTokenKindName        LexerTokenKind = "name"
	LexerTokenKindNumber      LexerTokenKind = "number"
	LexerTokenKindPunctuation LexerTokenKind = "punctuation"
)

// LexerToken describes a lexigraphical token which is used when deserializing
// Graphs.
type LexerToken struct {
	Kind  LexerTokenKind
	Value string // never empty string

	Location LexerLocation
}

func (t LexerToken) errPrefix() string {
	return fmt.Sprintf("%s: at %q", t.Location.String(), t.Value)
}

// Lexer is used to parse a string stream into a sequence of tokens which can
// then be parsed by a Parser.
type Lexer interface {

	// Next will return a LexerToken or a LexerError. io.EOF (wrapped in a
	// LexerError) is returned if the stream being read from is finished.
	Next() (LexerToken, error)
}

type lexer struct {
	r             *bufio.Reader
	stringBuilder *strings.Builder
	err           *LexerError

	// these fields are only needed to keep track of the current "cursor"
	// position when reading.
	lastRow, lastCol int
	prevRune         rune
}

// NewLexer wraps the io.Reader in a Lexer, which will read the io.Reader as a
// sequence of utf-8 characters and parse it into a sequence of LexerTokens.
func NewLexer(r io.Reader) Lexer {
	return &lexer{
		r:             bufio.NewReader(r),
		lastRow:       0,
		lastCol:       -1,
		stringBuilder: new(strings.Builder),
	}
}

// nextRowCol returns the row and column number which the next rune in the
// stream would be at.
func (l *lexer) nextRowCol() (int, int) {

	if l.prevRune == '\n' {
		return l.lastRow + 1, 0
	}

	return l.lastRow, l.lastCol + 1
}

func (l *lexer) fmtErr(err error) *LexerError {

	row, col := l.nextRowCol()

	return &LexerError{
		Err: err,
		Location: LexerLocation{
			Row: row,
			Col: col,
		},
	}
}

func (l *lexer) fmtErrf(str string, args ...interface{}) *LexerError {
	return l.fmtErr(fmt.Errorf(str, args...))
}

// discardRune must _always_ be called only after peekRune.
func (l *lexer) discardRune() {

	r, _, err := l.r.ReadRune()

	if err != nil {
		panic(err)
	}

	l.lastRow, l.lastCol = l.nextRowCol()
	l.prevRune = r
}

func (l *lexer) peekRune() (rune, error) {

	r, _, err := l.r.ReadRune()

	if err != nil {
		return '0', err

	} else if err := l.r.UnreadRune(); err != nil {

		// since the most recent operation on the bufio.Reader was a ReadRune,
		// UnreadRune should never return an error
		panic(err)
	}

	return r, nil
}

// readWhile reads runes until the given predicate returns false, and returns a
// LexerToken of the given kind whose Value is comprised of all runes which
// returned true.
//
// If an error is encountered then both the token (or what's been parsed of it
// so far) and the error are returned.
func (l *lexer) readWhile(
	kind LexerTokenKind, pred func(rune) bool,
) (
	LexerToken, *LexerError,
) {

	row, col := l.nextRowCol()

	l.stringBuilder.Reset()

	var lexErr *LexerError

	for {

		r, err := l.peekRune()

		if err != nil {
			lexErr = l.fmtErrf("peeking next character: %w", err)
			break

		} else if !pred(r) {
			break
		}

		l.stringBuilder.WriteRune(r)

		l.discardRune()
	}

	return LexerToken{
		Kind:  kind,
		Value: l.stringBuilder.String(),
		Location: LexerLocation{
			Row: row, Col: col,
		},
	}, lexErr
}

// we only support base-10 integers at the moment.
func isNumber(r rune) bool {
	return r == '-' || ('0' <= r && r <= '9')
}

// next can return a token, an error, or both. If an error is returned then no
// further calls to next should occur.
func (l *lexer) next() (LexerToken, *LexerError) {

	for {

		r, err := l.peekRune()

		if err != nil {
			return LexerToken{}, l.fmtErrf("peeking next character: %w", err)
		}

		switch {

		case r == '*': // comment

			// comments are everything up until a newline
			_, err := l.readWhile("", func(r rune) bool {
				return r != '\n'
			})

			if err != nil {
				return LexerToken{}, err
			}

			// terminating newline will be discarded on next loop

		case r == '"' || r == '`':

			// reserve double-quote and backtick for string parsing.
			l.discardRune()
			return LexerToken{}, l.fmtErrf("string parsing not yet implemented")

		case unicode.IsLetter(r):
			// letters denote the start of a name

			return l.readWhile(LexerTokenKindName, func(r rune) bool {

				if unicode.In(r, unicode.Letter, unicode.Number, unicode.Mark) {
					return true
				}

				if r == '-' {
					return true
				}

				return false
			})

		case isNumber(r):
			return l.readWhile(LexerTokenKindNumber, isNumber)

		case unicode.IsPunct(r) || unicode.IsSymbol(r):
			// symbols are also considered punctuation

			l.discardRune()

			return LexerToken{
				Kind:  LexerTokenKindPunctuation,
				Value: string(r),
				Location: LexerLocation{
					Row: l.lastRow,
					Col: l.lastCol,
				},
			}, nil

		case unicode.IsSpace(r):
			l.discardRune()

		default:
			return LexerToken{}, l.fmtErrf("unexpected character %q", r)
		}

	}
}

func (l *lexer) Next() (LexerToken, error) {

	if l.err != nil {
		return LexerToken{}, l.err
	}

	tok, err := l.next()

	if err != nil {

		l.err = err

		if tok.Kind == "" {
			return LexerToken{}, l.err
		}
	}

	return tok, nil
}
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`package gg`

			`import (`
			`"bufio"`
			`"fmt"`
			`"io"`
			`"strings"`
			`"unicode"`
			`)`

Improve semantics of tokens and values obtained from them. Now gg.Values can carry the token used to parse them, which will be useful later when generating errors. 2021-12-28 16:49:02 +00:00			`// LexerLocation describes the location in a file where a particular token was`
			`// parsed from.`
			`type LexerLocation struct {`
			`Row, Col int`
			`}`

			`func (l LexerLocation) String() string {`
			`return fmt.Sprintf("%d:%d", l.Row, l.Col)`
			`}`

Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`// LexerError is returned by Lexer when an unexpected error occurs parsing a`
			`// stream of LexerTokens.`
			`type LexerError struct {`
Improve semantics of tokens and values obtained from them. Now gg.Values can carry the token used to parse them, which will be useful later when generating errors. 2021-12-28 16:49:02 +00:00			`Err error`

			`Location LexerLocation`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`}`

			`func (e *LexerError) Error() string {`
Improve semantics of tokens and values obtained from them. Now gg.Values can carry the token used to parse them, which will be useful later when generating errors. 2021-12-28 16:49:02 +00:00			`return fmt.Sprintf("%s: %s", e.Location.String(), e.Err.Error())`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`}`

			`func (e *LexerError) Unwrap() error {`
			`return e.Err`
			`}`

			`// LexerTokenKind enumerates the different kinds of LexerToken there can be.`
			`type LexerTokenKind string`

			`// Enumeration of LexerTokenKinds.`
			`const (`
			`LexerTokenKindName LexerTokenKind = "name"`
			`LexerTokenKindNumber LexerTokenKind = "number"`
			`LexerTokenKindPunctuation LexerTokenKind = "punctuation"`
			`)`

			`// LexerToken describes a lexigraphical token which is used when deserializing`
			`// Graphs.`
			`type LexerToken struct {`
			`Kind LexerTokenKind`
			`Value string // never empty string`

Improve semantics of tokens and values obtained from them. Now gg.Values can carry the token used to parse them, which will be useful later when generating errors. 2021-12-28 16:49:02 +00:00			`Location LexerLocation`
			`}`

			`func (t LexerToken) errPrefix() string {`
			`return fmt.Sprintf("%s: at %q", t.Location.String(), t.Value)`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`}`

			`// Lexer is used to parse a string stream into a sequence of tokens which can`
			`// then be parsed by a Parser.`
			`type Lexer interface {`

			`// Next will return a LexerToken or a LexerError. io.EOF (wrapped in a`
			`// LexerError) is returned if the stream being read from is finished.`
			`Next() (LexerToken, error)`
			`}`

			`type lexer struct {`
			`r *bufio.Reader`
			`stringBuilder *strings.Builder`
			`err *LexerError`

			`// these fields are only needed to keep track of the current "cursor"`
			`// position when reading.`
			`lastRow, lastCol int`
			`prevRune rune`
			`}`

			`// NewLexer wraps the io.Reader in a Lexer, which will read the io.Reader as a`
			`// sequence of utf-8 characters and parse it into a sequence of LexerTokens.`
			`func NewLexer(r io.Reader) Lexer {`
			`return &lexer{`
			`r: bufio.NewReader(r),`
			`lastRow: 0,`
			`lastCol: -1,`
			`stringBuilder: new(strings.Builder),`
			`}`
			`}`

			`// nextRowCol returns the row and column number which the next rune in the`
			`// stream would be at.`
			`func (l *lexer) nextRowCol() (int, int) {`

			`if l.prevRune == '\n' {`
			`return l.lastRow + 1, 0`
			`}`

			`return l.lastRow, l.lastCol + 1`
			`}`

			`func (l lexer) fmtErr(err error) LexerError {`

			`row, col := l.nextRowCol()`

			`return &LexerError{`
			`Err: err,`
Improve semantics of tokens and values obtained from them. Now gg.Values can carry the token used to parse them, which will be useful later when generating errors. 2021-12-28 16:49:02 +00:00			`Location: LexerLocation{`
			`Row: row,`
			`Col: col,`
			`},`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`}`
			`}`

			`func (l lexer) fmtErrf(str string, args ...interface{}) LexerError {`
			`return l.fmtErr(fmt.Errorf(str, args...))`
			`}`

			`// discardRune must _always_ be called only after peekRune.`
			`func (l *lexer) discardRune() {`

			`r, _, err := l.r.ReadRune()`

			`if err != nil {`
			`panic(err)`
			`}`

			`l.lastRow, l.lastCol = l.nextRowCol()`
			`l.prevRune = r`
			`}`

			`func (l *lexer) peekRune() (rune, error) {`

			`r, _, err := l.r.ReadRune()`

			`if err != nil {`
			`return '0', err`

			`} else if err := l.r.UnreadRune(); err != nil {`

			`// since the most recent operation on the bufio.Reader was a ReadRune,`
			`// UnreadRune should never return an error`
			`panic(err)`
			`}`

			`return r, nil`
			`}`

			`// readWhile reads runes until the given predicate returns false, and returns a`
			`// LexerToken of the given kind whose Value is comprised of all runes which`
			`// returned true.`
			`//`
			`// If an error is encountered then both the token (or what's been parsed of it`
			`// so far) and the error are returned.`
			`func (l *lexer) readWhile(`
			`kind LexerTokenKind, pred func(rune) bool,`
			`) (`
			`LexerToken, *LexerError,`
			`) {`

			`row, col := l.nextRowCol()`

			`l.stringBuilder.Reset()`

			`var lexErr *LexerError`

			`for {`

			`r, err := l.peekRune()`

			`if err != nil {`
			`lexErr = l.fmtErrf("peeking next character: %w", err)`
			`break`

			`} else if !pred(r) {`
			`break`
			`}`

			`l.stringBuilder.WriteRune(r)`

			`l.discardRune()`
			`}`

			`return LexerToken{`
			`Kind: kind,`
			`Value: l.stringBuilder.String(),`
Improve semantics of tokens and values obtained from them. Now gg.Values can carry the token used to parse them, which will be useful later when generating errors. 2021-12-28 16:49:02 +00:00			`Location: LexerLocation{`
			`Row: row, Col: col,`
			`},`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`}, lexErr`
			`}`

			`// we only support base-10 integers at the moment.`
			`func isNumber(r rune) bool {`
			`return r == '-' \|\| ('0' <= r && r <= '9')`
			`}`

			`// next can return a token, an error, or both. If an error is returned then no`
			`// further calls to next should occur.`
			`func (l lexer) next() (LexerToken, LexerError) {`

			`for {`

			`r, err := l.peekRune()`

			`if err != nil {`
			`return LexerToken{}, l.fmtErrf("peeking next character: %w", err)`
			`}`

			`switch {`

			`case r == '*': // comment`

			`// comments are everything up until a newline`
			`_, err := l.readWhile("", func(r rune) bool {`
			`return r != '\n'`
			`})`

			`if err != nil {`
			`return LexerToken{}, err`
			`}`

Don't treat newlines specially For MVP newlines aren't going to be used as a syntax terminator, they're just going to be whitespace. Otherwise the decoding logic gets way more complicated. 2021-12-27 21:19:56 +00:00			`// terminating newline will be discarded on next loop`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00
			case r == '"' \|\| r == '`':

			`// reserve double-quote and backtick for string parsing.`
			`l.discardRune()`
			`return LexerToken{}, l.fmtErrf("string parsing not yet implemented")`

			`case unicode.IsLetter(r):`
			`// letters denote the start of a name`

			`return l.readWhile(LexerTokenKindName, func(r rune) bool {`

			`if unicode.In(r, unicode.Letter, unicode.Number, unicode.Mark) {`
			`return true`
			`}`

			`if r == '-' {`
			`return true`
			`}`

			`return false`
			`})`

			`case isNumber(r):`
			`return l.readWhile(LexerTokenKindNumber, isNumber)`

			`case unicode.IsPunct(r) \|\| unicode.IsSymbol(r):`
			`// symbols are also considered punctuation`

			`l.discardRune()`

			`return LexerToken{`
			`Kind: LexerTokenKindPunctuation,`
			`Value: string(r),`
Improve semantics of tokens and values obtained from them. Now gg.Values can carry the token used to parse them, which will be useful later when generating errors. 2021-12-28 16:49:02 +00:00			`Location: LexerLocation{`
			`Row: l.lastRow,`
			`Col: l.lastCol,`
			`},`
Implement lexer for new syntax 2021-12-26 21:49:43 +00:00			`}, nil`

			`case unicode.IsSpace(r):`
			`l.discardRune()`

			`default:`
			`return LexerToken{}, l.fmtErrf("unexpected character %q", r)`
			`}`

			`}`
			`}`

			`func (l *lexer) Next() (LexerToken, error) {`

			`if l.err != nil {`
			`return LexerToken{}, l.err`
			`}`

			`tok, err := l.next()`

			`if err != nil {`

			`l.err = err`

			`if tok.Kind == "" {`
			`return LexerToken{}, l.err`
			`}`
			`}`

			`return tok, nil`
			`}`