diff --git a/gg/lexer.go b/gg/lexer.go new file mode 100644 index 0000000..dea3f8e --- /dev/null +++ b/gg/lexer.go @@ -0,0 +1,284 @@ +package gg + +import ( + "bufio" + "fmt" + "io" + "strings" + "unicode" +) + +// LexerError is returned by Lexer when an unexpected error occurs parsing a +// stream of LexerTokens. +type LexerError struct { + Err error + Row, Col int +} + +func (e *LexerError) Error() string { + return fmt.Sprintf("%d: %d: %s", e.Col, e.Row, e.Err.Error()) +} + +func (e *LexerError) Unwrap() error { + return e.Err +} + +// LexerTokenKind enumerates the different kinds of LexerToken there can be. +type LexerTokenKind string + +// Enumeration of LexerTokenKinds. +const ( + LexerTokenKindName LexerTokenKind = "name" + LexerTokenKindNumber LexerTokenKind = "number" + LexerTokenKindPunctuation LexerTokenKind = "punctuation" +) + +// LexerToken describes a lexigraphical token which is used when deserializing +// Graphs. +type LexerToken struct { + Kind LexerTokenKind + Value string // never empty string + + Row, Col int +} + +// Lexer is used to parse a string stream into a sequence of tokens which can +// then be parsed by a Parser. +type Lexer interface { + + // Next will return a LexerToken or a LexerError. io.EOF (wrapped in a + // LexerError) is returned if the stream being read from is finished. + Next() (LexerToken, error) +} + +type lexer struct { + r *bufio.Reader + stringBuilder *strings.Builder + err *LexerError + + // these fields are only needed to keep track of the current "cursor" + // position when reading. + lastRow, lastCol int + prevRune rune +} + +// NewLexer wraps the io.Reader in a Lexer, which will read the io.Reader as a +// sequence of utf-8 characters and parse it into a sequence of LexerTokens. +func NewLexer(r io.Reader) Lexer { + return &lexer{ + r: bufio.NewReader(r), + lastRow: 0, + lastCol: -1, + stringBuilder: new(strings.Builder), + } +} + +// nextRowCol returns the row and column number which the next rune in the +// stream would be at. +func (l *lexer) nextRowCol() (int, int) { + + if l.prevRune == '\n' { + return l.lastRow + 1, 0 + } + + return l.lastRow, l.lastCol + 1 +} + +func (l *lexer) fmtErr(err error) *LexerError { + + row, col := l.nextRowCol() + + return &LexerError{ + Err: err, + Row: row, + Col: col, + } +} + +func (l *lexer) fmtErrf(str string, args ...interface{}) *LexerError { + return l.fmtErr(fmt.Errorf(str, args...)) +} + +// discardRune must _always_ be called only after peekRune. +func (l *lexer) discardRune() { + + r, _, err := l.r.ReadRune() + + if err != nil { + panic(err) + } + + l.lastRow, l.lastCol = l.nextRowCol() + l.prevRune = r +} + +func (l *lexer) peekRune() (rune, error) { + + r, _, err := l.r.ReadRune() + + if err != nil { + return '0', err + + } else if err := l.r.UnreadRune(); err != nil { + + // since the most recent operation on the bufio.Reader was a ReadRune, + // UnreadRune should never return an error + panic(err) + } + + return r, nil +} + +// readWhile reads runes until the given predicate returns false, and returns a +// LexerToken of the given kind whose Value is comprised of all runes which +// returned true. +// +// If an error is encountered then both the token (or what's been parsed of it +// so far) and the error are returned. +func (l *lexer) readWhile( + kind LexerTokenKind, pred func(rune) bool, +) ( + LexerToken, *LexerError, +) { + + row, col := l.nextRowCol() + + l.stringBuilder.Reset() + + var lexErr *LexerError + + for { + + r, err := l.peekRune() + + if err != nil { + lexErr = l.fmtErrf("peeking next character: %w", err) + break + + } else if !pred(r) { + break + } + + l.stringBuilder.WriteRune(r) + + l.discardRune() + } + + return LexerToken{ + Kind: kind, + Value: l.stringBuilder.String(), + Row: row, Col: col, + }, lexErr +} + +// we only support base-10 integers at the moment. +func isNumber(r rune) bool { + return r == '-' || ('0' <= r && r <= '9') +} + +// next can return a token, an error, or both. If an error is returned then no +// further calls to next should occur. +func (l *lexer) next() (LexerToken, *LexerError) { + + for { + + r, err := l.peekRune() + + if err != nil { + return LexerToken{}, l.fmtErrf("peeking next character: %w", err) + } + + switch { + + case r == '*': // comment + + // comments are everything up until a newline + _, err := l.readWhile("", func(r rune) bool { + return r != '\n' + }) + + if err != nil { + return LexerToken{}, err + } + + // terminating newline is deliberately not discarded. Loop and find + // the next token (which will be that newline). + + case r == '\n': + // newlines are considered punctuation, not whitespace + + l.discardRune() + + return LexerToken{ + Kind: LexerTokenKindPunctuation, + Value: string(r), + Row: l.lastRow, + Col: l.lastCol, + }, nil + + case r == '"' || r == '`': + + // reserve double-quote and backtick for string parsing. + l.discardRune() + return LexerToken{}, l.fmtErrf("string parsing not yet implemented") + + case unicode.IsLetter(r): + // letters denote the start of a name + + return l.readWhile(LexerTokenKindName, func(r rune) bool { + + if unicode.In(r, unicode.Letter, unicode.Number, unicode.Mark) { + return true + } + + if r == '-' { + return true + } + + return false + }) + + case isNumber(r): + return l.readWhile(LexerTokenKindNumber, isNumber) + + case unicode.IsPunct(r) || unicode.IsSymbol(r): + // symbols are also considered punctuation + + l.discardRune() + + return LexerToken{ + Kind: LexerTokenKindPunctuation, + Value: string(r), + Row: l.lastRow, + Col: l.lastCol, + }, nil + + case unicode.IsSpace(r): + l.discardRune() + + default: + return LexerToken{}, l.fmtErrf("unexpected character %q", r) + } + + } +} + +func (l *lexer) Next() (LexerToken, error) { + + if l.err != nil { + return LexerToken{}, l.err + } + + tok, err := l.next() + + if err != nil { + + l.err = err + + if tok.Kind == "" { + return LexerToken{}, l.err + } + } + + return tok, nil +} diff --git a/gg/lexer_test.go b/gg/lexer_test.go new file mode 100644 index 0000000..19731f4 --- /dev/null +++ b/gg/lexer_test.go @@ -0,0 +1,207 @@ +package gg + +import ( + "errors" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +type mockReader struct { + body []byte + err error +} + +func (r *mockReader) Read(b []byte) (int, error) { + + n := copy(b, r.body) + r.body = r.body[n:] + + if len(r.body) == 0 { + return n, r.err + } + + return n, nil +} + +func TestLexer(t *testing.T) { + + expErr := errors.New("eof") + + tests := []struct { + in string + exp []LexerToken + }{ + {in: "", exp: []LexerToken{}}, + {in: "* fooo", exp: []LexerToken{}}, + { + in: "*\n", + exp: []LexerToken{ + { + Kind: LexerTokenKindPunctuation, + Value: "\n", + Row: 0, Col: 1, + }, + }, + }, + { + in: "foo\nbar\n\n", + exp: []LexerToken{ + { + Kind: LexerTokenKindName, + Value: "foo", + Row: 0, Col: 0, + }, + { + Kind: LexerTokenKindPunctuation, + Value: "\n", + Row: 0, Col: 3, + }, + { + Kind: LexerTokenKindName, + Value: "bar", + Row: 1, Col: 0, + }, + { + Kind: LexerTokenKindPunctuation, + Value: "\n", + Row: 1, Col: 3, + }, + { + Kind: LexerTokenKindPunctuation, + Value: "\n", + Row: 2, Col: 0, + }, + }, + }, + { + in: "foo", + exp: []LexerToken{ + { + Kind: LexerTokenKindName, + Value: "foo", + Row: 0, Col: 0, + }, + }, + }, + { + in: "foo bar f-o f0O Foo", + exp: []LexerToken{ + { + Kind: LexerTokenKindName, + Value: "foo", + Row: 0, Col: 0, + }, + { + Kind: LexerTokenKindName, + Value: "bar", + Row: 0, Col: 4, + }, + { + Kind: LexerTokenKindName, + Value: "f-o", + Row: 0, Col: 8, + }, + { + Kind: LexerTokenKindName, + Value: "f0O", + Row: 0, Col: 12, + }, + { + Kind: LexerTokenKindName, + Value: "Foo", + Row: 0, Col: 16, + }, + }, + }, + { + in: "1 100 -100", + exp: []LexerToken{ + { + Kind: LexerTokenKindNumber, + Value: "1", + Row: 0, Col: 0, + }, + { + Kind: LexerTokenKindNumber, + Value: "100", + Row: 0, Col: 2, + }, + { + Kind: LexerTokenKindNumber, + Value: "-100", + Row: 0, Col: 6, + }, + }, + }, + { + in: "1<2!-3 ()", + exp: []LexerToken{ + { + Kind: LexerTokenKindNumber, + Value: "1", + Row: 0, Col: 0, + }, + { + Kind: LexerTokenKindPunctuation, + Value: "<", + Row: 0, Col: 1, + }, + { + Kind: LexerTokenKindNumber, + Value: "2", + Row: 0, Col: 2, + }, + { + Kind: LexerTokenKindPunctuation, + Value: "!", + Row: 0, Col: 3, + }, + { + Kind: LexerTokenKindNumber, + Value: "-3", + Row: 0, Col: 4, + }, + { + Kind: LexerTokenKindPunctuation, + Value: "(", + Row: 0, Col: 7, + }, + { + Kind: LexerTokenKindPunctuation, + Value: ")", + Row: 0, Col: 8, + }, + }, + }, + } + + for i, test := range tests { + t.Run(strconv.Itoa(i), func(t *testing.T) { + + lexer := NewLexer(&mockReader{body: []byte(test.in), err: expErr}) + + for i := range test.exp { + tok, err := lexer.Next() + assert.NoError(t, err) + assert.Equal(t, test.exp[i], tok, "test.exp[%d]", i) + } + + tok, err := lexer.Next() + assert.ErrorIs(t, err, expErr) + assert.Equal(t, LexerToken{}, tok) + + lexErr := new(LexerError) + assert.True(t, errors.As(err, &lexErr)) + + inParts := strings.Split(test.in, "\n") + + assert.ErrorIs(t, lexErr, expErr) + assert.Equal(t, lexErr.Row, len(inParts)-1) + assert.Equal(t, lexErr.Col, len(inParts[len(inParts)-1])) + }) + } + +} diff --git a/lexer/lexer.go b/lexer/lexer.go deleted file mode 100644 index ffc10ae..0000000 --- a/lexer/lexer.go +++ /dev/null @@ -1,349 +0,0 @@ -package lexer - -import ( - "bufio" - "bytes" - "errors" - "fmt" - "io" - "strings" -) - -// TokenType indicates the type of a token -type TokenType string - -// Different token types -const ( - Identifier TokenType = "identifier" - - // Punctuation are tokens which connect two other tokens - Punctuation TokenType = "punctuation" - - // Wrapper wraps one or more tokens - Wrapper TokenType = "wrapper" - String TokenType = "string" - Err TokenType = "err" - EOF TokenType = "eof" -) - -// Token is a single token which has been read in. All Tokens have a non-empty -// Val -type Token struct { - TokenType - Val string - Row, Col int -} - -// Equal returns whether two tokens are of equal type and value -func (tok Token) Equal(tok2 Token) bool { - return tok.TokenType == tok2.TokenType && tok.Val == tok2.Val -} - -// Err returns the error contained by the token, if any. Only returns non-nil if -// TokenType is Err or EOF -func (tok Token) Err() error { - if tok.TokenType == Err || tok.TokenType == EOF { - return fmt.Errorf("[line:%d col:%d] %s", tok.Row, tok.Col, tok.Val) - } - return nil -} - -func (tok Token) String() string { - var typ string - switch tok.TokenType { - case Identifier: - typ = "ident" - case Punctuation: - typ = "punct" - case String: - typ = "str" - case Err, EOF: - typ = "err" - } - return fmt.Sprintf("%s(%q)", typ, tok.Val) -} - -type lexerFn func(*Lexer) lexerFn - -// Lexer is used to read in ginger tokens from a source. HasNext() must be -// called before every call to Next() -type Lexer struct { - in *bufio.Reader - out *bytes.Buffer - cur lexerFn - - next []Token - - row, col int - absRow, absCol int -} - -// New returns a Lexer which will read tokens from the given source. -func New(r io.Reader) *Lexer { - return &Lexer{ - in: bufio.NewReader(r), - out: new(bytes.Buffer), - cur: lex, - - row: -1, - col: -1, - } -} - -func (l *Lexer) emit(t TokenType) { - str := l.out.String() - if str == "" { - panic("cannot emit empty token") - } - l.out.Reset() - - l.emitTok(Token{ - TokenType: t, - Val: str, - Row: l.row, - Col: l.col, - }) -} - -func (l *Lexer) emitErr(err error) { - tok := Token{ - TokenType: Err, - Val: err.Error(), - Row: l.absRow, - Col: l.absCol, - } - if errors.Is(err, io.EOF) { - tok.TokenType = EOF - } - l.emitTok(tok) -} - -func (l *Lexer) emitTok(tok Token) { - l.next = append(l.next, tok) - l.row = -1 - l.col = -1 -} - -func (l *Lexer) readRune() (rune, error) { - r, _, err := l.in.ReadRune() - if err != nil { - return r, err - } - - if r == '\n' { - l.absRow++ - l.absCol = 0 - } else { - l.absCol++ - } - - return r, err -} - -func (l *Lexer) peekRune() (rune, error) { - r, _, err := l.in.ReadRune() - if err != nil { - return r, err - } - - if err := l.in.UnreadRune(); err != nil { - return r, err - } - return r, nil -} - -func (l *Lexer) readAndPeek() (rune, rune, error) { - r, err := l.readRune() - if err != nil { - return r, 0, err - } - - n, err := l.peekRune() - return r, n, err -} - -func (l *Lexer) bufferRune(r rune) { - l.out.WriteRune(r) - if l.row < 0 && l.col < 0 { - l.row, l.col = l.absRow, l.absCol - } -} - -// HasNext returns true if Next should be called, and false if it should not be -// called and Err should be called instead. When HasNext returns false the Lexer -// is considered to be done -func (l *Lexer) HasNext() bool { - for { - if len(l.next) > 0 { - return true - } else if l.cur == nil { - return false - } - l.cur = l.cur(l) - } -} - -// Next returns the next available token. HasNext must be called before every -// call to Next -func (l *Lexer) Next() Token { - t := l.next[0] - l.next = l.next[1:] - if len(l.next) == 0 { - l.next = nil - } - return t -} - -//////////////////////////////////////////////////////////////////////////////// -// the actual fsm - -var whitespaceSet = " \n\r\t\v\f" -var punctuationSet = ",>" -var wrapperSet = "{}()" -var identifierSepSet = whitespaceSet + punctuationSet + wrapperSet - -func lex(l *Lexer) lexerFn { - r, err := l.readRune() - if err != nil { - l.emitErr(err) - return nil - } - - // handle comments first, cause we have to peek for those. We ignore errors, - // and assume that any error that would happen here will happen again the - // next read - if n, _ := l.peekRune(); r == '/' && n == '/' { - return lexLineComment - } else if r == '/' && n == '*' { - return lexBlockComment - } - - return lexSingleRune(l, r) -} - -func lexSingleRune(l *Lexer, r rune) lexerFn { - switch { - case strings.ContainsRune(whitespaceSet, r): - return lex - case strings.ContainsRune(punctuationSet, r): - l.bufferRune(r) - l.emit(Punctuation) - return lex - case strings.ContainsRune(wrapperSet, r): - l.bufferRune(r) - l.emit(Wrapper) - return lex - case r == '"' || r == '\'' || r == '`': - canEscape := r != '`' - return lexStrStart(l, r, makeLexStr(r, canEscape)) - default: - l.bufferRune(r) - return lexIdentifier - } -} - -func lexIdentifier(l *Lexer) lexerFn { - r, err := l.readRune() - if err != nil { - l.emit(Identifier) - l.emitErr(err) - return nil - } - - if strings.ContainsRune(identifierSepSet, r) { - l.emit(Identifier) - return lexSingleRune(l, r) - } - - l.bufferRune(r) - - return lexIdentifier -} - -func lexLineComment(l *Lexer) lexerFn { - r, err := l.readRune() - if err != nil { - l.emitErr(err) - return nil - } - if r == '\n' { - return lex - } - return lexLineComment -} - -// assumes the starting / has been read already -func lexBlockComment(l *Lexer) lexerFn { - depth := 1 - - var recurse lexerFn - recurse = func(l *Lexer) lexerFn { - r, err := l.readRune() - if err != nil { - l.emitErr(err) - return nil - } - n, _ := l.peekRune() - - if r == '/' && n == '*' { - depth++ - } else if r == '*' && n == '/' { - depth-- - } - - if depth == 0 { - return lexSkipThen(lex) - } - return recurse - } - return recurse -} - -func lexStrStart(lexer *Lexer, r rune, then lexerFn) lexerFn { - lexer.bufferRune(r) - return then -} - -func makeLexStr(quoteC rune, canEscape bool) lexerFn { - var fn lexerFn - fn = func(l *Lexer) lexerFn { - r, n, err := l.readAndPeek() - if err != nil { - if err == io.EOF { - if r == quoteC { - l.bufferRune(r) - l.emit(String) - l.emitErr(err) - return nil - } - l.emitErr(errors.New("expected end of string, got end of file")) - return nil - } - } - - if canEscape && r == '\\' && n == quoteC { - l.bufferRune(r) - l.bufferRune(n) - return lexSkipThen(fn) - } - - l.bufferRune(r) - if r == quoteC { - l.emit(String) - return lex - } - - return fn - } - return fn -} - -func lexSkipThen(then lexerFn) lexerFn { - return func(l *Lexer) lexerFn { - if _, err := l.readRune(); err != nil { - l.emitErr(err) - return nil - } - return then - } -} diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go deleted file mode 100644 index 376e133..0000000 --- a/lexer/lexer_test.go +++ /dev/null @@ -1,82 +0,0 @@ -package lexer - -import ( - "bytes" - . "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var lexTestSrc = ` - // this is a comment - // // this is also a comment - a - anIdentifier - 1 - 100 - 1.5 - 1.5e9 - - /* - some stuff - */ - - /* this should actually work */ - /*/ - - /* - nested! - /* - wtf this is crazy - */ - */ - - (punctuation,is{cool}> ) - -tab - - "this is a string", "and so is this one" - "\"foo" - "bar\"baz\"" - "buz\0" -` - -func TestLex(t *T) { - l := New(bytes.NewBufferString(lexTestSrc)) - - assertNext := func(typ TokenType, val string, row, col int) { - t.Logf("asserting %s %q [row:%d col:%d]", typ, val, row, col) - require.True(t, l.HasNext()) - tok := l.Next() - assert.Equal(t, typ, tok.TokenType) - assert.Equal(t, val, tok.Val) - assert.Equal(t, row, tok.Row) - assert.Equal(t, col, tok.Col) - } - - assertNext(Identifier, "a", 3, 2) - assertNext(Identifier, "anIdentifier", 4, 2) - assertNext(Identifier, "1", 5, 2) - assertNext(Identifier, "100", 6, 2) - assertNext(Identifier, "1.5", 7, 2) - assertNext(Identifier, "1.5e9", 8, 2) - assertNext(Wrapper, "(", 24, 2) - assertNext(Identifier, "punctuation", 24, 3) - assertNext(Punctuation, ",", 24, 14) - assertNext(Identifier, "is", 24, 15) - assertNext(Wrapper, "{", 24, 17) - assertNext(Identifier, "cool", 24, 18) - assertNext(Wrapper, "}", 24, 22) - assertNext(Punctuation, ">", 24, 23) - assertNext(Wrapper, ")", 24, 25) - assertNext(Identifier, "-tab", 25, 2) - assertNext(String, `"this is a string"`, 27, 2) - assertNext(Punctuation, ",", 27, 20) - assertNext(String, `"and so is this one"`, 27, 22) - assertNext(String, `"\"foo"`, 28, 2) - assertNext(String, `"bar\"baz\""`, 29, 2) - assertNext(String, `"buz\0"`, 30, 2) - assertNext(EOF, "EOF", 31, 0) - - assert.False(t, l.HasNext()) -}