package gg import ( "bufio" "fmt" "io" "strings" "unicode" ) // LexerLocation describes the location in a file where a particular token was // parsed from. type LexerLocation struct { Row, Col int } func (l LexerLocation) String() string { return fmt.Sprintf("%d:%d", l.Row, l.Col) } // LexerError is returned by Lexer when an unexpected error occurs parsing a // stream of LexerTokens. type LexerError struct { Err error Location LexerLocation } func (e *LexerError) Error() string { return fmt.Sprintf("%s: %s", e.Location.String(), e.Err.Error()) } func (e *LexerError) Unwrap() error { return e.Err } // LexerTokenKind enumerates the different kinds of LexerToken there can be. type LexerTokenKind string // Enumeration of LexerTokenKinds. const ( LexerTokenKindName LexerTokenKind = "name" LexerTokenKindNumber LexerTokenKind = "number" LexerTokenKindPunctuation LexerTokenKind = "punctuation" ) // LexerToken describes a lexigraphical token which is used when deserializing // Graphs. type LexerToken struct { Kind LexerTokenKind Value string // never empty string Location LexerLocation } func (t LexerToken) errPrefix() string { return fmt.Sprintf("%s: at %q", t.Location.String(), t.Value) } // Lexer is used to parse a string stream into a sequence of tokens which can // then be parsed by a Parser. type Lexer interface { // Next will return a LexerToken or a LexerError. io.EOF (wrapped in a // LexerError) is returned if the stream being read from is finished. Next() (LexerToken, error) } type lexer struct { r *bufio.Reader stringBuilder *strings.Builder err *LexerError // these fields are only needed to keep track of the current "cursor" // position when reading. lastRow, lastCol int prevRune rune } // NewLexer wraps the io.Reader in a Lexer, which will read the io.Reader as a // sequence of utf-8 characters and parse it into a sequence of LexerTokens. func NewLexer(r io.Reader) Lexer { return &lexer{ r: bufio.NewReader(r), lastRow: 0, lastCol: -1, stringBuilder: new(strings.Builder), } } // nextRowCol returns the row and column number which the next rune in the // stream would be at. func (l *lexer) nextRowCol() (int, int) { if l.prevRune == '\n' { return l.lastRow + 1, 0 } return l.lastRow, l.lastCol + 1 } func (l *lexer) fmtErr(err error) *LexerError { row, col := l.nextRowCol() return &LexerError{ Err: err, Location: LexerLocation{ Row: row, Col: col, }, } } func (l *lexer) fmtErrf(str string, args ...interface{}) *LexerError { return l.fmtErr(fmt.Errorf(str, args...)) } // discardRune must _always_ be called only after peekRune. func (l *lexer) discardRune() { r, _, err := l.r.ReadRune() if err != nil { panic(err) } l.lastRow, l.lastCol = l.nextRowCol() l.prevRune = r } func (l *lexer) peekRune() (rune, error) { r, _, err := l.r.ReadRune() if err != nil { return '0', err } else if err := l.r.UnreadRune(); err != nil { // since the most recent operation on the bufio.Reader was a ReadRune, // UnreadRune should never return an error panic(err) } return r, nil } // readWhile reads runes until the given predicate returns false, and returns a // LexerToken of the given kind whose Value is comprised of all runes which // returned true. // // If an error is encountered then both the token (or what's been parsed of it // so far) and the error are returned. func (l *lexer) readWhile( kind LexerTokenKind, pred func(rune) bool, ) ( LexerToken, *LexerError, ) { row, col := l.nextRowCol() l.stringBuilder.Reset() var lexErr *LexerError for { r, err := l.peekRune() if err != nil { lexErr = l.fmtErrf("peeking next character: %w", err) break } else if !pred(r) { break } l.stringBuilder.WriteRune(r) l.discardRune() } return LexerToken{ Kind: kind, Value: l.stringBuilder.String(), Location: LexerLocation{ Row: row, Col: col, }, }, lexErr } // we only support base-10 integers at the moment. func isNumber(r rune) bool { return r == '-' || ('0' <= r && r <= '9') } // next can return a token, an error, or both. If an error is returned then no // further calls to next should occur. func (l *lexer) next() (LexerToken, *LexerError) { for { r, err := l.peekRune() if err != nil { return LexerToken{}, l.fmtErrf("peeking next character: %w", err) } switch { case r == '*': // comment // comments are everything up until a newline _, err := l.readWhile("", func(r rune) bool { return r != '\n' }) if err != nil { return LexerToken{}, err } // terminating newline will be discarded on next loop case r == '"' || r == '`': // reserve double-quote and backtick for string parsing. l.discardRune() return LexerToken{}, l.fmtErrf("string parsing not yet implemented") case unicode.IsLetter(r): // letters denote the start of a name return l.readWhile(LexerTokenKindName, func(r rune) bool { if unicode.In(r, unicode.Letter, unicode.Number, unicode.Mark) { return true } if r == '-' { return true } return false }) case isNumber(r): return l.readWhile(LexerTokenKindNumber, isNumber) case unicode.IsPunct(r) || unicode.IsSymbol(r): // symbols are also considered punctuation l.discardRune() return LexerToken{ Kind: LexerTokenKindPunctuation, Value: string(r), Location: LexerLocation{ Row: l.lastRow, Col: l.lastCol, }, }, nil case unicode.IsSpace(r): l.discardRune() default: return LexerToken{}, l.fmtErrf("unexpected character %q", r) } } } func (l *lexer) Next() (LexerToken, error) { if l.err != nil { return LexerToken{}, l.err } tok, err := l.next() if err != nil { l.err = err if tok.Kind == "" { return LexerToken{}, l.err } } return tok, nil }