From 6fa4b3b11d4e397d553b4c4364394d7a2678907b Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Sat, 18 Oct 2014 12:04:48 -0400 Subject: [PATCH] bail on the parse code, watched rob pike's talk and wrote a lexer based on that --- parse/lex/lex.go | 205 ++++++++++++++++++++++++++++++++++++++++++ parse/lex/lex_test.go | 56 ++++++++++++ parse/parse.go | 88 ------------------ parse/parse_test.go | 55 ------------ 4 files changed, 261 insertions(+), 143 deletions(-) create mode 100644 parse/lex/lex.go create mode 100644 parse/lex/lex_test.go diff --git a/parse/lex/lex.go b/parse/lex/lex.go new file mode 100644 index 0000000..ce98af7 --- /dev/null +++ b/parse/lex/lex.go @@ -0,0 +1,205 @@ +package lexer + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "unicode" +) + +type TokenType int +const ( + BareString TokenType = iota + QuotedString + OpenParen + CloseParen + Err + EOF +) + +var invalidBareStringRunes = map[rune]bool{ + '"': true, + '\'': true, + '(': true, + ')': true, + '[': true, + ']': true, + '{': true, + '}': true, +} + +// Token represents a single set of characters that are a "thing" in the syntax +type Token struct { + Type TokenType + Val string +} + +var ( + errInvalidUTF8 = errors.New("invalid utf8 character") +) + +// Lexer reads through an io.Reader and emits Tokens from it. +type Lexer struct { + r *bufio.Reader + outbuf *bytes.Buffer + ch chan *Token +} + +// NewLexer constructs a new Lexer struct and returns it. r is internally +// wrapped with a bufio.Reader, unless it already is one. +func NewLexer(r io.Reader) *Lexer { + var br *bufio.Reader + var ok bool + if br, ok = r.(*bufio.Reader); !ok { + br = bufio.NewReader(r) + } + + l := Lexer{ + r: br, + ch: make(chan *Token, 1), + outbuf: bytes.NewBuffer(make([]byte, 0, 1024)), + } + + go l.spin() + + return &l +} + +func (l *Lexer) spin() { + f := lexWhitespace + for { + f = f(l) + if f == nil { + return + } + } +} + +func (l *Lexer) Next() *Token { + t := <-l.ch + if t.Type == EOF { + return nil + } + return t +} + +func (l *Lexer) emit(t TokenType) { + str := l.outbuf.String() + fmt.Printf("emitting %q\n", str) + l.ch <- &Token{ + Type: t, + Val: l.outbuf.String(), + } + l.outbuf.Reset() +} + +func (l *Lexer) peek() (rune, error) { + r, err := l.readRune() + if err != nil { + return 0, err + } + if err = l.r.UnreadRune(); err != nil { + return 0, err + } + return r, nil +} + +func (l *Lexer) readRune() (rune, error) { + r, i, err := l.r.ReadRune() + if err != nil { + return 0, err + } else if r == unicode.ReplacementChar && i == 1 { + return 0, errInvalidUTF8 + } + return r, nil +} + +func (l *Lexer) err(err error) lexerFunc { + if err == io.EOF { + l.ch <- &Token{EOF, ""} + } else { + l.ch <- &Token{Err, err.Error()} + } + close(l.ch) + return nil +} + +func (l *Lexer) errf(format string, args ...interface{}) lexerFunc { + s := fmt.Sprintf(format, args...) + l.ch <- &Token{Err, s} + close(l.ch) + return nil +} + +type lexerFunc func(*Lexer) lexerFunc + +func lexWhitespace(l *Lexer) lexerFunc { + r, err := l.readRune() + if err != nil { + return l.err(err) + } + + if unicode.IsSpace(r) { + fmt.Printf("skipping %q because it's a space\n", r) + return lexWhitespace + } + + fmt.Printf("not skipping %q\n", r) + l.outbuf.WriteRune(r) + + switch r { + case '"': + return lexQuotedString + case '(': + l.emit(OpenParen) + case ')': + l.emit(CloseParen) + default: + return lexBareString + } + + return lexWhitespace +} + +func lexQuotedString(l *Lexer) lexerFunc { + r, err := l.readRune() + if err != nil { + l.emit(QuotedString) + return l.err(err) + } + + l.outbuf.WriteRune(r) + buf := l.outbuf.Bytes() + + if r == '"' && buf[len(buf) - 2] != '\\' { + l.emit(QuotedString) + fmt.Println("emitting quoted string, parsing whitespace") + return lexWhitespace + } + return lexQuotedString +} + +func lexBareString(l *Lexer) lexerFunc { + r, err := l.peek() + if err != nil { + fmt.Printf("got err %s in peek\n", err) + l.emit(BareString) + return l.err(err) + } + + if _, ok := invalidBareStringRunes[r]; ok || unicode.IsSpace(r) { + l.emit(BareString) + return lexWhitespace + } + + if _, err = l.readRune(); err != nil { + fmt.Printf("got err %s in read\n", err) + l.emit(BareString) + return l.err(err) + } + + l.outbuf.WriteRune(r) + return lexBareString +} diff --git a/parse/lex/lex_test.go b/parse/lex/lex_test.go new file mode 100644 index 0000000..a06c553 --- /dev/null +++ b/parse/lex/lex_test.go @@ -0,0 +1,56 @@ +package lexer + +import ( + "bytes" + . "testing" +) + +func TestLexer(t *T) { + m := map[string][]Token{ + "": {{EOF, ""}}, + " \t": {{EOF, ""}}, + "a b c": {{BareString, "a"}, + {BareString, "b"}, + {BareString, "c"}, + {EOF, ""}}, + "\"foo\" bar": {{QuotedString, "\"foo\""}, + {BareString, "bar"}, + {EOF, ""}}, + "\"foo\nbar\" baz": {{QuotedString, "\"foo\nbar\""}, + {BareString, "baz"}, + {EOF, ""}}, + "( foo bar ) baz": {{OpenParen, "("}, + {BareString, "foo"}, + {BareString, "bar"}, + {CloseParen, ")"}, + {BareString, "baz"}, + {EOF, ""}}, + "((foo-bar))": {{OpenParen, "("}, + {OpenParen, "("}, + {BareString, "foo-bar"}, + {CloseParen, ")"}, + {CloseParen, ")"}, + {EOF, ""}}, + "(\"foo\nbar\")": {{OpenParen, "("}, + {QuotedString, "\"foo\nbar\""}, + {CloseParen, ")"}, + {EOF, ""}}, + } + + for input, output := range m { + buf := bytes.NewBufferString(input) + l := NewLexer(buf) + for i := range output { + tok := l.Next() + if tok == nil { + if output[i].Type == EOF { + continue + } + t.Fatalf("input: %q (%d) %#v != %#v", input, i, output[i], tok) + } + if *tok != output[i] { + t.Fatalf("input: %s (%d) %#v != %#v", input, i, output[i], tok) + } + } + } +} diff --git a/parse/parse.go b/parse/parse.go index 95e6710..5956e69 100644 --- a/parse/parse.go +++ b/parse/parse.go @@ -1,92 +1,4 @@ package parse import ( - "bufio" - "strconv" - - "github.com/mediocregopher/ginger/types" ) - -//func ReadElem(r io.Reader) (types.Elem, error) { -// buf := bufio.NewReader(r) -// var err error -// for { -// } -//} - -// ReadString reads in a string from the given reader. It assumes the first -// double-quote has already been read off. Ginger strings are wrapped with " and -// are allowed to have newlines literal in them. In all other respects they are -// the same as go strings. -func ReadString(r *bufio.Reader) (types.Str, error) { - str := types.Str("\"") - for { - piece, err := r.ReadBytes('"') - if err != nil { - return "", err - } - str += types.Str(piece) - if piece[len(piece)-2] != '\\' { - break - } - } - - ret, err := strconv.Unquote(string(str)) - if err != nil { - return "", err - } - return types.Str(ret), nil -} - - -// Returns (isNumber, isFloat). Can never return (false, true) -func whatNumber(el string) (bool, bool) { - var isFloat bool - first := el[0] - - var start int - if first == '-' { - if len(el) == 1 { - return false, false - } - start = 1 - } - - el = el[start:] - for i := range el { - if el[i] == '.' { - isFloat = true - } else if el[i] < '0' || el[i] > '9' { - return false, false - } - } - - return true, isFloat -} - -// Given a string with no spaces and with a length >= 1, parses it into either a -// number or string. -func ParseBareElement(el string) (types.Elem, error) { - isNumber, isFloat := whatNumber(el) - if isNumber { - if isFloat { - f, err := strconv.ParseFloat(el, 64) - if err != nil { - return nil, err - } - return types.Float(f), nil - } else { - i, err := strconv.ParseInt(el, 10, 64) - if err != nil { - return nil, err - } - return types.Int(i), nil - } - } - - if el[0] == ':' { - return types.Str(el), nil - } - - return types.Str(":"+el), nil -} diff --git a/parse/parse_test.go b/parse/parse_test.go index 4a0efe6..e3f6767 100644 --- a/parse/parse_test.go +++ b/parse/parse_test.go @@ -1,60 +1,5 @@ package parse import ( - "bytes" - "bufio" . "testing" - - "github.com/mediocregopher/ginger/types" ) - -func TestReadString(t *T) { - m := map[string]types.Str{ - `"hey there"`: "hey there", - `"hey\nthere"`: "hey\nthere", - `"hey there ⌘"`: "hey there ⌘", - `"hey\nthere \u2318"`: "hey\nthere ⌘", - } - - for input, output := range m { - buf := bytes.NewBufferString(input) - buf.ReadByte() - buf2 := bufio.NewReader(buf) - - parseOut, err := ReadString(buf2) - if err != nil { - t.Fatal(err) - } - if output != parseOut { - t.Fatalf("`%s` != `%s`", output, parseOut) - } - } -} - -func TestParseBareElement(t *T) { - m := map[string]types.Elem{ - `1`: types.Int(1), - `12`: types.Int(12), - `-1`: types.Int(-1), - `-12`: types.Int(-12), - - `1.0`: types.Float(1.0), - `12.5`: types.Float(12.5), - `-12.5`: types.Float(-12.5), - - `-`: types.Str(":-"), - - `bare`: types.Str(":bare"), - `:not-bare`: types.Str(":not-bare"), - } - - for input, output := range m { - el, err := ParseBareElement(input) - if err != nil { - t.Fatal(err) - } - if output != el { - t.Fatalf("`%s` != `%s`", output, el) - } - } -}