From 20e050438cc93c4bb2584ea191e53a7ceb287a6a Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Mon, 26 Dec 2022 17:54:08 +0100 Subject: [PATCH] Finish up lexer, add tests for it --- rust/src/gg/lexer.rs | 169 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 138 insertions(+), 31 deletions(-) diff --git a/rust/src/gg/lexer.rs b/rust/src/gg/lexer.rs index 881968e..d6f7134 100644 --- a/rust/src/gg/lexer.rs +++ b/rust/src/gg/lexer.rs @@ -4,6 +4,7 @@ use unicode_categories::UnicodeCategories; use char_reader::CharReader; +#[derive(Copy, Clone, Debug, PartialEq)] pub struct Location { pub row: i64, pub col: i64, @@ -15,6 +16,7 @@ impl fmt::Display for Location { } } +#[derive(Debug)] pub enum Error { Tokenizing(&'static str, Location), IO(io::Error), @@ -26,50 +28,57 @@ impl From for Error{ } } +#[derive(Debug, PartialEq)] pub enum TokenKind { Name, Number, Punctuation, } +#[derive(Debug, PartialEq)] pub struct Token { pub kind: TokenKind, pub value: String, - pub location: Location, } pub struct Lexer { r: CharReader>, buf: String, - prev_char: char, - prev_loc: Location, + next_loc: Location, } impl Lexer{ - fn next_loc(&self) -> Location { - - if self.prev_char == '\n' { - return Location{ - row: self.prev_loc.row + 1, - col: 0 - }; - } - - return Location{ - row: self.prev_loc.row, - col: self.prev_loc.col + 1, + pub fn new(r: R) -> Lexer { + Lexer{ + r: CharReader::new(BufReader::new(r)), + buf: String::new(), + next_loc: Location{ + row: 0, + col: 0, + }, } } fn discard(&mut self) { - self.prev_char = self.r.next_char(). + let c = self.r.next_char(). expect("discard should only get called after peek"). expect("discard should only get called after peek"); - self.prev_loc = self.next_loc(); + if c == '\n' { + self.next_loc = Location{ + row: self.next_loc.row + 1, + col: 0 + }; + return; + } + + self.next_loc = Location{ + row: self.next_loc.row, + col: self.next_loc.col + 1, + }; } fn peek_a_bool(&mut self) -> Result<(char, bool), Error> { @@ -96,9 +105,9 @@ impl Lexer{ &mut self, kind: TokenKind, pred: impl Fn(char) -> bool, - ) -> Result, Error> { + ) -> Result, Error> { - let loc = self.next_loc(); + let loc = self.next_loc; self.buf.truncate(0); loop { @@ -106,11 +115,9 @@ impl Lexer{ let (c, ok) = self.peek_a_bool()?; if !ok || !pred(c) { - return Ok(Some(Token{ - kind: kind, - value: self.buf.clone(), - location: loc, - })) + return Ok(Some( + (Token{kind: kind, value: self.buf.clone()}, loc) + )) } self.buf.push(c); @@ -122,7 +129,7 @@ impl Lexer{ c == '-' || ('0' <= c && c <= '9') } - pub fn next(&mut self) -> Result, Error> { + pub fn next(&mut self) -> Result, Error> { loop { @@ -143,18 +150,118 @@ impl Lexer{ } else if Self::is_number(c) { return self.collect_token(TokenKind::Number, Self::is_number); - } else if c.is_punctuation() { - return self.collect_token( - TokenKind::Punctuation, - |c| c.is_punctuation() || c.is_symbol(), - ); + } else if c.is_punctuation() || c.is_symbol() { + + let loc = self.next_loc; + self.discard(); + + return Ok(Some( + (Token{kind: TokenKind::Punctuation, value: c.to_string()}, loc) + )) } else if c.is_ascii_whitespace() { self.discard_while(|c| c.is_ascii_whitespace())?; } else { - return Err(Error::Tokenizing("unexpected character", self.next_loc())); + return Err(Error::Tokenizing("unexpected character", self.next_loc)); } } } } + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn lexer() { + + struct Test { + input: &'static str, + exp: Vec<(Token, Location)>, + } + + fn tok(kind: TokenKind, val: &'static str, loc_row: i64, loc_col: i64) -> (Token, Location) { + ( + Token{kind: kind, value: val.to_string()}, + Location{row: loc_row, col: loc_col}, + ) + } + + let tests = vec![ + Test{ + input: "", + exp: vec![], + }, + Test{ + input: "* foo", + exp: vec![], + }, + Test{ + input: "* foo\n", + exp: vec![], + }, + Test{ + input: "* foo\nbar", + exp: vec![ + tok(TokenKind::Name, "bar", 1, 0), + ], + }, + Test{ + input: "* foo\nbar ", + exp: vec![ + tok(TokenKind::Name, "bar", 1, 0), + ], + }, + Test{ + input: "foo bar\nf-o f0O Foo", + exp: vec![ + tok(TokenKind::Name, "foo", 0, 0), + tok(TokenKind::Name, "bar", 0, 4), + tok(TokenKind::Name, "f-o", 1, 0), + tok(TokenKind::Name, "f0O", 1, 4), + tok(TokenKind::Name, "Foo", 1, 8), + ], + }, + Test{ + input: "1 100 -100", + exp: vec![ + tok(TokenKind::Number, "1", 0, 0), + tok(TokenKind::Number, "100", 0, 2), + tok(TokenKind::Number, "-100", 0, 6), + ], + }, + Test{ + input: "1<2!-3 ()", + exp: vec![ + tok(TokenKind::Number, "1", 0, 0), + tok(TokenKind::Punctuation, "<", 0, 1), + tok(TokenKind::Number, "2", 0, 2), + tok(TokenKind::Punctuation, "!", 0, 3), + tok(TokenKind::Number, "-3", 0, 4), + tok(TokenKind::Punctuation, "(", 0, 7), + tok(TokenKind::Punctuation, ")", 0, 8), + ], + }, + ]; + + for test in tests { + println!("INPUT: {:#?}", test.input); + + let mut l = Lexer::new(test.input.as_bytes()); + let mut res = Vec::new(); + + loop { + if let Some(token) = l.next().expect("no errors expected") { + res.push(token); + } else { + break; + } + } + + assert_eq!(*res.as_slice(), *test.exp) + } + } +} +