Finish up lexer, add tests for it

This commit is contained in:
Brian Picciano 2022-12-26 17:54:08 +01:00
parent 2919bcaa77
commit 20e050438c

View File

@ -4,6 +4,7 @@ use unicode_categories::UnicodeCategories;
use char_reader::CharReader; use char_reader::CharReader;
#[derive(Copy, Clone, Debug, PartialEq)]
pub struct Location { pub struct Location {
pub row: i64, pub row: i64,
pub col: i64, pub col: i64,
@ -15,6 +16,7 @@ impl fmt::Display for Location {
} }
} }
#[derive(Debug)]
pub enum Error { pub enum Error {
Tokenizing(&'static str, Location), Tokenizing(&'static str, Location),
IO(io::Error), IO(io::Error),
@ -26,50 +28,57 @@ impl From<io::Error> for Error{
} }
} }
#[derive(Debug, PartialEq)]
pub enum TokenKind { pub enum TokenKind {
Name, Name,
Number, Number,
Punctuation, Punctuation,
} }
#[derive(Debug, PartialEq)]
pub struct Token { pub struct Token {
pub kind: TokenKind, pub kind: TokenKind,
pub value: String, pub value: String,
pub location: Location,
} }
pub struct Lexer<R: Read> { pub struct Lexer<R: Read> {
r: CharReader<BufReader<R>>, r: CharReader<BufReader<R>>,
buf: String, buf: String,
prev_char: char, next_loc: Location,
prev_loc: Location,
} }
impl<R: Read> Lexer<R>{ impl<R: Read> Lexer<R>{
fn next_loc(&self) -> Location { pub fn new(r: R) -> Lexer<R> {
Lexer{
if self.prev_char == '\n' { r: CharReader::new(BufReader::new(r)),
return Location{ buf: String::new(),
row: self.prev_loc.row + 1, next_loc: Location{
col: 0 row: 0,
}; col: 0,
} },
return Location{
row: self.prev_loc.row,
col: self.prev_loc.col + 1,
} }
} }
fn discard(&mut self) { fn discard(&mut self) {
self.prev_char = self.r.next_char(). let c = self.r.next_char().
expect("discard should only get called after peek"). expect("discard should only get called after peek").
expect("discard should only get called after peek"); expect("discard should only get called after peek");
self.prev_loc = self.next_loc(); if c == '\n' {
self.next_loc = Location{
row: self.next_loc.row + 1,
col: 0
};
return;
}
self.next_loc = Location{
row: self.next_loc.row,
col: self.next_loc.col + 1,
};
} }
fn peek_a_bool(&mut self) -> Result<(char, bool), Error> { fn peek_a_bool(&mut self) -> Result<(char, bool), Error> {
@ -96,9 +105,9 @@ impl<R: Read> Lexer<R>{
&mut self, &mut self,
kind: TokenKind, kind: TokenKind,
pred: impl Fn(char) -> bool, pred: impl Fn(char) -> bool,
) -> Result<Option<Token>, Error> { ) -> Result<Option<(Token, Location)>, Error> {
let loc = self.next_loc(); let loc = self.next_loc;
self.buf.truncate(0); self.buf.truncate(0);
loop { loop {
@ -106,11 +115,9 @@ impl<R: Read> Lexer<R>{
let (c, ok) = self.peek_a_bool()?; let (c, ok) = self.peek_a_bool()?;
if !ok || !pred(c) { if !ok || !pred(c) {
return Ok(Some(Token{ return Ok(Some(
kind: kind, (Token{kind: kind, value: self.buf.clone()}, loc)
value: self.buf.clone(), ))
location: loc,
}))
} }
self.buf.push(c); self.buf.push(c);
@ -122,7 +129,7 @@ impl<R: Read> Lexer<R>{
c == '-' || ('0' <= c && c <= '9') c == '-' || ('0' <= c && c <= '9')
} }
pub fn next(&mut self) -> Result<Option<Token>, Error> { pub fn next(&mut self) -> Result<Option<(Token, Location)>, Error> {
loop { loop {
@ -143,18 +150,118 @@ impl<R: Read> Lexer<R>{
} else if Self::is_number(c) { } else if Self::is_number(c) {
return self.collect_token(TokenKind::Number, Self::is_number); return self.collect_token(TokenKind::Number, Self::is_number);
} else if c.is_punctuation() { } else if c.is_punctuation() || c.is_symbol() {
return self.collect_token(
TokenKind::Punctuation, let loc = self.next_loc;
|c| c.is_punctuation() || c.is_symbol(), self.discard();
);
return Ok(Some(
(Token{kind: TokenKind::Punctuation, value: c.to_string()}, loc)
))
} else if c.is_ascii_whitespace() { } else if c.is_ascii_whitespace() {
self.discard_while(|c| c.is_ascii_whitespace())?; self.discard_while(|c| c.is_ascii_whitespace())?;
} else { } else {
return Err(Error::Tokenizing("unexpected character", self.next_loc())); return Err(Error::Tokenizing("unexpected character", self.next_loc));
} }
} }
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn lexer() {
struct Test {
input: &'static str,
exp: Vec<(Token, Location)>,
}
fn tok(kind: TokenKind, val: &'static str, loc_row: i64, loc_col: i64) -> (Token, Location) {
(
Token{kind: kind, value: val.to_string()},
Location{row: loc_row, col: loc_col},
)
}
let tests = vec![
Test{
input: "",
exp: vec![],
},
Test{
input: "* foo",
exp: vec![],
},
Test{
input: "* foo\n",
exp: vec![],
},
Test{
input: "* foo\nbar",
exp: vec![
tok(TokenKind::Name, "bar", 1, 0),
],
},
Test{
input: "* foo\nbar ",
exp: vec![
tok(TokenKind::Name, "bar", 1, 0),
],
},
Test{
input: "foo bar\nf-o f0O Foo",
exp: vec![
tok(TokenKind::Name, "foo", 0, 0),
tok(TokenKind::Name, "bar", 0, 4),
tok(TokenKind::Name, "f-o", 1, 0),
tok(TokenKind::Name, "f0O", 1, 4),
tok(TokenKind::Name, "Foo", 1, 8),
],
},
Test{
input: "1 100 -100",
exp: vec![
tok(TokenKind::Number, "1", 0, 0),
tok(TokenKind::Number, "100", 0, 2),
tok(TokenKind::Number, "-100", 0, 6),
],
},
Test{
input: "1<2!-3 ()",
exp: vec![
tok(TokenKind::Number, "1", 0, 0),
tok(TokenKind::Punctuation, "<", 0, 1),
tok(TokenKind::Number, "2", 0, 2),
tok(TokenKind::Punctuation, "!", 0, 3),
tok(TokenKind::Number, "-3", 0, 4),
tok(TokenKind::Punctuation, "(", 0, 7),
tok(TokenKind::Punctuation, ")", 0, 8),
],
},
];
for test in tests {
println!("INPUT: {:#?}", test.input);
let mut l = Lexer::new(test.input.as_bytes());
let mut res = Vec::new();
loop {
if let Some(token) = l.next().expect("no errors expected") {
res.push(token);
} else {
break;
}
}
assert_eq!(*res.as_slice(), *test.exp)
}
}
}