Finish up lexer, add tests for it

This commit is contained in:
Brian Picciano 2022-12-26 17:54:08 +01:00
parent 2919bcaa77
commit 20e050438c

View File

@ -4,6 +4,7 @@ use unicode_categories::UnicodeCategories;
use char_reader::CharReader;
#[derive(Copy, Clone, Debug, PartialEq)]
pub struct Location {
pub row: i64,
pub col: i64,
@ -15,6 +16,7 @@ impl fmt::Display for Location {
}
}
#[derive(Debug)]
pub enum Error {
Tokenizing(&'static str, Location),
IO(io::Error),
@ -26,50 +28,57 @@ impl From<io::Error> for Error{
}
}
#[derive(Debug, PartialEq)]
pub enum TokenKind {
Name,
Number,
Punctuation,
}
#[derive(Debug, PartialEq)]
pub struct Token {
pub kind: TokenKind,
pub value: String,
pub location: Location,
}
pub struct Lexer<R: Read> {
r: CharReader<BufReader<R>>,
buf: String,
prev_char: char,
prev_loc: Location,
next_loc: Location,
}
impl<R: Read> Lexer<R>{
fn next_loc(&self) -> Location {
if self.prev_char == '\n' {
return Location{
row: self.prev_loc.row + 1,
col: 0
};
}
return Location{
row: self.prev_loc.row,
col: self.prev_loc.col + 1,
pub fn new(r: R) -> Lexer<R> {
Lexer{
r: CharReader::new(BufReader::new(r)),
buf: String::new(),
next_loc: Location{
row: 0,
col: 0,
},
}
}
fn discard(&mut self) {
self.prev_char = self.r.next_char().
let c = self.r.next_char().
expect("discard should only get called after peek").
expect("discard should only get called after peek");
self.prev_loc = self.next_loc();
if c == '\n' {
self.next_loc = Location{
row: self.next_loc.row + 1,
col: 0
};
return;
}
self.next_loc = Location{
row: self.next_loc.row,
col: self.next_loc.col + 1,
};
}
fn peek_a_bool(&mut self) -> Result<(char, bool), Error> {
@ -96,9 +105,9 @@ impl<R: Read> Lexer<R>{
&mut self,
kind: TokenKind,
pred: impl Fn(char) -> bool,
) -> Result<Option<Token>, Error> {
) -> Result<Option<(Token, Location)>, Error> {
let loc = self.next_loc();
let loc = self.next_loc;
self.buf.truncate(0);
loop {
@ -106,11 +115,9 @@ impl<R: Read> Lexer<R>{
let (c, ok) = self.peek_a_bool()?;
if !ok || !pred(c) {
return Ok(Some(Token{
kind: kind,
value: self.buf.clone(),
location: loc,
}))
return Ok(Some(
(Token{kind: kind, value: self.buf.clone()}, loc)
))
}
self.buf.push(c);
@ -122,7 +129,7 @@ impl<R: Read> Lexer<R>{
c == '-' || ('0' <= c && c <= '9')
}
pub fn next(&mut self) -> Result<Option<Token>, Error> {
pub fn next(&mut self) -> Result<Option<(Token, Location)>, Error> {
loop {
@ -143,18 +150,118 @@ impl<R: Read> Lexer<R>{
} else if Self::is_number(c) {
return self.collect_token(TokenKind::Number, Self::is_number);
} else if c.is_punctuation() {
return self.collect_token(
TokenKind::Punctuation,
|c| c.is_punctuation() || c.is_symbol(),
);
} else if c.is_punctuation() || c.is_symbol() {
let loc = self.next_loc;
self.discard();
return Ok(Some(
(Token{kind: TokenKind::Punctuation, value: c.to_string()}, loc)
))
} else if c.is_ascii_whitespace() {
self.discard_while(|c| c.is_ascii_whitespace())?;
} else {
return Err(Error::Tokenizing("unexpected character", self.next_loc()));
return Err(Error::Tokenizing("unexpected character", self.next_loc));
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn lexer() {
struct Test {
input: &'static str,
exp: Vec<(Token, Location)>,
}
fn tok(kind: TokenKind, val: &'static str, loc_row: i64, loc_col: i64) -> (Token, Location) {
(
Token{kind: kind, value: val.to_string()},
Location{row: loc_row, col: loc_col},
)
}
let tests = vec![
Test{
input: "",
exp: vec![],
},
Test{
input: "* foo",
exp: vec![],
},
Test{
input: "* foo\n",
exp: vec![],
},
Test{
input: "* foo\nbar",
exp: vec![
tok(TokenKind::Name, "bar", 1, 0),
],
},
Test{
input: "* foo\nbar ",
exp: vec![
tok(TokenKind::Name, "bar", 1, 0),
],
},
Test{
input: "foo bar\nf-o f0O Foo",
exp: vec![
tok(TokenKind::Name, "foo", 0, 0),
tok(TokenKind::Name, "bar", 0, 4),
tok(TokenKind::Name, "f-o", 1, 0),
tok(TokenKind::Name, "f0O", 1, 4),
tok(TokenKind::Name, "Foo", 1, 8),
],
},
Test{
input: "1 100 -100",
exp: vec![
tok(TokenKind::Number, "1", 0, 0),
tok(TokenKind::Number, "100", 0, 2),
tok(TokenKind::Number, "-100", 0, 6),
],
},
Test{
input: "1<2!-3 ()",
exp: vec![
tok(TokenKind::Number, "1", 0, 0),
tok(TokenKind::Punctuation, "<", 0, 1),
tok(TokenKind::Number, "2", 0, 2),
tok(TokenKind::Punctuation, "!", 0, 3),
tok(TokenKind::Number, "-3", 0, 4),
tok(TokenKind::Punctuation, "(", 0, 7),
tok(TokenKind::Punctuation, ")", 0, 8),
],
},
];
for test in tests {
println!("INPUT: {:#?}", test.input);
let mut l = Lexer::new(test.input.as_bytes());
let mut res = Vec::new();
loop {
if let Some(token) = l.next().expect("no errors expected") {
res.push(token);
} else {
break;
}
}
assert_eq!(*res.as_slice(), *test.exp)
}
}
}