From 2919bcaa777759ac6cfcfd846ed7729b40c68e7b Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Sat, 24 Dec 2022 23:33:32 +0100 Subject: [PATCH] Implemented lexer --- rust/Cargo.lock | 14 ++++ rust/Cargo.toml | 2 + rust/src/gg.rs | 10 +++ rust/src/gg/lexer.rs | 160 +++++++++++++++++++++++++++++++++++++++++++ rust/src/lib.rs | 5 +- rust/src/main.rs | 2 +- 6 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 rust/src/gg.rs create mode 100644 rust/src/gg/lexer.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index c4da8e7..8a0561a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -11,11 +11,19 @@ dependencies = [ "typenum", ] +[[package]] +name = "char_reader" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37a59b22dec21ca7d6c173bd543eeab4cd2f36cf21f039a4134905034c87ed3a" + [[package]] name = "ginger" version = "0.1.0" dependencies = [ + "char_reader", "im-rc", + "unicode_categories", ] [[package]] @@ -63,6 +71,12 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "version_check" version = "0.9.4" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index ded0d37..6c6f68f 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -7,3 +7,5 @@ edition = "2021" [dependencies] im-rc = "15.1.0" +char_reader = "0.1.1" +unicode_categories = "0.1.1" diff --git a/rust/src/gg.rs b/rust/src/gg.rs new file mode 100644 index 0000000..2b39b1d --- /dev/null +++ b/rust/src/gg.rs @@ -0,0 +1,10 @@ +pub mod lexer; + +use super::graph::Graph; + +#[derive(Clone, Eq, Hash, PartialEq, Debug)] +pub enum Value<'a>{ + Name(&'a str), + Number(i64), + Graph(&'a Graph, Value<'a>>), +} diff --git a/rust/src/gg/lexer.rs b/rust/src/gg/lexer.rs new file mode 100644 index 0000000..881968e --- /dev/null +++ b/rust/src/gg/lexer.rs @@ -0,0 +1,160 @@ +use std::fmt; +use std::io::{self, Read, BufReader}; +use unicode_categories::UnicodeCategories; + +use char_reader::CharReader; + +pub struct Location { + pub row: i64, + pub col: i64, +} + +impl fmt::Display for Location { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}", self.row, self.col) + } +} + +pub enum Error { + Tokenizing(&'static str, Location), + IO(io::Error), +} + +impl From for Error{ + fn from(e: io::Error) -> Self { + Error::IO(e) + } +} + +pub enum TokenKind { + Name, + Number, + Punctuation, +} + +pub struct Token { + pub kind: TokenKind, + pub value: String, + pub location: Location, +} + +pub struct Lexer { + r: CharReader>, + buf: String, + + prev_char: char, + prev_loc: Location, +} + +impl Lexer{ + + fn next_loc(&self) -> Location { + + if self.prev_char == '\n' { + return Location{ + row: self.prev_loc.row + 1, + col: 0 + }; + } + + return Location{ + row: self.prev_loc.row, + col: self.prev_loc.col + 1, + } + } + + fn discard(&mut self) { + + self.prev_char = self.r.next_char(). + expect("discard should only get called after peek"). + expect("discard should only get called after peek"); + + self.prev_loc = self.next_loc(); + } + + fn peek_a_bool(&mut self) -> Result<(char, bool), Error> { + if let Some(c) = self.r.peek_char()? { + Ok((c, true)) + } else { + Ok(('0', false)) + } + } + + fn discard_while(&mut self, pred: impl Fn(char) -> bool) -> Result<(), Error> { + + loop { + let (c, ok) = self.peek_a_bool()?; + if !ok || !pred(c) { + return Ok(()); + } + + self.discard(); + } + } + + fn collect_token( + &mut self, + kind: TokenKind, + pred: impl Fn(char) -> bool, + ) -> Result, Error> { + + let loc = self.next_loc(); + self.buf.truncate(0); + + loop { + + let (c, ok) = self.peek_a_bool()?; + + if !ok || !pred(c) { + return Ok(Some(Token{ + kind: kind, + value: self.buf.clone(), + location: loc, + })) + } + + self.buf.push(c); + self.discard(); + } + } + + fn is_number(c: char) -> bool { + c == '-' || ('0' <= c && c <= '9') + } + + pub fn next(&mut self) -> Result, Error> { + + loop { + + let (c, ok) = self.peek_a_bool()?; + if !ok { + return Ok(None); + + } else if c == '*' { + self.discard_while(|c| c != '\n')?; + // the terminating newline will be dealt with in the next loop + + } else if c.is_letter() { + return self.collect_token( + TokenKind::Name, + |c| c.is_letter() || c.is_number() || c.is_mark() || c == '-', + ); + + } else if Self::is_number(c) { + return self.collect_token(TokenKind::Number, Self::is_number); + + } else if c.is_punctuation() { + return self.collect_token( + TokenKind::Punctuation, + |c| c.is_punctuation() || c.is_symbol(), + ); + + } else if c.is_ascii_whitespace() { + self.discard_while(|c| c.is_ascii_whitespace())?; + + } else { + return Err(Error::Tokenizing("unexpected character", self.next_loc())); + } + } + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 39a113d..975fba2 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,3 +1,2 @@ -mod graph; - -pub use graph::Graph; +pub mod graph; +pub mod gg; diff --git a/rust/src/main.rs b/rust/src/main.rs index bfb62af..b048f2a 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -1,4 +1,4 @@ -use ginger::Graph; +use ginger::graph::Graph; fn main() {