commit afcf9577d235774b4da9a110383cb5fc5c76050c Author: Michael Zhang Date: Thu Sep 24 03:54:08 2020 -0500 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..5f32525 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,126 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "aho-corasick" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b602bfe940d21c130f3895acd65221e8a61270debe89d628b9cb4e3ccb8569b" + +[[package]] +name = "capgen" +version = "0.1.0" +dependencies = [ + "anyhow", + "maplit", + "regex", + "thiserror", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + +[[package]] +name = "memchr" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" + +[[package]] +name = "proc-macro2" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36e28516df94f3dd551a587da5357459d9b36d945a7c37c3557928c1c2ff2a2c" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", + "thread_local", +] + +[[package]] +name = "regex-syntax" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8" + +[[package]] +name = "syn" +version = "1.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6690e3e9f692504b941dc6c3b188fd28df054f7fb8469ab40680df52fdcc842b" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "thiserror" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dfdd070ccd8ccb78f4ad66bf1982dc37f620ef696c6b5028fe2ed83dd3d0d08" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd80fc12f73063ac132ac92aceea36734f04a1d93c1240c6944e23a3b8841793" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "unicode-xid" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..717b073 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "capgen" +version = "0.1.0" +authors = ["Michael Zhang "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.32" +regex = "1.3.9" +thiserror = "1.0.20" +maplit = "1.0.2" diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..f718775 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 75 +wrap_comments = true diff --git a/src/canonical_collection.rs b/src/canonical_collection.rs new file mode 100644 index 0000000..570b169 --- /dev/null +++ b/src/canonical_collection.rs @@ -0,0 +1,28 @@ +use std::collections::HashSet; + +use crate::grammar::Grammar; +use crate::lr0item::Lr0Item; + +pub struct CanonicalCollection<'g> { + grammar: &'g Grammar, + sets: HashSet>, +} + +impl<'g> CanonicalCollection<'g> { + pub fn new(grammar: &'g Grammar) -> Self { + CanonicalCollection { + grammar, + sets: HashSet::default(), + } + } + + pub fn clear(&mut self) { + self.sets.clear(); + } + + pub fn build(&mut self) { + self.clear(); + + // add the start items + } +} diff --git a/src/grammar/context_sets.rs b/src/grammar/context_sets.rs new file mode 100644 index 0000000..07713b0 --- /dev/null +++ b/src/grammar/context_sets.rs @@ -0,0 +1,181 @@ +use std::collections::{HashMap, HashSet}; + +use crate::grammar::Grammar; +use crate::utils::MapOfSet; + +#[derive(Debug, Default)] +pub struct ContextSets {} + +#[derive(Debug, Error)] +pub enum Error {} + +impl Grammar { + pub fn build_context_sets(&mut self) { + // first set construction: dragon book page 189 + + let mut first_sets = MapOfSet::::default(); + let grammar_symbols = self.symbol_names(); + + loop { + let mut to_add = HashSet::<(String, String)>::new(); + + for name in grammar_symbols.iter() { + let name = *name; + if self.is_terminal(name) { + if !first_sets.key_contains(name, name) { + to_add.insert((name.to_owned(), name.to_owned())); + } + } else if self.is_nonterminal(name) { + let x = name; + let productions = + self.productions.get(name).unwrap(); + for production in productions.iter() { + // has ε shown up in every FIRST set so far? + let mut is_epsilon_in_all_sets = false; + let mut first = true; + + // if X -> ε is a production + if production.len() == 1 && production[0] == "ε" { + if !first_sets.key_contains(x, "ε") { + to_add.insert(( + x.to_owned(), + "ε".to_owned(), + )); + } + } + + if first && !is_epsilon_in_all_sets { + first = false; + is_epsilon_in_all_sets = true; + } + + // if X is a terminal and X -> Y1 Y2 Y3 .. Yk + for (_i, y_i) in production.iter().enumerate() { + if let Some(first_y) = first_sets.get(y_i) { + for a in first_y { + if is_epsilon_in_all_sets + && !first_sets.key_contains(x, a) + { + to_add.insert(( + x.to_owned(), + a.to_owned(), + )); + } + } + } + + if let Some(first_y) = first_sets.get(y_i) { + if !first_y.contains("ε") { + is_epsilon_in_all_sets = false; + } + } else { + is_epsilon_in_all_sets = false; + } + } + + if is_epsilon_in_all_sets { + // ε is in all first sets for X's production + if !first_sets.key_contains(x, "ε") { + to_add.insert(( + x.to_owned(), + "ε".to_owned(), + )); + } + } + } + } + } + + if to_add.is_empty() { + break; + } else { + for (symbol_name, first_item) in to_add.drain() { + first_sets.insert_item(symbol_name, first_item); + } + } + } + + self.first_sets = first_sets; + } +} + +#[cfg(test)] +mod tests { + use anyhow::Result; + + use super::ContextSets; + + macro_rules! assert_first_sets { + ( + $set:expr, + $( + $lhs:ident + -> + $($arg:ident)* + ),* $(,)? + ) => { + $( + let first_set = $set.get(stringify!($lhs)).expect("should exist"); + $( + assert!(first_set.contains(stringify!($arg))); + )* + )* + } + } + + #[test] + fn example1() -> Result<()> { + let mut grammar = make_grammar! { + E -> T E_, + E_ -> P T E_ | ε, + T -> F T_, + T_ -> A F T_ | ε, + F -> LP E RP | num + % + P -> r"\+", + A -> r"\*", + LP -> r"\(", + RP -> r"\)", + num -> r"[0-9]+", + }; + grammar.build_context_sets(); + + assert_first_sets! { + grammar.first_sets, + E -> LP num, + T -> LP num, + F -> LP num, + E_ -> P ε, + T_ -> A ε, + }; + + Ok(()) + } + + #[test] + fn example2() -> Result<()> { + let mut grammar = make_grammar! { + S -> A C B | C b b | B a, + A -> d a | B C, + B -> g | ε, + C -> h | ε, + % + a -> "a", + b -> "b", + d -> "d", + g -> "g", + h -> "h", + }; + grammar.build_context_sets(); + + assert_first_sets! { + grammar.first_sets, + S -> a b d g h ε, + A -> d g h ε, + B -> g ε, + C -> h ε, + }; + + Ok(()) + } +} diff --git a/src/grammar/mod.rs b/src/grammar/mod.rs new file mode 100644 index 0000000..e509498 --- /dev/null +++ b/src/grammar/mod.rs @@ -0,0 +1,138 @@ +mod context_sets; + +use std::collections::{HashMap, HashSet}; + +use regex::Regex; + +use crate::lr0item::Lr0Item; +use crate::utils::MapOfSet; + +#[derive(Debug)] +pub struct Grammar { + /// The start symbols + pub start: HashSet, + + /// Map of terminals from their names to the regex used to identify + /// them + pub terminals: HashMap, + + pub productions: HashMap>>, + + pub first_sets: MapOfSet, +} + +impl Grammar { + pub fn symbol_names(&self) -> Vec<&str> { + self.terminals + .keys() + .map(|s| s.as_str()) + .chain(self.productions.keys().map(|s| s.as_str())) + .collect() + } + + pub fn is_terminal(&self, name: impl AsRef) -> bool { + self.terminals.contains_key(name.as_ref()) + } + + pub fn is_nonterminal(&self, name: impl AsRef) -> bool { + self.productions.contains_key(name.as_ref()) + } + + pub fn build(&mut self) { + self.build_context_sets(); + } + + pub fn closure(&self, I: HashSet) -> HashSet { + // closure described in dragon book page 223 + let mut J = I.clone(); + let mut to_add = HashSet::new(); + loop { + for item in J.iter() { + let A = &item.lhs; + let B = item.next_symbol(); + let productions = match self.productions.get(B) { + Some(v) => v, + None => continue, + }; + for production in productions { + let item = Lr0Item { + lhs: B.to_owned(), + rhs: production.clone(), + dot: 0, + }; + + if !J.contains(&item) { + to_add.insert(item); + } + } + } + + if to_add.is_empty() { + break; + } else { + J.extend(to_add.drain()); + } + } + J + } + + pub fn goto(&self, I: HashSet, X: &str) -> HashSet { + let mut result = HashSet::new(); + let mut to_add = HashSet::new(); + loop { + for item in I.iter() { + let A = &item.lhs; + let B = item.next_symbol(); + let productions = match self.productions.get(B) { + Some(v) => v, + None => continue, + }; + for production in productions {} + } + + if to_add.is_empty() { + break; + } else { + result.extend(to_add.drain()); + } + } + return self.closure(result); + } +} + +#[cfg(test)] +mod tests { + use crate::lr0item::Lr0Item; + + #[test] + fn closure1() { + let grammar = make_grammar! { + E_ -> E, + E -> E p T | T, + T -> T a F | F, + F -> lp E rp | num, + % + p -> r"\+", + a -> r"\*", + lp -> r"\(", + rp -> r"\)", + num -> r"[0-9]+", + }; + + let item = Lr0Item { + lhs: string!(E_), + rhs: vec![string!(E)], + dot: 0, + }; + + let j = grammar.closure(hashset! { item }); + // TODO: verify this later + // E_ -> . E + // E -> . E p T + // E -> . T + // T -> . T a F + // T -> . F + // F -> . lp E rp + // F -> . num + } +} diff --git a/src/lr.rs b/src/lr.rs new file mode 100644 index 0000000..d28eeeb --- /dev/null +++ b/src/lr.rs @@ -0,0 +1,13 @@ +use crate::grammar::Grammar; + +pub struct LR { + pub grammar: Grammar, +} + +pub enum Error {} + +impl LR { + fn build(grammar: Grammar) -> Result { + todo!() + } +} diff --git a/src/lr0item.rs b/src/lr0item.rs new file mode 100644 index 0000000..e04cdb9 --- /dev/null +++ b/src/lr0item.rs @@ -0,0 +1,29 @@ +use std::fmt::{self, Display}; + +// lr0 items described in dragon book page 221 + +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +pub struct Lr0Item { + pub lhs: String, + pub rhs: Vec, + pub dot: usize, +} + +impl Lr0Item { + pub fn next_symbol(&self) -> &str { + self.rhs[self.dot].as_ref() + } +} + +impl Display for Lr0Item { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} ->", self.lhs)?; + for (i, sym) in self.rhs.iter().enumerate() { + if i == self.dot { + write!(f, " .")?; + } + write!(f, " {}", sym)?; + } + Ok(()) + } +} diff --git a/src/macros.rs b/src/macros.rs new file mode 100644 index 0000000..72b329b --- /dev/null +++ b/src/macros.rs @@ -0,0 +1,45 @@ +macro_rules! string { + ($x:ident) => { + stringify!($x).to_owned() + }; +} + +macro_rules! make_grammar { + ( + // comma-separated grammar rules + $( + $lhs:ident + -> + $( + $($arg:ident)* + )|* $(|)? + ),* $(,)? + + // backslash for terminals + $( + % + $( + $lhs_:ident + -> + $re:expr + ),* $(,)? + )? + ) => { + crate::grammar::Grammar { + start: hashset!{ $(string!($lhs),)* }, + productions: hashmap!{ + $(string!($lhs) => vec![ + $(vec![ + $(string!($arg),)* + ],)* + ],)* + }, + terminals: hashmap!{ + $( + $(string!($lhs_) => regex::Regex::new($re).unwrap(),)* + )? + }, + first_sets: crate::utils::MapOfSet::default(), + } + }; +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..a7d56c7 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,27 @@ +#[macro_use] +extern crate maplit; +#[macro_use] +extern crate thiserror; + +#[macro_use] +mod macros; + +mod canonical_collection; +mod grammar; +mod lr; +mod lr0item; +mod utils; + +use std::fs::File; +use std::io::Read; + +use anyhow::Result; + +fn main() -> Result<()> { + let mut file = File::open("test.bnf")?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + + println!("Hello, world!"); + Ok(()) +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..5a636fd --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,59 @@ +use std::borrow::Borrow; +use std::collections::{HashMap, HashSet}; +use std::fmt::{self, Debug}; +use std::hash::Hash; + +pub struct MapOfSet(HashMap>); + +impl Debug for MapOfSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.fmt(f) + } +} + +impl Default for MapOfSet { + fn default() -> Self { + Self(HashMap::new()) + } +} + +impl MapOfSet { + pub fn insert_item(&mut self, key: K, value: V) { + if let Some(container) = self.0.get_mut(&key) { + container.insert(value); + } else { + let mut container = HashSet::new(); + container.insert(value); + self.0.insert(key, container); + } + } + + pub fn get(&self, key: &Q) -> Option<&HashSet> + where + K: Borrow, + Q: Hash + Eq, + { + self.0.get(key) + } + + pub fn key_contains( + &self, + key: &QK, + value: &QV, + ) -> bool + where + K: Borrow, + QK: Hash + Eq, + V: Borrow, + QV: Hash + Eq, + { + match self.0.get(key) { + Some(container) => container.contains(value), + None => false, + } + } + + pub fn iter(&self) -> impl Iterator)> { + self.0.iter() + } +}