commit 18b4ac0d4b79377428a0a32c16712057cc0a9a61 Author: Greg Shuflin Date: Thu Mar 9 17:30:07 2023 -0800 Squashed 'subtrees/parser-combinator/' content from commit 5526ce7 git-subtree-dir: subtrees/parser-combinator git-subtree-split: 5526ce7bd17beda52047fbc3442e23e0174b79a7 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4fffb2f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +/Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..b3061b2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "parser-combinator" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arbitrary = "1.2.0" +proptest = "1.0.0" + +[dev-dependencies] +rstest = "0.16.0" diff --git a/README.md b/README.md new file mode 100644 index 0000000..e78ada3 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# Rust Parser Combinator + +This is a super-basic Rust parser combinator library I wrote mostly +as an exercise for myself. Inspired by [nom](https://github.com/rust-bakery/nom) +and [chumsky](https://github.com/zesterer/chumsky) + +## Ideas for future work + +* See if some of the ideas in [Efficient Parsing with Parser Combinators](https://research.rug.nl/en/publications/efficient-parsing-with-parser-combinators) +can be incorporated here. diff --git a/src/choice/mod.rs b/src/choice/mod.rs new file mode 100644 index 0000000..46063dd --- /dev/null +++ b/src/choice/mod.rs @@ -0,0 +1,198 @@ +use crate::parser::{ParseResult, Parser, ParserInput, Representation}; + +pub fn choice2(parser1: P1, parser2: P2) -> impl Parser +where + P1: Parser, + P2: Parser, + I: ParserInput + Clone, +{ + choice((parser1, parser2)) +} + +pub fn choice(choices: C) -> impl Parser +where + C: Choice, + I: ParserInput + Clone, +{ + let rep = choices.representation(); + (move |input| choices.parse(input), rep) +} + +pub trait Choice { + fn parse(&self, input: I) -> ParseResult; + fn representation(&self) -> Representation; +} + +impl Choice for (P1, P2) +where + P1: Parser, + P2: Parser, + I: ParserInput + Clone, +{ + fn parse(&self, input: I) -> ParseResult { + let parsers = vec![&self.0 as &dyn Parser, &self.1]; + choice_loop(input, parsers) + } + + fn representation(&self) -> Representation { + let parsers = vec![&self.0 as &dyn Parser, &self.1]; + repr_loop(parsers) + } +} + +impl Choice for (P1, P2, P3) +where + P1: Parser, + P2: Parser, + P3: Parser, + I: ParserInput + Clone, +{ + fn parse(&self, input: I) -> ParseResult { + let parsers = vec![&self.0 as &dyn Parser, &self.1, &self.2]; + choice_loop(input, parsers) + } + + fn representation(&self) -> Representation { + let parsers = vec![&self.0 as &dyn Parser, &self.1, &self.2]; + repr_loop(parsers) + } +} + +impl Choice for (P1, P2, P3, P4) +where + P1: Parser, + P2: Parser, + P3: Parser, + P4: Parser, + I: ParserInput + Clone, +{ + fn parse(&self, input: I) -> ParseResult { + let parsers = vec![&self.0 as &dyn Parser, &self.1, &self.2, &self.3]; + choice_loop(input, parsers) + } + + fn representation(&self) -> Representation { + let parsers = vec![&self.0 as &dyn Parser, &self.1, &self.2, &self.3]; + repr_loop(parsers) + } +} + +impl Choice for (P1, P2, P3, P4, P5) +where + P1: Parser, + P2: Parser, + P3: Parser, + P4: Parser, + P5: Parser, + I: ParserInput + Clone, +{ + fn parse(&self, input: I) -> ParseResult { + let parsers = vec![ + &self.0 as &dyn Parser, + &self.1, + &self.2, + &self.3, + &self.4, + ]; + choice_loop(input, parsers) + } + + fn representation(&self) -> Representation { + let parsers = vec![ + &self.0 as &dyn Parser, + &self.1, + &self.2, + &self.3, + &self.4, + ]; + repr_loop(parsers) + } +} + +impl Choice for (P1, P2, P3, P4, P5, P6) +where + P1: Parser, + P2: Parser, + P3: Parser, + P4: Parser, + P5: Parser, + P6: Parser, + I: ParserInput + Clone, +{ + fn parse(&self, input: I) -> ParseResult { + let parsers = vec![ + &self.0 as &dyn Parser, + &self.1, + &self.2, + &self.3, + &self.4, + &self.5, + ]; + choice_loop(input, parsers) + } + fn representation(&self) -> Representation { + let parsers = vec![ + &self.0 as &dyn Parser, + &self.1, + &self.2, + &self.3, + &self.4, + &self.5, + ]; + repr_loop(parsers) + } +} + +fn choice_loop(input: I, parsers: Vec<&dyn Parser>) -> ParseResult +where + I: ParserInput + Clone, +{ + //TODO need a more principled way to return an error when no choices work + let mut err = None; + + for parser in parsers.iter() { + match parser.parse(input.clone()) { + Ok(result) => return Ok(result), + Err(e) => { + err = Some(e); + } + } + } + Err(err.unwrap()) +} + +fn repr_loop(parsers: Vec<&dyn Parser>) -> Representation +where + I: ParserInput + Clone, +{ + let mut iter = parsers.iter().map(|p| p.representation()); + Representation::from_choice(&mut iter) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::combinators::repeated; + use crate::primitives::literal; + + #[test] + fn test_choice() { + let p = choice2( + literal("gnostika").to(1), + repeated(literal(" ")).at_least(1).to(2), + ); + assert_eq!(p.parse("gnostika twentynine"), Ok((1, " twentynine"))); + } + + #[test] + fn test_several_choices() { + let p = choice(( + literal("a").to(1), + literal("q").to(10), + repeated(literal("chutney")).to(200), + literal("banana").to(10000), + )); + + assert_eq!(p.parse("q drugs").unwrap(), (10, " drugs")); + } +} diff --git a/src/combinators/map.rs b/src/combinators/map.rs new file mode 100644 index 0000000..e43d6be --- /dev/null +++ b/src/combinators/map.rs @@ -0,0 +1,16 @@ +use crate::parser::{Parser, ParserInput}; + +pub fn map(parser: P, map_fn: F) -> impl Parser +where + I: ParserInput, + P: Parser, + F: Fn(O1) -> O2, +{ + let rep = parser.representation(); + let p = move |input| { + parser + .parse(input) + .map(|(result, rest)| (map_fn(result), rest)) + }; + (p, rep) +} diff --git a/src/combinators/mod.rs b/src/combinators/mod.rs new file mode 100644 index 0000000..364df44 --- /dev/null +++ b/src/combinators/mod.rs @@ -0,0 +1,66 @@ +mod map; +mod optional; +mod repeated; +mod separated_by; + +pub use map::map; +pub use optional::optional; +pub use repeated::repeated; + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::Parser; + use crate::primitives::literal; + + #[test] + fn test_map() { + let lit_a = literal("a"); + let output = lit_a.map(|s| s.to_uppercase()).parse("a yolo"); + assert_eq!(output.unwrap(), ("A".to_string(), " yolo")); + } + + #[test] + fn test_one_or_more() { + let p = repeated(literal("bongo ")).at_least(1); + let input = "bongo bongo bongo bongo bongo "; + + let (output, rest) = p.parse(input).unwrap(); + assert_eq!(rest, ""); + assert_eq!(output.len(), 5); + + let (output, rest) = p.parse("bongo ecks").unwrap(); + assert_eq!(output.len(), 1); + assert_eq!(rest, "ecks"); + } + + #[test] + fn test_separated_by() { + let p = repeated(literal("garb").to(20)) + .separated_by(repeated(literal(" ")).at_least(1), false); + + assert_eq!( + p.parse("garb garb garb garb").unwrap(), + (vec![20, 20, 20, 20], "") + ); + + assert!(p.parse("garb garb garb garb ").is_err()); + + let p = + repeated(literal("garb").to(20)).separated_by(repeated(literal(" ")).at_least(1), true); + + assert_eq!( + p.parse("garb garb garb garb").unwrap(), + (vec![20, 20, 20, 20], "") + ); + + assert_eq!( + p.parse("garb garb garb garb ").unwrap(), + (vec![20, 20, 20, 20], "") + ); + assert_eq!( + p.parse("garb garb garb garb q").unwrap(), + (vec![20, 20, 20, 20], "q") + ); + } +} diff --git a/src/combinators/optional.rs b/src/combinators/optional.rs new file mode 100644 index 0000000..cd6e636 --- /dev/null +++ b/src/combinators/optional.rs @@ -0,0 +1,17 @@ +use crate::parser::{Parser, ParserInput, Representation}; + +pub fn optional(parser: P) -> impl Parser, E> +where + P: Parser, + I: ParserInput + Clone, +{ + let rep = Representation::from_choice( + &mut [parser.representation(), Representation::new("ε")].into_iter(), + ); + let p = move |input: I| match parser.parse(input.clone()) { + Ok((output, rest)) => Ok((Some(output), rest)), + Err(_e) => Ok((None, input)), + }; + + (p, rep) +} diff --git a/src/combinators/repeated.rs b/src/combinators/repeated.rs new file mode 100644 index 0000000..c25f3c3 --- /dev/null +++ b/src/combinators/repeated.rs @@ -0,0 +1,94 @@ +use crate::combinators::separated_by::SeparatedBy; +use crate::parser::{BoxedParser, ParseResult, Parser, ParserInput, Representation}; + +pub fn repeated<'a, P, I, O>(parser: P) -> Repeated<'a, I, O> +where + P: Parser + 'a, + I: ParserInput + Clone + 'a, +{ + Repeated { + inner_parser: BoxedParser::new(parser), + at_least: None, + at_most: None, + } +} + +pub struct Repeated<'a, I, O> +where + I: ParserInput + Clone, +{ + pub(super) inner_parser: BoxedParser<'a, I, O, I>, + pub(super) at_least: Option, + pub(super) at_most: Option, +} + +impl<'a, I, O> Repeated<'a, I, O> +where + I: ParserInput + Clone, +{ + pub fn at_least(self, n: u16) -> Self { + Self { + at_least: Some(n), + ..self + } + } + pub fn at_most(self, n: u16) -> Self { + Self { + at_most: Some(n), + ..self + } + } + + pub fn separated_by(self, delimiter: D, allow_trailing: bool) -> SeparatedBy<'a, I, O> + where + D: Parser + 'a, + O2: 'a, + I: 'a, + { + SeparatedBy { + inner_repeated: self, + delimiter: BoxedParser::new(delimiter.to(())), + allow_trailing, + } + } +} + +impl<'a, I, O> Parser, I> for Repeated<'a, I, O> +where + I: ParserInput + Clone + 'a, +{ + fn parse(&self, input: I) -> ParseResult, I> { + let at_least = self.at_least.unwrap_or(0); + let at_most = self.at_most.unwrap_or(u16::MAX); + + if at_most == 0 { + return Ok((vec![], input)); + } + + let mut results = Vec::new(); + let mut count: u16 = 0; + let mut further_input = input.clone(); + + while let Ok((item, rest)) = self.inner_parser.parse(further_input.clone()) { + results.push(item); + further_input = rest; + count += 1; + if count >= at_most { + break; + } + } + if count < at_least { + return Err(input); + } + + Ok((results, further_input)) + } + + fn representation(&self) -> Representation { + Representation::repeated( + self.inner_parser.representation(), + self.at_least.unwrap_or(0), + self.at_most.unwrap_or(u16::MAX), + ) + } +} diff --git a/src/combinators/separated_by.rs b/src/combinators/separated_by.rs new file mode 100644 index 0000000..d60cd23 --- /dev/null +++ b/src/combinators/separated_by.rs @@ -0,0 +1,84 @@ +use crate::combinators::repeated::Repeated; +use crate::parser::{BoxedParser, ParseResult, Parser, ParserInput, Representation}; + +pub struct SeparatedBy<'a, I, O> +where + I: ParserInput + Clone, +{ + pub(super) inner_repeated: Repeated<'a, I, O>, + pub(super) delimiter: BoxedParser<'a, I, (), I>, + pub(super) allow_trailing: bool, +} + +impl<'a, I, O> Parser, I> for SeparatedBy<'a, I, O> +where + I: ParserInput + Clone + 'a, +{ + fn representation(&self) -> Representation { + Representation::new("sepby") + } + + fn parse(&self, input: I) -> ParseResult, I> { + let at_least = self.inner_repeated.at_least.unwrap_or(0); + let at_most = self.inner_repeated.at_most.unwrap_or(u16::MAX); + let parser = &self.inner_repeated.inner_parser; + let delimiter = &self.delimiter; + + if at_most == 0 { + return Ok((vec![], input)); + } + + let mut results = Vec::new(); + let mut count: u16 = 0; + let mut further_input; + + match parser.parse(input.clone()) { + Ok((item, rest)) => { + results.push(item); + further_input = rest; + } + Err(_e) => { + if at_least > 0 { + return Err(input); + } else { + return Ok((vec![], input)); + } + } + } + + loop { + match delimiter.parse(further_input.clone()) { + Ok(((), rest)) => { + further_input = rest; + } + Err(_e) => { + break; + } + } + + match parser.parse(further_input.clone()) { + Ok((item, rest)) => { + results.push(item); + further_input = rest; + count += 1; + } + Err(_e) if self.allow_trailing => { + break; + } + Err(e) => { + return Err(e); + } + } + + if count >= at_most { + break; + } + } + + if count < at_least { + return Err(input); + } + + Ok((results, further_input)) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..acb73d8 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,7 @@ +pub mod choice; +pub mod combinators; +mod parser; +pub mod primitives; +pub mod sequence; + +pub use parser::{ParseResult, Parser, ParserInput, Representation}; diff --git a/src/parser/boxed_parser.rs b/src/parser/boxed_parser.rs new file mode 100644 index 0000000..53509c1 --- /dev/null +++ b/src/parser/boxed_parser.rs @@ -0,0 +1,38 @@ +use crate::parser::{ParseResult, Parser, ParserInput, Representation}; + +pub struct BoxedParser<'a, I, O, E> +where + I: ParserInput, +{ + inner: Box + 'a>, +} + +impl<'a, I, O, E> BoxedParser<'a, I, O, E> +where + I: ParserInput, +{ + pub(crate) fn new

(inner: P) -> Self + where + P: Parser + 'a, + { + BoxedParser { + inner: Box::new(inner), + } + } +} + +impl<'a, I: ParserInput, O, E> Parser for BoxedParser<'a, I, O, E> { + fn representation(&self) -> Representation { + self.inner.representation() + } + fn parse(&self, input: I) -> ParseResult { + self.inner.parse(input) + } + + fn boxed<'b>(self) -> BoxedParser<'b, I, O, E> + where + Self: Sized + 'b, + { + self + } +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 0000000..cb540e9 --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,179 @@ +mod boxed_parser; +mod named_parser; +mod parser_input; +mod representation; + +use std::rc::Rc; + +pub use boxed_parser::BoxedParser; +pub use named_parser::NamedParser; +pub use parser_input::ParserInput; +pub use representation::Representation; + +pub type ParseResult = Result<(O, I), E>; + +pub trait Parser +where + I: ParserInput, +{ + fn parse(&self, input: I) -> ParseResult; + + fn representation(&self) -> Representation; + + fn boxed<'a>(self) -> BoxedParser<'a, I, O, E> + where + Self: Sized + 'a, + { + BoxedParser::new(self) + } + + fn map<'a, F, O2>(self, map_fn: F) -> BoxedParser<'a, I, O2, E> + where + Self: Sized + 'a, + I: 'a, + E: 'a, + O: 'a, + O2: 'a, + F: Fn(O) -> O2 + 'a, + { + crate::combinators::map(self, map_fn).boxed() + } + + fn to<'a, O2>(self, item: O2) -> BoxedParser<'a, I, O2, E> + where + Self: Sized + 'a, + I: 'a, + O: 'a, + O2: Clone + 'a, + E: 'a, + { + self.map(move |_| item.clone()) + } + + fn then<'a, P, O2>(self, next_parser: P) -> BoxedParser<'a, I, (O, O2), E> + where + Self: Sized + 'a, + I: 'a, + O: 'a, + O2: 'a, + E: 'a, + P: Parser + 'a, + { + crate::sequence::tuple2(self, next_parser).boxed() + } + + fn ignore_then<'a, P, O2>(self, next_parser: P) -> BoxedParser<'a, I, O2, E> + where + Self: Sized + 'a, + I: 'a, + O: 'a, + O2: 'a, + E: 'a, + P: Parser + 'a, + { + crate::sequence::tuple2(self, next_parser).map(|(_, next_output)| next_output) + } + + fn then_ignore<'a, P, O2>(self, next_parser: P) -> BoxedParser<'a, I, O, E> + where + Self: Sized + 'a, + I: 'a, + O: 'a, + O2: 'a, + E: 'a, + P: Parser + 'a, + { + crate::sequence::tuple2(self, next_parser).map(|(this_output, _)| this_output) + } + + fn delimited<'a, P1, O1, P2, O2>(self, left: P1, right: P2) -> BoxedParser<'a, I, O, E> + where + Self: Sized + 'a, + I: 'a, + O1: 'a, + O2: 'a, + O: 'a, + E: 'a, + P1: Parser + 'a, + P2: Parser + 'a, + { + crate::sequence::seq((left, self, right)).map(|(_, output, _)| output) + } + + fn surrounded_by<'a, P, O1>(self, surrounding: P) -> BoxedParser<'a, I, O, E> + where + Self: Sized + 'a, + I: 'a, + O1: 'a, + O: 'a, + E: 'a, + P: Parser + 'a, + { + BoxedParser::new(move |input| { + let p1 = |i| surrounding.parse(i); + let p2 = |i| surrounding.parse(i); + let main = |i| self.parse(i); + crate::sequence::seq((p1, main, p2)) + .map(|(_, output, _)| output) + .parse(input) + }) + } + + fn optional<'a>(self) -> BoxedParser<'a, I, Option, E> + where + I: Clone + 'a, + O: 'a, + E: 'a, + Self: Sized + 'a, + { + crate::combinators::optional(self).boxed() + } + + fn named<'a>(self, parser_name: &str) -> NamedParser<'a, I, O, E> + where + Self: Sized + 'a, + I: 'a, + { + NamedParser::new(self.boxed(), parser_name.to_string()) + } +} + +impl Parser for F +where + F: Fn(I) -> ParseResult, +{ + fn parse(&self, input: I) -> ParseResult { + self(input) + } + + fn representation(&self) -> Representation { + Representation::new("NOT IMPL'D") + } +} + +impl Parser for (F, Representation) +where + F: Fn(I) -> ParseResult, +{ + fn parse(&self, input: I) -> ParseResult { + self.0(input) + } + + fn representation(&self) -> Representation { + self.1.clone() + } +} + +impl Parser for Rc +where + I: ParserInput, + T: Parser, +{ + fn parse(&self, input: I) -> ParseResult { + self.as_ref().parse(input) + } + + fn representation(&self) -> Representation { + self.as_ref().representation() + } +} diff --git a/src/parser/named_parser.rs b/src/parser/named_parser.rs new file mode 100644 index 0000000..7c5f9c5 --- /dev/null +++ b/src/parser/named_parser.rs @@ -0,0 +1,36 @@ +use super::boxed_parser::BoxedParser; +use crate::parser::{ParseResult, Parser, ParserInput, Representation}; + +pub struct NamedParser<'a, I, O, E> +where + I: ParserInput, +{ + inner_parser: BoxedParser<'a, I, O, E>, + name: String, +} + +impl<'a, I, O, E> NamedParser<'a, I, O, E> +where + I: ParserInput, +{ + pub(super) fn new(inner_parser: BoxedParser<'a, I, O, E>, name: String) -> Self + where + I: 'a, + { + NamedParser { inner_parser, name } + } + + pub fn get_name(&'a self) -> &'a str { + self.name.as_ref() + } +} + +impl<'a, I: ParserInput, O, E> Parser for NamedParser<'a, I, O, E> { + fn representation(&self) -> Representation { + self.inner_parser.representation() + } + + fn parse(&self, input: I) -> ParseResult { + self.inner_parser.parse(input) + } +} diff --git a/src/parser/parser_input.rs b/src/parser/parser_input.rs new file mode 100644 index 0000000..0784227 --- /dev/null +++ b/src/parser/parser_input.rs @@ -0,0 +1,11 @@ +pub trait ParserInput: std::fmt::Debug { + type Output; + fn next_token() -> Self::Output; +} + +impl ParserInput for &str { + type Output = (); + fn next_token() -> Self::Output { + () + } +} diff --git a/src/parser/representation.rs b/src/parser/representation.rs new file mode 100644 index 0000000..bc5ecc2 --- /dev/null +++ b/src/parser/representation.rs @@ -0,0 +1,66 @@ +#[derive(Debug, Clone, PartialEq)] +pub struct Representation { + val: String, +} + +impl Representation { + pub fn new(from: &str) -> Self { + Self { + val: from.to_string(), + } + } + + pub(crate) fn from_choice( + choice_parser_reps: &mut impl Iterator, + ) -> Self { + let mut buf = String::new(); + let mut iter = choice_parser_reps.peekable(); + loop { + let rep = match iter.next() { + Some(r) => r, + None => break, + }; + buf.push_str(&rep.val); + match iter.peek() { + Some(_) => { + buf.push_str(" | "); + } + None => { + break; + } + } + } + + Representation::new(&buf) + } + + pub(crate) fn from_sequence( + sequence_representations: &mut impl Iterator, + ) -> Self { + let mut buf = String::new(); + let mut iter = sequence_representations.peekable(); + loop { + let rep = match iter.next() { + Some(r) => r, + None => break, + }; + buf.push_str(&rep.val); + match iter.peek() { + Some(_) => { + buf.push_str(" "); + } + None => { + break; + } + } + } + + Representation::new(&buf) + } + + // TODO use at_least, at_most + pub(crate) fn repeated(underlying: Representation, at_least: u16, _at_most: u16) -> Self { + let sigil = if at_least == 0 { "*" } else { "+" }; + Representation::new(&format!("({}){}", underlying.val, sigil)) + } +} diff --git a/src/primitives/mod.rs b/src/primitives/mod.rs new file mode 100644 index 0000000..b005118 --- /dev/null +++ b/src/primitives/mod.rs @@ -0,0 +1,108 @@ +use crate::parser::{ParseResult, Parser, ParserInput, Representation}; + +pub fn literal_char(expected: char) -> impl Fn(&str) -> ParseResult<&str, char, &str> { + move |input| match input.chars().next() { + Some(ch) if ch == expected => Ok((expected, &input[ch.len_utf8()..])), + _ => Err(input), + } +} + +pub fn literal<'a>(expected: &'static str) -> impl Parser<&'a str, &'a str, &'a str> { + println!("literal call expected: {}", expected); + let rep = Representation::new(expected); + let p = move |input: &'a str| match input.get(0..expected.len()) { + Some(next) if next == expected => Ok((expected, &input[expected.len()..])), + _ => Err(input), + }; + (p, rep) +} + +pub fn any_char(input: &str) -> ParseResult<&str, char, &str> { + match input.chars().next() { + Some(ch) => Ok((ch, &input[ch.len_utf8()..])), + None => Err(input), + } +} + +pub fn one_of<'a>(items: &'static str) -> impl Parser<&'a str, &'a str, &'a str> { + let p = move |input: &'a str| { + if let Some(ch) = input.chars().next() { + if items.contains(ch) { + let (first, rest) = input.split_at(1); + return Ok((first, rest)); + } + } + Err(input) + }; + + let mut s = String::new(); + for ch in items.chars() { + s.push(ch); + s.push_str(" | "); + } + let rep = Representation::new(&s); + (p, rep) +} + +pub fn pred(parser: P, pred_fn: F) -> impl Parser +where + I: ParserInput, + P: Parser, + F: Fn(&O) -> bool, +{ + let orig_rep = parser.representation(); + ( + move |input| { + parser.parse(input).and_then(|(result, rest)| { + if pred_fn(&result) { + Ok((result, rest)) + } else { + Err(rest) + } + }) + }, + Representation::new(&format!("{:?} if ", orig_rep)), + ) +} + +/// Parses a standard identifier in a programming language +pub fn identifier(input: &str) -> ParseResult<&str, String, &str> { + let mut chars = input.chars(); + let mut buf = String::new(); + + match chars.next() { + Some(ch) if ch.is_alphabetic() => buf.push(ch), + _ => return Err(input), + } + + for next in chars { + if next.is_alphanumeric() { + buf.push(next); + } else { + break; + } + } + + let next_index = buf.len(); + Ok((buf, &input[next_index..])) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_identifier() { + assert_eq!( + identifier("bongo1beans").unwrap(), + (("bongo1beans".to_string(), "")) + ); + assert_eq!(identifier("2bongo1beans"), Err("2bongo1beans")); + } + + #[test] + fn test_pred() { + let p = pred(any_char, |c| *c == 'f'); + assert_eq!(p.parse("frog"), Ok(('f', "rog"))); + } +} diff --git a/src/sequence/mod.rs b/src/sequence/mod.rs new file mode 100644 index 0000000..cab1ad9 --- /dev/null +++ b/src/sequence/mod.rs @@ -0,0 +1,195 @@ +use crate::parser::{ParseResult, Parser, ParserInput, Representation}; + +pub fn tuple2(parser1: P1, parser2: P2) -> impl Parser +where + I: ParserInput, + P1: Parser, + P2: Parser, +{ + seq((parser1, parser2)) +} + +pub fn seq(sequence: T) -> impl Parser +where + I: ParserInput, + T: Sequence, +{ + let rep = sequence.representation(); + let p = move |input| sequence.parse(input); + (p, rep) +} + +/* TODO - eventually rewrite this parser combinator in Schala. Seeing what this + * code that makes heavy use of type variables and abstraction over types looks like + * in Schala's type system should be educational + */ + +pub trait Sequence { + fn parse(&self, input: I) -> ParseResult; + fn representation(&self) -> Representation; +} + +impl Sequence for (P1, P2) +where + I: ParserInput, + P1: Parser, + P2: Parser, +{ + fn parse(&self, input: I) -> ParseResult { + let parser1 = &self.0; + let parser2 = &self.1; + parser1.parse(input).and_then(|(result1, rest1)| { + parser2 + .parse(rest1) + .map(|(result2, rest2)| ((result1, result2), rest2)) + }) + } + + fn representation(&self) -> Representation { + let mut iter = [self.0.representation(), self.1.representation()].into_iter(); + Representation::from_sequence(&mut iter) + } +} + +impl Sequence for (P1, P2, P3) +where + I: ParserInput, + P1: Parser, + P2: Parser, + P3: Parser, +{ + fn parse(&self, input: I) -> ParseResult { + let parser1 = &self.0; + let parser2 = &self.1; + let parser3 = &self.2; + + let (result1, rest1) = parser1.parse(input)?; + let (result2, rest2) = parser2.parse(rest1)?; + let (result3, rest3) = parser3.parse(rest2)?; + + Ok(((result1, result2, result3), rest3)) + } + + fn representation(&self) -> Representation { + let mut iter = [ + self.0.representation(), + self.1.representation(), + self.2.representation(), + ] + .into_iter(); + Representation::from_sequence(&mut iter) + } +} + +impl Sequence for (P1, P2, P3, P4) +where + I: ParserInput, + P1: Parser, + P2: Parser, + P3: Parser, + P4: Parser, +{ + fn parse(&self, input: I) -> ParseResult { + let parser1 = &self.0; + let parser2 = &self.1; + let parser3 = &self.2; + let parser4 = &self.3; + + let (result1, rest1) = parser1.parse(input)?; + let (result2, rest2) = parser2.parse(rest1)?; + let (result3, rest3) = parser3.parse(rest2)?; + let (result4, rest4) = parser4.parse(rest3)?; + + Ok(((result1, result2, result3, result4), rest4)) + } + + fn representation(&self) -> Representation { + let mut iter = [ + self.0.representation(), + self.1.representation(), + self.2.representation(), + self.3.representation(), + ] + .into_iter(); + Representation::from_sequence(&mut iter) + } +} + +impl Sequence + for (P1, P2, P3, P4, P5) +where + I: ParserInput, + P1: Parser, + P2: Parser, + P3: Parser, + P4: Parser, + P5: Parser, +{ + fn parse(&self, input: I) -> ParseResult { + let parser1 = &self.0; + let parser2 = &self.1; + let parser3 = &self.2; + let parser4 = &self.3; + let parser5 = &self.4; + + let (result1, rest1) = parser1.parse(input)?; + let (result2, rest2) = parser2.parse(rest1)?; + let (result3, rest3) = parser3.parse(rest2)?; + let (result4, rest4) = parser4.parse(rest3)?; + let (result5, rest5) = parser5.parse(rest4)?; + + Ok(((result1, result2, result3, result4, result5), rest5)) + } + + fn representation(&self) -> Representation { + let mut iter = [ + self.0.representation(), + self.1.representation(), + self.2.representation(), + self.3.representation(), + self.4.representation(), + ] + .into_iter(); + Representation::from_sequence(&mut iter) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::combinators::repeated; + use crate::primitives::{identifier, literal}; + + #[test] + fn test_tuple2() { + let p = tuple2(identifier, tuple2(literal(" "), literal("ruts"))); + let (output, _rest) = p.parse("fort1 ruts").unwrap(); + assert_eq!(output, ("fort1".into(), (" ", "ruts"))); + + let p = identifier.then(literal(" ")).then(literal("ruts")); + let (output, _rest) = p.parse("fort1 ruts").unwrap(); + assert_eq!(output, (("fort1".into(), " "), "ruts")); + } + + #[test] + fn test_seq() { + let p = seq(( + literal("bong").to(10), + repeated(literal(" ")).to(()), + literal("hits").to(20), + )); + assert_eq!(p.parse("bong hits").unwrap(), ((10, (), 20), "")); + + let p = seq(( + literal("alpha").to(10), + repeated(literal(" ")).to(()), + repeated(literal("-")).to(()), + repeated(literal(" ")), + literal("beta"), + )); + assert_eq!( + p.parse("alpha ------ beta gamma").unwrap(), + ((10, (), (), vec![" ", " ", " "], "beta"), " gamma") + ); + } +} diff --git a/tests/joplin-cfg.json b/tests/joplin-cfg.json new file mode 100644 index 0000000..1834f85 --- /dev/null +++ b/tests/joplin-cfg.json @@ -0,0 +1,49 @@ +{ + "$schema": "https://joplinapp.org/schema/settings.json", + "locale": "en_GB", + "sync.target": 6, + "markdown.plugin.softbreaks": false, + "markdown.plugin.typographer": false, + "spellChecker.language": "en-US", + "ui.layout": { + "key": "root", + "children": [ + { + "key": "sideBar", + "width": 250, + "visible": true + }, + { + "key": "noteList", + "width": 250, + "visible": true + }, + { + "key": "editor", + "visible": true, + "width": 1493 + }, + { + "key": "plugin-view-joplin.plugin.note.tabs-note.tabs.panel", + "context": { + "pluginId": "joplin.plugin.note.tabs" + }, + "visible": true + } + ], + "visible": true + }, + "noteVisiblePanes": [ + "editor", + "viewer" + ], + "theme": 4, + "sync.6.username": "webdav", + "net.ignoreTlsErrors": true, + "style.editor.contentMaxWidth": 600, + "editor.codeView": true, + "markdown.plugin.sub": true, + "markdown.plugin.sup": true, + "markdown.plugin.multitable": true +} + diff --git a/tests/json_parser.rs b/tests/json_parser.rs new file mode 100644 index 0000000..2b515d5 --- /dev/null +++ b/tests/json_parser.rs @@ -0,0 +1,248 @@ +use parser_combinator::choice::choice; +use parser_combinator::combinators::repeated; +use parser_combinator::primitives::{any_char, literal, literal_char, one_of, pred}; +use parser_combinator::sequence::seq; +use parser_combinator::Parser; +use parser_combinator::Representation; + +use proptest::prelude::*; + +use rstest::*; + +proptest! { + #[test] + fn doesnt_crash(s in "\\PC*") { + let _output = json_object().parse(&s); + } + + #[test] + fn parse_string(s in r#"[^"]+"#) { + let input = format!("\"{}\"", s); + let output = json_string().parse(&input).unwrap(); + match output { + (JsonValue::Str(output_s), "") if output_s == s => (), + _ => panic!(), + } + } +} + +#[test] +fn test_parsing() { + let output = literal("a").parse("a yolo"); + assert_eq!(output.unwrap(), ("a", " yolo")); +} + +/* + * JSON BNF + * ::= + ::= | | | | | + ::= "[" [] {"," }* "]" + ::= "{" [] {"," }* "}" + ::= ":" +*/ +#[derive(Debug, Clone, PartialEq)] +enum JsonValue { + Null, + Bool(bool), + Str(String), + Num(f64), + Array(Vec), + Object(Vec<(String, JsonValue)>), +} + +trait JsonParser<'a, T>: Parser<&'a str, T, &'a str> {} +impl<'a, T, P> JsonParser<'a, T> for P where P: Parser<&'a str, T, &'a str> {} + +fn json_null<'a>() -> impl JsonParser<'a, JsonValue> { + literal("null").to(JsonValue::Null) +} + +fn json_bool<'a>() -> impl JsonParser<'a, JsonValue> { + choice(( + literal("true").to(JsonValue::Bool(true)), + literal("false").to(JsonValue::Bool(false)), + )) +} + +fn json_number<'a>() -> impl JsonParser<'a, JsonValue> { + fn digit<'a>() -> impl JsonParser<'a, &'a str> { + one_of("1234567890") + } + + fn digits<'a>() -> impl JsonParser<'a, Vec<&'a str>> { + repeated(digit()).at_least(1) + } + + let json_number_inner = choice(( + seq((digits(), literal(".").ignore_then(digits()).optional())).map( + |(mut digits, maybe_decimal)| { + if let Some(decimal_digits) = maybe_decimal { + digits.push("."); + digits.extend(decimal_digits.into_iter()); + } + digits.into_iter().collect::() + }, + ), + literal(".").ignore_then(digits()).map(|decimal_digits| { + let mut d = vec!["."]; + d.extend(decimal_digits.into_iter()); + d.into_iter().collect::() + }), + )) + .map(|digits| digits.parse::().unwrap()); + + literal("-") + .optional() + .then(json_number_inner) + .map(|(maybe_sign, mut val)| { + if maybe_sign.is_some() { + val *= -1.0; + } + JsonValue::Num(val) + }) +} + +fn json_string_raw<'a>() -> impl JsonParser<'a, String> { + seq(( + literal_char('"'), + repeated(pred(any_char, |ch| *ch != '"')), + literal_char('"'), + )) + .map(|(_, s, _)| s.iter().cloned().collect::()) +} + +fn json_string<'a>() -> impl JsonParser<'a, JsonValue> { + json_string_raw().map(JsonValue::Str) +} + +fn whitespace<'a>() -> impl JsonParser<'a, ()> { + repeated(choice(( + literal_char('\t'), + literal_char('\n'), + literal_char(' '), + ))) + .to(()) +} + +fn json_array<'a>() -> impl JsonParser<'a, JsonValue> { + move |input| { + let val = json_value().surrounded_by(whitespace()); + + repeated(val) + .separated_by(literal(","), false) + .delimited(literal_char('['), literal_char(']')) + .map(JsonValue::Array) + .parse(input) + } +} + +fn json_object<'a>() -> impl JsonParser<'a, JsonValue> { + move |input| { + let kv = json_string_raw() + .surrounded_by(whitespace()) + .then_ignore(literal_char(':')) + .then(json_value().surrounded_by(whitespace())); + + repeated(kv) + .separated_by(literal_char(','), false) + .delimited(literal_char('{'), literal_char('}')) + .map(JsonValue::Object) + .parse(input) + } +} + +fn json_value<'a>() -> impl JsonParser<'a, JsonValue> { + choice(( + json_null(), + json_bool(), + json_number(), + json_string(), + json_array(), + json_object(), + )) +} + +#[test] +fn parse_json_primitives() { + assert_eq!( + json_string().parse(r#""yolo swagg""#).unwrap(), + (JsonValue::Str("yolo swagg".into()), "") + ); + + assert_eq!( + json_number().parse("-383").unwrap().0, + JsonValue::Num(-383f64) + ); + assert_eq!( + json_number().parse("-.383").unwrap().0, + JsonValue::Num(-0.383) + ); + assert_eq!( + json_number().parse(".383").unwrap().0, + JsonValue::Num(0.383) + ); + assert_eq!( + json_number().parse("-1.383").unwrap().0, + JsonValue::Num(-1.383) + ); +} + +#[rstest] +#[case(r#"[ 4, 9, "ara",]"#)] +fn parse_json_array_err(#[case] input: &str) { + assert!(json_array().parse(input).is_err()); +} + +#[rstest] +#[case("[[],[]]", (JsonValue::Array(vec![JsonValue::Array(vec![]), JsonValue::Array(vec![])]), ""))] +#[case(r#"[ 4, 9, "foo" ]"#, ( + JsonValue::Array(vec![ + JsonValue::Num(4.), + JsonValue::Num(9.0), + JsonValue::Str("foo".to_string()) + ]), + "" + ))] +#[case(r#"[8,null,[],5],{}"#, + ( + JsonValue::Array(vec![ + JsonValue::Num(8.), + JsonValue::Null, + JsonValue::Array(vec![]), + JsonValue::Num(5.), + ]), + ",{}" + ))] +fn parse_json_array(#[case] input: &str, #[case] expected: (JsonValue, &str)) { + assert_eq!(json_array().parse(input).unwrap(), expected); +} + +#[test] +fn parse_json_object() { + assert_eq!( + json_object().parse(r#"{ "a": 23}"#).unwrap().0, + JsonValue::Object(vec![("a".into(), JsonValue::Num(23.))]) + ); + assert_eq!( + json_object().parse(r#"{}"#).unwrap().0, + JsonValue::Object(vec![]) + ); +} + +#[test] +fn parse_json_document() { + let test_json = include_str!("joplin-cfg.json"); + let parsed_json = json_object().parse(test_json); + assert!(parsed_json.is_ok()); +} + +#[rstest] +#[case(json_null().representation(), Representation::new("null"))] +#[case(json_bool().representation(), Representation::new("true | false"))] +#[case(json_number().representation(), Representation::new("- | ε (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 | )+ . (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 | )+ | ε | . (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 | )+"))] +fn representations_test( + #[case] parser_representation: Representation, + #[case] expected: Representation, +) { + assert_eq!(parser_representation, expected); +}