From c3fb83f7aa0f2bc5699d3a050a40af64081f78c7 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 1 Jul 2022 11:00:39 +0200 Subject: Refactor to clean and document `space_or_tab` --- readme.md | 6 +- src/construct/definition.rs | 6 +- src/construct/label_end.rs | 8 +- src/construct/partial_label.rs | 6 +- src/construct/partial_space_or_tab.rs | 218 ++++++++++++++++++++-------------- src/construct/partial_title.rs | 6 +- src/construct/partial_whitespace.rs | 6 + 7 files changed, 146 insertions(+), 110 deletions(-) diff --git a/readme.md b/readme.md index 4144440..765c40a 100644 --- a/readme.md +++ b/readme.md @@ -122,7 +122,6 @@ cargo doc --document-private-items #### Docs -- [ ] (1) `space_or_tab_one_line_ending` - [ ] (1) `edit_map` - [ ] (1) Go through all bnf - [ ] (1) Go through all docs @@ -131,9 +130,7 @@ cargo doc --document-private-items #### Refactor - [ ] (1) Clean shifting, assertions in `edit_map` -- [ ] (1) Clean `space_or_tab_one_line_ending` -- [ ] (1) Use `link_to` (and `space_or_tab_one_line_ending`) in more places? - It’s probably better +- [ ] (1) Use `link_to` in more places? It’s probably better - [ ] (1) Use `edit_map` in `subtokenize` #### Parse @@ -278,3 +275,4 @@ important. - [x] (1) Add docs on resolver, clean feed - [x] (3) Clean compiler - [x] (1) Parse initial and final space_or_tab of paragraphs (in string, text) +- [x] (1) Refactor to clean and document `space_or_tab` diff --git a/src/construct/definition.rs b/src/construct/definition.rs index af94d12..2b3e4b3 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -96,7 +96,7 @@ use crate::construct::{ partial_destination::{start as destination, Options as DestinationOptions}, partial_label::{start as label, Options as LabelOptions}, - partial_space_or_tab::{space_or_tab, space_or_tab_one_line_ending}, + partial_space_or_tab::{space_or_tab, space_or_tab_eol}, partial_title::{start as title, Options as TitleOptions}, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -149,7 +149,7 @@ fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::DefinitionMarker); ( State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab_one_line_ending(), destination_before), + tokenizer.attempt_opt(space_or_tab_eol(), destination_before), )), None, ) @@ -233,7 +233,7 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// "c" /// ``` fn title_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.go(space_or_tab_one_line_ending(), title_before_marker)(tokenizer, code) + tokenizer.go(space_or_tab_eol(), title_before_marker)(tokenizer, code) } /// Before a title, after a line ending. diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 0da12b8..6901cb3 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -150,7 +150,7 @@ use crate::constant::RESOURCE_DESTINATION_BALANCE_MAX; use crate::construct::{ partial_destination::{start as destination, Options as DestinationOptions}, partial_label::{start as label, Options as LabelOptions}, - partial_space_or_tab::space_or_tab_one_line_ending, + partial_space_or_tab::space_or_tab_eol, partial_title::{start as title, Options as TitleOptions}, }; use crate::tokenizer::{ @@ -561,7 +561,7 @@ fn resource(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// [a](|b) c /// ``` fn resource_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_opt(space_or_tab_one_line_ending(), resource_open)(tokenizer, code) + tokenizer.attempt_opt(space_or_tab_eol(), resource_open)(tokenizer, code) } /// At the start of a resource, after optional whitespace. @@ -599,7 +599,7 @@ fn resource_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// [a](b| "c") d /// ``` fn destination_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt(space_or_tab_one_line_ending(), |ok| { + tokenizer.attempt(space_or_tab_eol(), |ok| { Box::new(if ok { resource_between } else { resource_end }) })(tokenizer, code) } @@ -636,7 +636,7 @@ fn resource_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// [a](b "c"|) d /// ``` fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_opt(space_or_tab_one_line_ending(), resource_end)(tokenizer, code) + tokenizer.attempt_opt(space_or_tab_eol(), resource_end)(tokenizer, code) } /// In a resource, at the `)`. diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 1e4d7f2..e505997 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -59,9 +59,7 @@ //! //! -use super::partial_space_or_tab::{ - space_or_tab_one_line_ending_with_options, OneLineEndingOptions, -}; +use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::constant::LINK_REFERENCE_SIZE_MAX; use crate::subtokenize::link; use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; @@ -137,7 +135,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes (State::Ok, None) } Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( - space_or_tab_one_line_ending_with_options(OneLineEndingOptions { + space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, }), diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 8df7601..d2934b3 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -1,4 +1,4 @@ -//! Several helpers to parse whitespace (`space_or_tab`). +//! Several helpers to parse whitespace (`space_or_tab`, `space_or_tab_eol`). //! //! ## References //! @@ -7,7 +7,7 @@ use crate::subtokenize::link; use crate::tokenizer::{Code, ContentType, State, StateFn, StateFnResult, TokenType, Tokenizer}; -/// Options to parse whitespace. +/// Options to parse `space_or_tab`. #[derive(Debug)] pub struct Options { /// Minimum allowed characters (inclusive). @@ -16,28 +16,22 @@ pub struct Options { pub max: usize, /// Token type to use for whitespace events. pub kind: TokenType, - /// To do. - pub content_type: Option, + /// Connect this whitespace to the previous. pub connect: bool, -} - -#[derive(Debug)] -pub struct OneLineEndingOptions { - /// To do. + /// Embedded content type to use. pub content_type: Option, - pub connect: bool, } -/// Options to parse whitespace. +/// Options to parse `space_or_tab` and one optional eol, but no blank line. #[derive(Debug)] -struct OneLineInfo { - /// Whether something was seen. - connect: bool, - /// Configuration. - options: OneLineEndingOptions, +pub struct EolOptions { + /// Connect this whitespace to the previous. + pub connect: bool, + /// Embedded content type to use. + pub content_type: Option, } -/// Options to parse whitespace. +/// State needed to parse `space_or_tab`. #[derive(Debug)] struct Info { /// Current size. @@ -46,6 +40,17 @@ struct Info { options: Options, } +/// State needed to parse `space_or_tab_eol`. +#[derive(Debug)] +struct EolInfo { + /// Whether to connect the next whitespace to the event before. + connect: bool, + /// Whether there was initial whitespace. + ok: bool, + /// Configuration. + options: EolOptions, +} + /// One or more `space_or_tab`. /// /// ```bnf @@ -55,7 +60,7 @@ pub fn space_or_tab() -> Box { space_or_tab_min_max(1, usize::MAX) } -/// Between `x` and `y` `space_or_tab` +/// Between `x` and `y` `space_or_tab`. /// /// ```bnf /// space_or_tab_min_max ::= x*y( ' ' '\t' ) @@ -70,16 +75,57 @@ pub fn space_or_tab_min_max(min: usize, max: usize) -> Box { }) } -/// Between `x` and `y` `space_or_tab`, with the given token type. +/// `space_or_tab`, with the given options. +pub fn space_or_tab_with_options(options: Options) -> Box { + Box::new(|t, c| start(t, c, Info { size: 0, options })) +} + +/// `space_or_tab`, or optionally `space_or_tab`, one `eol`, and +/// optionally `space_or_tab`. /// /// ```bnf -/// space_or_tab ::= x*y( ' ' '\t' ) +/// space_or_tab_eol ::= 1*( ' ' '\t' ) | 0*( ' ' '\t' ) eol 0*( ' ' '\t' ) /// ``` -pub fn space_or_tab_with_options(options: Options) -> Box { - Box::new(|t, c| start(t, c, Info { size: 0, options })) +pub fn space_or_tab_eol() -> Box { + space_or_tab_eol_with_options(EolOptions { + content_type: None, + connect: false, + }) } -/// Before whitespace. +/// `space_or_tab_eol`, with the given options. +pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box { + Box::new(move |tokenizer, code| { + let mut info = EolInfo { + connect: false, + ok: false, + options, + }; + + tokenizer.attempt( + space_or_tab_with_options(Options { + kind: TokenType::SpaceOrTab, + min: 1, + max: usize::MAX, + content_type: info.options.content_type, + connect: info.options.connect, + }), + move |ok| { + if ok { + info.ok = ok; + + if info.options.content_type.is_some() { + info.connect = true; + } + } + + Box::new(|t, c| after_space_or_tab(t, c, info)) + }, + )(tokenizer, code) + }) +} + +/// Before `space_or_tab`. /// /// ```markdown /// alpha| bravo @@ -109,7 +155,7 @@ fn start(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult } } -/// In whitespace. +/// In `space_or_tab`. /// /// ```markdown /// alpha |bravo @@ -136,85 +182,75 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResul } } -pub fn space_or_tab_one_line_ending() -> Box { - space_or_tab_one_line_ending_with_options(OneLineEndingOptions { - content_type: None, - connect: false, - }) -} - -pub fn space_or_tab_one_line_ending_with_options(options: OneLineEndingOptions) -> Box { - Box::new(move |tokenizer, code| { - let mut info = OneLineInfo { - connect: false, - options, - }; - - tokenizer.attempt( - space_or_tab_with_options(Options { - kind: TokenType::SpaceOrTab, - min: 1, - max: usize::MAX, - content_type: info.options.content_type, - connect: info.options.connect, - }), - move |ok| { - if ok && info.options.content_type.is_some() { - info.connect = true; - } - - Box::new(move |tokenizer, code| match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - at_eol(tokenizer, code, info) - } - _ => { - if ok { - (State::Ok, Some(vec![code])) - } else { - (State::Nok, None) - } - } - }) - }, - )(tokenizer, code) - }) -} - -fn at_eol(tokenizer: &mut Tokenizer, code: Code, mut info: OneLineInfo) -> StateFnResult { +/// `space_or_tab_eol`: after optionally first `space_or_tab`. +/// +/// ```markdown +/// alpha | +/// bravo +/// ``` +/// +/// ```markdown +/// alpha| +/// bravo +/// ``` +fn after_space_or_tab(tokenizer: &mut Tokenizer, code: Code, mut info: EolInfo) -> StateFnResult { match code { Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { tokenizer.enter_with_content(TokenType::LineEnding, info.options.content_type); - if info.options.content_type.is_some() { - if info.connect { - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - } else { - info.connect = true; - } + if info.connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else if info.options.content_type.is_some() { + info.connect = true; } tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new(tokenizer.attempt_opt( - space_or_tab_with_options(Options { - kind: TokenType::SpaceOrTab, - min: 1, - max: usize::MAX, - content_type: info.options.content_type, - connect: info.connect, - }), - after_eol, - ))), - None, - ) + (State::Fn(Box::new(|t, c| after_eol(t, c, info))), None) } - _ => unreachable!("expected eol"), + _ if info.ok => (State::Ok, Some(vec![code])), + _ => (State::Nok, None), } } -fn after_eol(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +/// `space_or_tab_eol`: after eol. +/// +/// ```markdown +/// alpha +/// |bravo +/// ``` +/// +/// ```markdown +/// alpha +/// |bravo +/// ``` +#[allow(clippy::needless_pass_by_value)] +fn after_eol(tokenizer: &mut Tokenizer, code: Code, info: EolInfo) -> StateFnResult { + tokenizer.attempt_opt( + space_or_tab_with_options(Options { + kind: TokenType::SpaceOrTab, + min: 1, + max: usize::MAX, + content_type: info.options.content_type, + connect: info.connect, + }), + after_more_space_or_tab, + )(tokenizer, code) +} + +/// `space_or_tab_eol`: after more (optional) `space_or_tab`. +/// +/// ```markdown +/// alpha +/// |bravo +/// ``` +/// +/// ```markdown +/// alpha +/// |bravo +/// ``` +fn after_more_space_or_tab(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { // Blank line not allowed. if matches!( code, diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 044a8db..3d0bfb6 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -30,9 +30,7 @@ //! [character_reference]: crate::construct::character_reference //! [label_end]: crate::construct::label_end -use super::partial_space_or_tab::{ - space_or_tab_one_line_ending_with_options, OneLineEndingOptions, -}; +use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::subtokenize::link; use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; @@ -183,7 +181,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes } Code::None => (State::Nok, None), Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( - space_or_tab_one_line_ending_with_options(OneLineEndingOptions { + space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, }), diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 9a7a54d..62b1205 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -10,12 +10,18 @@ //! whitespace ::= 0.*space_or_tab eol 0.*space_or_tab //! ``` //! +//! This is similar to [`space_or_tab_eol`][space_or_tab_eol], with the main +//! difference that that *does not* require a line ending and parses any +//! `space_or_tab` with one line ending. +//! This instead *requires* the line ending (or eol). +//! //! ## References //! //! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js) //! //! [string]: crate::content::string //! [text]: crate::content::text +//! [space_or_tab_eol]: crate::construct::partial_space_or_tab::space_or_tab_eol use super::partial_space_or_tab::space_or_tab; use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; -- cgit