From 41afec1ed898159e1df3bc1157768f2066dd85e5 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 1 Jul 2022 15:36:38 +0200 Subject: Make paragraphs really fast The approach that `micromark-js` takes is as follows: to parse a paragraph, check whether each line starts with something else. If it does, exit, otherwise continue. That is slow, because our actual flow parser does similar things: the work was being done twice. To fix this, this commit introduces parsing each line of a paragraph separately. And finally, when done with flow, combining adjacent paragraphs. This same mechanism is reused for setext headings. Additionally, this commit adds support for interrupting things (or not). E.g., HTML (flow, complete) cannot interrupt paragraphs. Definitions cannot interrupt paragraphs, and connect be interrupted either, but they can follow each other. --- readme.md | 9 +- src/construct/code_fenced.rs | 5 +- src/construct/code_indented.rs | 11 +- src/construct/definition.rs | 17 ++- src/construct/heading_atx.rs | 9 +- src/construct/heading_setext.rs | 233 ++++++++++++---------------------------- src/construct/html_flow.rs | 19 +++- src/construct/paragraph.rs | 150 ++++++++++---------------- src/construct/thematic_break.rs | 9 +- src/content/flow.rs | 46 ++++---- src/tokenizer.rs | 3 + tests/autolink.rs | 2 +- tests/character_escape.rs | 2 +- tests/character_reference.rs | 2 +- tests/code_fenced.rs | 2 +- tests/code_indented.rs | 2 +- tests/definition.rs | 13 ++- tests/hard_break_escape.rs | 2 +- tests/hard_break_trailing.rs | 2 +- tests/heading_atx.rs | 2 +- tests/html_flow.rs | 13 ++- tests/html_text.rs | 2 +- tests/thematic_break.rs | 2 +- 23 files changed, 230 insertions(+), 327 deletions(-) diff --git a/readme.md b/readme.md index f7847dc..103b201 100644 --- a/readme.md +++ b/readme.md @@ -46,11 +46,6 @@ cargo doc --document-private-items ### Some major obstacles -- [ ] (8) Can paragraphs operate more performantly than checking whether other - flow constructs start a line, before exiting and actually attempting flow - constructs? -- [ ] (3) Interrupting: sometimes flow can or cannot start depending on the - previous construct (paragraph, definition) - [ ] (5) Containers: this will be rather messy, and depends a lot on how subtokenization is solved - [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by @@ -132,7 +127,6 @@ cargo doc --document-private-items #### Parse -- [ ] (3) Interrupting (html flow complete, definition + code_indented) - [ ] (5) attention\ test (`character_reference`, `hard_break_escape`, `hard_break_trailing`, `heading_atx`, `heading_setext`, `html_flow`, `thematic_break`)\ @@ -274,3 +268,6 @@ important. - [x] (1) Parse initial and final space_or_tab of paragraphs (in string, text) - [x] (1) Refactor to clean and document `space_or_tab` - [x] (1) Refactor to clean and document `edit_map` +- [x] (8) Make paragraphs fast by merging them at the end, not checking whether + things interrupt them each line +- [x] (3) Add support for interrupting (or not) diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index d19cad0..f2d243a 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -179,7 +179,8 @@ struct Info { pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::CodeFenced); tokenizer.enter(TokenType::CodeFencedFence); - tokenizer.attempt_opt(space_or_tab(), before_sequence_open)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before_sequence_open)(tokenizer, code) } /// Inside the opening fence, after an optional prefix, before a sequence. @@ -550,5 +551,7 @@ fn content_continue(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateF /// ``` fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::CodeFenced); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 99445b9..9bdfd71 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -59,8 +59,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// > filled line (that it has a non-whitespace character), because blank lines /// > are parsed already, so we never run into that. pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.enter(TokenType::CodeIndented); - tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code) + // Do not interrupt paragraphs. + if tokenizer.interrupt { + (State::Nok, None) + } else { + tokenizer.enter(TokenType::CodeIndented); + tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code) + } } /// At a break. @@ -110,6 +115,8 @@ fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::CodeIndented); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index f05064a..e1afd03 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -107,8 +107,19 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// |[a]: b "c" /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.enter(TokenType::Definition); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + let index = tokenizer.events.len(); + let definition_before = index > 3 + && tokenizer.events[index - 1].token_type == TokenType::LineEnding + && tokenizer.events[index - 3].token_type == TokenType::Definition; + + // Do not interrupt paragraphs (but do follow definitions). + if tokenizer.interrupt && !definition_before { + (State::Nok, None) + } else { + tokenizer.enter(TokenType::Definition); + // Note: arbitrary whitespace allowed even if code (indented) is on. + tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + } } /// At the start of a definition, after whitespace. @@ -218,6 +229,8 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { tokenizer.exit(TokenType::Definition); + // You’d be interrupting. + tokenizer.interrupt = true; (State::Ok, Some(vec![code])) } _ => (State::Nok, None), diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 2811894..3ce7052 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -54,8 +54,8 @@ //! [wiki-setext]: https://en.wikipedia.org/wiki/Setext //! [atx]: http://www.aaronsw.com/2002/atx/ -use super::partial_space_or_tab::space_or_tab; -use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; use crate::tokenizer::{ Code, ContentType, Event, EventType, State, StateFnResult, TokenType, Tokenizer, }; @@ -68,7 +68,8 @@ use crate::util::edit_map::EditMap; /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::HeadingAtx); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) } /// Start of a heading (atx), after whitespace. @@ -127,6 +128,8 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HeadingAtx); tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve)); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } Code::VirtualSpace | Code::Char('\t' | ' ') => { diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 03a2e55..df20aa7 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -58,10 +58,9 @@ //! [atx]: http://www.aaronsw.com/2002/atx/ use crate::constant::TAB_SIZE; -use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_with_options, Options}; -use crate::subtokenize::link; -use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; -use crate::util::span::from_exit_event; +use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; /// Kind of underline. #[derive(Debug, Clone, PartialEq)] @@ -109,150 +108,23 @@ impl Kind { } } -/// Start of a heading (setext). -/// -/// ```markdown -/// |alpha -/// == -/// ``` -pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.enter(TokenType::HeadingSetext); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) -} - -/// Start of a heading (setext), after whitespace. -/// -/// ```markdown -/// |alpha -/// == -/// ``` -fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - unreachable!("expected non-eol/eof"); - } - _ => { - tokenizer.enter(TokenType::HeadingSetextText); - tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); - text_inside(tokenizer, code) - } - } -} - -/// Inside text. -/// -/// ```markdown -/// al|pha -/// bra|vo -/// == -/// ``` -fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::Data); - tokenizer.exit(TokenType::HeadingSetextText); - tokenizer.attempt(underline_before, |ok| { - Box::new(if ok { after } else { text_continue }) - })(tokenizer, code) - } - _ => { - tokenizer.consume(code); - (State::Fn(Box::new(text_inside)), None) - } - } -} - -/// At a line ending, not at an underline. -/// -/// ```markdown -/// alpha -/// |bravo -/// == -/// ``` -fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - // Needed to connect the text. - tokenizer.enter(TokenType::HeadingSetextText); - tokenizer.events.pop(); - tokenizer.events.pop(); - - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter_with_content(TokenType::LineEnding, Some(ContentType::Text)); - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - - ( - State::Fn(Box::new(tokenizer.attempt_opt( - space_or_tab_with_options(Options { - kind: TokenType::SpaceOrTab, - min: 1, - max: usize::MAX, - content_type: Some(ContentType::Text), - connect: true, - }), - text_line_start, - ))), - None, - ) - } - _ => unreachable!("expected eol"), - } -} - -/// At a line ending after whitespace, not at an underline. -/// -/// ```markdown -/// alpha -/// |bravo -/// == -/// ``` -fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - // Blank lines not allowed. - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), - _ => { - tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - text_inside(tokenizer, code) - } - } -} - -/// After a heading (setext). -/// -/// ```markdown -/// alpha -/// ==| -/// ``` -fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::HeadingSetext); - (State::Ok, Some(vec![code])) -} - /// At a line ending, presumably an underline. /// /// ```markdown /// alpha| /// == /// ``` -fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), underline_sequence_start), - )), - None, - ) - } - _ => unreachable!("expected eol"), +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let index = tokenizer.events.len(); + let paragraph_before = index > 3 + && tokenizer.events[index - 1].token_type == TokenType::LineEnding + && tokenizer.events[index - 3].token_type == TokenType::Paragraph; + + if paragraph_before { + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) + } else { + (State::Nok, None) } } @@ -262,26 +134,11 @@ fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// alpha /// |== /// ``` -fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let tail = tokenizer.events.last(); - let mut prefix = 0; - - if let Some(event) = tail { - if event.token_type == TokenType::SpaceOrTab { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; - } - } - - // To do: 4+ should be okay if code (indented) is turned off! - if prefix >= TAB_SIZE { - return (State::Nok, None); - } - +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::Char(char) if char == '-' || char == '=' => { tokenizer.enter(TokenType::HeadingSetextUnderline); - underline_sequence_inside(tokenizer, code, Kind::from_char(char)) + inside(tokenizer, code, Kind::from_char(char)) } _ => (State::Nok, None), } @@ -293,16 +150,13 @@ fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnRes /// alpha /// =|= /// ``` -fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { +fn inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { Code::Char(char) if char == kind.as_char() => { tokenizer.consume(code); - ( - State::Fn(Box::new(move |t, c| underline_sequence_inside(t, c, kind))), - None, - ) + (State::Fn(Box::new(move |t, c| inside(t, c, kind))), None) } - _ => tokenizer.attempt_opt(space_or_tab(), underline_after)(tokenizer, code), + _ => tokenizer.attempt_opt(space_or_tab(), after)(tokenizer, code), } } @@ -312,12 +166,59 @@ fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) /// alpha /// ==| /// ``` -fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HeadingSetextUnderline); + // Feel free to interrupt. + tokenizer.interrupt = false; + tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve)); (State::Ok, Some(vec![code])) } _ => (State::Nok, None), } } + +/// To do. +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec { + let mut edit_map = EditMap::new(); + let mut index = 0; + let mut paragraph_enter: Option = None; + let mut paragraph_exit: Option = None; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + // Find paragraphs. + if event.event_type == EventType::Enter { + if event.token_type == TokenType::Paragraph { + paragraph_enter = Some(index); + } + } else if event.token_type == TokenType::Paragraph { + paragraph_exit = Some(index); + } + // We know this is preceded by a paragraph. + // Otherwise we don’t parse. + else if event.token_type == TokenType::HeadingSetextUnderline { + let enter = paragraph_enter.take().unwrap(); + let exit = paragraph_exit.take().unwrap(); + + // Change types of Enter:Paragraph, Exit:Paragraph. + tokenizer.events[enter].token_type = TokenType::HeadingSetextText; + tokenizer.events[exit].token_type = TokenType::HeadingSetextText; + + // Add of Enter:HeadingSetext, Exit:HeadingSetext. + let mut heading_enter = tokenizer.events[enter].clone(); + heading_enter.token_type = TokenType::HeadingSetext; + let mut heading_exit = tokenizer.events[index].clone(); + heading_exit.token_type = TokenType::HeadingSetext; + + edit_map.add(enter, 0, vec![heading_enter]); + edit_map.add(index + 1, 0, vec![heading_exit]); + } + + index += 1; + } + + edit_map.consume(&mut tokenizer.events) +} diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index d0e0558..a1bddad 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -98,8 +98,10 @@ //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing -use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; -use crate::construct::{blank_line::start as blank_line, partial_space_or_tab::space_or_tab}; +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE}; +use crate::construct::{ + blank_line::start as blank_line, partial_space_or_tab::space_or_tab_min_max, +}; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Kind of HTML (flow). @@ -191,7 +193,8 @@ struct Info { pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::HtmlFlow); tokenizer.enter(TokenType::HtmlFlowData); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) } /// After optional whitespace, before `<`. @@ -400,8 +403,10 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes } else { info.kind = Kind::Complete; - // To do: do not support complete HTML when interrupting. - if info.start_tag { + // Do not support complete HTML when interrupting. + if tokenizer.interrupt { + (State::Nok, None) + } else if info.start_tag { complete_attribute_name_before(tokenizer, code, info) } else { complete_closing_tag_after(tokenizer, code, info) @@ -784,6 +789,8 @@ fn html_continue_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Sta match code { Code::None => { tokenizer.exit(TokenType::HtmlFlow); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } // To do: do not allow lazy lines. @@ -949,6 +956,8 @@ fn continuation_close(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Stat Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HtmlFlowData); tokenizer.exit(TokenType::HtmlFlow); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } _ => { diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index fea7052..ae2f4de 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -32,14 +32,10 @@ //! [code_text]: crate::construct::code_text //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element -use crate::constant::TAB_SIZE; -use crate::construct::{ - blank_line::start as blank_line, code_fenced::start as code_fenced, - heading_atx::start as heading_atx, html_flow::start as html_flow, - partial_space_or_tab::space_or_tab_min_max, thematic_break::start as thematic_break, +use crate::tokenizer::{ + Code, ContentType, Event, EventType, State, StateFnResult, TokenType, Tokenizer, }; -use crate::subtokenize::link; -use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; /// Before a paragraph. /// @@ -66,11 +62,14 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::None => end(tokenizer, code), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer - .check(interrupt, |ok| { - Box::new(if ok { at_line_ending } else { end }) - })(tokenizer, code), + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::Data); + tokenizer.exit(TokenType::Paragraph); + tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve)); + // You’d be interrupting. + tokenizer.interrupt = true; + (State::Ok, Some(vec![code])) + } _ => { tokenizer.consume(code); (State::Fn(Box::new(inside)), None) @@ -78,90 +77,55 @@ fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// At a line ending, not interrupting. -/// -/// ```markdown -/// alpha| -/// bravo. -/// ``` -fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.consume(code); - tokenizer.exit(TokenType::Data); - tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - (State::Fn(Box::new(inside)), None) -} +/// Merge “`Paragraph`”s, which currently span a single line, into actual +/// `Paragraph`s that span multiple lines. +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec { + let mut edit_map = EditMap::new(); + let len = tokenizer.events.len(); + let mut index = 0; -/// At a line ending, done. -/// -/// ```markdown -/// alpha| -/// *** -/// ``` -fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::Data); - tokenizer.exit(TokenType::Paragraph); - (State::Ok, Some(vec![code])) -} + while index < len { + let event = &tokenizer.events[index]; -/// Before a potential interruption. -/// -/// ```markdown -/// alpha| -/// *** -/// ``` -fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - (State::Fn(Box::new(interrupt_start)), None) - } - _ => unreachable!("expected eol"), - } -} + if event.event_type == EventType::Enter && event.token_type == TokenType::Paragraph { + // Exit:Paragraph + let mut exit_index = index + 3; + // Enter:Paragraph + let mut enter_next_index = exit_index + 3; -/// After a line ending. -/// -/// ```markdown -/// alpha -/// |~~~js -/// ~~~ -/// ``` -fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - // To do: If code is disabled, indented lines are allowed to interrupt. - tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { - Box::new(if ok { interrupt_indent } else { interrupt_cont }) - })(tokenizer, code) -} + // To do: assert that `LineEnding` between? + while enter_next_index < len + && tokenizer.events[enter_next_index].token_type == TokenType::Paragraph + { + // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding, Enter:Paragraph. + edit_map.add(exit_index, 4, vec![]); + println!("rm {:?} {:?}", exit_index, exit_index + 4); -/// At an indent. -/// -/// ```markdown -/// alpha -/// | -/// ``` -fn interrupt_indent(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - (State::Ok, Some(vec![code])) -} + // Add Exit:LineEnding position info to Exit:Data. + let line_ending_exit = &tokenizer.events[enter_next_index - 1]; + let line_ending_point = line_ending_exit.point.clone(); + let line_ending_index = line_ending_exit.index; + let data_exit = &mut tokenizer.events[exit_index - 1]; + data_exit.point = line_ending_point; + data_exit.index = line_ending_index; -/// Not at an indented line. -/// -/// ```markdown -/// alpha -/// |
-/// ``` -fn interrupt_cont(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_n( - vec![ - Box::new(blank_line), - Box::new(code_fenced), - Box::new(html_flow), - Box::new(heading_atx), - Box::new(thematic_break), - ], - |ok| Box::new(move |_t, code| (if ok { State::Nok } else { State::Ok }, Some(vec![code]))), - )(tokenizer, code) + // Link Enter:Data on the previous line to Enter:Data on this line. + let data_enter_prev = &mut tokenizer.events[exit_index - 2]; + data_enter_prev.next = Some(enter_next_index + 1); + let data_enter_next = &mut tokenizer.events[enter_next_index + 1]; + data_enter_next.previous = Some(exit_index - 2); + + // Potential next start. + exit_index = enter_next_index + 3; + enter_next_index = exit_index + 3; + } + + // Move to `Exit:Paragraph`. + index = exit_index; + } + + index += 1; + } + + edit_map.consume(&mut tokenizer.events) } diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 9978ee0..8d29157 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -49,8 +49,8 @@ //! //! -use super::partial_space_or_tab::space_or_tab; -use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; +use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Type of thematic break. @@ -122,7 +122,8 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::ThematicBreak); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) } /// Start of a thematic break, after whitespace. @@ -157,6 +158,8 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(TokenType::ThematicBreak); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } Code::Char(char) if char == info.kind.as_char() => { diff --git a/src/content/flow.rs b/src/content/flow.rs index 0d3ede0..3ff948d 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -92,26 +92,6 @@ fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// After a blank line. -/// -/// Move to `start` afterwards. -/// -/// ```markdown -/// ␠␠| -/// ``` -fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None => (State::Ok, None), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::BlankLineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::BlankLineEnding); - (State::Fn(Box::new(start)), None) - } - _ => unreachable!("expected eol/eof after blank line `{:?}`", code), - } -} - /// Before flow (initial). /// /// “Initial” flow means unprefixed flow, so right at the start of a line. @@ -133,16 +113,38 @@ fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Box::new(code_fenced), Box::new(html_flow), Box::new(heading_atx), + Box::new(heading_setext), Box::new(thematic_break), Box::new(definition), - Box::new(heading_setext), ], |ok| Box::new(if ok { after } else { before_paragraph }), )(tokenizer, code), } } -/// After a flow construct. +/// After a blank line. +/// +/// Move to `start` afterwards. +/// +/// ```markdown +/// ␠␠| +/// ``` +fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::BlankLineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::BlankLineEnding); + // Feel free to interrupt. + tokenizer.interrupt = false; + (State::Fn(Box::new(start)), None) + } + _ => unreachable!("expected eol/eof after blank line `{:?}`", code), + } +} + +/// After something. /// /// ```markdown /// ## alpha| diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 817c1de..b70e706 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1760,6 +1760,8 @@ pub struct Tokenizer<'a> { /// To do. pub label_start_list_loose: Vec, /// To do. + pub interrupt: bool, + /// To do. pub media_list: Vec, /// To do. resolvers: Vec>, @@ -1783,6 +1785,7 @@ impl<'a> Tokenizer<'a> { label_start_stack: vec![], label_start_list_loose: vec![], media_list: vec![], + interrupt: false, resolvers: vec![], resolver_ids: vec![], } diff --git a/tests/autolink.rs b/tests/autolink.rs index f0486ef..9c28834 100644 --- a/tests/autolink.rs +++ b/tests/autolink.rs @@ -252,7 +252,7 @@ fn autolink() { "should not support a dash before a dot in email autolinks" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("", {extensions: [{disable: {null: ["autolink"]}}]}), // "

<a@b.co>

", diff --git a/tests/character_escape.rs b/tests/character_escape.rs index 26e9336..6200014 100644 --- a/tests/character_escape.rs +++ b/tests/character_escape.rs @@ -79,7 +79,7 @@ fn character_escape() { "should escape in fenced code info" ); - // // To do: extensions + // // To do: turning things off // assert_eq!( // micromark("\\> a", {extensions: [{disable: {null: ["characterEscape"]}}]}), // "

\\> a

", diff --git a/tests/character_reference.rs b/tests/character_reference.rs index 3951e00..c87657e 100644 --- a/tests/character_reference.rs +++ b/tests/character_reference.rs @@ -190,7 +190,7 @@ fn character_reference() { "should not support the other characters inside a hexademical" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("&", { // extensions: [{disable: {null: ["characterReferences"]}}] diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs index 0e19637..b7d8307 100644 --- a/tests/code_fenced.rs +++ b/tests/code_fenced.rs @@ -252,7 +252,7 @@ fn code_fenced() { // "should not support lazyness (3)" // ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("```", {extensions: [{disable: {null: ["codeFenced"]}}]}), // "

```

", diff --git a/tests/code_indented.rs b/tests/code_indented.rs index 0190497..773e3d4 100644 --- a/tests/code_indented.rs +++ b/tests/code_indented.rs @@ -119,7 +119,7 @@ fn code_indented() { // "should not support lazyness (7)" // ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark(" a", {extensions: [{disable: {null: ["codeIndented"]}}]}), // "

a

", diff --git a/tests/definition.rs b/tests/definition.rs index ba4e384..df99f74 100644 --- a/tests/definition.rs +++ b/tests/definition.rs @@ -375,12 +375,11 @@ fn definition() { "should not support a final (unbalanced) right paren in a raw destination “before” a title" ); - // To do: do not let code (indented) interrupt definitions. - // assert_eq!( - // micromark(" [a]: b \"c\"\n [d]: e\n [f]: g \"h\"\n [i]: j\n\t[k]: l (m)\n\t n [k] o"), - // "

n k o

", - // "should support subsequent indented definitions" - // ); + assert_eq!( + micromark(" [a]: b \"c\"\n [d]: e\n [f]: g \"h\"\n [i]: j\n\t[k]: l (m)\n\t n [k] o"), + "

n k o

", + "should support subsequent indented definitions" + ); assert_eq!( micromark("[a\n b]: c\n\n[a\n b]"), @@ -406,7 +405,7 @@ fn definition() { "should not support definitions w/ text + a closing paren as a raw destination" ); - // To do: support turning off things. + // To do: turning things off. // assert_eq!( // micromark("[foo]: /url \"title\"", { // extensions: [{disable: {null: ["definition"]}}] diff --git a/tests/hard_break_escape.rs b/tests/hard_break_escape.rs index c4f6f1d..740e706 100644 --- a/tests/hard_break_escape.rs +++ b/tests/hard_break_escape.rs @@ -40,7 +40,7 @@ fn hard_break_escape() { "should not support escape hard breaks at the end of a heading" ); - // // To do: turning off things. + // // To do: turning things off. // assert_eq!( // micromark("a\\\nb", {extensions: [{disable: {null: ["hardBreakEscape"]}}]}), // "

a\\\nb

", diff --git a/tests/hard_break_trailing.rs b/tests/hard_break_trailing.rs index 0dbbbdb..2a4b534 100644 --- a/tests/hard_break_trailing.rs +++ b/tests/hard_break_trailing.rs @@ -118,7 +118,7 @@ fn hard_break_trailing() { // "should support a mixed line suffix after a span (3)" // ); - // // To do: turning off things. + // // To do: turning things off. // assert_eq!( // micromark("a \nb", {extensions: [{disable: {null: ["hardBreakTrailing"]}}]}), // "

a\nb

", diff --git a/tests/heading_atx.rs b/tests/heading_atx.rs index 2548056..ef5846a 100644 --- a/tests/heading_atx.rs +++ b/tests/heading_atx.rs @@ -196,7 +196,7 @@ fn heading_atx() { // "should not support lazyness (2)" // ); - // Extensions: + // To do: turning things off: // assert_eq!( // micromark("# a", {extensions: [{disable: {null: ["headingAtx"]}}]}), // "

# a

", diff --git a/tests/html_flow.rs b/tests/html_flow.rs index 455c5b8..3b69671 100644 --- a/tests/html_flow.rs +++ b/tests/html_flow.rs @@ -21,7 +21,7 @@ fn html_flow() { "should support a heading w/ rank 1" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark_with_options("", {extensions: [{disable: {null: ["htmlFlow"]}}]}), // "

<x>

", @@ -789,12 +789,11 @@ fn html_flow_7_complete() { "should support interleaving w/ whitespace-only blank lines" ); - // To do: disallow html (complete) from interrupting. - // assert_eq!( - // micromark_with_options("Foo\n\nbaz", DANGER), - // "

Foo\n\nbaz

", - // "should not support interrupting paragraphs w/ complete tags" - // ); + assert_eq!( + micromark_with_options("Foo\n\nbaz", DANGER), + "

Foo\n\nbaz

", + "should not support interrupting paragraphs w/ complete tags" + ); assert_eq!( micromark_with_options(" a", DANGER), "should support an EOL in an instruction" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark_with_options("a ", {extensions: [{disable: {null: ["htmlText"]}}]}), // "

a <x>

", diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs index e71ae22..06b1193 100644 --- a/tests/thematic_break.rs +++ b/tests/thematic_break.rs @@ -169,7 +169,7 @@ fn thematic_break() { // "should not support lazyness (2)" // ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("***", {extensions: [{disable: {null: ["thematicBreak"]}}]}), // "

***

", -- cgit