diff options
-rw-r--r-- | readme.md | 9 | ||||
-rw-r--r-- | src/construct/code_fenced.rs | 5 | ||||
-rw-r--r-- | src/construct/code_indented.rs | 11 | ||||
-rw-r--r-- | src/construct/definition.rs | 17 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 9 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 233 | ||||
-rw-r--r-- | src/construct/html_flow.rs | 19 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 150 | ||||
-rw-r--r-- | src/construct/thematic_break.rs | 9 | ||||
-rw-r--r-- | src/content/flow.rs | 46 | ||||
-rw-r--r-- | src/tokenizer.rs | 3 | ||||
-rw-r--r-- | tests/autolink.rs | 2 | ||||
-rw-r--r-- | tests/character_escape.rs | 2 | ||||
-rw-r--r-- | tests/character_reference.rs | 2 | ||||
-rw-r--r-- | tests/code_fenced.rs | 2 | ||||
-rw-r--r-- | tests/code_indented.rs | 2 | ||||
-rw-r--r-- | tests/definition.rs | 13 | ||||
-rw-r--r-- | tests/hard_break_escape.rs | 2 | ||||
-rw-r--r-- | tests/hard_break_trailing.rs | 2 | ||||
-rw-r--r-- | tests/heading_atx.rs | 2 | ||||
-rw-r--r-- | tests/html_flow.rs | 13 | ||||
-rw-r--r-- | tests/html_text.rs | 2 | ||||
-rw-r--r-- | tests/thematic_break.rs | 2 |
23 files changed, 230 insertions, 327 deletions
@@ -46,11 +46,6 @@ cargo doc --document-private-items ### Some major obstacles -- [ ] (8) Can paragraphs operate more performantly than checking whether other - flow constructs start a line, before exiting and actually attempting flow - constructs? -- [ ] (3) Interrupting: sometimes flow can or cannot start depending on the - previous construct (paragraph, definition) - [ ] (5) Containers: this will be rather messy, and depends a lot on how subtokenization is solved - [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by @@ -132,7 +127,6 @@ cargo doc --document-private-items #### Parse -- [ ] (3) Interrupting (html flow complete, definition + code_indented) - [ ] (5) attention\ test (`character_reference`, `hard_break_escape`, `hard_break_trailing`, `heading_atx`, `heading_setext`, `html_flow`, `thematic_break`)\ @@ -274,3 +268,6 @@ important. - [x] (1) Parse initial and final space_or_tab of paragraphs (in string, text) - [x] (1) Refactor to clean and document `space_or_tab` - [x] (1) Refactor to clean and document `edit_map` +- [x] (8) Make paragraphs fast by merging them at the end, not checking whether + things interrupt them each line +- [x] (3) Add support for interrupting (or not) diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index d19cad0..f2d243a 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -179,7 +179,8 @@ struct Info { pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::CodeFenced); tokenizer.enter(TokenType::CodeFencedFence); - tokenizer.attempt_opt(space_or_tab(), before_sequence_open)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before_sequence_open)(tokenizer, code) } /// Inside the opening fence, after an optional prefix, before a sequence. @@ -550,5 +551,7 @@ fn content_continue(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateF /// ``` fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::CodeFenced); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 99445b9..9bdfd71 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -59,8 +59,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// > filled line (that it has a non-whitespace character), because blank lines /// > are parsed already, so we never run into that. pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.enter(TokenType::CodeIndented); - tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code) + // Do not interrupt paragraphs. + if tokenizer.interrupt { + (State::Nok, None) + } else { + tokenizer.enter(TokenType::CodeIndented); + tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code) + } } /// At a break. @@ -110,6 +115,8 @@ fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::CodeIndented); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index f05064a..e1afd03 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -107,8 +107,19 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// |[a]: b "c" /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.enter(TokenType::Definition); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + let index = tokenizer.events.len(); + let definition_before = index > 3 + && tokenizer.events[index - 1].token_type == TokenType::LineEnding + && tokenizer.events[index - 3].token_type == TokenType::Definition; + + // Do not interrupt paragraphs (but do follow definitions). + if tokenizer.interrupt && !definition_before { + (State::Nok, None) + } else { + tokenizer.enter(TokenType::Definition); + // Note: arbitrary whitespace allowed even if code (indented) is on. + tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + } } /// At the start of a definition, after whitespace. @@ -218,6 +229,8 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { tokenizer.exit(TokenType::Definition); + // You’d be interrupting. + tokenizer.interrupt = true; (State::Ok, Some(vec![code])) } _ => (State::Nok, None), diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 2811894..3ce7052 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -54,8 +54,8 @@ //! [wiki-setext]: https://en.wikipedia.org/wiki/Setext //! [atx]: http://www.aaronsw.com/2002/atx/ -use super::partial_space_or_tab::space_or_tab; -use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; use crate::tokenizer::{ Code, ContentType, Event, EventType, State, StateFnResult, TokenType, Tokenizer, }; @@ -68,7 +68,8 @@ use crate::util::edit_map::EditMap; /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::HeadingAtx); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) } /// Start of a heading (atx), after whitespace. @@ -127,6 +128,8 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HeadingAtx); tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve)); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } Code::VirtualSpace | Code::Char('\t' | ' ') => { diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 03a2e55..df20aa7 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -58,10 +58,9 @@ //! [atx]: http://www.aaronsw.com/2002/atx/ use crate::constant::TAB_SIZE; -use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_with_options, Options}; -use crate::subtokenize::link; -use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; -use crate::util::span::from_exit_event; +use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; /// Kind of underline. #[derive(Debug, Clone, PartialEq)] @@ -109,150 +108,23 @@ impl Kind { } } -/// Start of a heading (setext). -/// -/// ```markdown -/// |alpha -/// == -/// ``` -pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.enter(TokenType::HeadingSetext); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) -} - -/// Start of a heading (setext), after whitespace. -/// -/// ```markdown -/// |alpha -/// == -/// ``` -fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - unreachable!("expected non-eol/eof"); - } - _ => { - tokenizer.enter(TokenType::HeadingSetextText); - tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); - text_inside(tokenizer, code) - } - } -} - -/// Inside text. -/// -/// ```markdown -/// al|pha -/// bra|vo -/// == -/// ``` -fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::Data); - tokenizer.exit(TokenType::HeadingSetextText); - tokenizer.attempt(underline_before, |ok| { - Box::new(if ok { after } else { text_continue }) - })(tokenizer, code) - } - _ => { - tokenizer.consume(code); - (State::Fn(Box::new(text_inside)), None) - } - } -} - -/// At a line ending, not at an underline. -/// -/// ```markdown -/// alpha -/// |bravo -/// == -/// ``` -fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - // Needed to connect the text. - tokenizer.enter(TokenType::HeadingSetextText); - tokenizer.events.pop(); - tokenizer.events.pop(); - - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter_with_content(TokenType::LineEnding, Some(ContentType::Text)); - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - - ( - State::Fn(Box::new(tokenizer.attempt_opt( - space_or_tab_with_options(Options { - kind: TokenType::SpaceOrTab, - min: 1, - max: usize::MAX, - content_type: Some(ContentType::Text), - connect: true, - }), - text_line_start, - ))), - None, - ) - } - _ => unreachable!("expected eol"), - } -} - -/// At a line ending after whitespace, not at an underline. -/// -/// ```markdown -/// alpha -/// |bravo -/// == -/// ``` -fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - // Blank lines not allowed. - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), - _ => { - tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - text_inside(tokenizer, code) - } - } -} - -/// After a heading (setext). -/// -/// ```markdown -/// alpha -/// ==| -/// ``` -fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::HeadingSetext); - (State::Ok, Some(vec![code])) -} - /// At a line ending, presumably an underline. /// /// ```markdown /// alpha| /// == /// ``` -fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), underline_sequence_start), - )), - None, - ) - } - _ => unreachable!("expected eol"), +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let index = tokenizer.events.len(); + let paragraph_before = index > 3 + && tokenizer.events[index - 1].token_type == TokenType::LineEnding + && tokenizer.events[index - 3].token_type == TokenType::Paragraph; + + if paragraph_before { + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) + } else { + (State::Nok, None) } } @@ -262,26 +134,11 @@ fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// alpha /// |== /// ``` -fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let tail = tokenizer.events.last(); - let mut prefix = 0; - - if let Some(event) = tail { - if event.token_type == TokenType::SpaceOrTab { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; - } - } - - // To do: 4+ should be okay if code (indented) is turned off! - if prefix >= TAB_SIZE { - return (State::Nok, None); - } - +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::Char(char) if char == '-' || char == '=' => { tokenizer.enter(TokenType::HeadingSetextUnderline); - underline_sequence_inside(tokenizer, code, Kind::from_char(char)) + inside(tokenizer, code, Kind::from_char(char)) } _ => (State::Nok, None), } @@ -293,16 +150,13 @@ fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnRes /// alpha /// =|= /// ``` -fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { +fn inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { Code::Char(char) if char == kind.as_char() => { tokenizer.consume(code); - ( - State::Fn(Box::new(move |t, c| underline_sequence_inside(t, c, kind))), - None, - ) + (State::Fn(Box::new(move |t, c| inside(t, c, kind))), None) } - _ => tokenizer.attempt_opt(space_or_tab(), underline_after)(tokenizer, code), + _ => tokenizer.attempt_opt(space_or_tab(), after)(tokenizer, code), } } @@ -312,12 +166,59 @@ fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) /// alpha /// ==| /// ``` -fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HeadingSetextUnderline); + // Feel free to interrupt. + tokenizer.interrupt = false; + tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve)); (State::Ok, Some(vec![code])) } _ => (State::Nok, None), } } + +/// To do. +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut edit_map = EditMap::new(); + let mut index = 0; + let mut paragraph_enter: Option<usize> = None; + let mut paragraph_exit: Option<usize> = None; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + // Find paragraphs. + if event.event_type == EventType::Enter { + if event.token_type == TokenType::Paragraph { + paragraph_enter = Some(index); + } + } else if event.token_type == TokenType::Paragraph { + paragraph_exit = Some(index); + } + // We know this is preceded by a paragraph. + // Otherwise we don’t parse. + else if event.token_type == TokenType::HeadingSetextUnderline { + let enter = paragraph_enter.take().unwrap(); + let exit = paragraph_exit.take().unwrap(); + + // Change types of Enter:Paragraph, Exit:Paragraph. + tokenizer.events[enter].token_type = TokenType::HeadingSetextText; + tokenizer.events[exit].token_type = TokenType::HeadingSetextText; + + // Add of Enter:HeadingSetext, Exit:HeadingSetext. + let mut heading_enter = tokenizer.events[enter].clone(); + heading_enter.token_type = TokenType::HeadingSetext; + let mut heading_exit = tokenizer.events[index].clone(); + heading_exit.token_type = TokenType::HeadingSetext; + + edit_map.add(enter, 0, vec![heading_enter]); + edit_map.add(index + 1, 0, vec![heading_exit]); + } + + index += 1; + } + + edit_map.consume(&mut tokenizer.events) +} diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index d0e0558..a1bddad 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -98,8 +98,10 @@ //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing -use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; -use crate::construct::{blank_line::start as blank_line, partial_space_or_tab::space_or_tab}; +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE}; +use crate::construct::{ + blank_line::start as blank_line, partial_space_or_tab::space_or_tab_min_max, +}; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Kind of HTML (flow). @@ -191,7 +193,8 @@ struct Info { pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::HtmlFlow); tokenizer.enter(TokenType::HtmlFlowData); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) } /// After optional whitespace, before `<`. @@ -400,8 +403,10 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes } else { info.kind = Kind::Complete; - // To do: do not support complete HTML when interrupting. - if info.start_tag { + // Do not support complete HTML when interrupting. + if tokenizer.interrupt { + (State::Nok, None) + } else if info.start_tag { complete_attribute_name_before(tokenizer, code, info) } else { complete_closing_tag_after(tokenizer, code, info) @@ -784,6 +789,8 @@ fn html_continue_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Sta match code { Code::None => { tokenizer.exit(TokenType::HtmlFlow); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } // To do: do not allow lazy lines. @@ -949,6 +956,8 @@ fn continuation_close(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Stat Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HtmlFlowData); tokenizer.exit(TokenType::HtmlFlow); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } _ => { diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index fea7052..ae2f4de 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -32,14 +32,10 @@ //! [code_text]: crate::construct::code_text //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element -use crate::constant::TAB_SIZE; -use crate::construct::{ - blank_line::start as blank_line, code_fenced::start as code_fenced, - heading_atx::start as heading_atx, html_flow::start as html_flow, - partial_space_or_tab::space_or_tab_min_max, thematic_break::start as thematic_break, +use crate::tokenizer::{ + Code, ContentType, Event, EventType, State, StateFnResult, TokenType, Tokenizer, }; -use crate::subtokenize::link; -use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; /// Before a paragraph. /// @@ -66,11 +62,14 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::None => end(tokenizer, code), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer - .check(interrupt, |ok| { - Box::new(if ok { at_line_ending } else { end }) - })(tokenizer, code), + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::Data); + tokenizer.exit(TokenType::Paragraph); + tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve)); + // You’d be interrupting. + tokenizer.interrupt = true; + (State::Ok, Some(vec![code])) + } _ => { tokenizer.consume(code); (State::Fn(Box::new(inside)), None) @@ -78,90 +77,55 @@ fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// At a line ending, not interrupting. -/// -/// ```markdown -/// alpha| -/// bravo. -/// ``` -fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.consume(code); - tokenizer.exit(TokenType::Data); - tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - (State::Fn(Box::new(inside)), None) -} +/// Merge “`Paragraph`”s, which currently span a single line, into actual +/// `Paragraph`s that span multiple lines. +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut edit_map = EditMap::new(); + let len = tokenizer.events.len(); + let mut index = 0; -/// At a line ending, done. -/// -/// ```markdown -/// alpha| -/// *** -/// ``` -fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::Data); - tokenizer.exit(TokenType::Paragraph); - (State::Ok, Some(vec![code])) -} + while index < len { + let event = &tokenizer.events[index]; -/// Before a potential interruption. -/// -/// ```markdown -/// alpha| -/// *** -/// ``` -fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - (State::Fn(Box::new(interrupt_start)), None) - } - _ => unreachable!("expected eol"), - } -} + if event.event_type == EventType::Enter && event.token_type == TokenType::Paragraph { + // Exit:Paragraph + let mut exit_index = index + 3; + // Enter:Paragraph + let mut enter_next_index = exit_index + 3; -/// After a line ending. -/// -/// ```markdown -/// alpha -/// |~~~js -/// ~~~ -/// ``` -fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - // To do: If code is disabled, indented lines are allowed to interrupt. - tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { - Box::new(if ok { interrupt_indent } else { interrupt_cont }) - })(tokenizer, code) -} + // To do: assert that `LineEnding` between? + while enter_next_index < len + && tokenizer.events[enter_next_index].token_type == TokenType::Paragraph + { + // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding, Enter:Paragraph. + edit_map.add(exit_index, 4, vec![]); + println!("rm {:?} {:?}", exit_index, exit_index + 4); -/// At an indent. -/// -/// ```markdown -/// alpha -/// | -/// ``` -fn interrupt_indent(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - (State::Ok, Some(vec![code])) -} + // Add Exit:LineEnding position info to Exit:Data. + let line_ending_exit = &tokenizer.events[enter_next_index - 1]; + let line_ending_point = line_ending_exit.point.clone(); + let line_ending_index = line_ending_exit.index; + let data_exit = &mut tokenizer.events[exit_index - 1]; + data_exit.point = line_ending_point; + data_exit.index = line_ending_index; -/// Not at an indented line. -/// -/// ```markdown -/// alpha -/// |<div> -/// ``` -fn interrupt_cont(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_n( - vec![ - Box::new(blank_line), - Box::new(code_fenced), - Box::new(html_flow), - Box::new(heading_atx), - Box::new(thematic_break), - ], - |ok| Box::new(move |_t, code| (if ok { State::Nok } else { State::Ok }, Some(vec![code]))), - )(tokenizer, code) + // Link Enter:Data on the previous line to Enter:Data on this line. + let data_enter_prev = &mut tokenizer.events[exit_index - 2]; + data_enter_prev.next = Some(enter_next_index + 1); + let data_enter_next = &mut tokenizer.events[enter_next_index + 1]; + data_enter_next.previous = Some(exit_index - 2); + + // Potential next start. + exit_index = enter_next_index + 3; + enter_next_index = exit_index + 3; + } + + // Move to `Exit:Paragraph`. + index = exit_index; + } + + index += 1; + } + + edit_map.consume(&mut tokenizer.events) } diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 9978ee0..8d29157 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -49,8 +49,8 @@ //! //! <!-- To do: link `lists` --> -use super::partial_space_or_tab::space_or_tab; -use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; +use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Type of thematic break. @@ -122,7 +122,8 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::ThematicBreak); - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code) + // To do: allow arbitrary when code (indented) is turned off. + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code) } /// Start of a thematic break, after whitespace. @@ -157,6 +158,8 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(TokenType::ThematicBreak); + // Feel free to interrupt. + tokenizer.interrupt = false; (State::Ok, Some(vec![code])) } Code::Char(char) if char == info.kind.as_char() => { diff --git a/src/content/flow.rs b/src/content/flow.rs index 0d3ede0..3ff948d 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -92,26 +92,6 @@ fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// After a blank line. -/// -/// Move to `start` afterwards. -/// -/// ```markdown -/// ␠␠| -/// ``` -fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None => (State::Ok, None), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::BlankLineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::BlankLineEnding); - (State::Fn(Box::new(start)), None) - } - _ => unreachable!("expected eol/eof after blank line `{:?}`", code), - } -} - /// Before flow (initial). /// /// “Initial” flow means unprefixed flow, so right at the start of a line. @@ -133,16 +113,38 @@ fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Box::new(code_fenced), Box::new(html_flow), Box::new(heading_atx), + Box::new(heading_setext), Box::new(thematic_break), Box::new(definition), - Box::new(heading_setext), ], |ok| Box::new(if ok { after } else { before_paragraph }), )(tokenizer, code), } } -/// After a flow construct. +/// After a blank line. +/// +/// Move to `start` afterwards. +/// +/// ```markdown +/// ␠␠| +/// ``` +fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::BlankLineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::BlankLineEnding); + // Feel free to interrupt. + tokenizer.interrupt = false; + (State::Fn(Box::new(start)), None) + } + _ => unreachable!("expected eol/eof after blank line `{:?}`", code), + } +} + +/// After something. /// /// ```markdown /// ## alpha| diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 817c1de..b70e706 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1760,6 +1760,8 @@ pub struct Tokenizer<'a> { /// To do. pub label_start_list_loose: Vec<LabelStart>, /// To do. + pub interrupt: bool, + /// To do. pub media_list: Vec<Media>, /// To do. resolvers: Vec<Box<Resolver>>, @@ -1783,6 +1785,7 @@ impl<'a> Tokenizer<'a> { label_start_stack: vec![], label_start_list_loose: vec![], media_list: vec![], + interrupt: false, resolvers: vec![], resolver_ids: vec![], } diff --git a/tests/autolink.rs b/tests/autolink.rs index f0486ef..9c28834 100644 --- a/tests/autolink.rs +++ b/tests/autolink.rs @@ -252,7 +252,7 @@ fn autolink() { "should not support a dash before a dot in email autolinks" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("<a@b.co>", {extensions: [{disable: {null: ["autolink"]}}]}), // "<p><a@b.co></p>", diff --git a/tests/character_escape.rs b/tests/character_escape.rs index 26e9336..6200014 100644 --- a/tests/character_escape.rs +++ b/tests/character_escape.rs @@ -79,7 +79,7 @@ fn character_escape() { "should escape in fenced code info" ); - // // To do: extensions + // // To do: turning things off // assert_eq!( // micromark("\\> a", {extensions: [{disable: {null: ["characterEscape"]}}]}), // "<p>\\> a</p>", diff --git a/tests/character_reference.rs b/tests/character_reference.rs index 3951e00..c87657e 100644 --- a/tests/character_reference.rs +++ b/tests/character_reference.rs @@ -190,7 +190,7 @@ fn character_reference() { "should not support the other characters inside a hexademical" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("&", { // extensions: [{disable: {null: ["characterReferences"]}}] diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs index 0e19637..b7d8307 100644 --- a/tests/code_fenced.rs +++ b/tests/code_fenced.rs @@ -252,7 +252,7 @@ fn code_fenced() { // "should not support lazyness (3)" // ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("```", {extensions: [{disable: {null: ["codeFenced"]}}]}), // "<p>```</p>", diff --git a/tests/code_indented.rs b/tests/code_indented.rs index 0190497..773e3d4 100644 --- a/tests/code_indented.rs +++ b/tests/code_indented.rs @@ -119,7 +119,7 @@ fn code_indented() { // "should not support lazyness (7)" // ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark(" a", {extensions: [{disable: {null: ["codeIndented"]}}]}), // "<p>a</p>", diff --git a/tests/definition.rs b/tests/definition.rs index ba4e384..df99f74 100644 --- a/tests/definition.rs +++ b/tests/definition.rs @@ -375,12 +375,11 @@ fn definition() { "should not support a final (unbalanced) right paren in a raw destination “before” a title" ); - // To do: do not let code (indented) interrupt definitions. - // assert_eq!( - // micromark(" [a]: b \"c\"\n [d]: e\n [f]: g \"h\"\n [i]: j\n\t[k]: l (m)\n\t n [k] o"), - // "<p>n <a href=\"l\" title=\"m\">k</a> o</p>", - // "should support subsequent indented definitions" - // ); + assert_eq!( + micromark(" [a]: b \"c\"\n [d]: e\n [f]: g \"h\"\n [i]: j\n\t[k]: l (m)\n\t n [k] o"), + "<p>n <a href=\"l\" title=\"m\">k</a> o</p>", + "should support subsequent indented definitions" + ); assert_eq!( micromark("[a\n b]: c\n\n[a\n b]"), @@ -406,7 +405,7 @@ fn definition() { "should not support definitions w/ text + a closing paren as a raw destination" ); - // To do: support turning off things. + // To do: turning things off. // assert_eq!( // micromark("[foo]: /url \"title\"", { // extensions: [{disable: {null: ["definition"]}}] diff --git a/tests/hard_break_escape.rs b/tests/hard_break_escape.rs index c4f6f1d..740e706 100644 --- a/tests/hard_break_escape.rs +++ b/tests/hard_break_escape.rs @@ -40,7 +40,7 @@ fn hard_break_escape() { "should not support escape hard breaks at the end of a heading" ); - // // To do: turning off things. + // // To do: turning things off. // assert_eq!( // micromark("a\\\nb", {extensions: [{disable: {null: ["hardBreakEscape"]}}]}), // "<p>a\\\nb</p>", diff --git a/tests/hard_break_trailing.rs b/tests/hard_break_trailing.rs index 0dbbbdb..2a4b534 100644 --- a/tests/hard_break_trailing.rs +++ b/tests/hard_break_trailing.rs @@ -118,7 +118,7 @@ fn hard_break_trailing() { // "should support a mixed line suffix after a span (3)" // ); - // // To do: turning off things. + // // To do: turning things off. // assert_eq!( // micromark("a \nb", {extensions: [{disable: {null: ["hardBreakTrailing"]}}]}), // "<p>a\nb</p>", diff --git a/tests/heading_atx.rs b/tests/heading_atx.rs index 2548056..ef5846a 100644 --- a/tests/heading_atx.rs +++ b/tests/heading_atx.rs @@ -196,7 +196,7 @@ fn heading_atx() { // "should not support lazyness (2)" // ); - // Extensions: + // To do: turning things off: // assert_eq!( // micromark("# a", {extensions: [{disable: {null: ["headingAtx"]}}]}), // "<p># a</p>", diff --git a/tests/html_flow.rs b/tests/html_flow.rs index 455c5b8..3b69671 100644 --- a/tests/html_flow.rs +++ b/tests/html_flow.rs @@ -21,7 +21,7 @@ fn html_flow() { "should support a heading w/ rank 1" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark_with_options("<x>", {extensions: [{disable: {null: ["htmlFlow"]}}]}), // "<p><x></p>", @@ -789,12 +789,11 @@ fn html_flow_7_complete() { "should support interleaving w/ whitespace-only blank lines" ); - // To do: disallow html (complete) from interrupting. - // assert_eq!( - // micromark_with_options("Foo\n<a href=\"bar\">\nbaz", DANGER), - // "<p>Foo\n<a href=\"bar\">\nbaz</p>", - // "should not support interrupting paragraphs w/ complete tags" - // ); + assert_eq!( + micromark_with_options("Foo\n<a href=\"bar\">\nbaz", DANGER), + "<p>Foo\n<a href=\"bar\">\nbaz</p>", + "should not support interrupting paragraphs w/ complete tags" + ); assert_eq!( micromark_with_options("<x", DANGER), diff --git a/tests/html_text.rs b/tests/html_text.rs index e70a4da..0288af7 100644 --- a/tests/html_text.rs +++ b/tests/html_text.rs @@ -418,7 +418,7 @@ micromark_with_options("<x> a", DANGER), "should support an EOL in an instruction" ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}), // "<p>a <x></p>", diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs index e71ae22..06b1193 100644 --- a/tests/thematic_break.rs +++ b/tests/thematic_break.rs @@ -169,7 +169,7 @@ fn thematic_break() { // "should not support lazyness (2)" // ); - // To do: extensions. + // To do: turning things off. // assert_eq!( // micromark("***", {extensions: [{disable: {null: ["thematicBreak"]}}]}), // "<p>***</p>", |