aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--readme.md9
-rw-r--r--src/construct/code_fenced.rs5
-rw-r--r--src/construct/code_indented.rs11
-rw-r--r--src/construct/definition.rs17
-rw-r--r--src/construct/heading_atx.rs9
-rw-r--r--src/construct/heading_setext.rs233
-rw-r--r--src/construct/html_flow.rs19
-rw-r--r--src/construct/paragraph.rs150
-rw-r--r--src/construct/thematic_break.rs9
-rw-r--r--src/content/flow.rs46
-rw-r--r--src/tokenizer.rs3
-rw-r--r--tests/autolink.rs2
-rw-r--r--tests/character_escape.rs2
-rw-r--r--tests/character_reference.rs2
-rw-r--r--tests/code_fenced.rs2
-rw-r--r--tests/code_indented.rs2
-rw-r--r--tests/definition.rs13
-rw-r--r--tests/hard_break_escape.rs2
-rw-r--r--tests/hard_break_trailing.rs2
-rw-r--r--tests/heading_atx.rs2
-rw-r--r--tests/html_flow.rs13
-rw-r--r--tests/html_text.rs2
-rw-r--r--tests/thematic_break.rs2
23 files changed, 230 insertions, 327 deletions
diff --git a/readme.md b/readme.md
index f7847dc..103b201 100644
--- a/readme.md
+++ b/readme.md
@@ -46,11 +46,6 @@ cargo doc --document-private-items
### Some major obstacles
-- [ ] (8) Can paragraphs operate more performantly than checking whether other
- flow constructs start a line, before exiting and actually attempting flow
- constructs?
-- [ ] (3) Interrupting: sometimes flow can or cannot start depending on the
- previous construct (paragraph, definition)
- [ ] (5) Containers: this will be rather messy, and depends a lot on how
subtokenization is solved
- [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by
@@ -132,7 +127,6 @@ cargo doc --document-private-items
#### Parse
-- [ ] (3) Interrupting (html flow complete, definition + code_indented)
- [ ] (5) attention\
test (`character_reference`, `hard_break_escape`, `hard_break_trailing`,
`heading_atx`, `heading_setext`, `html_flow`, `thematic_break`)\
@@ -274,3 +268,6 @@ important.
- [x] (1) Parse initial and final space_or_tab of paragraphs (in string, text)
- [x] (1) Refactor to clean and document `space_or_tab`
- [x] (1) Refactor to clean and document `edit_map`
+- [x] (8) Make paragraphs fast by merging them at the end, not checking whether
+ things interrupt them each line
+- [x] (3) Add support for interrupting (or not)
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index d19cad0..f2d243a 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -179,7 +179,8 @@ struct Info {
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::CodeFenced);
tokenizer.enter(TokenType::CodeFencedFence);
- tokenizer.attempt_opt(space_or_tab(), before_sequence_open)(tokenizer, code)
+ // To do: allow arbitrary when code (indented) is turned off.
+ tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before_sequence_open)(tokenizer, code)
}
/// Inside the opening fence, after an optional prefix, before a sequence.
@@ -550,5 +551,7 @@ fn content_continue(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateF
/// ```
fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.exit(TokenType::CodeFenced);
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
(State::Ok, Some(vec![code]))
}
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
index 99445b9..9bdfd71 100644
--- a/src/construct/code_indented.rs
+++ b/src/construct/code_indented.rs
@@ -59,8 +59,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// > filled line (that it has a non-whitespace character), because blank lines
/// > are parsed already, so we never run into that.
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.enter(TokenType::CodeIndented);
- tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code)
+ // Do not interrupt paragraphs.
+ if tokenizer.interrupt {
+ (State::Nok, None)
+ } else {
+ tokenizer.enter(TokenType::CodeIndented);
+ tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code)
+ }
}
/// At a break.
@@ -110,6 +115,8 @@ fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.exit(TokenType::CodeIndented);
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
(State::Ok, Some(vec![code]))
}
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index f05064a..e1afd03 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -107,8 +107,19 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// |[a]: b "c"
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.enter(TokenType::Definition);
- tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code)
+ let index = tokenizer.events.len();
+ let definition_before = index > 3
+ && tokenizer.events[index - 1].token_type == TokenType::LineEnding
+ && tokenizer.events[index - 3].token_type == TokenType::Definition;
+
+ // Do not interrupt paragraphs (but do follow definitions).
+ if tokenizer.interrupt && !definition_before {
+ (State::Nok, None)
+ } else {
+ tokenizer.enter(TokenType::Definition);
+ // Note: arbitrary whitespace allowed even if code (indented) is on.
+ tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code)
+ }
}
/// At the start of a definition, after whitespace.
@@ -218,6 +229,8 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
tokenizer.exit(TokenType::Definition);
+ // You’d be interrupting.
+ tokenizer.interrupt = true;
(State::Ok, Some(vec![code]))
}
_ => (State::Nok, None),
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 2811894..3ce7052 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -54,8 +54,8 @@
//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
//! [atx]: http://www.aaronsw.com/2002/atx/
-use super::partial_space_or_tab::space_or_tab;
-use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX;
+use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};
+use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE};
use crate::tokenizer::{
Code, ContentType, Event, EventType, State, StateFnResult, TokenType, Tokenizer,
};
@@ -68,7 +68,8 @@ use crate::util::edit_map::EditMap;
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::HeadingAtx);
- tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code)
+ // To do: allow arbitrary when code (indented) is turned off.
+ tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code)
}
/// Start of a heading (atx), after whitespace.
@@ -127,6 +128,8 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::HeadingAtx);
tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve));
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
(State::Ok, Some(vec![code]))
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
index 03a2e55..df20aa7 100644
--- a/src/construct/heading_setext.rs
+++ b/src/construct/heading_setext.rs
@@ -58,10 +58,9 @@
//! [atx]: http://www.aaronsw.com/2002/atx/
use crate::constant::TAB_SIZE;
-use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_with_options, Options};
-use crate::subtokenize::link;
-use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer};
-use crate::util::span::from_exit_event;
+use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};
+use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::edit_map::EditMap;
/// Kind of underline.
#[derive(Debug, Clone, PartialEq)]
@@ -109,150 +108,23 @@ impl Kind {
}
}
-/// Start of a heading (setext).
-///
-/// ```markdown
-/// |alpha
-/// ==
-/// ```
-pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.enter(TokenType::HeadingSetext);
- tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code)
-}
-
-/// Start of a heading (setext), after whitespace.
-///
-/// ```markdown
-/// |alpha
-/// ==
-/// ```
-fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- unreachable!("expected non-eol/eof");
- }
- _ => {
- tokenizer.enter(TokenType::HeadingSetextText);
- tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text));
- text_inside(tokenizer, code)
- }
- }
-}
-
-/// Inside text.
-///
-/// ```markdown
-/// al|pha
-/// bra|vo
-/// ==
-/// ```
-fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::None => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.exit(TokenType::Data);
- tokenizer.exit(TokenType::HeadingSetextText);
- tokenizer.attempt(underline_before, |ok| {
- Box::new(if ok { after } else { text_continue })
- })(tokenizer, code)
- }
- _ => {
- tokenizer.consume(code);
- (State::Fn(Box::new(text_inside)), None)
- }
- }
-}
-
-/// At a line ending, not at an underline.
-///
-/// ```markdown
-/// alpha
-/// |bravo
-/// ==
-/// ```
-fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- // Needed to connect the text.
- tokenizer.enter(TokenType::HeadingSetextText);
- tokenizer.events.pop();
- tokenizer.events.pop();
-
- match code {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.enter_with_content(TokenType::LineEnding, Some(ContentType::Text));
- let index = tokenizer.events.len() - 1;
- link(&mut tokenizer.events, index);
- tokenizer.consume(code);
- tokenizer.exit(TokenType::LineEnding);
-
- (
- State::Fn(Box::new(tokenizer.attempt_opt(
- space_or_tab_with_options(Options {
- kind: TokenType::SpaceOrTab,
- min: 1,
- max: usize::MAX,
- content_type: Some(ContentType::Text),
- connect: true,
- }),
- text_line_start,
- ))),
- None,
- )
- }
- _ => unreachable!("expected eol"),
- }
-}
-
-/// At a line ending after whitespace, not at an underline.
-///
-/// ```markdown
-/// alpha
-/// |bravo
-/// ==
-/// ```
-fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- // Blank lines not allowed.
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
- _ => {
- tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text));
- let index = tokenizer.events.len() - 1;
- link(&mut tokenizer.events, index);
- text_inside(tokenizer, code)
- }
- }
-}
-
-/// After a heading (setext).
-///
-/// ```markdown
-/// alpha
-/// ==|
-/// ```
-fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.exit(TokenType::HeadingSetext);
- (State::Ok, Some(vec![code]))
-}
-
/// At a line ending, presumably an underline.
///
/// ```markdown
/// alpha|
/// ==
/// ```
-fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.enter(TokenType::LineEnding);
- tokenizer.consume(code);
- tokenizer.exit(TokenType::LineEnding);
- (
- State::Fn(Box::new(
- tokenizer.attempt_opt(space_or_tab(), underline_sequence_start),
- )),
- None,
- )
- }
- _ => unreachable!("expected eol"),
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let index = tokenizer.events.len();
+ let paragraph_before = index > 3
+ && tokenizer.events[index - 1].token_type == TokenType::LineEnding
+ && tokenizer.events[index - 3].token_type == TokenType::Paragraph;
+
+ if paragraph_before {
+ // To do: allow arbitrary when code (indented) is turned off.
+ tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code)
+ } else {
+ (State::Nok, None)
}
}
@@ -262,26 +134,11 @@ fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// alpha
/// |==
/// ```
-fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- let tail = tokenizer.events.last();
- let mut prefix = 0;
-
- if let Some(event) = tail {
- if event.token_type == TokenType::SpaceOrTab {
- let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
- prefix = span.end_index - span.start_index;
- }
- }
-
- // To do: 4+ should be okay if code (indented) is turned off!
- if prefix >= TAB_SIZE {
- return (State::Nok, None);
- }
-
+fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::Char(char) if char == '-' || char == '=' => {
tokenizer.enter(TokenType::HeadingSetextUnderline);
- underline_sequence_inside(tokenizer, code, Kind::from_char(char))
+ inside(tokenizer, code, Kind::from_char(char))
}
_ => (State::Nok, None),
}
@@ -293,16 +150,13 @@ fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnRes
/// alpha
/// =|=
/// ```
-fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult {
+fn inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult {
match code {
Code::Char(char) if char == kind.as_char() => {
tokenizer.consume(code);
- (
- State::Fn(Box::new(move |t, c| underline_sequence_inside(t, c, kind))),
- None,
- )
+ (State::Fn(Box::new(move |t, c| inside(t, c, kind))), None)
}
- _ => tokenizer.attempt_opt(space_or_tab(), underline_after)(tokenizer, code),
+ _ => tokenizer.attempt_opt(space_or_tab(), after)(tokenizer, code),
}
}
@@ -312,12 +166,59 @@ fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind)
/// alpha
/// ==|
/// ```
-fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::HeadingSetextUnderline);
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
+ tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve));
(State::Ok, Some(vec![code]))
}
_ => (State::Nok, None),
}
}
+
+/// To do.
+pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
+ let mut edit_map = EditMap::new();
+ let mut index = 0;
+ let mut paragraph_enter: Option<usize> = None;
+ let mut paragraph_exit: Option<usize> = None;
+
+ while index < tokenizer.events.len() {
+ let event = &tokenizer.events[index];
+
+ // Find paragraphs.
+ if event.event_type == EventType::Enter {
+ if event.token_type == TokenType::Paragraph {
+ paragraph_enter = Some(index);
+ }
+ } else if event.token_type == TokenType::Paragraph {
+ paragraph_exit = Some(index);
+ }
+ // We know this is preceded by a paragraph.
+ // Otherwise we don’t parse.
+ else if event.token_type == TokenType::HeadingSetextUnderline {
+ let enter = paragraph_enter.take().unwrap();
+ let exit = paragraph_exit.take().unwrap();
+
+ // Change types of Enter:Paragraph, Exit:Paragraph.
+ tokenizer.events[enter].token_type = TokenType::HeadingSetextText;
+ tokenizer.events[exit].token_type = TokenType::HeadingSetextText;
+
+ // Add of Enter:HeadingSetext, Exit:HeadingSetext.
+ let mut heading_enter = tokenizer.events[enter].clone();
+ heading_enter.token_type = TokenType::HeadingSetext;
+ let mut heading_exit = tokenizer.events[index].clone();
+ heading_exit.token_type = TokenType::HeadingSetext;
+
+ edit_map.add(enter, 0, vec![heading_enter]);
+ edit_map.add(index + 1, 0, vec![heading_exit]);
+ }
+
+ index += 1;
+ }
+
+ edit_map.consume(&mut tokenizer.events)
+}
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index d0e0558..a1bddad 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -98,8 +98,10 @@
//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES
//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
-use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX};
-use crate::construct::{blank_line::start as blank_line, partial_space_or_tab::space_or_tab};
+use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE};
+use crate::construct::{
+ blank_line::start as blank_line, partial_space_or_tab::space_or_tab_min_max,
+};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// Kind of HTML (flow).
@@ -191,7 +193,8 @@ struct Info {
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::HtmlFlow);
tokenizer.enter(TokenType::HtmlFlowData);
- tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code)
+ // To do: allow arbitrary when code (indented) is turned off.
+ tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code)
}
/// After optional whitespace, before `<`.
@@ -400,8 +403,10 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes
} else {
info.kind = Kind::Complete;
- // To do: do not support complete HTML when interrupting.
- if info.start_tag {
+ // Do not support complete HTML when interrupting.
+ if tokenizer.interrupt {
+ (State::Nok, None)
+ } else if info.start_tag {
complete_attribute_name_before(tokenizer, code, info)
} else {
complete_closing_tag_after(tokenizer, code, info)
@@ -784,6 +789,8 @@ fn html_continue_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Sta
match code {
Code::None => {
tokenizer.exit(TokenType::HtmlFlow);
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
(State::Ok, Some(vec![code]))
}
// To do: do not allow lazy lines.
@@ -949,6 +956,8 @@ fn continuation_close(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Stat
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::HtmlFlowData);
tokenizer.exit(TokenType::HtmlFlow);
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
(State::Ok, Some(vec![code]))
}
_ => {
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
index fea7052..ae2f4de 100644
--- a/src/construct/paragraph.rs
+++ b/src/construct/paragraph.rs
@@ -32,14 +32,10 @@
//! [code_text]: crate::construct::code_text
//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element
-use crate::constant::TAB_SIZE;
-use crate::construct::{
- blank_line::start as blank_line, code_fenced::start as code_fenced,
- heading_atx::start as heading_atx, html_flow::start as html_flow,
- partial_space_or_tab::space_or_tab_min_max, thematic_break::start as thematic_break,
+use crate::tokenizer::{
+ Code, ContentType, Event, EventType, State, StateFnResult, TokenType, Tokenizer,
};
-use crate::subtokenize::link;
-use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::edit_map::EditMap;
/// Before a paragraph.
///
@@ -66,11 +62,14 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::None => end(tokenizer, code),
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer
- .check(interrupt, |ok| {
- Box::new(if ok { at_line_ending } else { end })
- })(tokenizer, code),
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::Data);
+ tokenizer.exit(TokenType::Paragraph);
+ tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve));
+ // You’d be interrupting.
+ tokenizer.interrupt = true;
+ (State::Ok, Some(vec![code]))
+ }
_ => {
tokenizer.consume(code);
(State::Fn(Box::new(inside)), None)
@@ -78,90 +77,55 @@ fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
-/// At a line ending, not interrupting.
-///
-/// ```markdown
-/// alpha|
-/// bravo.
-/// ```
-fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.consume(code);
- tokenizer.exit(TokenType::Data);
- tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text));
- let index = tokenizer.events.len() - 1;
- link(&mut tokenizer.events, index);
- (State::Fn(Box::new(inside)), None)
-}
+/// Merge “`Paragraph`”s, which currently span a single line, into actual
+/// `Paragraph`s that span multiple lines.
+pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
+ let mut edit_map = EditMap::new();
+ let len = tokenizer.events.len();
+ let mut index = 0;
-/// At a line ending, done.
-///
-/// ```markdown
-/// alpha|
-/// ***
-/// ```
-fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.exit(TokenType::Data);
- tokenizer.exit(TokenType::Paragraph);
- (State::Ok, Some(vec![code]))
-}
+ while index < len {
+ let event = &tokenizer.events[index];
-/// Before a potential interruption.
-///
-/// ```markdown
-/// alpha|
-/// ***
-/// ```
-fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.enter(TokenType::LineEnding);
- tokenizer.consume(code);
- tokenizer.exit(TokenType::LineEnding);
- (State::Fn(Box::new(interrupt_start)), None)
- }
- _ => unreachable!("expected eol"),
- }
-}
+ if event.event_type == EventType::Enter && event.token_type == TokenType::Paragraph {
+ // Exit:Paragraph
+ let mut exit_index = index + 3;
+ // Enter:Paragraph
+ let mut enter_next_index = exit_index + 3;
-/// After a line ending.
-///
-/// ```markdown
-/// alpha
-/// |~~~js
-/// ~~~
-/// ```
-fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- // To do: If code is disabled, indented lines are allowed to interrupt.
- tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| {
- Box::new(if ok { interrupt_indent } else { interrupt_cont })
- })(tokenizer, code)
-}
+ // To do: assert that `LineEnding` between?
+ while enter_next_index < len
+ && tokenizer.events[enter_next_index].token_type == TokenType::Paragraph
+ {
+ // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding, Enter:Paragraph.
+ edit_map.add(exit_index, 4, vec![]);
+ println!("rm {:?} {:?}", exit_index, exit_index + 4);
-/// At an indent.
-///
-/// ```markdown
-/// alpha
-/// |
-/// ```
-fn interrupt_indent(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- (State::Ok, Some(vec![code]))
-}
+ // Add Exit:LineEnding position info to Exit:Data.
+ let line_ending_exit = &tokenizer.events[enter_next_index - 1];
+ let line_ending_point = line_ending_exit.point.clone();
+ let line_ending_index = line_ending_exit.index;
+ let data_exit = &mut tokenizer.events[exit_index - 1];
+ data_exit.point = line_ending_point;
+ data_exit.index = line_ending_index;
-/// Not at an indented line.
-///
-/// ```markdown
-/// alpha
-/// |<div>
-/// ```
-fn interrupt_cont(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt_n(
- vec![
- Box::new(blank_line),
- Box::new(code_fenced),
- Box::new(html_flow),
- Box::new(heading_atx),
- Box::new(thematic_break),
- ],
- |ok| Box::new(move |_t, code| (if ok { State::Nok } else { State::Ok }, Some(vec![code]))),
- )(tokenizer, code)
+ // Link Enter:Data on the previous line to Enter:Data on this line.
+ let data_enter_prev = &mut tokenizer.events[exit_index - 2];
+ data_enter_prev.next = Some(enter_next_index + 1);
+ let data_enter_next = &mut tokenizer.events[enter_next_index + 1];
+ data_enter_next.previous = Some(exit_index - 2);
+
+ // Potential next start.
+ exit_index = enter_next_index + 3;
+ enter_next_index = exit_index + 3;
+ }
+
+ // Move to `Exit:Paragraph`.
+ index = exit_index;
+ }
+
+ index += 1;
+ }
+
+ edit_map.consume(&mut tokenizer.events)
}
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index 9978ee0..8d29157 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -49,8 +49,8 @@
//!
//! <!-- To do: link `lists` -->
-use super::partial_space_or_tab::space_or_tab;
-use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN;
+use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};
+use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// Type of thematic break.
@@ -122,7 +122,8 @@ struct Info {
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::ThematicBreak);
- tokenizer.attempt_opt(space_or_tab(), before)(tokenizer, code)
+ // To do: allow arbitrary when code (indented) is turned off.
+ tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), before)(tokenizer, code)
}
/// Start of a thematic break, after whitespace.
@@ -157,6 +158,8 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult
if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN =>
{
tokenizer.exit(TokenType::ThematicBreak);
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
(State::Ok, Some(vec![code]))
}
Code::Char(char) if char == info.kind.as_char() => {
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 0d3ede0..3ff948d 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -92,26 +92,6 @@ fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
-/// After a blank line.
-///
-/// Move to `start` afterwards.
-///
-/// ```markdown
-/// ␠␠|
-/// ```
-fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::None => (State::Ok, None),
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.enter(TokenType::BlankLineEnding);
- tokenizer.consume(code);
- tokenizer.exit(TokenType::BlankLineEnding);
- (State::Fn(Box::new(start)), None)
- }
- _ => unreachable!("expected eol/eof after blank line `{:?}`", code),
- }
-}
-
/// Before flow (initial).
///
/// “Initial” flow means unprefixed flow, so right at the start of a line.
@@ -133,16 +113,38 @@ fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
Box::new(code_fenced),
Box::new(html_flow),
Box::new(heading_atx),
+ Box::new(heading_setext),
Box::new(thematic_break),
Box::new(definition),
- Box::new(heading_setext),
],
|ok| Box::new(if ok { after } else { before_paragraph }),
)(tokenizer, code),
}
}
-/// After a flow construct.
+/// After a blank line.
+///
+/// Move to `start` afterwards.
+///
+/// ```markdown
+/// ␠␠|
+/// ```
+fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::BlankLineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::BlankLineEnding);
+ // Feel free to interrupt.
+ tokenizer.interrupt = false;
+ (State::Fn(Box::new(start)), None)
+ }
+ _ => unreachable!("expected eol/eof after blank line `{:?}`", code),
+ }
+}
+
+/// After something.
///
/// ```markdown
/// ## alpha|
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 817c1de..b70e706 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1760,6 +1760,8 @@ pub struct Tokenizer<'a> {
/// To do.
pub label_start_list_loose: Vec<LabelStart>,
/// To do.
+ pub interrupt: bool,
+ /// To do.
pub media_list: Vec<Media>,
/// To do.
resolvers: Vec<Box<Resolver>>,
@@ -1783,6 +1785,7 @@ impl<'a> Tokenizer<'a> {
label_start_stack: vec![],
label_start_list_loose: vec![],
media_list: vec![],
+ interrupt: false,
resolvers: vec![],
resolver_ids: vec![],
}
diff --git a/tests/autolink.rs b/tests/autolink.rs
index f0486ef..9c28834 100644
--- a/tests/autolink.rs
+++ b/tests/autolink.rs
@@ -252,7 +252,7 @@ fn autolink() {
"should not support a dash before a dot in email autolinks"
);
- // To do: extensions.
+ // To do: turning things off.
// assert_eq!(
// micromark("<a@b.co>", {extensions: [{disable: {null: ["autolink"]}}]}),
// "<p>&lt;a@b.co&gt;</p>",
diff --git a/tests/character_escape.rs b/tests/character_escape.rs
index 26e9336..6200014 100644
--- a/tests/character_escape.rs
+++ b/tests/character_escape.rs
@@ -79,7 +79,7 @@ fn character_escape() {
"should escape in fenced code info"
);
- // // To do: extensions
+ // // To do: turning things off
// assert_eq!(
// micromark("\\> a", {extensions: [{disable: {null: ["characterEscape"]}}]}),
// "<p>\\&gt; a</p>",
diff --git a/tests/character_reference.rs b/tests/character_reference.rs
index 3951e00..c87657e 100644
--- a/tests/character_reference.rs
+++ b/tests/character_reference.rs
@@ -190,7 +190,7 @@ fn character_reference() {
"should not support the other characters inside a hexademical"
);
- // To do: extensions.
+ // To do: turning things off.
// assert_eq!(
// micromark("&amp;", {
// extensions: [{disable: {null: ["characterReferences"]}}]
diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs
index 0e19637..b7d8307 100644
--- a/tests/code_fenced.rs
+++ b/tests/code_fenced.rs
@@ -252,7 +252,7 @@ fn code_fenced() {
// "should not support lazyness (3)"
// );
- // To do: extensions.
+ // To do: turning things off.
// assert_eq!(
// micromark("```", {extensions: [{disable: {null: ["codeFenced"]}}]}),
// "<p>```</p>",
diff --git a/tests/code_indented.rs b/tests/code_indented.rs
index 0190497..773e3d4 100644
--- a/tests/code_indented.rs
+++ b/tests/code_indented.rs
@@ -119,7 +119,7 @@ fn code_indented() {
// "should not support lazyness (7)"
// );
- // To do: extensions.
+ // To do: turning things off.
// assert_eq!(
// micromark(" a", {extensions: [{disable: {null: ["codeIndented"]}}]}),
// "<p>a</p>",
diff --git a/tests/definition.rs b/tests/definition.rs
index ba4e384..df99f74 100644
--- a/tests/definition.rs
+++ b/tests/definition.rs
@@ -375,12 +375,11 @@ fn definition() {
"should not support a final (unbalanced) right paren in a raw destination “before” a title"
);
- // To do: do not let code (indented) interrupt definitions.
- // assert_eq!(
- // micromark(" [a]: b \"c\"\n [d]: e\n [f]: g \"h\"\n [i]: j\n\t[k]: l (m)\n\t n [k] o"),
- // "<p>n <a href=\"l\" title=\"m\">k</a> o</p>",
- // "should support subsequent indented definitions"
- // );
+ assert_eq!(
+ micromark(" [a]: b \"c\"\n [d]: e\n [f]: g \"h\"\n [i]: j\n\t[k]: l (m)\n\t n [k] o"),
+ "<p>n <a href=\"l\" title=\"m\">k</a> o</p>",
+ "should support subsequent indented definitions"
+ );
assert_eq!(
micromark("[a\n b]: c\n\n[a\n b]"),
@@ -406,7 +405,7 @@ fn definition() {
"should not support definitions w/ text + a closing paren as a raw destination"
);
- // To do: support turning off things.
+ // To do: turning things off.
// assert_eq!(
// micromark("[foo]: /url \"title\"", {
// extensions: [{disable: {null: ["definition"]}}]
diff --git a/tests/hard_break_escape.rs b/tests/hard_break_escape.rs
index c4f6f1d..740e706 100644
--- a/tests/hard_break_escape.rs
+++ b/tests/hard_break_escape.rs
@@ -40,7 +40,7 @@ fn hard_break_escape() {
"should not support escape hard breaks at the end of a heading"
);
- // // To do: turning off things.
+ // // To do: turning things off.
// assert_eq!(
// micromark("a\\\nb", {extensions: [{disable: {null: ["hardBreakEscape"]}}]}),
// "<p>a\\\nb</p>",
diff --git a/tests/hard_break_trailing.rs b/tests/hard_break_trailing.rs
index 0dbbbdb..2a4b534 100644
--- a/tests/hard_break_trailing.rs
+++ b/tests/hard_break_trailing.rs
@@ -118,7 +118,7 @@ fn hard_break_trailing() {
// "should support a mixed line suffix after a span (3)"
// );
- // // To do: turning off things.
+ // // To do: turning things off.
// assert_eq!(
// micromark("a \nb", {extensions: [{disable: {null: ["hardBreakTrailing"]}}]}),
// "<p>a\nb</p>",
diff --git a/tests/heading_atx.rs b/tests/heading_atx.rs
index 2548056..ef5846a 100644
--- a/tests/heading_atx.rs
+++ b/tests/heading_atx.rs
@@ -196,7 +196,7 @@ fn heading_atx() {
// "should not support lazyness (2)"
// );
- // Extensions:
+ // To do: turning things off:
// assert_eq!(
// micromark("# a", {extensions: [{disable: {null: ["headingAtx"]}}]}),
// "<p># a</p>",
diff --git a/tests/html_flow.rs b/tests/html_flow.rs
index 455c5b8..3b69671 100644
--- a/tests/html_flow.rs
+++ b/tests/html_flow.rs
@@ -21,7 +21,7 @@ fn html_flow() {
"should support a heading w/ rank 1"
);
- // To do: extensions.
+ // To do: turning things off.
// assert_eq!(
// micromark_with_options("<x>", {extensions: [{disable: {null: ["htmlFlow"]}}]}),
// "<p>&lt;x&gt;</p>",
@@ -789,12 +789,11 @@ fn html_flow_7_complete() {
"should support interleaving w/ whitespace-only blank lines"
);
- // To do: disallow html (complete) from interrupting.
- // assert_eq!(
- // micromark_with_options("Foo\n<a href=\"bar\">\nbaz", DANGER),
- // "<p>Foo\n<a href=\"bar\">\nbaz</p>",
- // "should not support interrupting paragraphs w/ complete tags"
- // );
+ assert_eq!(
+ micromark_with_options("Foo\n<a href=\"bar\">\nbaz", DANGER),
+ "<p>Foo\n<a href=\"bar\">\nbaz</p>",
+ "should not support interrupting paragraphs w/ complete tags"
+ );
assert_eq!(
micromark_with_options("<x", DANGER),
diff --git a/tests/html_text.rs b/tests/html_text.rs
index e70a4da..0288af7 100644
--- a/tests/html_text.rs
+++ b/tests/html_text.rs
@@ -418,7 +418,7 @@ micromark_with_options("<x> a", DANGER),
"should support an EOL in an instruction"
);
- // To do: extensions.
+ // To do: turning things off.
// assert_eq!(
// micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}),
// "<p>a &lt;x&gt;</p>",
diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs
index e71ae22..06b1193 100644
--- a/tests/thematic_break.rs
+++ b/tests/thematic_break.rs
@@ -169,7 +169,7 @@ fn thematic_break() {
// "should not support lazyness (2)"
// );
- // To do: extensions.
+ // To do: turning things off.
// assert_eq!(
// micromark("***", {extensions: [{disable: {null: ["thematicBreak"]}}]}),
// "<p>***</p>",