diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-09-14 16:21:42 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-09-14 16:26:24 +0200 |
commit | 74d2688aa329f0a41c2a92034c3454ed9299e71a (patch) | |
tree | 9ec8fdc6e40ff7cd40a14408afcc47716990134e /src/construct | |
parent | 65d4b46c2a3bdecb0493e484473d2de3d124f839 (diff) | |
download | markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.gz markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.bz2 markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.zip |
Fix to prefer flow over definitions, setext headings
An undocumented part of CommonMark is how to deal with things in definition
labels or definition titles (which both can span multiple lines).
Can flow (or containers?) interrupt them?
They can according to the `cmark` reference parser, so this was implemented here.
This adds a new `Content` content type, which houses zero or more definitions,
and then zero-or-one paragraphs.
Content can be followed by a setext heading underline, which either turns
into a setext heading when the content ends in a paragraph, or turns into
the start of the following paragraph when it is followed by content that
starts with a paragraph, or turns into a stray paragraph.
Diffstat (limited to '')
-rw-r--r-- | src/construct/attention.rs | 6 | ||||
-rw-r--r-- | src/construct/content.rs | 188 | ||||
-rw-r--r-- | src/construct/definition.rs | 26 | ||||
-rw-r--r-- | src/construct/document.rs | 5 | ||||
-rw-r--r-- | src/construct/flow.rs | 33 | ||||
-rw-r--r-- | src/construct/gfm_table.rs | 61 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 7 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 137 | ||||
-rw-r--r-- | src/construct/label_end.rs | 5 | ||||
-rw-r--r-- | src/construct/list_item.rs | 7 | ||||
-rw-r--r-- | src/construct/mod.rs | 4 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 149 | ||||
-rw-r--r-- | src/construct/partial_data.rs | 7 | ||||
-rw-r--r-- | src/construct/string.rs | 6 | ||||
-rw-r--r-- | src/construct/text.rs | 6 |
15 files changed, 436 insertions, 211 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 4a208df..4d58610 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -79,6 +79,7 @@ use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{ char::{ @@ -87,6 +88,7 @@ use crate::util::{ }, slice::Slice, }; +use alloc::string::String; use alloc::{vec, vec::Vec}; /// Attentention sequence that we can take markers from. @@ -150,7 +152,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Resolve sequences. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { // Find all sequences, gather info about them. let mut sequences = get_sequences(tokenizer); @@ -221,6 +223,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } /// Get sequences. diff --git a/src/construct/content.rs b/src/construct/content.rs new file mode 100644 index 0000000..6c10cea --- /dev/null +++ b/src/construct/content.rs @@ -0,0 +1,188 @@ +//! Content occurs in the [flow][] content type. +//! +//! Content contains zero or more [definition][definition]s, followed by zero +//! or one [paragraph][]. +//! +//! The constructs found in flow are: +//! +//! * [Definition][crate::construct::definition] +//! * [Paragraph][crate::construct::paragraph] +//! +//! ## Tokens +//! +//! * [`Content`][Name::Content] +//! +//! > 👉 **Note**: while parsing, [`Content`][Name::Content] +//! > is used, which is later compiled away. +//! +//! ## References +//! +//! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) +//! +//! [flow]: crate::construct::flow +//! [definition]: crate::construct::definition +//! [paragraph]: crate::construct::paragraph + +use crate::event::{Content, Kind, Link, Name}; +use crate::resolve::Name as ResolveName; +use crate::state::{Name as StateName, State}; +use crate::subtokenize::{subtokenize, Subresult}; +use crate::tokenizer::Tokenizer; +use alloc::{string::String, vec}; + +/// Before a content content. +/// +/// ```markdown +/// > | abc +/// ^ +/// ``` +pub fn chunk_start(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => unreachable!("unexpected eol/eof"), + _ => { + tokenizer.enter_link( + Name::Content, + Link { + previous: None, + next: None, + content: Content::Content, + }, + ); + State::Retry(StateName::ContentChunkInside) + } + } +} + +/// In a content chunk. +/// +/// ```markdown +/// > | abc +/// ^^^ +/// ``` +pub fn chunk_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(Name::Content); + tokenizer.register_resolver_before(ResolveName::Content); + // You’d be interrupting. + tokenizer.interrupt = true; + State::Ok + } + _ => { + tokenizer.consume(); + State::Next(StateName::ContentChunkInside) + } + } +} + +/// Before a definition. +/// +/// ```markdown +/// > | [a]: b +/// ^ +/// ``` +pub fn definition_before(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt( + State::Next(StateName::ContentDefinitionAfter), + State::Next(StateName::ParagraphStart), + ); + State::Retry(StateName::DefinitionStart) +} + +/// After a definition. +/// +/// ```markdown +/// > | [a]: b +/// ^ +/// | c +/// ``` +pub fn definition_after(tokenizer: &mut Tokenizer) -> State { + debug_assert!(matches!(tokenizer.current, None | Some(b'\n'))); + if tokenizer.current.is_none() { + State::Ok + } else { + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::ContentDefinitionBefore) + } +} + +/// Merge `Content` chunks, which currently span a single line, into actual +/// `Content`s that span multiple lines. +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { + let mut index = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.kind == Kind::Enter && event.name == Name::Content { + // Exit:Content + let mut exit_index = index + 1; + + loop { + let mut enter_index = exit_index + 1; + + if enter_index == tokenizer.events.len() + || tokenizer.events[enter_index].name != Name::LineEnding + { + break; + } + + // Skip past line ending. + enter_index += 2; + + // Skip past prefix. + while enter_index < tokenizer.events.len() { + let event = &tokenizer.events[enter_index]; + + if event.name != Name::SpaceOrTab + && event.name != Name::BlockQuotePrefix + && event.name != Name::BlockQuoteMarker + { + break; + } + + enter_index += 1; + } + + if enter_index == tokenizer.events.len() + || tokenizer.events[enter_index].name != Name::Content + { + break; + } + + // Set Exit:Content point to Exit:LineEnding. + tokenizer.events[exit_index].point = tokenizer.events[exit_index + 2].point.clone(); + // Remove Enter:LineEnding, Exit:LineEnding. + tokenizer.map.add(exit_index + 1, 2, vec![]); + + // Link Enter:Content to Enter:Content on this line and vice versa. + tokenizer.events[exit_index - 1].link.as_mut().unwrap().next = Some(enter_index); + tokenizer.events[enter_index] + .link + .as_mut() + .unwrap() + .previous = Some(exit_index - 1); + + // Potential next start. + exit_index = enter_index + 1; + } + + // Move to `Exit:Content`. + index = exit_index; + } + + index += 1; + } + + tokenizer.map.consume(&mut tokenizer.events); + + let result = subtokenize( + &mut tokenizer.events, + tokenizer.parse_state, + &Some(Content::Content), + )?; + + Ok(Some(result)) +} diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 1071489..8ccfb90 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -1,4 +1,4 @@ -//! Definition occurs in the [flow] content type. +//! Definition occurs in the [content] content type. //! //! ## Grammar //! @@ -12,8 +12,8 @@ //! ; those parts. //! ``` //! -//! As this construct occurs in flow, like all flow constructs, it must be -//! followed by an eol (line ending) or eof (end of file). +//! This construct must be followed by an eol (line ending) or eof (end of +//! file), like flow constructs. //! //! See [`destination`][destination], [`label`][label], and [`title`][title] //! for grammar, notes, and recommendations on each part. @@ -88,7 +88,7 @@ //! * [`definition.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/definition.js) //! * [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.30/#link-reference-definitions) //! -//! [flow]: crate::construct::flow +//! [content]: crate::construct::content //! [string]: crate::construct::string //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference @@ -157,7 +157,10 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Name::DefinitionLabel; tokenizer.tokenize_state.token_2 = Name::DefinitionLabelMarker; tokenizer.tokenize_state.token_3 = Name::DefinitionLabelString; - tokenizer.attempt(State::Next(StateName::DefinitionLabelAfter), State::Nok); + tokenizer.attempt( + State::Next(StateName::DefinitionLabelAfter), + State::Next(StateName::DefinitionLabelNok), + ); State::Retry(StateName::LabelStart) } _ => State::Nok, @@ -192,6 +195,19 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State { } } +/// At a non-label +/// +/// ```markdown +/// > | [] +/// ^ +/// ``` +pub fn label_nok(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Nok +} + /// After marker. /// /// ```markdown diff --git a/src/construct/document.rs b/src/construct/document.rs index 45a961d..82f2ebd 100644 --- a/src/construct/document.rs +++ b/src/construct/document.rs @@ -413,7 +413,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { while !document_lazy_continuation_current && stack_index > 0 { stack_index -= 1; let name = &child.stack[stack_index]; - if name == &Name::Paragraph || name == &Name::Definition || name == &Name::GfmTableHead { + if name == &Name::Content || name == &Name::GfmTableHead { document_lazy_continuation_current = true; } } @@ -423,7 +423,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { if !document_lazy_continuation_current && !child.events.is_empty() { let before = skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding]); let name = &child.events[before].name; - if name == &Name::Paragraph { + if name == &Name::Content { document_lazy_continuation_current = true; } } @@ -582,6 +582,7 @@ fn resolve(tokenizer: &mut Tokenizer) { &tokenizer.events, flow_index, &mut child.events, + (0, 0), ); // Replace the flow data with actual events. diff --git a/src/construct/flow.rs b/src/construct/flow.rs index e97ee63..08e0466 100644 --- a/src/construct/flow.rs +++ b/src/construct/flow.rs @@ -12,7 +12,6 @@ //! //! * [Blank line][crate::construct::blank_line] //! * [Code (indented)][crate::construct::code_indented] -//! * [Definition][crate::construct::definition] //! * [Heading (atx)][crate::construct::heading_atx] //! * [Heading (setext)][crate::construct::heading_setext] //! * [HTML (flow)][crate::construct::html_flow] @@ -40,14 +39,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'#') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::HeadingAtxStart) } Some(b'$' | b'`' | b'~') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::RawFlowStart) } @@ -56,7 +55,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'*' | b'_') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::ThematicBreakStart) } @@ -70,12 +69,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'{') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::MdxExpressionFlowStart) } // Actual parsing: blank line? Indented code? Indented anything? - // Tables, setext heading underlines, definitions, and paragraphs are + // Tables, setext heading underlines, definitions, and Contents are // particularly weird. _ => State::Retry(StateName::FlowBlankLineBefore), } @@ -217,34 +216,20 @@ pub fn before_mdx_expression(tokenizer: &mut Tokenizer) -> State { pub fn before_gfm_table(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeDefinition), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::GfmTableStart) } -/// At definition. -/// -/// ```markdown -/// > | [a]: b -/// ^ -/// ``` -pub fn before_definition(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt( - State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), - ); - State::Retry(StateName::DefinitionStart) -} - -/// At paragraph. +/// At content. /// /// ```markdown /// > | a /// ^ /// ``` -pub fn before_paragraph(tokenizer: &mut Tokenizer) -> State { +pub fn before_content(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt(State::Next(StateName::FlowAfter), State::Nok); - State::Retry(StateName::ParagraphStart) + State::Retry(StateName::ContentChunkStart) } /// After blank line. diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs index 27fbadf..63772c4 100644 --- a/src/construct/gfm_table.rs +++ b/src/construct/gfm_table.rs @@ -229,9 +229,10 @@ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max} use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; -use alloc::vec; +use alloc::{string::String, vec}; /// Start of a GFM table. /// @@ -771,15 +772,13 @@ pub fn body_row_escape(tokenizer: &mut Tokenizer) -> State { } /// Resolve GFM table. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; - // let mut tables = vec![]; let mut in_first_cell_awaiting_pipe = true; let mut in_row = false; let mut in_delimiter_row = false; let mut last_cell = (0, 0, 0, 0); let mut cell = (0, 0, 0, 0); - let mut after_head_awaiting_first_body_row = false; let mut last_table_end = 0; let mut last_table_has_body = false; @@ -800,17 +799,14 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } // Inject table start. - tokenizer.map.add( - index, - 0, - vec![Event { - kind: Kind::Enter, - name: Name::GfmTable, - point: tokenizer.events[index].point.clone(), - link: None, - }], - ); - } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + let enter = Event { + kind: Kind::Enter, + name: Name::GfmTable, + point: tokenizer.events[index].point.clone(), + link: None, + }; + tokenizer.map.add(index, 0, vec![enter]); + } else if matches!(event.name, Name::GfmTableRow | Name::GfmTableDelimiterRow) { in_delimiter_row = event.name == Name::GfmTableDelimiterRow; in_row = true; in_first_cell_awaiting_pipe = true; @@ -821,23 +817,21 @@ pub fn resolve(tokenizer: &mut Tokenizer) { if after_head_awaiting_first_body_row { after_head_awaiting_first_body_row = false; last_table_has_body = true; - tokenizer.map.add( - index, - 0, - vec![Event { - kind: Kind::Enter, - name: Name::GfmTableBody, - point: tokenizer.events[index].point.clone(), - link: None, - }], - ); + let enter = Event { + kind: Kind::Enter, + name: Name::GfmTableBody, + point: tokenizer.events[index].point.clone(), + link: None, + }; + tokenizer.map.add(index, 0, vec![enter]); } } // Cell data. else if in_row - && (event.name == Name::Data - || event.name == Name::GfmTableDelimiterMarker - || event.name == Name::GfmTableDelimiterFiller) + && matches!( + event.name, + Name::Data | Name::GfmTableDelimiterMarker | Name::GfmTableDelimiterFiller + ) { in_first_cell_awaiting_pipe = false; @@ -868,7 +862,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } else if event.name == Name::GfmTableHead { after_head_awaiting_first_body_row = true; last_table_end = index; - } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + } else if matches!(event.name, Name::GfmTableRow | Name::GfmTableDelimiterRow) { in_row = false; last_table_end = index; if last_cell.1 != 0 { @@ -878,9 +872,10 @@ pub fn resolve(tokenizer: &mut Tokenizer) { flush_cell(tokenizer, cell, in_delimiter_row, Some(index)); } } else if in_row - && (event.name == Name::Data - || event.name == Name::GfmTableDelimiterMarker - || event.name == Name::GfmTableDelimiterFiller) + && (matches!( + event.name, + Name::Data | Name::GfmTableDelimiterMarker | Name::GfmTableDelimiterFiller + )) { cell.3 = index; } @@ -891,6 +886,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { if last_table_end != 0 { flush_table_end(tokenizer, last_table_end, last_table_has_body); } + + Ok(None) } /// Generate a cell. diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index c1090c4..b76e455 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -66,9 +66,10 @@ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max} use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; -use alloc::vec; +use alloc::{string::String, vec}; /// Start of a heading (atx). /// @@ -222,7 +223,7 @@ pub fn data(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (atx). -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; let mut heading_inside = false; let mut data_start: Option<usize> = None; @@ -281,4 +282,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index e9cc759..3a484e1 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -54,6 +54,7 @@ //! * [`HeadingSetext`][Name::HeadingSetext] //! * [`HeadingSetextText`][Name::HeadingSetextText] //! * [`HeadingSetextUnderline`][Name::HeadingSetextUnderline] +//! * [`HeadingSetextUnderlineSequence`][Name::HeadingSetextUnderlineSequence] //! //! ## References //! @@ -70,12 +71,13 @@ //! [atx]: http://www.aaronsw.com/2002/atx/ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; -use crate::event::{Kind, Name}; +use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; -use alloc::vec; +use crate::util::{constant::TAB_SIZE, skip}; +use alloc::{string::String, vec}; /// At start of heading (setext) underline. /// @@ -90,14 +92,16 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { && !tokenizer.pierce // Require a paragraph before. && (!tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( + && tokenizer.events[skip::opt_back( &tokenizer.events, tokenizer.events.len() - 1, &[Name::LineEnding, Name::SpaceOrTab], )] .name - == Name::Paragraph) + == Name::Content) { + tokenizer.enter(Name::HeadingSetextUnderline); + if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.attempt(State::Next(StateName::HeadingSetextBefore), State::Nok); State::Retry(space_or_tab_min_max( @@ -128,7 +132,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-' | b'=') => { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); - tokenizer.enter(Name::HeadingSetextUnderline); + tokenizer.enter(Name::HeadingSetextUnderlineSequence); State::Retry(StateName::HeadingSetextInside) } _ => State::Nok, @@ -148,7 +152,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { State::Next(StateName::HeadingSetextInside) } else { tokenizer.tokenize_state.marker = 0; - tokenizer.exit(Name::HeadingSetextUnderline); + tokenizer.exit(Name::HeadingSetextUnderlineSequence); if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.attempt(State::Next(StateName::HeadingSetextAfter), State::Nok); @@ -172,6 +176,7 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { // Feel free to interrupt. tokenizer.interrupt = false; tokenizer.register_resolver(ResolveName::HeadingSetext); + tokenizer.exit(Name::HeadingSetextUnderline); State::Ok } _ => State::Nok, @@ -179,42 +184,102 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (setext). -pub fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - let mut paragraph_enter = None; - let mut paragraph_exit = None; - - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - // Find paragraphs. - if event.kind == Kind::Enter { - if event.name == Name::Paragraph { - paragraph_enter = Some(index); - } - } else if event.name == Name::Paragraph { - paragraph_exit = Some(index); - } - // We know this is preceded by a paragraph. - // Otherwise we don’t parse. - else if event.name == Name::HeadingSetextUnderline { - let enter = paragraph_enter.take().unwrap(); - let exit = paragraph_exit.take().unwrap(); +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { + tokenizer.map.consume(&mut tokenizer.events); + + let mut enter = skip::to(&tokenizer.events, 0, &[Name::HeadingSetextUnderline]); + + while enter < tokenizer.events.len() { + let exit = skip::to( + &tokenizer.events, + enter + 1, + &[Name::HeadingSetextUnderline], + ); + + // Find paragraph before + let paragraph_exit_before = skip::opt_back( + &tokenizer.events, + enter - 1, + &[Name::SpaceOrTab, Name::LineEnding, Name::BlockQuotePrefix], + ); + + // There’s a paragraph before: this is a setext heading. + if tokenizer.events[paragraph_exit_before].name == Name::Paragraph { + let paragraph_enter = skip::to_back( + &tokenizer.events, + paragraph_exit_before - 1, + &[Name::Paragraph], + ); // Change types of Enter:Paragraph, Exit:Paragraph. - tokenizer.events[enter].name = Name::HeadingSetextText; - tokenizer.events[exit].name = Name::HeadingSetextText; + tokenizer.events[paragraph_enter].name = Name::HeadingSetextText; + tokenizer.events[paragraph_exit_before].name = Name::HeadingSetextText; // Add Enter:HeadingSetext, Exit:HeadingSetext. - let mut heading_enter = tokenizer.events[enter].clone(); + let mut heading_enter = tokenizer.events[paragraph_enter].clone(); heading_enter.name = Name::HeadingSetext; - let mut heading_exit = tokenizer.events[index].clone(); + tokenizer.map.add(paragraph_enter, 0, vec![heading_enter]); + let mut heading_exit = tokenizer.events[exit].clone(); heading_exit.name = Name::HeadingSetext; - - tokenizer.map.add(enter, 0, vec![heading_enter]); - tokenizer.map.add(index + 1, 0, vec![heading_exit]); + tokenizer.map.add(exit + 1, 0, vec![heading_exit]); + } else { + // There’s a following paragraph, move this underline inside it. + if exit + 3 < tokenizer.events.len() + && tokenizer.events[exit + 1].name == Name::LineEnding + && tokenizer.events[exit + 3].name == Name::Paragraph + { + // Swap type, HeadingSetextUnderline:Enter -> Paragraph:Enter. + tokenizer.events[enter].name = Name::Paragraph; + // Swap type, LineEnding -> Data. + tokenizer.events[exit + 1].name = Name::Data; + tokenizer.events[exit + 2].name = Name::Data; + // Move new data (was line ending) back to include whole line, + // and link data together. + tokenizer.events[exit + 1].point = tokenizer.events[enter].point.clone(); + tokenizer.events[exit + 1].link = Some(Link { + previous: None, + next: Some(exit + 4), + content: Content::Text, + }); + tokenizer.events[exit + 4].link.as_mut().unwrap().previous = Some(exit + 1); + // Remove *including* HeadingSetextUnderline:Exit, until the line ending. + tokenizer.map.add(enter + 1, exit - enter, vec![]); + // Remove old Paragraph:Enter. + tokenizer.map.add(exit + 3, 1, vec![]); + } else { + // Swap type. + tokenizer.events[enter].name = Name::Paragraph; + tokenizer.events[exit].name = Name::Paragraph; + // Replace what’s inside the underline (whitespace, sequence). + tokenizer.map.add( + enter + 1, + exit - enter - 1, + vec![ + Event { + name: Name::Data, + kind: Kind::Enter, + point: tokenizer.events[enter].point.clone(), + link: Some(Link { + previous: None, + next: None, + content: Content::Text, + }), + }, + Event { + name: Name::Data, + kind: Kind::Exit, + point: tokenizer.events[exit].point.clone(), + link: None, + }, + ], + ); + } } - index += 1; + enter = skip::to(&tokenizer.events, exit + 1, &[Name::HeadingSetextUnderline]); } + + tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index ce1c295..95b9a27 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -183,6 +183,7 @@ use crate::construct::partial_space_or_tab_eol::space_or_tab_eol; use crate::event::{Event, Kind, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::{Label, LabelKind, LabelStart, Tokenizer}; use crate::util::{ constant::RESOURCE_DESTINATION_BALANCE_MAX, @@ -660,7 +661,7 @@ pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State { /// /// This turns matching label starts and label ends into links, images, and /// footnotes, and turns unmatched label starts back into data. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { // Inject labels. let labels = tokenizer.tokenize_state.labels.split_off(0); inject_labels(tokenizer, &labels); @@ -671,6 +672,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { mark_as_data(tokenizer, &starts); tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } /// Inject links/images/footnotes. diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs index 658c2c7..13b740b 100644 --- a/src/construct/list_item.rs +++ b/src/construct/list_item.rs @@ -62,13 +62,14 @@ use crate::construct::partial_space_or_tab::space_or_tab_min_max; use crate::event::{Kind, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{ constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}, skip, slice::{Position, Slice}, }; -use alloc::{vec, vec::Vec}; +use alloc::{string::String, vec, vec::Vec}; /// Start of list item. /// @@ -370,7 +371,7 @@ pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { } /// Find adjacent list items with the same marker. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; let mut index = 0; @@ -472,4 +473,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 1afa105..ae6facf 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -16,7 +16,7 @@ //! Content types also have a *rest* thing: after all things are parsed, //! there’s something left. //! In document, that is [flow][]. -//! In flow, that is a [paragraph][]. +//! In flow, that is [content][]. //! In string and text, that is [data][partial_data]. //! //! ## Construct @@ -37,6 +37,7 @@ //! * [character escape][character_escape] //! * [character reference][character_reference] //! * [code (indented)][code_indented] +//! * [content][] //! * [definition][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] @@ -149,6 +150,7 @@ pub mod block_quote; pub mod character_escape; pub mod character_reference; pub mod code_indented; +pub mod content; pub mod definition; pub mod document; pub mod flow; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index c1e7311..78fbacb 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -1,4 +1,4 @@ -//! Paragraph occurs in the [flow][] content type. +//! Paragraph occurs in the [content][] content type. //! //! ## Grammar //! @@ -11,14 +11,15 @@ //! paragraph ::= 1*line *(eol 1*line) //! ``` //! -//! As this construct occurs in flow, like all flow constructs, it must be -//! followed by an eol (line ending) or eof (end of file). +//! This construct must be followed by an eol (line ending) or eof (end of +//! file), like flow constructs. //! //! Paragraphs can contain line endings and whitespace, but they are not //! allowed to contain blank lines, or to be blank themselves. //! //! The paragraph is interpreted as the [text][] content type. -//! That means that [autolinks][autolink], [code (text)][raw_text], etc are allowed. +//! That means that [autolinks][autolink], [code (text)][raw_text], etc are +//! allowed. //! //! ## HTML //! @@ -34,40 +35,57 @@ //! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) //! * [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs) //! -//! [flow]: crate::construct::flow +//! [content]: crate::construct::content //! [text]: crate::construct::text //! [autolink]: crate::construct::autolink //! [raw_text]: crate::construct::raw_text //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element -use crate::event::{Content, Kind, Link, Name}; -use crate::resolve::Name as ResolveName; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; +use crate::subtokenize::link; use crate::tokenizer::Tokenizer; -use alloc::vec; -/// Before paragraph. +/// Paragraph start. /// /// ```markdown /// > | abc /// ^ +/// | def /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => unreachable!("unexpected eol/eof"), - _ => { - tokenizer.enter(Name::Paragraph); - tokenizer.enter_link( - Name::Data, - Link { - previous: None, - next: None, - content: Content::Text, - }, - ); - State::Retry(StateName::ParagraphInside) - } + debug_assert!(tokenizer.current.is_some()); + tokenizer.enter(Name::Paragraph); + State::Retry(StateName::ParagraphLineStart) +} + +/// Start of a line in a paragraph. +/// +/// ```markdown +/// > | abc +/// ^ +/// > | def +/// ^ +/// ``` +pub fn line_start(tokenizer: &mut Tokenizer) -> State { + debug_assert!(tokenizer.current.is_some()); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::Text, + }, + ); + + if tokenizer.tokenize_state.connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else { + tokenizer.tokenize_state.connect = true; } + + State::Retry(StateName::ParagraphInside) } /// In paragraph. @@ -78,91 +96,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') => { + None => { + tokenizer.tokenize_state.connect = false; tokenizer.exit(Name::Data); tokenizer.exit(Name::Paragraph); - tokenizer.register_resolver_before(ResolveName::Paragraph); - // You’d be interrupting. - tokenizer.interrupt = true; State::Ok } + Some(b'\n') => { + tokenizer.consume(); + tokenizer.exit(Name::Data); + State::Next(StateName::ParagraphLineStart) + } _ => { tokenizer.consume(); State::Next(StateName::ParagraphInside) } } } - -/// Merge “`Paragraph`”s, which currently span a single line, into actual -/// `Paragraph`s that span multiple lines. -pub fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - if event.kind == Kind::Enter && event.name == Name::Paragraph { - // Exit:Paragraph - let mut exit_index = index + 3; - - loop { - let mut enter_index = exit_index + 1; - - if enter_index == tokenizer.events.len() - || tokenizer.events[enter_index].name != Name::LineEnding - { - break; - } - - enter_index += 2; - - while enter_index < tokenizer.events.len() { - let event = &tokenizer.events[enter_index]; - - if event.name != Name::SpaceOrTab - && event.name != Name::BlockQuotePrefix - && event.name != Name::BlockQuoteMarker - { - break; - } - - enter_index += 1; - } - - if enter_index == tokenizer.events.len() - || tokenizer.events[enter_index].name != Name::Paragraph - { - break; - } - - // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding. - tokenizer.map.add(exit_index, 3, vec![]); - - // Remove Enter:Paragraph. - tokenizer.map.add(enter_index, 1, vec![]); - - // Add Exit:LineEnding position info to Exit:Data. - tokenizer.events[exit_index - 1].point = - tokenizer.events[exit_index + 2].point.clone(); - - // Link Enter:Data on the previous line to Enter:Data on this line. - if let Some(link) = &mut tokenizer.events[exit_index - 2].link { - link.next = Some(enter_index + 1); - } - if let Some(link) = &mut tokenizer.events[enter_index + 1].link { - link.previous = Some(exit_index - 2); - } - - // Potential next start. - exit_index = enter_index + 3; - } - - // Move to `Exit:Paragraph`. - index = exit_index; - } - - index += 1; - } - - tokenizer.map.consume(&mut tokenizer.events); -} diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index b6f1f47..b36d9f0 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -8,8 +8,9 @@ use crate::event::{Kind, Name}; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use alloc::vec; +use alloc::{string::String, vec}; /// At beginning of data. /// @@ -72,7 +73,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Merge adjacent data events. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; // Loop through events and merge adjacent data events. @@ -103,4 +104,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/string.rs b/src/construct/string.rs index dba1ac1..cf2f222 100644 --- a/src/construct/string.rs +++ b/src/construct/string.rs @@ -15,7 +15,9 @@ use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; +use alloc::string::String; /// Characters that can start something in string. const MARKERS: [u8; 2] = [b'&', b'\\']; @@ -74,6 +76,8 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace in string. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { resolve_whitespace(tokenizer, false, false); + + Ok(None) } diff --git a/src/construct/text.rs b/src/construct/text.rs index 34ea071..2648531 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -28,7 +28,9 @@ use crate::construct::gfm_autolink_literal::resolve as resolve_gfm_autolink_lite use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; +use alloc::string::String; /// Characters that can start something in text. const MARKERS: [u8; 16] = [ @@ -242,7 +244,7 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { resolve_whitespace( tokenizer, tokenizer.parse_state.options.constructs.hard_break_trailing, @@ -257,4 +259,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { { resolve_gfm_autolink_literal(tokenizer); } + + Ok(None) } |