diff options
Diffstat (limited to '')
-rw-r--r-- | src/construct/attention.rs | 6 | ||||
-rw-r--r-- | src/construct/content.rs | 188 | ||||
-rw-r--r-- | src/construct/definition.rs | 26 | ||||
-rw-r--r-- | src/construct/document.rs | 5 | ||||
-rw-r--r-- | src/construct/flow.rs | 33 | ||||
-rw-r--r-- | src/construct/gfm_table.rs | 61 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 7 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 137 | ||||
-rw-r--r-- | src/construct/label_end.rs | 5 | ||||
-rw-r--r-- | src/construct/list_item.rs | 7 | ||||
-rw-r--r-- | src/construct/mod.rs | 4 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 149 | ||||
-rw-r--r-- | src/construct/partial_data.rs | 7 | ||||
-rw-r--r-- | src/construct/string.rs | 6 | ||||
-rw-r--r-- | src/construct/text.rs | 6 |
15 files changed, 436 insertions, 211 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 4a208df..4d58610 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -79,6 +79,7 @@ use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{ char::{ @@ -87,6 +88,7 @@ use crate::util::{ }, slice::Slice, }; +use alloc::string::String; use alloc::{vec, vec::Vec}; /// Attentention sequence that we can take markers from. @@ -150,7 +152,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Resolve sequences. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { // Find all sequences, gather info about them. let mut sequences = get_sequences(tokenizer); @@ -221,6 +223,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } /// Get sequences. diff --git a/src/construct/content.rs b/src/construct/content.rs new file mode 100644 index 0000000..6c10cea --- /dev/null +++ b/src/construct/content.rs @@ -0,0 +1,188 @@ +//! Content occurs in the [flow][] content type. +//! +//! Content contains zero or more [definition][definition]s, followed by zero +//! or one [paragraph][]. +//! +//! The constructs found in flow are: +//! +//! * [Definition][crate::construct::definition] +//! * [Paragraph][crate::construct::paragraph] +//! +//! ## Tokens +//! +//! * [`Content`][Name::Content] +//! +//! > 👉 **Note**: while parsing, [`Content`][Name::Content] +//! > is used, which is later compiled away. +//! +//! ## References +//! +//! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) +//! +//! [flow]: crate::construct::flow +//! [definition]: crate::construct::definition +//! [paragraph]: crate::construct::paragraph + +use crate::event::{Content, Kind, Link, Name}; +use crate::resolve::Name as ResolveName; +use crate::state::{Name as StateName, State}; +use crate::subtokenize::{subtokenize, Subresult}; +use crate::tokenizer::Tokenizer; +use alloc::{string::String, vec}; + +/// Before a content content. +/// +/// ```markdown +/// > | abc +/// ^ +/// ``` +pub fn chunk_start(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => unreachable!("unexpected eol/eof"), + _ => { + tokenizer.enter_link( + Name::Content, + Link { + previous: None, + next: None, + content: Content::Content, + }, + ); + State::Retry(StateName::ContentChunkInside) + } + } +} + +/// In a content chunk. +/// +/// ```markdown +/// > | abc +/// ^^^ +/// ``` +pub fn chunk_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(Name::Content); + tokenizer.register_resolver_before(ResolveName::Content); + // You’d be interrupting. + tokenizer.interrupt = true; + State::Ok + } + _ => { + tokenizer.consume(); + State::Next(StateName::ContentChunkInside) + } + } +} + +/// Before a definition. +/// +/// ```markdown +/// > | [a]: b +/// ^ +/// ``` +pub fn definition_before(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt( + State::Next(StateName::ContentDefinitionAfter), + State::Next(StateName::ParagraphStart), + ); + State::Retry(StateName::DefinitionStart) +} + +/// After a definition. +/// +/// ```markdown +/// > | [a]: b +/// ^ +/// | c +/// ``` +pub fn definition_after(tokenizer: &mut Tokenizer) -> State { + debug_assert!(matches!(tokenizer.current, None | Some(b'\n'))); + if tokenizer.current.is_none() { + State::Ok + } else { + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::ContentDefinitionBefore) + } +} + +/// Merge `Content` chunks, which currently span a single line, into actual +/// `Content`s that span multiple lines. +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { + let mut index = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.kind == Kind::Enter && event.name == Name::Content { + // Exit:Content + let mut exit_index = index + 1; + + loop { + let mut enter_index = exit_index + 1; + + if enter_index == tokenizer.events.len() + || tokenizer.events[enter_index].name != Name::LineEnding + { + break; + } + + // Skip past line ending. + enter_index += 2; + + // Skip past prefix. + while enter_index < tokenizer.events.len() { + let event = &tokenizer.events[enter_index]; + + if event.name != Name::SpaceOrTab + && event.name != Name::BlockQuotePrefix + && event.name != Name::BlockQuoteMarker + { + break; + } + + enter_index += 1; + } + + if enter_index == tokenizer.events.len() + || tokenizer.events[enter_index].name != Name::Content + { + break; + } + + // Set Exit:Content point to Exit:LineEnding. + tokenizer.events[exit_index].point = tokenizer.events[exit_index + 2].point.clone(); + // Remove Enter:LineEnding, Exit:LineEnding. + tokenizer.map.add(exit_index + 1, 2, vec![]); + + // Link Enter:Content to Enter:Content on this line and vice versa. + tokenizer.events[exit_index - 1].link.as_mut().unwrap().next = Some(enter_index); + tokenizer.events[enter_index] + .link + .as_mut() + .unwrap() + .previous = Some(exit_index - 1); + + // Potential next start. + exit_index = enter_index + 1; + } + + // Move to `Exit:Content`. + index = exit_index; + } + + index += 1; + } + + tokenizer.map.consume(&mut tokenizer.events); + + let result = subtokenize( + &mut tokenizer.events, + tokenizer.parse_state, + &Some(Content::Content), + )?; + + Ok(Some(result)) +} diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 1071489..8ccfb90 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -1,4 +1,4 @@ -//! Definition occurs in the [flow] content type. +//! Definition occurs in the [content] content type. //! //! ## Grammar //! @@ -12,8 +12,8 @@ //! ; those parts. //! ``` //! -//! As this construct occurs in flow, like all flow constructs, it must be -//! followed by an eol (line ending) or eof (end of file). +//! This construct must be followed by an eol (line ending) or eof (end of +//! file), like flow constructs. //! //! See [`destination`][destination], [`label`][label], and [`title`][title] //! for grammar, notes, and recommendations on each part. @@ -88,7 +88,7 @@ //! * [`definition.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/definition.js) //! * [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.30/#link-reference-definitions) //! -//! [flow]: crate::construct::flow +//! [content]: crate::construct::content //! [string]: crate::construct::string //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference @@ -157,7 +157,10 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Name::DefinitionLabel; tokenizer.tokenize_state.token_2 = Name::DefinitionLabelMarker; tokenizer.tokenize_state.token_3 = Name::DefinitionLabelString; - tokenizer.attempt(State::Next(StateName::DefinitionLabelAfter), State::Nok); + tokenizer.attempt( + State::Next(StateName::DefinitionLabelAfter), + State::Next(StateName::DefinitionLabelNok), + ); State::Retry(StateName::LabelStart) } _ => State::Nok, @@ -192,6 +195,19 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State { } } +/// At a non-label +/// +/// ```markdown +/// > | [] +/// ^ +/// ``` +pub fn label_nok(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Nok +} + /// After marker. /// /// ```markdown diff --git a/src/construct/document.rs b/src/construct/document.rs index 45a961d..82f2ebd 100644 --- a/src/construct/document.rs +++ b/src/construct/document.rs @@ -413,7 +413,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { while !document_lazy_continuation_current && stack_index > 0 { stack_index -= 1; let name = &child.stack[stack_index]; - if name == &Name::Paragraph || name == &Name::Definition || name == &Name::GfmTableHead { + if name == &Name::Content || name == &Name::GfmTableHead { document_lazy_continuation_current = true; } } @@ -423,7 +423,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { if !document_lazy_continuation_current && !child.events.is_empty() { let before = skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding]); let name = &child.events[before].name; - if name == &Name::Paragraph { + if name == &Name::Content { document_lazy_continuation_current = true; } } @@ -582,6 +582,7 @@ fn resolve(tokenizer: &mut Tokenizer) { &tokenizer.events, flow_index, &mut child.events, + (0, 0), ); // Replace the flow data with actual events. diff --git a/src/construct/flow.rs b/src/construct/flow.rs index e97ee63..08e0466 100644 --- a/src/construct/flow.rs +++ b/src/construct/flow.rs @@ -12,7 +12,6 @@ //! //! * [Blank line][crate::construct::blank_line] //! * [Code (indented)][crate::construct::code_indented] -//! * [Definition][crate::construct::definition] //! * [Heading (atx)][crate::construct::heading_atx] //! * [Heading (setext)][crate::construct::heading_setext] //! * [HTML (flow)][crate::construct::html_flow] @@ -40,14 +39,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'#') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::HeadingAtxStart) } Some(b'$' | b'`' | b'~') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::RawFlowStart) } @@ -56,7 +55,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'*' | b'_') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::ThematicBreakStart) } @@ -70,12 +69,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'{') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::MdxExpressionFlowStart) } // Actual parsing: blank line? Indented code? Indented anything? - // Tables, setext heading underlines, definitions, and paragraphs are + // Tables, setext heading underlines, definitions, and Contents are // particularly weird. _ => State::Retry(StateName::FlowBlankLineBefore), } @@ -217,34 +216,20 @@ pub fn before_mdx_expression(tokenizer: &mut Tokenizer) -> State { pub fn before_gfm_table(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeDefinition), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::GfmTableStart) } -/// At definition. -/// -/// ```markdown -/// > | [a]: b -/// ^ -/// ``` -pub fn before_definition(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt( - State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), - ); - State::Retry(StateName::DefinitionStart) -} - -/// At paragraph. +/// At content. /// /// ```markdown /// > | a /// ^ /// ``` -pub fn before_paragraph(tokenizer: &mut Tokenizer) -> State { +pub fn before_content(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt(State::Next(StateName::FlowAfter), State::Nok); - State::Retry(StateName::ParagraphStart) + State::Retry(StateName::ContentChunkStart) } /// After blank line. diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs index 27fbadf..63772c4 100644 --- a/src/construct/gfm_table.rs +++ b/src/construct/gfm_table.rs @@ -229,9 +229,10 @@ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max} use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; -use alloc::vec; +use alloc::{string::String, vec}; /// Start of a GFM table. /// @@ -771,15 +772,13 @@ pub fn body_row_escape(tokenizer: &mut Tokenizer) -> State { } /// Resolve GFM table. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; - // let mut tables = vec![]; let mut in_first_cell_awaiting_pipe = true; let mut in_row = false; let mut in_delimiter_row = false; let mut last_cell = (0, 0, 0, 0); let mut cell = (0, 0, 0, 0); - let mut after_head_awaiting_first_body_row = false; let mut last_table_end = 0; let mut last_table_has_body = false; @@ -800,17 +799,14 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } // Inject table start. - tokenizer.map.add( - index, - 0, - vec![Event { - kind: Kind::Enter, - name: Name::GfmTable, - point: tokenizer.events[index].point.clone(), - link: None, - }], - ); - } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + let enter = Event { + kind: Kind::Enter, + name: Name::GfmTable, + point: tokenizer.events[index].point.clone(), + link: None, + }; + tokenizer.map.add(index, 0, vec![enter]); + } else if matches!(event.name, Name::GfmTableRow | Name::GfmTableDelimiterRow) { in_delimiter_row = event.name == Name::GfmTableDelimiterRow; in_row = true; in_first_cell_awaiting_pipe = true; @@ -821,23 +817,21 @@ pub fn resolve(tokenizer: &mut Tokenizer) { if after_head_awaiting_first_body_row { after_head_awaiting_first_body_row = false; last_table_has_body = true; - tokenizer.map.add( - index, - 0, - vec![Event { - kind: Kind::Enter, - name: Name::GfmTableBody, - point: tokenizer.events[index].point.clone(), - link: None, - }], - ); + let enter = Event { + kind: Kind::Enter, + name: Name::GfmTableBody, + point: tokenizer.events[index].point.clone(), + link: None, + }; + tokenizer.map.add(index, 0, vec![enter]); } } // Cell data. else if in_row - && (event.name == Name::Data - || event.name == Name::GfmTableDelimiterMarker - || event.name == Name::GfmTableDelimiterFiller) + && matches!( + event.name, + Name::Data | Name::GfmTableDelimiterMarker | Name::GfmTableDelimiterFiller + ) { in_first_cell_awaiting_pipe = false; @@ -868,7 +862,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } else if event.name == Name::GfmTableHead { after_head_awaiting_first_body_row = true; last_table_end = index; - } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + } else if matches!(event.name, Name::GfmTableRow | Name::GfmTableDelimiterRow) { in_row = false; last_table_end = index; if last_cell.1 != 0 { @@ -878,9 +872,10 @@ pub fn resolve(tokenizer: &mut Tokenizer) { flush_cell(tokenizer, cell, in_delimiter_row, Some(index)); } } else if in_row - && (event.name == Name::Data - || event.name == Name::GfmTableDelimiterMarker - || event.name == Name::GfmTableDelimiterFiller) + && (matches!( + event.name, + Name::Data | Name::GfmTableDelimiterMarker | Name::GfmTableDelimiterFiller + )) { cell.3 = index; } @@ -891,6 +886,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { if last_table_end != 0 { flush_table_end(tokenizer, last_table_end, last_table_has_body); } + + Ok(None) } /// Generate a cell. diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index c1090c4..b76e455 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -66,9 +66,10 @@ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max} use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; -use alloc::vec; +use alloc::{string::String, vec}; /// Start of a heading (atx). /// @@ -222,7 +223,7 @@ pub fn data(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (atx). -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; let mut heading_inside = false; let mut data_start: Option<usize> = None; @@ -281,4 +282,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index e9cc759..3a484e1 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -54,6 +54,7 @@ //! * [`HeadingSetext`][Name::HeadingSetext] //! * [`HeadingSetextText`][Name::HeadingSetextText] //! * [`HeadingSetextUnderline`][Name::HeadingSetextUnderline] +//! * [`HeadingSetextUnderlineSequence`][Name::HeadingSetextUnderlineSequence] //! //! ## References //! @@ -70,12 +71,13 @@ //! [atx]: http://www.aaronsw.com/2002/atx/ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; -use crate::event::{Kind, Name}; +use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; -use alloc::vec; +use crate::util::{constant::TAB_SIZE, skip}; +use alloc::{string::String, vec}; /// At start of heading (setext) underline. /// @@ -90,14 +92,16 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { && !tokenizer.pierce // Require a paragraph before. && (!tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( + && tokenizer.events[skip::opt_back( &tokenizer.events, tokenizer.events.len() - 1, &[Name::LineEnding, Name::SpaceOrTab], )] .name - == Name::Paragraph) + == Name::Content) { + tokenizer.enter(Name::HeadingSetextUnderline); + if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.attempt(State::Next(StateName::HeadingSetextBefore), State::Nok); State::Retry(space_or_tab_min_max( @@ -128,7 +132,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-' | b'=') => { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); - tokenizer.enter(Name::HeadingSetextUnderline); + tokenizer.enter(Name::HeadingSetextUnderlineSequence); State::Retry(StateName::HeadingSetextInside) } _ => State::Nok, @@ -148,7 +152,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { State::Next(StateName::HeadingSetextInside) } else { tokenizer.tokenize_state.marker = 0; - tokenizer.exit(Name::HeadingSetextUnderline); + tokenizer.exit(Name::HeadingSetextUnderlineSequence); if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.attempt(State::Next(StateName::HeadingSetextAfter), State::Nok); @@ -172,6 +176,7 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { // Feel free to interrupt. tokenizer.interrupt = false; tokenizer.register_resolver(ResolveName::HeadingSetext); + tokenizer.exit(Name::HeadingSetextUnderline); State::Ok } _ => State::Nok, @@ -179,42 +184,102 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (setext). -pub fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - let mut paragraph_enter = None; - let mut paragraph_exit = None; - - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - // Find paragraphs. - if event.kind == Kind::Enter { - if event.name == Name::Paragraph { - paragraph_enter = Some(index); - } - } else if event.name == Name::Paragraph { - paragraph_exit = Some(index); - } - // We know this is preceded by a paragraph. - // Otherwise we don’t parse. - else if event.name == Name::HeadingSetextUnderline { - let enter = paragraph_enter.take().unwrap(); - let exit = paragraph_exit.take().unwrap(); +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { + tokenizer.map.consume(&mut tokenizer.events); + + let mut enter = skip::to(&tokenizer.events, 0, &[Name::HeadingSetextUnderline]); + + while enter < tokenizer.events.len() { + let exit = skip::to( + &tokenizer.events, + enter + 1, + &[Name::HeadingSetextUnderline], + ); + + // Find paragraph before + let paragraph_exit_before = skip::opt_back( + &tokenizer.events, + enter - 1, + &[Name::SpaceOrTab, Name::LineEnding, Name::BlockQuotePrefix], + ); + + // There’s a paragraph before: this is a setext heading. + if tokenizer.events[paragraph_exit_before].name == Name::Paragraph { + let paragraph_enter = skip::to_back( + &tokenizer.events, + paragraph_exit_before - 1, + &[Name::Paragraph], + ); // Change types of Enter:Paragraph, Exit:Paragraph. - tokenizer.events[enter].name = Name::HeadingSetextText; - tokenizer.events[exit].name = Name::HeadingSetextText; + tokenizer.events[paragraph_enter].name = Name::HeadingSetextText; + tokenizer.events[paragraph_exit_before].name = Name::HeadingSetextText; // Add Enter:HeadingSetext, Exit:HeadingSetext. - let mut heading_enter = tokenizer.events[enter].clone(); + let mut heading_enter = tokenizer.events[paragraph_enter].clone(); heading_enter.name = Name::HeadingSetext; - let mut heading_exit = tokenizer.events[index].clone(); + tokenizer.map.add(paragraph_enter, 0, vec![heading_enter]); + let mut heading_exit = tokenizer.events[exit].clone(); heading_exit.name = Name::HeadingSetext; - - tokenizer.map.add(enter, 0, vec![heading_enter]); - tokenizer.map.add(index + 1, 0, vec![heading_exit]); + tokenizer.map.add(exit + 1, 0, vec![heading_exit]); + } else { + // There’s a following paragraph, move this underline inside it. + if exit + 3 < tokenizer.events.len() + && tokenizer.events[exit + 1].name == Name::LineEnding + && tokenizer.events[exit + 3].name == Name::Paragraph + { + // Swap type, HeadingSetextUnderline:Enter -> Paragraph:Enter. + tokenizer.events[enter].name = Name::Paragraph; + // Swap type, LineEnding -> Data. + tokenizer.events[exit + 1].name = Name::Data; + tokenizer.events[exit + 2].name = Name::Data; + // Move new data (was line ending) back to include whole line, + // and link data together. + tokenizer.events[exit + 1].point = tokenizer.events[enter].point.clone(); + tokenizer.events[exit + 1].link = Some(Link { + previous: None, + next: Some(exit + 4), + content: Content::Text, + }); + tokenizer.events[exit + 4].link.as_mut().unwrap().previous = Some(exit + 1); + // Remove *including* HeadingSetextUnderline:Exit, until the line ending. + tokenizer.map.add(enter + 1, exit - enter, vec![]); + // Remove old Paragraph:Enter. + tokenizer.map.add(exit + 3, 1, vec![]); + } else { + // Swap type. + tokenizer.events[enter].name = Name::Paragraph; + tokenizer.events[exit].name = Name::Paragraph; + // Replace what’s inside the underline (whitespace, sequence). + tokenizer.map.add( + enter + 1, + exit - enter - 1, + vec![ + Event { + name: Name::Data, + kind: Kind::Enter, + point: tokenizer.events[enter].point.clone(), + link: Some(Link { + previous: None, + next: None, + content: Content::Text, + }), + }, + Event { + name: Name::Data, + kind: Kind::Exit, + point: tokenizer.events[exit].point.clone(), + link: None, + }, + ], + ); + } } - index += 1; + enter = skip::to(&tokenizer.events, exit + 1, &[Name::HeadingSetextUnderline]); } + + tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index ce1c295..95b9a27 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -183,6 +183,7 @@ use crate::construct::partial_space_or_tab_eol::space_or_tab_eol; use crate::event::{Event, Kind, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::{Label, LabelKind, LabelStart, Tokenizer}; use crate::util::{ constant::RESOURCE_DESTINATION_BALANCE_MAX, @@ -660,7 +661,7 @@ pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State { /// /// This turns matching label starts and label ends into links, images, and /// footnotes, and turns unmatched label starts back into data. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { // Inject labels. let labels = tokenizer.tokenize_state.labels.split_off(0); inject_labels(tokenizer, &labels); @@ -671,6 +672,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { mark_as_data(tokenizer, &starts); tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } /// Inject links/images/footnotes. diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs index 658c2c7..13b740b 100644 --- a/src/construct/list_item.rs +++ b/src/construct/list_item.rs @@ -62,13 +62,14 @@ use crate::construct::partial_space_or_tab::space_or_tab_min_max; use crate::event::{Kind, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{ constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}, skip, slice::{Position, Slice}, }; -use alloc::{vec, vec::Vec}; +use alloc::{string::String, vec, vec::Vec}; /// Start of list item. /// @@ -370,7 +371,7 @@ pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { } /// Find adjacent list items with the same marker. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; let mut index = 0; @@ -472,4 +473,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 1afa105..ae6facf 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -16,7 +16,7 @@ //! Content types also have a *rest* thing: after all things are parsed, //! there’s something left. //! In document, that is [flow][]. -//! In flow, that is a [paragraph][]. +//! In flow, that is [content][]. //! In string and text, that is [data][partial_data]. //! //! ## Construct @@ -37,6 +37,7 @@ //! * [character escape][character_escape] //! * [character reference][character_reference] //! * [code (indented)][code_indented] +//! * [content][] //! * [definition][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] @@ -149,6 +150,7 @@ pub mod block_quote; pub mod character_escape; pub mod character_reference; pub mod code_indented; +pub mod content; pub mod definition; pub mod document; pub mod flow; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index c1e7311..78fbacb 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -1,4 +1,4 @@ -//! Paragraph occurs in the [flow][] content type. +//! Paragraph occurs in the [content][] content type. //! //! ## Grammar //! @@ -11,14 +11,15 @@ //! paragraph ::= 1*line *(eol 1*line) //! ``` //! -//! As this construct occurs in flow, like all flow constructs, it must be -//! followed by an eol (line ending) or eof (end of file). +//! This construct must be followed by an eol (line ending) or eof (end of +//! file), like flow constructs. //! //! Paragraphs can contain line endings and whitespace, but they are not //! allowed to contain blank lines, or to be blank themselves. //! //! The paragraph is interpreted as the [text][] content type. -//! That means that [autolinks][autolink], [code (text)][raw_text], etc are allowed. +//! That means that [autolinks][autolink], [code (text)][raw_text], etc are +//! allowed. //! //! ## HTML //! @@ -34,40 +35,57 @@ //! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) //! * [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs) //! -//! [flow]: crate::construct::flow +//! [content]: crate::construct::content //! [text]: crate::construct::text //! [autolink]: crate::construct::autolink //! [raw_text]: crate::construct::raw_text //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element -use crate::event::{Content, Kind, Link, Name}; -use crate::resolve::Name as ResolveName; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; +use crate::subtokenize::link; use crate::tokenizer::Tokenizer; -use alloc::vec; -/// Before paragraph. +/// Paragraph start. /// /// ```markdown /// > | abc /// ^ +/// | def /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => unreachable!("unexpected eol/eof"), - _ => { - tokenizer.enter(Name::Paragraph); - tokenizer.enter_link( - Name::Data, - Link { - previous: None, - next: None, - content: Content::Text, - }, - ); - State::Retry(StateName::ParagraphInside) - } + debug_assert!(tokenizer.current.is_some()); + tokenizer.enter(Name::Paragraph); + State::Retry(StateName::ParagraphLineStart) +} + +/// Start of a line in a paragraph. +/// +/// ```markdown +/// > | abc +/// ^ +/// > | def +/// ^ +/// ``` +pub fn line_start(tokenizer: &mut Tokenizer) -> State { + debug_assert!(tokenizer.current.is_some()); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::Text, + }, + ); + + if tokenizer.tokenize_state.connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else { + tokenizer.tokenize_state.connect = true; } + + State::Retry(StateName::ParagraphInside) } /// In paragraph. @@ -78,91 +96,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') => { + None => { + tokenizer.tokenize_state.connect = false; tokenizer.exit(Name::Data); tokenizer.exit(Name::Paragraph); - tokenizer.register_resolver_before(ResolveName::Paragraph); - // You’d be interrupting. - tokenizer.interrupt = true; State::Ok } + Some(b'\n') => { + tokenizer.consume(); + tokenizer.exit(Name::Data); + State::Next(StateName::ParagraphLineStart) + } _ => { tokenizer.consume(); State::Next(StateName::ParagraphInside) } } } - -/// Merge “`Paragraph`”s, which currently span a single line, into actual -/// `Paragraph`s that span multiple lines. -pub fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - if event.kind == Kind::Enter && event.name == Name::Paragraph { - // Exit:Paragraph - let mut exit_index = index + 3; - - loop { - let mut enter_index = exit_index + 1; - - if enter_index == tokenizer.events.len() - || tokenizer.events[enter_index].name != Name::LineEnding - { - break; - } - - enter_index += 2; - - while enter_index < tokenizer.events.len() { - let event = &tokenizer.events[enter_index]; - - if event.name != Name::SpaceOrTab - && event.name != Name::BlockQuotePrefix - && event.name != Name::BlockQuoteMarker - { - break; - } - - enter_index += 1; - } - - if enter_index == tokenizer.events.len() - || tokenizer.events[enter_index].name != Name::Paragraph - { - break; - } - - // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding. - tokenizer.map.add(exit_index, 3, vec![]); - - // Remove Enter:Paragraph. - tokenizer.map.add(enter_index, 1, vec![]); - - // Add Exit:LineEnding position info to Exit:Data. - tokenizer.events[exit_index - 1].point = - tokenizer.events[exit_index + 2].point.clone(); - - // Link Enter:Data on the previous line to Enter:Data on this line. - if let Some(link) = &mut tokenizer.events[exit_index - 2].link { - link.next = Some(enter_index + 1); - } - if let Some(link) = &mut tokenizer.events[enter_index + 1].link { - link.previous = Some(exit_index - 2); - } - - // Potential next start. - exit_index = enter_index + 3; - } - - // Move to `Exit:Paragraph`. - index = exit_index; - } - - index += 1; - } - - tokenizer.map.consume(&mut tokenizer.events); -} diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index b6f1f47..b36d9f0 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -8,8 +8,9 @@ use crate::event::{Kind, Name}; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use alloc::vec; +use alloc::{string::String, vec}; /// At beginning of data. /// @@ -72,7 +73,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Merge adjacent data events. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; // Loop through events and merge adjacent data events. @@ -103,4 +104,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/string.rs b/src/construct/string.rs index dba1ac1..cf2f222 100644 --- a/src/construct/string.rs +++ b/src/construct/string.rs @@ -15,7 +15,9 @@ use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; +use alloc::string::String; /// Characters that can start something in string. const MARKERS: [u8; 2] = [b'&', b'\\']; @@ -74,6 +76,8 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace in string. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { resolve_whitespace(tokenizer, false, false); + + Ok(None) } diff --git a/src/construct/text.rs b/src/construct/text.rs index 34ea071..2648531 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -28,7 +28,9 @@ use crate::construct::gfm_autolink_literal::resolve as resolve_gfm_autolink_lite use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; +use alloc::string::String; /// Characters that can start something in text. const MARKERS: [u8; 16] = [ @@ -242,7 +244,7 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { resolve_whitespace( tokenizer, tokenizer.parse_state.options.constructs.hard_break_trailing, @@ -257,4 +259,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { { resolve_gfm_autolink_literal(tokenizer); } + + Ok(None) } |