diff options
Diffstat (limited to '')
-rw-r--r-- | src/compiler.rs | 6 | ||||
-rw-r--r-- | src/construct/attention.rs | 6 | ||||
-rw-r--r-- | src/construct/content.rs | 188 | ||||
-rw-r--r-- | src/construct/definition.rs | 26 | ||||
-rw-r--r-- | src/construct/document.rs | 5 | ||||
-rw-r--r-- | src/construct/flow.rs | 33 | ||||
-rw-r--r-- | src/construct/gfm_table.rs | 61 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 7 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 137 | ||||
-rw-r--r-- | src/construct/label_end.rs | 5 | ||||
-rw-r--r-- | src/construct/list_item.rs | 7 | ||||
-rw-r--r-- | src/construct/mod.rs | 4 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 149 | ||||
-rw-r--r-- | src/construct/partial_data.rs | 7 | ||||
-rw-r--r-- | src/construct/string.rs | 6 | ||||
-rw-r--r-- | src/construct/text.rs | 6 | ||||
-rw-r--r-- | src/event.rs | 51 | ||||
-rw-r--r-- | src/parser.rs | 23 | ||||
-rw-r--r-- | src/resolve.rs | 20 | ||||
-rw-r--r-- | src/state.rs | 202 | ||||
-rw-r--r-- | src/subtokenize.rs | 61 | ||||
-rw-r--r-- | src/tokenizer.rs | 23 |
22 files changed, 680 insertions, 353 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 397e96f..d1ac774 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -463,7 +463,7 @@ fn exit(context: &mut CompileContext) { Name::HeadingAtxSequence => on_exit_heading_atx_sequence(context), Name::HeadingAtxText => on_exit_heading_atx_text(context), Name::HeadingSetextText => on_exit_heading_setext_text(context), - Name::HeadingSetextUnderline => on_exit_heading_setext_underline(context), + Name::HeadingSetextUnderlineSequence => on_exit_heading_setext_underline_sequence(context), Name::HtmlFlow | Name::HtmlText => on_exit_html(context), Name::HtmlFlowData | Name::HtmlTextData => on_exit_html_data(context), Name::Image | Name::Link => on_exit_media(context), @@ -1440,8 +1440,8 @@ fn on_exit_heading_setext_text(context: &mut CompileContext) { context.slurp_one_line_ending = true; } -/// Handle [`Exit`][Kind::Exit]:[`HeadingSetextUnderline`][Name::HeadingSetextUnderline]. -fn on_exit_heading_setext_underline(context: &mut CompileContext) { +/// Handle [`Exit`][Kind::Exit]:[`HeadingSetextUnderlineSequence`][Name::HeadingSetextUnderlineSequence]. +fn on_exit_heading_setext_underline_sequence(context: &mut CompileContext) { let text = context .heading_setext_buffer .take() diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 4a208df..4d58610 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -79,6 +79,7 @@ use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{ char::{ @@ -87,6 +88,7 @@ use crate::util::{ }, slice::Slice, }; +use alloc::string::String; use alloc::{vec, vec::Vec}; /// Attentention sequence that we can take markers from. @@ -150,7 +152,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Resolve sequences. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { // Find all sequences, gather info about them. let mut sequences = get_sequences(tokenizer); @@ -221,6 +223,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } /// Get sequences. diff --git a/src/construct/content.rs b/src/construct/content.rs new file mode 100644 index 0000000..6c10cea --- /dev/null +++ b/src/construct/content.rs @@ -0,0 +1,188 @@ +//! Content occurs in the [flow][] content type. +//! +//! Content contains zero or more [definition][definition]s, followed by zero +//! or one [paragraph][]. +//! +//! The constructs found in flow are: +//! +//! * [Definition][crate::construct::definition] +//! * [Paragraph][crate::construct::paragraph] +//! +//! ## Tokens +//! +//! * [`Content`][Name::Content] +//! +//! > 👉 **Note**: while parsing, [`Content`][Name::Content] +//! > is used, which is later compiled away. +//! +//! ## References +//! +//! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) +//! +//! [flow]: crate::construct::flow +//! [definition]: crate::construct::definition +//! [paragraph]: crate::construct::paragraph + +use crate::event::{Content, Kind, Link, Name}; +use crate::resolve::Name as ResolveName; +use crate::state::{Name as StateName, State}; +use crate::subtokenize::{subtokenize, Subresult}; +use crate::tokenizer::Tokenizer; +use alloc::{string::String, vec}; + +/// Before a content content. +/// +/// ```markdown +/// > | abc +/// ^ +/// ``` +pub fn chunk_start(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => unreachable!("unexpected eol/eof"), + _ => { + tokenizer.enter_link( + Name::Content, + Link { + previous: None, + next: None, + content: Content::Content, + }, + ); + State::Retry(StateName::ContentChunkInside) + } + } +} + +/// In a content chunk. +/// +/// ```markdown +/// > | abc +/// ^^^ +/// ``` +pub fn chunk_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(Name::Content); + tokenizer.register_resolver_before(ResolveName::Content); + // You’d be interrupting. + tokenizer.interrupt = true; + State::Ok + } + _ => { + tokenizer.consume(); + State::Next(StateName::ContentChunkInside) + } + } +} + +/// Before a definition. +/// +/// ```markdown +/// > | [a]: b +/// ^ +/// ``` +pub fn definition_before(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt( + State::Next(StateName::ContentDefinitionAfter), + State::Next(StateName::ParagraphStart), + ); + State::Retry(StateName::DefinitionStart) +} + +/// After a definition. +/// +/// ```markdown +/// > | [a]: b +/// ^ +/// | c +/// ``` +pub fn definition_after(tokenizer: &mut Tokenizer) -> State { + debug_assert!(matches!(tokenizer.current, None | Some(b'\n'))); + if tokenizer.current.is_none() { + State::Ok + } else { + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::ContentDefinitionBefore) + } +} + +/// Merge `Content` chunks, which currently span a single line, into actual +/// `Content`s that span multiple lines. +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { + let mut index = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.kind == Kind::Enter && event.name == Name::Content { + // Exit:Content + let mut exit_index = index + 1; + + loop { + let mut enter_index = exit_index + 1; + + if enter_index == tokenizer.events.len() + || tokenizer.events[enter_index].name != Name::LineEnding + { + break; + } + + // Skip past line ending. + enter_index += 2; + + // Skip past prefix. + while enter_index < tokenizer.events.len() { + let event = &tokenizer.events[enter_index]; + + if event.name != Name::SpaceOrTab + && event.name != Name::BlockQuotePrefix + && event.name != Name::BlockQuoteMarker + { + break; + } + + enter_index += 1; + } + + if enter_index == tokenizer.events.len() + || tokenizer.events[enter_index].name != Name::Content + { + break; + } + + // Set Exit:Content point to Exit:LineEnding. + tokenizer.events[exit_index].point = tokenizer.events[exit_index + 2].point.clone(); + // Remove Enter:LineEnding, Exit:LineEnding. + tokenizer.map.add(exit_index + 1, 2, vec![]); + + // Link Enter:Content to Enter:Content on this line and vice versa. + tokenizer.events[exit_index - 1].link.as_mut().unwrap().next = Some(enter_index); + tokenizer.events[enter_index] + .link + .as_mut() + .unwrap() + .previous = Some(exit_index - 1); + + // Potential next start. + exit_index = enter_index + 1; + } + + // Move to `Exit:Content`. + index = exit_index; + } + + index += 1; + } + + tokenizer.map.consume(&mut tokenizer.events); + + let result = subtokenize( + &mut tokenizer.events, + tokenizer.parse_state, + &Some(Content::Content), + )?; + + Ok(Some(result)) +} diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 1071489..8ccfb90 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -1,4 +1,4 @@ -//! Definition occurs in the [flow] content type. +//! Definition occurs in the [content] content type. //! //! ## Grammar //! @@ -12,8 +12,8 @@ //! ; those parts. //! ``` //! -//! As this construct occurs in flow, like all flow constructs, it must be -//! followed by an eol (line ending) or eof (end of file). +//! This construct must be followed by an eol (line ending) or eof (end of +//! file), like flow constructs. //! //! See [`destination`][destination], [`label`][label], and [`title`][title] //! for grammar, notes, and recommendations on each part. @@ -88,7 +88,7 @@ //! * [`definition.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/definition.js) //! * [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.30/#link-reference-definitions) //! -//! [flow]: crate::construct::flow +//! [content]: crate::construct::content //! [string]: crate::construct::string //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference @@ -157,7 +157,10 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Name::DefinitionLabel; tokenizer.tokenize_state.token_2 = Name::DefinitionLabelMarker; tokenizer.tokenize_state.token_3 = Name::DefinitionLabelString; - tokenizer.attempt(State::Next(StateName::DefinitionLabelAfter), State::Nok); + tokenizer.attempt( + State::Next(StateName::DefinitionLabelAfter), + State::Next(StateName::DefinitionLabelNok), + ); State::Retry(StateName::LabelStart) } _ => State::Nok, @@ -192,6 +195,19 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State { } } +/// At a non-label +/// +/// ```markdown +/// > | [] +/// ^ +/// ``` +pub fn label_nok(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Nok +} + /// After marker. /// /// ```markdown diff --git a/src/construct/document.rs b/src/construct/document.rs index 45a961d..82f2ebd 100644 --- a/src/construct/document.rs +++ b/src/construct/document.rs @@ -413,7 +413,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { while !document_lazy_continuation_current && stack_index > 0 { stack_index -= 1; let name = &child.stack[stack_index]; - if name == &Name::Paragraph || name == &Name::Definition || name == &Name::GfmTableHead { + if name == &Name::Content || name == &Name::GfmTableHead { document_lazy_continuation_current = true; } } @@ -423,7 +423,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { if !document_lazy_continuation_current && !child.events.is_empty() { let before = skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding]); let name = &child.events[before].name; - if name == &Name::Paragraph { + if name == &Name::Content { document_lazy_continuation_current = true; } } @@ -582,6 +582,7 @@ fn resolve(tokenizer: &mut Tokenizer) { &tokenizer.events, flow_index, &mut child.events, + (0, 0), ); // Replace the flow data with actual events. diff --git a/src/construct/flow.rs b/src/construct/flow.rs index e97ee63..08e0466 100644 --- a/src/construct/flow.rs +++ b/src/construct/flow.rs @@ -12,7 +12,6 @@ //! //! * [Blank line][crate::construct::blank_line] //! * [Code (indented)][crate::construct::code_indented] -//! * [Definition][crate::construct::definition] //! * [Heading (atx)][crate::construct::heading_atx] //! * [Heading (setext)][crate::construct::heading_setext] //! * [HTML (flow)][crate::construct::html_flow] @@ -40,14 +39,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'#') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::HeadingAtxStart) } Some(b'$' | b'`' | b'~') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::RawFlowStart) } @@ -56,7 +55,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'*' | b'_') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::ThematicBreakStart) } @@ -70,12 +69,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'{') => { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::MdxExpressionFlowStart) } // Actual parsing: blank line? Indented code? Indented anything? - // Tables, setext heading underlines, definitions, and paragraphs are + // Tables, setext heading underlines, definitions, and Contents are // particularly weird. _ => State::Retry(StateName::FlowBlankLineBefore), } @@ -217,34 +216,20 @@ pub fn before_mdx_expression(tokenizer: &mut Tokenizer) -> State { pub fn before_gfm_table(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeDefinition), + State::Next(StateName::FlowBeforeContent), ); State::Retry(StateName::GfmTableStart) } -/// At definition. -/// -/// ```markdown -/// > | [a]: b -/// ^ -/// ``` -pub fn before_definition(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt( - State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), - ); - State::Retry(StateName::DefinitionStart) -} - -/// At paragraph. +/// At content. /// /// ```markdown /// > | a /// ^ /// ``` -pub fn before_paragraph(tokenizer: &mut Tokenizer) -> State { +pub fn before_content(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt(State::Next(StateName::FlowAfter), State::Nok); - State::Retry(StateName::ParagraphStart) + State::Retry(StateName::ContentChunkStart) } /// After blank line. diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs index 27fbadf..63772c4 100644 --- a/src/construct/gfm_table.rs +++ b/src/construct/gfm_table.rs @@ -229,9 +229,10 @@ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max} use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; -use alloc::vec; +use alloc::{string::String, vec}; /// Start of a GFM table. /// @@ -771,15 +772,13 @@ pub fn body_row_escape(tokenizer: &mut Tokenizer) -> State { } /// Resolve GFM table. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; - // let mut tables = vec![]; let mut in_first_cell_awaiting_pipe = true; let mut in_row = false; let mut in_delimiter_row = false; let mut last_cell = (0, 0, 0, 0); let mut cell = (0, 0, 0, 0); - let mut after_head_awaiting_first_body_row = false; let mut last_table_end = 0; let mut last_table_has_body = false; @@ -800,17 +799,14 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } // Inject table start. - tokenizer.map.add( - index, - 0, - vec![Event { - kind: Kind::Enter, - name: Name::GfmTable, - point: tokenizer.events[index].point.clone(), - link: None, - }], - ); - } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + let enter = Event { + kind: Kind::Enter, + name: Name::GfmTable, + point: tokenizer.events[index].point.clone(), + link: None, + }; + tokenizer.map.add(index, 0, vec![enter]); + } else if matches!(event.name, Name::GfmTableRow | Name::GfmTableDelimiterRow) { in_delimiter_row = event.name == Name::GfmTableDelimiterRow; in_row = true; in_first_cell_awaiting_pipe = true; @@ -821,23 +817,21 @@ pub fn resolve(tokenizer: &mut Tokenizer) { if after_head_awaiting_first_body_row { after_head_awaiting_first_body_row = false; last_table_has_body = true; - tokenizer.map.add( - index, - 0, - vec![Event { - kind: Kind::Enter, - name: Name::GfmTableBody, - point: tokenizer.events[index].point.clone(), - link: None, - }], - ); + let enter = Event { + kind: Kind::Enter, + name: Name::GfmTableBody, + point: tokenizer.events[index].point.clone(), + link: None, + }; + tokenizer.map.add(index, 0, vec![enter]); } } // Cell data. else if in_row - && (event.name == Name::Data - || event.name == Name::GfmTableDelimiterMarker - || event.name == Name::GfmTableDelimiterFiller) + && matches!( + event.name, + Name::Data | Name::GfmTableDelimiterMarker | Name::GfmTableDelimiterFiller + ) { in_first_cell_awaiting_pipe = false; @@ -868,7 +862,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } else if event.name == Name::GfmTableHead { after_head_awaiting_first_body_row = true; last_table_end = index; - } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + } else if matches!(event.name, Name::GfmTableRow | Name::GfmTableDelimiterRow) { in_row = false; last_table_end = index; if last_cell.1 != 0 { @@ -878,9 +872,10 @@ pub fn resolve(tokenizer: &mut Tokenizer) { flush_cell(tokenizer, cell, in_delimiter_row, Some(index)); } } else if in_row - && (event.name == Name::Data - || event.name == Name::GfmTableDelimiterMarker - || event.name == Name::GfmTableDelimiterFiller) + && (matches!( + event.name, + Name::Data | Name::GfmTableDelimiterMarker | Name::GfmTableDelimiterFiller + )) { cell.3 = index; } @@ -891,6 +886,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { if last_table_end != 0 { flush_table_end(tokenizer, last_table_end, last_table_has_body); } + + Ok(None) } /// Generate a cell. diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index c1090c4..b76e455 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -66,9 +66,10 @@ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max} use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; -use alloc::vec; +use alloc::{string::String, vec}; /// Start of a heading (atx). /// @@ -222,7 +223,7 @@ pub fn data(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (atx). -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; let mut heading_inside = false; let mut data_start: Option<usize> = None; @@ -281,4 +282,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index e9cc759..3a484e1 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -54,6 +54,7 @@ //! * [`HeadingSetext`][Name::HeadingSetext] //! * [`HeadingSetextText`][Name::HeadingSetextText] //! * [`HeadingSetextUnderline`][Name::HeadingSetextUnderline] +//! * [`HeadingSetextUnderlineSequence`][Name::HeadingSetextUnderlineSequence] //! //! ## References //! @@ -70,12 +71,13 @@ //! [atx]: http://www.aaronsw.com/2002/atx/ use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; -use crate::event::{Kind, Name}; +use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; -use alloc::vec; +use crate::util::{constant::TAB_SIZE, skip}; +use alloc::{string::String, vec}; /// At start of heading (setext) underline. /// @@ -90,14 +92,16 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { && !tokenizer.pierce // Require a paragraph before. && (!tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( + && tokenizer.events[skip::opt_back( &tokenizer.events, tokenizer.events.len() - 1, &[Name::LineEnding, Name::SpaceOrTab], )] .name - == Name::Paragraph) + == Name::Content) { + tokenizer.enter(Name::HeadingSetextUnderline); + if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.attempt(State::Next(StateName::HeadingSetextBefore), State::Nok); State::Retry(space_or_tab_min_max( @@ -128,7 +132,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-' | b'=') => { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); - tokenizer.enter(Name::HeadingSetextUnderline); + tokenizer.enter(Name::HeadingSetextUnderlineSequence); State::Retry(StateName::HeadingSetextInside) } _ => State::Nok, @@ -148,7 +152,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { State::Next(StateName::HeadingSetextInside) } else { tokenizer.tokenize_state.marker = 0; - tokenizer.exit(Name::HeadingSetextUnderline); + tokenizer.exit(Name::HeadingSetextUnderlineSequence); if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.attempt(State::Next(StateName::HeadingSetextAfter), State::Nok); @@ -172,6 +176,7 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { // Feel free to interrupt. tokenizer.interrupt = false; tokenizer.register_resolver(ResolveName::HeadingSetext); + tokenizer.exit(Name::HeadingSetextUnderline); State::Ok } _ => State::Nok, @@ -179,42 +184,102 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (setext). -pub fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - let mut paragraph_enter = None; - let mut paragraph_exit = None; - - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - // Find paragraphs. - if event.kind == Kind::Enter { - if event.name == Name::Paragraph { - paragraph_enter = Some(index); - } - } else if event.name == Name::Paragraph { - paragraph_exit = Some(index); - } - // We know this is preceded by a paragraph. - // Otherwise we don’t parse. - else if event.name == Name::HeadingSetextUnderline { - let enter = paragraph_enter.take().unwrap(); - let exit = paragraph_exit.take().unwrap(); +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { + tokenizer.map.consume(&mut tokenizer.events); + + let mut enter = skip::to(&tokenizer.events, 0, &[Name::HeadingSetextUnderline]); + + while enter < tokenizer.events.len() { + let exit = skip::to( + &tokenizer.events, + enter + 1, + &[Name::HeadingSetextUnderline], + ); + + // Find paragraph before + let paragraph_exit_before = skip::opt_back( + &tokenizer.events, + enter - 1, + &[Name::SpaceOrTab, Name::LineEnding, Name::BlockQuotePrefix], + ); + + // There’s a paragraph before: this is a setext heading. + if tokenizer.events[paragraph_exit_before].name == Name::Paragraph { + let paragraph_enter = skip::to_back( + &tokenizer.events, + paragraph_exit_before - 1, + &[Name::Paragraph], + ); // Change types of Enter:Paragraph, Exit:Paragraph. - tokenizer.events[enter].name = Name::HeadingSetextText; - tokenizer.events[exit].name = Name::HeadingSetextText; + tokenizer.events[paragraph_enter].name = Name::HeadingSetextText; + tokenizer.events[paragraph_exit_before].name = Name::HeadingSetextText; // Add Enter:HeadingSetext, Exit:HeadingSetext. - let mut heading_enter = tokenizer.events[enter].clone(); + let mut heading_enter = tokenizer.events[paragraph_enter].clone(); heading_enter.name = Name::HeadingSetext; - let mut heading_exit = tokenizer.events[index].clone(); + tokenizer.map.add(paragraph_enter, 0, vec![heading_enter]); + let mut heading_exit = tokenizer.events[exit].clone(); heading_exit.name = Name::HeadingSetext; - - tokenizer.map.add(enter, 0, vec![heading_enter]); - tokenizer.map.add(index + 1, 0, vec![heading_exit]); + tokenizer.map.add(exit + 1, 0, vec![heading_exit]); + } else { + // There’s a following paragraph, move this underline inside it. + if exit + 3 < tokenizer.events.len() + && tokenizer.events[exit + 1].name == Name::LineEnding + && tokenizer.events[exit + 3].name == Name::Paragraph + { + // Swap type, HeadingSetextUnderline:Enter -> Paragraph:Enter. + tokenizer.events[enter].name = Name::Paragraph; + // Swap type, LineEnding -> Data. + tokenizer.events[exit + 1].name = Name::Data; + tokenizer.events[exit + 2].name = Name::Data; + // Move new data (was line ending) back to include whole line, + // and link data together. + tokenizer.events[exit + 1].point = tokenizer.events[enter].point.clone(); + tokenizer.events[exit + 1].link = Some(Link { + previous: None, + next: Some(exit + 4), + content: Content::Text, + }); + tokenizer.events[exit + 4].link.as_mut().unwrap().previous = Some(exit + 1); + // Remove *including* HeadingSetextUnderline:Exit, until the line ending. + tokenizer.map.add(enter + 1, exit - enter, vec![]); + // Remove old Paragraph:Enter. + tokenizer.map.add(exit + 3, 1, vec![]); + } else { + // Swap type. + tokenizer.events[enter].name = Name::Paragraph; + tokenizer.events[exit].name = Name::Paragraph; + // Replace what’s inside the underline (whitespace, sequence). + tokenizer.map.add( + enter + 1, + exit - enter - 1, + vec![ + Event { + name: Name::Data, + kind: Kind::Enter, + point: tokenizer.events[enter].point.clone(), + link: Some(Link { + previous: None, + next: None, + content: Content::Text, + }), + }, + Event { + name: Name::Data, + kind: Kind::Exit, + point: tokenizer.events[exit].point.clone(), + link: None, + }, + ], + ); + } } - index += 1; + enter = skip::to(&tokenizer.events, exit + 1, &[Name::HeadingSetextUnderline]); } + + tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index ce1c295..95b9a27 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -183,6 +183,7 @@ use crate::construct::partial_space_or_tab_eol::space_or_tab_eol; use crate::event::{Event, Kind, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::{Label, LabelKind, LabelStart, Tokenizer}; use crate::util::{ constant::RESOURCE_DESTINATION_BALANCE_MAX, @@ -660,7 +661,7 @@ pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State { /// /// This turns matching label starts and label ends into links, images, and /// footnotes, and turns unmatched label starts back into data. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { // Inject labels. let labels = tokenizer.tokenize_state.labels.split_off(0); inject_labels(tokenizer, &labels); @@ -671,6 +672,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { mark_as_data(tokenizer, &starts); tokenizer.map.consume(&mut tokenizer.events); + + Ok(None) } /// Inject links/images/footnotes. diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs index 658c2c7..13b740b 100644 --- a/src/construct/list_item.rs +++ b/src/construct/list_item.rs @@ -62,13 +62,14 @@ use crate::construct::partial_space_or_tab::space_or_tab_min_max; use crate::event::{Kind, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{ constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}, skip, slice::{Position, Slice}, }; -use alloc::{vec, vec::Vec}; +use alloc::{string::String, vec, vec::Vec}; /// Start of list item. /// @@ -370,7 +371,7 @@ pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { } /// Find adjacent list items with the same marker. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; let mut index = 0; @@ -472,4 +473,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 1afa105..ae6facf 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -16,7 +16,7 @@ //! Content types also have a *rest* thing: after all things are parsed, //! there’s something left. //! In document, that is [flow][]. -//! In flow, that is a [paragraph][]. +//! In flow, that is [content][]. //! In string and text, that is [data][partial_data]. //! //! ## Construct @@ -37,6 +37,7 @@ //! * [character escape][character_escape] //! * [character reference][character_reference] //! * [code (indented)][code_indented] +//! * [content][] //! * [definition][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] @@ -149,6 +150,7 @@ pub mod block_quote; pub mod character_escape; pub mod character_reference; pub mod code_indented; +pub mod content; pub mod definition; pub mod document; pub mod flow; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index c1e7311..78fbacb 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -1,4 +1,4 @@ -//! Paragraph occurs in the [flow][] content type. +//! Paragraph occurs in the [content][] content type. //! //! ## Grammar //! @@ -11,14 +11,15 @@ //! paragraph ::= 1*line *(eol 1*line) //! ``` //! -//! As this construct occurs in flow, like all flow constructs, it must be -//! followed by an eol (line ending) or eof (end of file). +//! This construct must be followed by an eol (line ending) or eof (end of +//! file), like flow constructs. //! //! Paragraphs can contain line endings and whitespace, but they are not //! allowed to contain blank lines, or to be blank themselves. //! //! The paragraph is interpreted as the [text][] content type. -//! That means that [autolinks][autolink], [code (text)][raw_text], etc are allowed. +//! That means that [autolinks][autolink], [code (text)][raw_text], etc are +//! allowed. //! //! ## HTML //! @@ -34,40 +35,57 @@ //! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) //! * [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs) //! -//! [flow]: crate::construct::flow +//! [content]: crate::construct::content //! [text]: crate::construct::text //! [autolink]: crate::construct::autolink //! [raw_text]: crate::construct::raw_text //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element -use crate::event::{Content, Kind, Link, Name}; -use crate::resolve::Name as ResolveName; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; +use crate::subtokenize::link; use crate::tokenizer::Tokenizer; -use alloc::vec; -/// Before paragraph. +/// Paragraph start. /// /// ```markdown /// > | abc /// ^ +/// | def /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => unreachable!("unexpected eol/eof"), - _ => { - tokenizer.enter(Name::Paragraph); - tokenizer.enter_link( - Name::Data, - Link { - previous: None, - next: None, - content: Content::Text, - }, - ); - State::Retry(StateName::ParagraphInside) - } + debug_assert!(tokenizer.current.is_some()); + tokenizer.enter(Name::Paragraph); + State::Retry(StateName::ParagraphLineStart) +} + +/// Start of a line in a paragraph. +/// +/// ```markdown +/// > | abc +/// ^ +/// > | def +/// ^ +/// ``` +pub fn line_start(tokenizer: &mut Tokenizer) -> State { + debug_assert!(tokenizer.current.is_some()); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::Text, + }, + ); + + if tokenizer.tokenize_state.connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else { + tokenizer.tokenize_state.connect = true; } + + State::Retry(StateName::ParagraphInside) } /// In paragraph. @@ -78,91 +96,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') => { + None => { + tokenizer.tokenize_state.connect = false; tokenizer.exit(Name::Data); tokenizer.exit(Name::Paragraph); - tokenizer.register_resolver_before(ResolveName::Paragraph); - // You’d be interrupting. - tokenizer.interrupt = true; State::Ok } + Some(b'\n') => { + tokenizer.consume(); + tokenizer.exit(Name::Data); + State::Next(StateName::ParagraphLineStart) + } _ => { tokenizer.consume(); State::Next(StateName::ParagraphInside) } } } - -/// Merge “`Paragraph`”s, which currently span a single line, into actual -/// `Paragraph`s that span multiple lines. -pub fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - if event.kind == Kind::Enter && event.name == Name::Paragraph { - // Exit:Paragraph - let mut exit_index = index + 3; - - loop { - let mut enter_index = exit_index + 1; - - if enter_index == tokenizer.events.len() - || tokenizer.events[enter_index].name != Name::LineEnding - { - break; - } - - enter_index += 2; - - while enter_index < tokenizer.events.len() { - let event = &tokenizer.events[enter_index]; - - if event.name != Name::SpaceOrTab - && event.name != Name::BlockQuotePrefix - && event.name != Name::BlockQuoteMarker - { - break; - } - - enter_index += 1; - } - - if enter_index == tokenizer.events.len() - || tokenizer.events[enter_index].name != Name::Paragraph - { - break; - } - - // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding. - tokenizer.map.add(exit_index, 3, vec![]); - - // Remove Enter:Paragraph. - tokenizer.map.add(enter_index, 1, vec![]); - - // Add Exit:LineEnding position info to Exit:Data. - tokenizer.events[exit_index - 1].point = - tokenizer.events[exit_index + 2].point.clone(); - - // Link Enter:Data on the previous line to Enter:Data on this line. - if let Some(link) = &mut tokenizer.events[exit_index - 2].link { - link.next = Some(enter_index + 1); - } - if let Some(link) = &mut tokenizer.events[enter_index + 1].link { - link.previous = Some(exit_index - 2); - } - - // Potential next start. - exit_index = enter_index + 3; - } - - // Move to `Exit:Paragraph`. - index = exit_index; - } - - index += 1; - } - - tokenizer.map.consume(&mut tokenizer.events); -} diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index b6f1f47..b36d9f0 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -8,8 +8,9 @@ use crate::event::{Kind, Name}; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use alloc::vec; +use alloc::{string::String, vec}; /// At beginning of data. /// @@ -72,7 +73,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Merge adjacent data events. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { let mut index = 0; // Loop through events and merge adjacent data events. @@ -103,4 +104,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { index += 1; } + + Ok(None) } diff --git a/src/construct/string.rs b/src/construct/string.rs index dba1ac1..cf2f222 100644 --- a/src/construct/string.rs +++ b/src/construct/string.rs @@ -15,7 +15,9 @@ use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; +use alloc::string::String; /// Characters that can start something in string. const MARKERS: [u8; 2] = [b'&', b'\\']; @@ -74,6 +76,8 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace in string. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { resolve_whitespace(tokenizer, false, false); + + Ok(None) } diff --git a/src/construct/text.rs b/src/construct/text.rs index 34ea071..2648531 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -28,7 +28,9 @@ use crate::construct::gfm_autolink_literal::resolve as resolve_gfm_autolink_lite use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; +use alloc::string::String; /// Characters that can start something in text. const MARKERS: [u8; 16] = [ @@ -242,7 +244,7 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace. -pub fn resolve(tokenizer: &mut Tokenizer) { +pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> { resolve_whitespace( tokenizer, tokenizer.parse_state.options.constructs.hard_break_trailing, @@ -257,4 +259,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) { { resolve_gfm_autolink_literal(tokenizer); } + + Ok(None) } diff --git a/src/event.rs b/src/event.rs index de3f95f..a2626ee 100644 --- a/src/event.rs +++ b/src/event.rs @@ -554,6 +554,26 @@ pub enum Name { /// ^ ^ /// ``` CodeTextSequence, + /// Content. + /// + /// ## Info + /// + /// * **Context**: + /// [flow content][crate::construct::flow] + /// * **Content model**: + /// [content][crate::construct::content] + /// * **Construct**: + /// [`content`][crate::construct::content] + /// + /// ## Example + /// + /// ```markdown + /// > | [a]: b + /// ^^^^^^ + /// > | c. + /// ^^ + /// ``` + Content, /// Data. /// /// ## Info @@ -1754,7 +1774,8 @@ pub enum Name { /// * **Context**: /// [`HeadingSetext`][Name::HeadingSetext] /// * **Content model**: - /// void + /// [`HeadingSetextUnderlineSequence`][Name::HeadingSetextUnderlineSequence], + /// [`SpaceOrTab`][Name::SpaceOrTab] /// * **Construct**: /// [`heading_setext`][crate::construct::heading_setext] /// @@ -1766,6 +1787,25 @@ pub enum Name { /// ^^^^^ /// ``` HeadingSetextUnderline, + /// Heading (setext) underline sequence. + /// + /// ## Info + /// + /// * **Context**: + /// [`HeadingSetext`][Name::HeadingSetext] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`heading_setext`][crate::construct::heading_setext] + /// + /// ## Example + /// + /// ```markdown + /// | alpha + /// > | ===== + /// ^^^^^ + /// ``` + HeadingSetextUnderlineSequence, /// Whole html (flow). /// /// ## Info @@ -2914,13 +2954,12 @@ pub enum Name { /// ^ /// ``` MdxJsxTagSelfClosingMarker, - - /// Whole paragraph. + /// Paragraph. /// /// ## Info /// /// * **Context**: - /// [flow content][crate::construct::flow] + /// [content][crate::construct::content] /// * **Content model**: /// [text content][crate::construct::text] /// * **Construct**: @@ -3340,7 +3379,7 @@ pub const VOID_EVENTS: [Name; 75] = [ Name::HardBreakEscape, Name::HardBreakTrailing, Name::HeadingAtxSequence, - Name::HeadingSetextUnderline, + Name::HeadingSetextUnderlineSequence, Name::HtmlFlowData, Name::HtmlTextData, Name::LabelImageMarker, @@ -3380,6 +3419,8 @@ pub const VOID_EVENTS: [Name; 75] = [ pub enum Content { /// Represents [flow content][crate::construct::flow]. Flow, + /// Represents [content][crate::construct::content]. + Content, /// Represents [string content][crate::construct::string]. String, /// Represents [text content][crate::construct::text]. diff --git a/src/parser.rs b/src/parser.rs index 3a7713a..c69eb38 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -49,16 +49,25 @@ pub fn parse<'a>(value: &'a str, options: &'a Options) -> Result<(Vec<Event>, &' (parse_state.bytes.len(), 0), State::Next(StateName::DocumentStart), ); - tokenizer.flush(state, true)?; - + let mut result = tokenizer.flush(state, true)?; let mut events = tokenizer.events; - let footnote = tokenizer.tokenize_state.gfm_footnote_definitions; - let normal = tokenizer.tokenize_state.definitions; - parse_state.gfm_footnote_definitions = footnote; - parse_state.definitions = normal; + parse_state + .gfm_footnote_definitions + .append(&mut result.gfm_footnote_definitions); + parse_state.definitions.append(&mut result.definitions); + + loop { + let mut result = subtokenize(&mut events, &parse_state, &None)?; + parse_state + .gfm_footnote_definitions + .append(&mut result.gfm_footnote_definitions); + parse_state.definitions.append(&mut result.definitions); - while !(subtokenize(&mut events, &parse_state)?) {} + if result.done { + break; + } + } Ok((events, parse_state.bytes)) } diff --git a/src/resolve.rs b/src/resolve.rs index d015213..2586676 100644 --- a/src/resolve.rs +++ b/src/resolve.rs @@ -1,7 +1,9 @@ //! Resolve events. use crate::construct; +use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; +use alloc::string::String; /// Names of resolvers. #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -32,8 +34,8 @@ pub enum Name { HeadingAtx, /// Resolve heading (setext). /// - /// Heading (setext) is parsed as an underline that is preceded by a - /// paragraph, both will form the whole construct. + /// Heading (setext) is parsed as an underline that is preceded by content, + /// both will form the whole construct. HeadingSetext, /// Resolve list item. /// @@ -41,12 +43,12 @@ pub enum Name { /// They are wrapped into ordered or unordered lists based on whether items /// with the same marker occur next to each other. ListItem, - /// Resolve paragraphs. + /// Resolve content. /// - /// Paragraphs are parsed as single line paragraphs, as what remains if - /// other flow constructs don’t match. + /// Content is parsed as single lines, as what remains if other flow + /// constructs don’t match. /// But, when they occur next to each other, they need to be merged. - Paragraph, + Content, /// Resolve data. /// /// Data is parsed as many small bits, due to many punctuation characters @@ -61,7 +63,7 @@ pub enum Name { } /// Call the corresponding resolver. -pub fn call(tokenizer: &mut Tokenizer, name: Name) { +pub fn call(tokenizer: &mut Tokenizer, name: Name) -> Result<Option<Subresult>, String> { let func = match name { Name::Label => construct::label_end::resolve, Name::Attention => construct::attention::resolve, @@ -69,11 +71,11 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) { Name::HeadingAtx => construct::heading_atx::resolve, Name::HeadingSetext => construct::heading_setext::resolve, Name::ListItem => construct::list_item::resolve, - Name::Paragraph => construct::paragraph::resolve, + Name::Content => construct::content::resolve, Name::Data => construct::partial_data::resolve, Name::String => construct::string::resolve, Name::Text => construct::text::resolve, }; - func(tokenizer); + func(tokenizer) } diff --git a/src/state.rs b/src/state.rs index 1d15239..896761e 100644 --- a/src/state.rs +++ b/src/state.rs @@ -75,24 +75,6 @@ pub enum Name { CharacterReferenceNumeric, CharacterReferenceValue, - RawFlowStart, - RawFlowBeforeSequenceOpen, - RawFlowSequenceOpen, - RawFlowInfoBefore, - RawFlowInfo, - RawFlowMetaBefore, - RawFlowMeta, - RawFlowAtNonLazyBreak, - RawFlowCloseStart, - RawFlowBeforeSequenceClose, - RawFlowSequenceClose, - RawFlowAfterSequenceClose, - RawFlowContentBefore, - RawFlowContentStart, - RawFlowBeforeContentChunk, - RawFlowContentChunk, - RawFlowAfter, - CodeIndentedStart, CodeIndentedAtBreak, CodeIndentedAfter, @@ -101,11 +83,10 @@ pub enum Name { CodeIndentedFurtherBegin, CodeIndentedFurtherAfter, - RawTextStart, - RawTextSequenceOpen, - RawTextBetween, - RawTextData, - RawTextSequenceClose, + ContentChunkStart, + ContentChunkInside, + ContentDefinitionBefore, + ContentDefinitionAfter, DataStart, DataInside, @@ -114,6 +95,7 @@ pub enum Name { DefinitionStart, DefinitionBefore, DefinitionLabelAfter, + DefinitionLabelNok, DefinitionMarkerAfter, DefinitionDestinationBefore, DefinitionDestinationAfter, @@ -155,11 +137,10 @@ pub enum Name { FlowBeforeHeadingAtx, FlowBeforeHeadingSetext, FlowBeforeThematicBreak, - FlowBeforeDefinition, FlowAfter, FlowBlankLineBefore, FlowBlankLineAfter, - FlowBeforeParagraph, + FlowBeforeContent, FrontmatterStart, FrontmatterOpenSequence, @@ -363,6 +344,21 @@ pub enum Name { ListItemContBlank, ListItemContFilled, + MdxExpressionTextStart, + MdxExpressionTextAfter, + + MdxExpressionFlowStart, + MdxExpressionFlowBefore, + MdxExpressionFlowAfter, + MdxExpressionFlowEnd, + + MdxExpressionStart, + MdxExpressionBefore, + MdxExpressionInside, + MdxExpressionEolAfter, + MdxJsxAttributeValueExpressionAfter, + MdxJsxAttributeExpressionAfter, + MdxJsxFlowStart, MdxJsxFlowBefore, MdxJsxFlowAfter, @@ -402,8 +398,33 @@ pub enum Name { NonLazyContinuationAfter, ParagraphStart, + ParagraphLineStart, ParagraphInside, + RawFlowStart, + RawFlowBeforeSequenceOpen, + RawFlowSequenceOpen, + RawFlowInfoBefore, + RawFlowInfo, + RawFlowMetaBefore, + RawFlowMeta, + RawFlowAtNonLazyBreak, + RawFlowCloseStart, + RawFlowBeforeSequenceClose, + RawFlowSequenceClose, + RawFlowAfterSequenceClose, + RawFlowContentBefore, + RawFlowContentStart, + RawFlowBeforeContentChunk, + RawFlowContentChunk, + RawFlowAfter, + + RawTextStart, + RawTextSequenceOpen, + RawTextBetween, + RawTextData, + RawTextSequenceClose, + SpaceOrTabStart, SpaceOrTabInside, SpaceOrTabAfter, @@ -438,47 +459,12 @@ pub enum Name { TitleAtBlankLine, TitleEscape, TitleInside, - - MdxExpressionTextStart, - MdxExpressionTextAfter, - - MdxExpressionFlowStart, - MdxExpressionFlowBefore, - MdxExpressionFlowAfter, - MdxExpressionFlowEnd, - - MdxExpressionStart, - MdxExpressionBefore, - MdxExpressionInside, - MdxExpressionEolAfter, - MdxJsxAttributeValueExpressionAfter, - MdxJsxAttributeExpressionAfter, } #[allow(clippy::too_many_lines)] /// Call the corresponding state for a state name. pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { let func = match name { - Name::MdxExpressionTextStart => construct::mdx_expression_text::start, - Name::MdxExpressionTextAfter => construct::mdx_expression_text::after, - - Name::MdxExpressionFlowStart => construct::mdx_expression_flow::start, - Name::MdxExpressionFlowBefore => construct::mdx_expression_flow::before, - Name::MdxExpressionFlowAfter => construct::mdx_expression_flow::after, - Name::MdxExpressionFlowEnd => construct::mdx_expression_flow::end, - - Name::MdxExpressionStart => construct::partial_mdx_expression::start, - Name::MdxExpressionBefore => construct::partial_mdx_expression::before, - Name::MdxExpressionInside => construct::partial_mdx_expression::inside, - Name::MdxExpressionEolAfter => construct::partial_mdx_expression::eol_after, - - Name::MdxJsxAttributeValueExpressionAfter => { - construct::partial_mdx_jsx::attribute_value_expression_after - } - Name::MdxJsxAttributeExpressionAfter => { - construct::partial_mdx_jsx::attribute_expression_after - } - Name::AttentionStart => construct::attention::start, Name::AttentionInside => construct::attention::inside, @@ -511,24 +497,6 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::CharacterReferenceNumeric => construct::character_reference::numeric, Name::CharacterReferenceValue => construct::character_reference::value, - Name::RawFlowStart => construct::raw_flow::start, - Name::RawFlowBeforeSequenceOpen => construct::raw_flow::before_sequence_open, - Name::RawFlowSequenceOpen => construct::raw_flow::sequence_open, - Name::RawFlowInfoBefore => construct::raw_flow::info_before, - Name::RawFlowInfo => construct::raw_flow::info, - Name::RawFlowMetaBefore => construct::raw_flow::meta_before, - Name::RawFlowMeta => construct::raw_flow::meta, - Name::RawFlowAtNonLazyBreak => construct::raw_flow::at_non_lazy_break, - Name::RawFlowCloseStart => construct::raw_flow::close_start, - Name::RawFlowBeforeSequenceClose => construct::raw_flow::before_sequence_close, - Name::RawFlowSequenceClose => construct::raw_flow::sequence_close, - Name::RawFlowAfterSequenceClose => construct::raw_flow::sequence_close_after, - Name::RawFlowContentBefore => construct::raw_flow::content_before, - Name::RawFlowContentStart => construct::raw_flow::content_start, - Name::RawFlowBeforeContentChunk => construct::raw_flow::before_content_chunk, - Name::RawFlowContentChunk => construct::raw_flow::content_chunk, - Name::RawFlowAfter => construct::raw_flow::after, - Name::CodeIndentedStart => construct::code_indented::start, Name::CodeIndentedAtBreak => construct::code_indented::at_break, Name::CodeIndentedAfter => construct::code_indented::after, @@ -537,11 +505,10 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::CodeIndentedFurtherBegin => construct::code_indented::further_begin, Name::CodeIndentedFurtherAfter => construct::code_indented::further_after, - Name::RawTextStart => construct::raw_text::start, - Name::RawTextSequenceOpen => construct::raw_text::sequence_open, - Name::RawTextBetween => construct::raw_text::between, - Name::RawTextData => construct::raw_text::data, - Name::RawTextSequenceClose => construct::raw_text::sequence_close, + Name::ContentChunkStart => construct::content::chunk_start, + Name::ContentChunkInside => construct::content::chunk_inside, + Name::ContentDefinitionBefore => construct::content::definition_before, + Name::ContentDefinitionAfter => construct::content::definition_after, Name::DataStart => construct::partial_data::start, Name::DataInside => construct::partial_data::inside, @@ -550,6 +517,7 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::DefinitionStart => construct::definition::start, Name::DefinitionBefore => construct::definition::before, Name::DefinitionLabelAfter => construct::definition::label_after, + Name::DefinitionLabelNok => construct::definition::label_nok, Name::DefinitionMarkerAfter => construct::definition::marker_after, Name::DefinitionDestinationBefore => construct::definition::destination_before, Name::DefinitionDestinationAfter => construct::definition::destination_after, @@ -599,11 +567,10 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::FlowBeforeHeadingAtx => construct::flow::before_heading_atx, Name::FlowBeforeHeadingSetext => construct::flow::before_heading_setext, Name::FlowBeforeThematicBreak => construct::flow::before_thematic_break, - Name::FlowBeforeDefinition => construct::flow::before_definition, Name::FlowAfter => construct::flow::after, Name::FlowBlankLineBefore => construct::flow::blank_line_before, Name::FlowBlankLineAfter => construct::flow::blank_line_after, - Name::FlowBeforeParagraph => construct::flow::before_paragraph, + Name::FlowBeforeContent => construct::flow::before_content, Name::FrontmatterStart => construct::frontmatter::start, Name::FrontmatterOpenSequence => construct::frontmatter::open_sequence, @@ -624,7 +591,6 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::GfmAutolinkLiteralProtocolSlashesInside => { construct::gfm_autolink_literal::protocol_slashes_inside } - Name::GfmAutolinkLiteralWwwAfter => construct::gfm_autolink_literal::www_after, Name::GfmAutolinkLiteralWwwStart => construct::gfm_autolink_literal::www_start, Name::GfmAutolinkLiteralWwwPrefixInside => { @@ -636,7 +602,6 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { construct::gfm_autolink_literal::domain_at_punctuation } Name::GfmAutolinkLiteralDomainAfter => construct::gfm_autolink_literal::domain_after, - Name::GfmAutolinkLiteralPathInside => construct::gfm_autolink_literal::path_inside, Name::GfmAutolinkLiteralPathAtPunctuation => { construct::gfm_autolink_literal::path_at_punctuation @@ -671,21 +636,12 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::GfmLabelStartFootnoteStart => construct::gfm_label_start_footnote::start, Name::GfmLabelStartFootnoteOpen => construct::gfm_label_start_footnote::open, - Name::GfmTaskListItemCheckStart => construct::gfm_task_list_item_check::start, - Name::GfmTaskListItemCheckInside => construct::gfm_task_list_item_check::inside, - Name::GfmTaskListItemCheckClose => construct::gfm_task_list_item_check::close, - Name::GfmTaskListItemCheckAfter => construct::gfm_task_list_item_check::after, - Name::GfmTaskListItemCheckAfterSpaceOrTab => { - construct::gfm_task_list_item_check::after_space_or_tab - } - Name::GfmTableStart => construct::gfm_table::start, Name::GfmTableHeadRowBefore => construct::gfm_table::head_row_before, Name::GfmTableHeadRowStart => construct::gfm_table::head_row_start, Name::GfmTableHeadRowBreak => construct::gfm_table::head_row_break, Name::GfmTableHeadRowData => construct::gfm_table::head_row_data, Name::GfmTableHeadRowEscape => construct::gfm_table::head_row_escape, - Name::GfmTableHeadDelimiterStart => construct::gfm_table::head_delimiter_start, Name::GfmTableHeadDelimiterBefore => construct::gfm_table::head_delimiter_before, Name::GfmTableHeadDelimiterCellBefore => construct::gfm_table::head_delimiter_cell_before, @@ -699,13 +655,20 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { } Name::GfmTableHeadDelimiterCellAfter => construct::gfm_table::head_delimiter_cell_after, Name::GfmTableHeadDelimiterNok => construct::gfm_table::head_delimiter_nok, - Name::GfmTableBodyRowBefore => construct::gfm_table::body_row_before, Name::GfmTableBodyRowStart => construct::gfm_table::body_row_start, Name::GfmTableBodyRowBreak => construct::gfm_table::body_row_break, Name::GfmTableBodyRowData => construct::gfm_table::body_row_data, Name::GfmTableBodyRowEscape => construct::gfm_table::body_row_escape, + Name::GfmTaskListItemCheckStart => construct::gfm_task_list_item_check::start, + Name::GfmTaskListItemCheckInside => construct::gfm_task_list_item_check::inside, + Name::GfmTaskListItemCheckClose => construct::gfm_task_list_item_check::close, + Name::GfmTaskListItemCheckAfter => construct::gfm_task_list_item_check::after, + Name::GfmTaskListItemCheckAfterSpaceOrTab => { + construct::gfm_task_list_item_check::after_space_or_tab + } + Name::HardBreakEscapeStart => construct::hard_break_escape::start, Name::HardBreakEscapeAfter => construct::hard_break_escape::after, @@ -859,11 +822,25 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::ListItemContBlank => construct::list_item::cont_blank, Name::ListItemContFilled => construct::list_item::cont_filled, + Name::MdxExpressionStart => construct::partial_mdx_expression::start, + Name::MdxExpressionBefore => construct::partial_mdx_expression::before, + Name::MdxExpressionInside => construct::partial_mdx_expression::inside, + Name::MdxExpressionEolAfter => construct::partial_mdx_expression::eol_after, + + Name::MdxExpressionFlowStart => construct::mdx_expression_flow::start, + Name::MdxExpressionFlowBefore => construct::mdx_expression_flow::before, + Name::MdxExpressionFlowAfter => construct::mdx_expression_flow::after, + Name::MdxExpressionFlowEnd => construct::mdx_expression_flow::end, + + Name::MdxExpressionTextStart => construct::mdx_expression_text::start, + Name::MdxExpressionTextAfter => construct::mdx_expression_text::after, + Name::MdxJsxFlowStart => construct::mdx_jsx_flow::start, Name::MdxJsxFlowBefore => construct::mdx_jsx_flow::before, Name::MdxJsxFlowAfter => construct::mdx_jsx_flow::after, Name::MdxJsxFlowEnd => construct::mdx_jsx_flow::end, Name::MdxJsxFlowNok => construct::mdx_jsx_flow::nok, + Name::MdxJsxTextStart => construct::mdx_jsx_text::start, Name::MdxJsxTextAfter => construct::mdx_jsx_text::after, Name::MdxJsxTextNok => construct::mdx_jsx_text::nok, @@ -883,6 +860,9 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::MdxJsxLocalNameAfter => construct::partial_mdx_jsx::local_name_after, Name::MdxJsxAttributeBefore => construct::partial_mdx_jsx::attribute_before, Name::MdxJsxSelfClosing => construct::partial_mdx_jsx::self_closing, + Name::MdxJsxAttributeExpressionAfter => { + construct::partial_mdx_jsx::attribute_expression_after + } Name::MdxJsxAttributePrimaryName => construct::partial_mdx_jsx::attribute_primary_name, Name::MdxJsxAttributePrimaryNameAfter => { construct::partial_mdx_jsx::attribute_primary_name_after @@ -899,6 +879,9 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { construct::partial_mdx_jsx::attribute_value_quoted_start } Name::MdxJsxAttributeValueQuoted => construct::partial_mdx_jsx::attribute_value_quoted, + Name::MdxJsxAttributeValueExpressionAfter => { + construct::partial_mdx_jsx::attribute_value_expression_after + } Name::MdxJsxEsWhitespaceStart => construct::partial_mdx_jsx::es_whitespace_start, Name::MdxJsxEsWhitespaceInside => construct::partial_mdx_jsx::es_whitespace_inside, Name::MdxJsxEsWhitespaceEolAfter => construct::partial_mdx_jsx::es_whitespace_eol_after, @@ -907,8 +890,33 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::NonLazyContinuationAfter => construct::partial_non_lazy_continuation::after, Name::ParagraphStart => construct::paragraph::start, + Name::ParagraphLineStart => construct::paragraph::line_start, Name::ParagraphInside => construct::paragraph::inside, + Name::RawFlowStart => construct::raw_flow::start, + Name::RawFlowBeforeSequenceOpen => construct::raw_flow::before_sequence_open, + Name::RawFlowSequenceOpen => construct::raw_flow::sequence_open, + Name::RawFlowInfoBefore => construct::raw_flow::info_before, + Name::RawFlowInfo => construct::raw_flow::info, + Name::RawFlowMetaBefore => construct::raw_flow::meta_before, + Name::RawFlowMeta => construct::raw_flow::meta, + Name::RawFlowAtNonLazyBreak => construct::raw_flow::at_non_lazy_break, + Name::RawFlowCloseStart => construct::raw_flow::close_start, + Name::RawFlowBeforeSequenceClose => construct::raw_flow::before_sequence_close, + Name::RawFlowSequenceClose => construct::raw_flow::sequence_close, + Name::RawFlowAfterSequenceClose => construct::raw_flow::sequence_close_after, + Name::RawFlowContentBefore => construct::raw_flow::content_before, + Name::RawFlowContentStart => construct::raw_flow::content_start, + Name::RawFlowBeforeContentChunk => construct::raw_flow::before_content_chunk, + Name::RawFlowContentChunk => construct::raw_flow::content_chunk, + Name::RawFlowAfter => construct::raw_flow::after, + + Name::RawTextStart => construct::raw_text::start, + Name::RawTextSequenceOpen => construct::raw_text::sequence_open, + Name::RawTextBetween => construct::raw_text::between, + Name::RawTextData => construct::raw_text::data, + Name::RawTextSequenceClose => construct::raw_text::sequence_close, + Name::SpaceOrTabStart => construct::partial_space_or_tab::start, Name::SpaceOrTabInside => construct::partial_space_or_tab::inside, Name::SpaceOrTabAfter => construct::partial_space_or_tab::after, diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 12f91cf..5bb7e98 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -24,6 +24,13 @@ use crate::tokenizer::Tokenizer; use crate::util::{edit_map::EditMap, skip}; use alloc::{string::String, vec, vec::Vec}; +#[derive(Debug)] +pub struct Subresult { + pub done: bool, + pub gfm_footnote_definitions: Vec<String>, + pub definitions: Vec<String>, +} + /// Link two [`Event`][]s. /// /// Arbitrary (void) events can be linked together. @@ -69,10 +76,19 @@ pub fn link_to(events: &mut [Event], previous: usize, next: usize) { /// Parse linked events. /// /// Supposed to be called repeatedly, returns `true` when done. -pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> Result<bool, String> { +pub fn subtokenize( + events: &mut Vec<Event>, + parse_state: &ParseState, + filter: &Option<Content>, +) -> Result<Subresult, String> { let mut map = EditMap::new(); - let mut done = true; let mut index = 0; + let mut value = Subresult { + done: true, + gfm_footnote_definitions: vec![], + definitions: vec![], + }; + let mut acc = (0, 0); while index < events.len() { let event = &events[index]; @@ -82,16 +98,19 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> Result< debug_assert_eq!(event.kind, Kind::Enter); // No need to enter linked events again. - if link.previous == None { + if link.previous == None + && (filter.is_none() || &link.content == filter.as_ref().unwrap()) + { // Index into `events` pointing to a chunk. let mut link_index = Some(index); // Subtokenizer. let mut tokenizer = Tokenizer::new(event.point.clone(), parse_state); // Substate. - let mut state = State::Next(if link.content == Content::String { - StateName::StringStart - } else { - StateName::TextStart + let mut state = State::Next(match link.content { + Content::Flow => unreachable!("flow subcontent not implemented yet"), + Content::Content => StateName::ContentDefinitionBefore, + Content::String => StateName::StringStart, + Content::Text => StateName::TextStart, }); // Check if this is the first paragraph, after zero or more @@ -143,11 +162,14 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> Result< link_index = link_curr.next; } - tokenizer.flush(state, true)?; + let mut result = tokenizer.flush(state, true)?; + value + .gfm_footnote_definitions + .append(&mut result.gfm_footnote_definitions); + value.definitions.append(&mut result.definitions); + value.done = false; - divide_events(&mut map, events, index, &mut tokenizer.events); - - done = false; + acc = divide_events(&mut map, events, index, &mut tokenizer.events, acc); } } @@ -156,7 +178,7 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> Result< map.consume(events); - Ok(done) + Ok(value) } /// Divide `child_events` over links in `events`, the first of which is at @@ -166,15 +188,17 @@ pub fn divide_events( events: &[Event], mut link_index: usize, child_events: &mut Vec<Event>, -) { + acc_before: (usize, usize), +) -> (usize, usize) { // Loop through `child_events` to figure out which parts belong where and // fix deep links. let mut child_index = 0; let mut slices = vec![]; let mut slice_start = 0; let mut old_prev: Option<usize> = None; + let len = child_events.len(); - while child_index < child_events.len() { + while child_index < len { let current = &child_events[child_index].point; let end = &events[link_index + 1].point; @@ -200,7 +224,8 @@ pub fn divide_events( } else { old_prev + link_index - (slices.len() - 1) * 2 }; - prev_event.link.as_mut().unwrap().next = Some(new_link); + prev_event.link.as_mut().unwrap().next = + Some(new_link + acc_before.1 - acc_before.0); } } @@ -219,7 +244,9 @@ pub fn divide_events( // The `index` in `events` where the current link is, // minus 2 events (the enter and exit) for each removed // link. - .map(|previous| previous + link_index - (slices.len() * 2)); + .map(|previous| { + previous + link_index - (slices.len() * 2) + acc_before.1 - acc_before.0 + }); } } @@ -245,4 +272,6 @@ pub fn divide_events( child_events.split_off(slices[index].1), ); } + + (acc_before.0 + (slices.len() * 2), acc_before.1 + len) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 5095abb..8441f7e 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -12,6 +12,7 @@ use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS}; use crate::parser::ParseState; use crate::resolve::{call as call_resolve, Name as ResolveName}; use crate::state::{call, State}; +use crate::subtokenize::Subresult; use crate::util::{char::format_byte_opt, constant::TAB_SIZE, edit_map::EditMap}; use alloc::{boxed::Box, string::String, vec, vec::Vec}; @@ -609,23 +610,35 @@ impl<'a> Tokenizer<'a> { } /// Flush. - pub fn flush(&mut self, state: State, resolve: bool) -> Result<(), String> { + pub fn flush(&mut self, state: State, resolve: bool) -> Result<Subresult, String> { let to = (self.point.index, self.point.vs); let state = push_impl(self, to, to, state, true); - let result = state.to_result(); - if resolve && result.is_ok() { + state.to_result()?; + + let mut value = Subresult { + done: false, + gfm_footnote_definitions: self.tokenize_state.gfm_footnote_definitions.split_off(0), + definitions: self.tokenize_state.definitions.split_off(0), + }; + + if resolve { let resolvers = self.resolvers.split_off(0); let mut index = 0; while index < resolvers.len() { - call_resolve(self, resolvers[index]); + if let Some(mut result) = call_resolve(self, resolvers[index])? { + value + .gfm_footnote_definitions + .append(&mut result.gfm_footnote_definitions); + value.definitions.append(&mut result.definitions); + } index += 1; } self.map.consume(&mut self.events); } - result + Ok(value) } } |