From ee967aa634b5f8e9d30329d587538f1371a5da95 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 15 Aug 2022 11:40:40 +0200 Subject: Refactor to move `content` to `construct` --- readme.md | 1 - src/construct/attention.rs | 2 +- src/construct/autolink.rs | 2 +- src/construct/blank_line.rs | 2 +- src/construct/block_quote.rs | 2 +- src/construct/character_escape.rs | 4 +- src/construct/character_reference.rs | 4 +- src/construct/code_fenced.rs | 6 +- src/construct/code_indented.rs | 4 +- src/construct/code_text.rs | 4 +- src/construct/definition.rs | 4 +- src/construct/document.rs | 492 +++++++++++++++++++++++++ src/construct/flow.rs | 254 +++++++++++++ src/construct/hard_break_escape.rs | 2 +- src/construct/heading_atx.rs | 2 +- src/construct/heading_setext.rs | 2 +- src/construct/html_flow.rs | 2 +- src/construct/html_text.rs | 2 +- src/construct/label_end.rs | 4 +- src/construct/label_start_image.rs | 2 +- src/construct/label_start_link.rs | 2 +- src/construct/list_item.rs | 2 +- src/construct/mod.rs | 4 + src/construct/paragraph.rs | 4 +- src/construct/partial_data.rs | 4 +- src/construct/partial_destination.rs | 2 +- src/construct/partial_label.rs | 2 +- src/construct/partial_non_lazy_continuation.rs | 2 +- src/construct/partial_title.rs | 2 +- src/construct/partial_whitespace.rs | 4 +- src/construct/string.rs | 76 ++++ src/construct/text.rs | 173 +++++++++ src/construct/thematic_break.rs | 2 +- src/content/document.rs | 492 ------------------------- src/content/flow.rs | 254 ------------- src/content/mod.rs | 11 - src/content/string.rs | 76 ---- src/content/text.rs | 173 --------- src/event.rs | 94 ++--- src/lib.rs | 1 - src/resolve.rs | 5 +- src/state.rs | 65 ++-- src/tokenizer.rs | 10 +- 43 files changed, 1124 insertions(+), 1133 deletions(-) create mode 100644 src/construct/document.rs create mode 100644 src/construct/flow.rs create mode 100644 src/construct/string.rs create mode 100644 src/construct/text.rs delete mode 100644 src/content/document.rs delete mode 100644 src/content/flow.rs delete mode 100644 src/content/mod.rs delete mode 100644 src/content/string.rs delete mode 100644 src/content/text.rs diff --git a/readme.md b/readme.md index 7360e3c..5379089 100644 --- a/readme.md +++ b/readme.md @@ -14,7 +14,6 @@ Crate docs are currently at ### Refactor -- [ ] (1) Move `content` to `construct` - [ ] (1) Improve `interrupt`, `concrete`, `lazy` fields somehow? - [ ] (?) Remove last box: the one around the child tokenizer? - [ ] (1) Add helper to get byte at, get char before/after, etc. diff --git a/src/construct/attention.rs b/src/construct/attention.rs index ae8da81..1dc8868 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -47,7 +47,7 @@ //! * [`attention.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/attention.js) //! * [*§ 6.2 Emphasis and strong emphasis* in `CommonMark`](https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis) //! -//! [text]: crate::content::text +//! [text]: crate::construct::text //! [html-em]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-em-element //! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 5c826a3..37e21d9 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -94,7 +94,7 @@ //! * [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js) //! * [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks) //! -//! [text]: crate::content::text +//! [text]: crate::construct::text //! [label_end]: crate::construct::label_end //! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX //! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 87d257d..928b8cc 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -30,7 +30,7 @@ //! [heading-atx]: crate::construct::heading_atx //! [list-item]: crate::construct::list_item //! [paragraph]: crate::construct::paragraph -//! [flow]: crate::content::flow +//! [flow]: crate::construct::flow use crate::construct::partial_space_or_tab::space_or_tab; use crate::state::{Name as StateName, State}; diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index 4f0870f..37726c5 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -29,7 +29,7 @@ //! * [`block-quote.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/block-quote.js) //! * [*§ 5.1 Block quotes* in `CommonMark`](https://spec.commonmark.org/0.30/#block-quotes) //! -//! [document]: crate::content::document +//! [document]: crate::construct::document //! [html-blockquote]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-blockquote-element //! [commonmark-block]: https://spec.commonmark.org/0.30/#phase-1-block-structure diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index ac91c29..6dac458 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -28,8 +28,8 @@ //! * [`character-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-escape.js) //! * [*§ 2.4 Backslash escapes* in `CommonMark`](https://spec.commonmark.org/0.30/#backslash-escapes) //! -//! [string]: crate::content::string -//! [text]: crate::content::text +//! [string]: crate::construct::string +//! [text]: crate::construct::text //! [character_reference]: crate::construct::character_reference //! [hard_break_escape]: crate::construct::hard_break_escape diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 7d7b6f9..7935109 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -54,8 +54,8 @@ //! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js) //! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) //! -//! [string]: crate::content::string -//! [text]: crate::content::text +//! [string]: crate::construct::string +//! [text]: crate::construct::text //! [character_escape]: crate::construct::character_reference //! [decode_numeric]: crate::util::decode_character_reference::decode_numeric //! [character_references]: crate::constant::CHARACTER_REFERENCES diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 74d6fe1..3812d44 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -91,9 +91,9 @@ //! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) //! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) //! -//! [flow]: crate::content::flow -//! [string]: crate::content::string -//! [text]: crate::content::text +//! [flow]: crate::construct::flow +//! [string]: crate::construct::string +//! [text]: crate::construct::text //! [code_indented]: crate::construct::code_indented //! [code_text]: crate::construct::code_text //! [character_escape]: crate::construct::character_escape diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index cf111f4..e3a5333 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -38,8 +38,8 @@ //! * [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js) //! * [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks) //! -//! [flow]: crate::content::flow -//! [text]: crate::content::text +//! [flow]: crate::construct::flow +//! [text]: crate::construct::text //! [code_text]: crate::construct::code_text //! [code_fenced]: crate::construct::code_fenced //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index d601583..7ebee96 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -77,8 +77,8 @@ //! * [`code-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-text.js) //! * [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans) //! -//! [flow]: crate::content::flow -//! [text]: crate::content::text +//! [flow]: crate::construct::flow +//! [text]: crate::construct::text //! [code_indented]: crate::construct::code_indented //! [code_fenced]: crate::construct::code_fenced //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element diff --git a/src/construct/definition.rs b/src/construct/definition.rs index e242e23..8f274ee 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -80,8 +80,8 @@ //! * [`definition.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/definition.js) //! * [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.30/#link-reference-definitions) //! -//! [flow]: crate::content::flow -//! [string]: crate::content::string +//! [flow]: crate::construct::flow +//! [string]: crate::construct::string //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference //! [label_end]: crate::construct::label_end diff --git a/src/construct/document.rs b/src/construct/document.rs new file mode 100644 index 0000000..9def6c5 --- /dev/null +++ b/src/construct/document.rs @@ -0,0 +1,492 @@ +//! The document content type. +//! +//! **Document** represents the containers, such as block quotes and lists, +//! which structure the document and contain other sections. +//! +//! The constructs found in flow are: +//! +//! * [Block quote][crate::construct::block_quote] +//! * [List][crate::construct::list_item] + +use crate::event::{Content, Event, Kind, Link, Name}; +use crate::state::{Name as StateName, State}; +use crate::subtokenize::divide_events; +use crate::tokenizer::{Container, ContainerState, Tokenizer}; +use crate::util::skip; + +/// Phases where we can exit containers. +#[derive(Debug, PartialEq)] +enum Phase { + /// After parsing a line of lazy flow which resulted in something that + /// exits containers before the line. + /// + /// ```markdown + /// | * a + /// > | ```js + /// ^ + /// | b + /// | ``` + /// ``` + After, + /// When a new container replaces an existing container. + /// + /// ```markdown + /// | * a + /// > | > b + /// ^ + /// ``` + Prefix, + /// After everything. + /// + /// ```markdown + /// > | * a + /// ^ + /// ``` + Eof, +} + +/// Start of document, at an optional BOM. +/// +/// ```markdown +/// > | a +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.document_child = Some(Box::new(Tokenizer::new( + tokenizer.point.clone(), + tokenizer.parse_state, + ))); + + tokenizer.attempt( + State::Next(StateName::DocumentContainerExistingBefore), + State::Next(StateName::DocumentContainerExistingBefore), + ); + + State::Retry(StateName::BomStart) +} + +/// At optional existing containers. +// +/// ```markdown +/// | * a +/// > | > b +/// ^ +/// ``` +pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State { + // If there are more existing containers, check whether the next one continues. + if tokenizer.tokenize_state.document_continued + < tokenizer.tokenize_state.document_container_stack.len() + { + let container = &tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued]; + + let name = match container.kind { + Container::BlockQuote => StateName::BlockQuoteContStart, + Container::ListItem => StateName::ListItemContStart, + }; + + tokenizer.attempt( + State::Next(StateName::DocumentContainerExistingAfter), + State::Next(StateName::DocumentContainerNewBefore), + ); + + State::Retry(name) + } + // Otherwise, check new containers. + else { + State::Retry(StateName::DocumentContainerNewBefore) + } +} + +/// After continued existing container. +// +/// ```markdown +/// | * a +/// > | b +/// ^ +/// ``` +pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.document_continued += 1; + State::Retry(StateName::DocumentContainerExistingBefore) +} + +/// At new containers. +// +/// ```markdown +/// > | * a +/// ^ +/// > | > b +/// ^ +/// ``` +pub fn container_new_before(tokenizer: &mut Tokenizer) -> State { + // If we have completely continued, restore the flow’s past `interrupt` + // status. + if tokenizer.tokenize_state.document_continued + == tokenizer.tokenize_state.document_container_stack.len() + { + let child = tokenizer.tokenize_state.document_child.as_ref().unwrap(); + + tokenizer.interrupt = child.interrupt; + + // …and if we’re in a concrete construct, new containers can’t “pierce” + // into them. + if child.concrete { + return State::Retry(StateName::DocumentContainersAfter); + } + } + + // Check for a new container. + // Block quote? + // Add a new container at the end of the stack. + let tail = tokenizer.tokenize_state.document_container_stack.len(); + tokenizer + .tokenize_state + .document_container_stack + .push(ContainerState { + kind: Container::BlockQuote, + blank_initial: false, + size: 0, + }); + // Swap the existing container with the new one. + tokenizer + .tokenize_state + .document_container_stack + .swap(tokenizer.tokenize_state.document_continued, tail); + + tokenizer.attempt( + State::Next(StateName::DocumentContainerNewAfter), + State::Next(StateName::DocumentContainerNewBeforeNotBlockQuote), + ); + State::Retry(StateName::BlockQuoteStart) +} + +/// At new container, but not a block quote. +// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State { + // List item? + // We replace the empty block quote container for this new list one. + tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued] = ContainerState { + kind: Container::ListItem, + blank_initial: false, + size: 0, + }; + + tokenizer.attempt( + State::Next(StateName::DocumentContainerNewAfter), + State::Next(StateName::DocumentContainerNewBeforeNotList), + ); + State::Retry(StateName::ListItemStart) +} + +/// At new container, but not a list (or block quote). +// +/// ```markdown +/// > | a +/// ^ +/// ``` +pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State { + // It wasn’t a new block quote or a list. + // Swap the new container (in the middle) with the existing one (at the end). + // Drop what was in the middle. + tokenizer + .tokenize_state + .document_container_stack + .swap_remove(tokenizer.tokenize_state.document_continued); + + State::Retry(StateName::DocumentContainersAfter) +} + +/// After new container. +/// +/// ```markdown +/// > | * a +/// ^ +/// > | > b +/// ^ +/// ``` +pub fn container_new_after(tokenizer: &mut Tokenizer) -> State { + // It was a new block quote or a list. + // Swap the new container (in the middle) with the existing one (at the end). + // Take the new container. + let container = tokenizer + .tokenize_state + .document_container_stack + .swap_remove(tokenizer.tokenize_state.document_continued); + + // If we did not continue all existing containers, and there is a new one, + // close the flow and those containers. + if tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len() + { + exit_containers(tokenizer, &Phase::Prefix); + } + + tokenizer + .tokenize_state + .document_container_stack + .push(container); + tokenizer.tokenize_state.document_continued += 1; + tokenizer.interrupt = false; + State::Retry(StateName::DocumentContainerNewBefore) +} + +/// After containers, at flow. +// +/// ```markdown +/// > | * a +/// ^ +/// > | > b +/// ^ +/// ``` +pub fn containers_after(tokenizer: &mut Tokenizer) -> State { + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); + + child.lazy = tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len(); + child.define_skip(tokenizer.point.clone()); + + match tokenizer.current { + // Note: EOL is part of data. + None => State::Retry(StateName::DocumentFlowEnd), + Some(_) => { + let current = tokenizer.events.len(); + let previous = tokenizer.tokenize_state.document_data_index; + if let Some(previous) = previous { + tokenizer.events[previous].link.as_mut().unwrap().next = Some(current); + } + tokenizer.tokenize_state.document_data_index = Some(current); + tokenizer.enter_link( + Name::Data, + Link { + previous, + next: None, + content: Content::Flow, + }, + ); + State::Retry(StateName::DocumentFlowInside) + } + } +} + +/// In flow. +// +/// ```markdown +/// > | * ab +/// ^ +/// ``` +pub fn flow_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None => { + tokenizer.exit(Name::Data); + State::Retry(StateName::DocumentFlowEnd) + } + // Note: EOL is part of data. + Some(b'\n') => { + tokenizer.consume(); + tokenizer.exit(Name::Data); + State::Next(StateName::DocumentFlowEnd) + } + Some(_) => { + tokenizer.consume(); + State::Next(StateName::DocumentFlowInside) + } + } +} + +/// After flow (after eol or at eof). +// +/// ```markdown +/// | * a +/// > | > b +/// ^ ^ +/// ``` +pub fn flow_end(tokenizer: &mut Tokenizer) -> State { + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); + let state = tokenizer + .tokenize_state + .document_child_state + .unwrap_or(State::Next(StateName::FlowStart)); + + tokenizer.tokenize_state.document_exits.push(None); + + let state = child.push( + (child.point.index, child.point.vs), + (tokenizer.point.index, tokenizer.point.vs), + state, + ); + + let paragraph = matches!(state, State::Next(StateName::ParagraphInside)) + || (!child.events.is_empty() + && child.events + [skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding])] + .name + == Name::Paragraph); + + tokenizer.tokenize_state.document_child_state = Some(state); + + if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before { + tokenizer.tokenize_state.document_continued = + tokenizer.tokenize_state.document_container_stack.len(); + } + + if tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len() + { + exit_containers(tokenizer, &Phase::After); + } + + match tokenizer.current { + None => { + tokenizer.tokenize_state.document_continued = 0; + exit_containers(tokenizer, &Phase::Eof); + resolve(tokenizer); + State::Ok + } + Some(_) => { + tokenizer.tokenize_state.document_continued = 0; + tokenizer.tokenize_state.document_paragraph_before = paragraph; + // Containers would only be interrupting if we’ve continued. + tokenizer.interrupt = false; + State::Retry(StateName::DocumentContainerExistingBefore) + } + } +} + +/// Close containers (and flow if needed). +fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { + let mut stack_close = tokenizer + .tokenize_state + .document_container_stack + .split_off(tokenizer.tokenize_state.document_continued); + + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); + + // Flush if needed. + if *phase != Phase::After { + let state = tokenizer + .tokenize_state + .document_child_state + .take() + .unwrap_or(State::Next(StateName::FlowStart)); + + child.flush(state, false); + } + + if !stack_close.is_empty() { + let index = tokenizer.tokenize_state.document_exits.len() + - (if *phase == Phase::After { 2 } else { 1 }); + let mut exits = Vec::with_capacity(stack_close.len()); + + while !stack_close.is_empty() { + let container = stack_close.pop().unwrap(); + let name = match container.kind { + Container::BlockQuote => Name::BlockQuote, + Container::ListItem => Name::ListItem, + }; + + exits.push(Event { + kind: Kind::Exit, + name: name.clone(), + point: tokenizer.point.clone(), + link: None, + }); + + let mut stack_index = tokenizer.stack.len(); + let mut found = false; + + while stack_index > 0 { + stack_index -= 1; + + if tokenizer.stack[stack_index] == name { + tokenizer.stack.remove(stack_index); + found = true; + break; + } + } + + debug_assert!(found, "expected to find container token to exit"); + } + + if let Some(ref mut list) = tokenizer.tokenize_state.document_exits[index] { + list.append(&mut exits); + } else { + tokenizer.tokenize_state.document_exits[index] = Some(exits); + } + } + + child.interrupt = false; +} + +// Inject everything together. +fn resolve(tokenizer: &mut Tokenizer) { + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); + + // First, add the container exits into `child`. + let mut child_index = 0; + let mut line = 0; + + while child_index < child.events.len() { + let event = &child.events[child_index]; + + if event.kind == Kind::Enter + && (event.name == Name::LineEnding || event.name == Name::BlankLineEnding) + { + if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() { + let mut exit_index = 0; + while exit_index < exits.len() { + exits[exit_index].point = event.point.clone(); + exit_index += 1; + } + + child.map.add(child_index, 0, exits); + } + + line += 1; + } + + child_index += 1; + } + + child.map.consume(&mut child.events); + + // Now, add all child events into our parent document tokenizer. + divide_events( + &mut tokenizer.map, + &tokenizer.events, + skip::to(&tokenizer.events, 0, &[Name::Data]), + &mut child.events, + ); + + // Replace the flow data with actual events. + tokenizer.map.consume(&mut tokenizer.events); + + // Now, add some final container exits due to the EOF. + // We can’t inject them into the child earlier, as they are “outside” its + // linked data. + if line < tokenizer.tokenize_state.document_exits.len() { + if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() { + let mut exit_index = 0; + while exit_index < exits.len() { + exits[exit_index].point = tokenizer.point.clone(); + exit_index += 1; + } + + tokenizer.events.append(&mut exits); + } + } + + // Add the resolvers from child. + tokenizer + .resolvers + .append(&mut child.resolvers.split_off(0)); + + tokenizer + .tokenize_state + .definitions + .append(&mut child.tokenize_state.definitions.split_off(0)); +} diff --git a/src/construct/flow.rs b/src/construct/flow.rs new file mode 100644 index 0000000..08c7891 --- /dev/null +++ b/src/construct/flow.rs @@ -0,0 +1,254 @@ +//! The flow content type. +//! +//! **Flow** represents the sections, such as headings and code, which are +//! parsed per line. +//! An example is HTML, which has a certain starting condition (such as +//! `