Fix to prefer flow over definitions, setext headings

An undocumented part of CommonMark is how to deal with things in definition labels or definition titles (which both can span multiple lines). Can flow (or containers?) interrupt them? They can according to the `cmark` reference parser, so this was implemented here. This adds a new `Content` content type, which houses zero or more definitions, and then zero-or-one paragraphs. Content can be followed by a setext heading underline, which either turns into a setext heading when the content ends in a paragraph, or turns into the start of the following paragraph when it is followed by content that starts with a paragraph, or turns into a stray paragraph.
author: Titus Wormer <tituswormer@gmail.com> 2022-09-14 16:21:42 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-09-14 16:26:24 +0200
commit: 74d2688aa329f0a41c2a92034c3454ed9299e71a (patch)
tree: 9ec8fdc6e40ff7cd40a14408afcc47716990134e /src/construct/content.rs
parent: 65d4b46c2a3bdecb0493e484473d2de3d124f839 (diff)
download: markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.gz
markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.bz2
markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.zip
1 files changed, 188 insertions, 0 deletions
diff --git a/src/construct/content.rs b/src/construct/content.rs
new file mode 100644
index 0000000..6c10cea
--- /dev/null
+++ b/src/construct/content.rs
@@ -0,0 +1,188 @@
+//! Content occurs in the [flow][] content type.
+//!
+//! Content contains zero or more [definition][definition]s, followed by zero
+//! or one [paragraph][].
+//!
+//! The constructs found in flow are:
+//!
+//! *   [Definition][crate::construct::definition]
+//! *   [Paragraph][crate::construct::paragraph]
+//!
+//! ## Tokens
+//!
+//! *   [`Content`][Name::Content]
+//!
+//! > 👉 **Note**: while parsing, [`Content`][Name::Content]
+//! > is used, which is later compiled away.
+//!
+//! ## References
+//!
+//! *   [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js)
+//!
+//! [flow]: crate::construct::flow
+//! [definition]: crate::construct::definition
+//! [paragraph]: crate::construct::paragraph
+
+use crate::event::{Content, Kind, Link, Name};
+use crate::resolve::Name as ResolveName;
+use crate::state::{Name as StateName, State};
+use crate::subtokenize::{subtokenize, Subresult};
+use crate::tokenizer::Tokenizer;
+use alloc::{string::String, vec};
+
+/// Before a content content.
+///
+/// ```markdown
+/// > | abc
+///     ^
+/// ```
+pub fn chunk_start(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        None | Some(b'\n') => unreachable!("unexpected eol/eof"),
+        _ => {
+            tokenizer.enter_link(
+                Name::Content,
+                Link {
+                    previous: None,
+                    next: None,
+                    content: Content::Content,
+                },
+            );
+            State::Retry(StateName::ContentChunkInside)
+        }
+    }
+}
+
+/// In a content chunk.
+///
+/// ```markdown
+/// > | abc
+///     ^^^
+/// ```
+pub fn chunk_inside(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        None | Some(b'\n') => {
+            tokenizer.exit(Name::Content);
+            tokenizer.register_resolver_before(ResolveName::Content);
+            // You’d be interrupting.
+            tokenizer.interrupt = true;
+            State::Ok
+        }
+        _ => {
+            tokenizer.consume();
+            State::Next(StateName::ContentChunkInside)
+        }
+    }
+}
+
+/// Before a definition.
+///
+/// ```markdown
+/// > | [a]: b
+///     ^
+/// ```
+pub fn definition_before(tokenizer: &mut Tokenizer) -> State {
+    tokenizer.attempt(
+        State::Next(StateName::ContentDefinitionAfter),
+        State::Next(StateName::ParagraphStart),
+    );
+    State::Retry(StateName::DefinitionStart)
+}
+
+/// After a definition.
+///
+/// ```markdown
+/// > | [a]: b
+///           ^
+///   | c
+/// ```
+pub fn definition_after(tokenizer: &mut Tokenizer) -> State {
+    debug_assert!(matches!(tokenizer.current, None | Some(b'\n')));
+    if tokenizer.current.is_none() {
+        State::Ok
+    } else {
+        tokenizer.enter(Name::LineEnding);
+        tokenizer.consume();
+        tokenizer.exit(Name::LineEnding);
+        State::Next(StateName::ContentDefinitionBefore)
+    }
+}
+
+/// Merge `Content` chunks, which currently span a single line, into actual
+/// `Content`s that span multiple lines.
+pub fn resolve(tokenizer: &mut Tokenizer) -> Result<Option<Subresult>, String> {
+    let mut index = 0;
+
+    while index < tokenizer.events.len() {
+        let event = &tokenizer.events[index];
+
+        if event.kind == Kind::Enter && event.name == Name::Content {
+            // Exit:Content
+            let mut exit_index = index + 1;
+
+            loop {
+                let mut enter_index = exit_index + 1;
+
+                if enter_index == tokenizer.events.len()
+                    || tokenizer.events[enter_index].name != Name::LineEnding
+                {
+                    break;
+                }
+
+                // Skip past line ending.
+                enter_index += 2;
+
+                // Skip past prefix.
+                while enter_index < tokenizer.events.len() {
+                    let event = &tokenizer.events[enter_index];
+
+                    if event.name != Name::SpaceOrTab
+                        && event.name != Name::BlockQuotePrefix
+                        && event.name != Name::BlockQuoteMarker
+                    {
+                        break;
+                    }
+
+                    enter_index += 1;
+                }
+
+                if enter_index == tokenizer.events.len()
+                    || tokenizer.events[enter_index].name != Name::Content
+                {
+                    break;
+                }
+
+                // Set Exit:Content point to Exit:LineEnding.
+                tokenizer.events[exit_index].point = tokenizer.events[exit_index + 2].point.clone();
+                // Remove Enter:LineEnding, Exit:LineEnding.
+                tokenizer.map.add(exit_index + 1, 2, vec![]);
+
+                // Link Enter:Content to Enter:Content on this line and vice versa.
+                tokenizer.events[exit_index - 1].link.as_mut().unwrap().next = Some(enter_index);
+                tokenizer.events[enter_index]
+                    .link
+                    .as_mut()
+                    .unwrap()
+                    .previous = Some(exit_index - 1);
+
+                // Potential next start.
+                exit_index = enter_index + 1;
+            }
+
+            // Move to `Exit:Content`.
+            index = exit_index;
+        }
+
+        index += 1;
+    }
+
+    tokenizer.map.consume(&mut tokenizer.events);
+
+    let result = subtokenize(
+        &mut tokenizer.events,
+        tokenizer.parse_state,
+        &Some(Content::Content),
+    )?;
+
+    Ok(Some(result))
+}
author	Titus Wormer <tituswormer@gmail.com>	2022-09-14 16:21:42 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-09-14 16:26:24 +0200
commit	74d2688aa329f0a41c2a92034c3454ed9299e71a (patch)
tree	9ec8fdc6e40ff7cd40a14408afcc47716990134e /src/construct/content.rs
parent	65d4b46c2a3bdecb0493e484473d2de3d124f839 (diff)
download	markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.gz markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.bz2 markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.zip