Fix to prefer flow over definitions, setext headings

An undocumented part of CommonMark is how to deal with things in definition labels or definition titles (which both can span multiple lines). Can flow (or containers?) interrupt them? They can according to the `cmark` reference parser, so this was implemented here. This adds a new `Content` content type, which houses zero or more definitions, and then zero-or-one paragraphs. Content can be followed by a setext heading underline, which either turns into a setext heading when the content ends in a paragraph, or turns into the start of the following paragraph when it is followed by content that starts with a paragraph, or turns into a stray paragraph.
author: Titus Wormer <tituswormer@gmail.com> 2022-09-14 16:21:42 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-09-14 16:26:24 +0200
commit: 74d2688aa329f0a41c2a92034c3454ed9299e71a (patch)
tree: 9ec8fdc6e40ff7cd40a14408afcc47716990134e /src/construct/paragraph.rs
parent: 65d4b46c2a3bdecb0493e484473d2de3d124f839 (diff)
download: markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.gz
markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.bz2
markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.zip
1 files changed, 48 insertions, 101 deletions
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
index c1e7311..78fbacb 100644
--- a/src/construct/paragraph.rs
+++ b/src/construct/paragraph.rs
@@ -1,4 +1,4 @@
-//! Paragraph occurs in the [flow][] content type.
+//! Paragraph occurs in the [content][] content type.
 //!
 //! ## Grammar
 //!
@@ -11,14 +11,15 @@
 //! paragraph ::= 1*line *(eol 1*line)
 //! ```
 //!
-//! As this construct occurs in flow, like all flow constructs, it must be
-//! followed by an eol (line ending) or eof (end of file).
+//! This construct must be followed by an eol (line ending) or eof (end of
+//! file), like flow constructs.
 //!
 //! Paragraphs can contain line endings and whitespace, but they are not
 //! allowed to contain blank lines, or to be blank themselves.
 //!
 //! The paragraph is interpreted as the [text][] content type.
-//! That means that [autolinks][autolink], [code (text)][raw_text], etc are allowed.
+//! That means that [autolinks][autolink], [code (text)][raw_text], etc are
+//! allowed.
 //!
 //! ## HTML
 //!
@@ -34,40 +35,57 @@
 //! *   [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js)
 //! *   [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs)
 //!
-//! [flow]: crate::construct::flow
+//! [content]: crate::construct::content
 //! [text]: crate::construct::text
 //! [autolink]: crate::construct::autolink
 //! [raw_text]: crate::construct::raw_text
 //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element
 
-use crate::event::{Content, Kind, Link, Name};
-use crate::resolve::Name as ResolveName;
+use crate::event::{Content, Link, Name};
 use crate::state::{Name as StateName, State};
+use crate::subtokenize::link;
 use crate::tokenizer::Tokenizer;
-use alloc::vec;
 
-/// Before paragraph.
+/// Paragraph start.
 ///
 /// ```markdown
 /// > | abc
 ///     ^
+///   | def
 /// ```
 pub fn start(tokenizer: &mut Tokenizer) -> State {
-    match tokenizer.current {
-        None | Some(b'\n') => unreachable!("unexpected eol/eof"),
-        _ => {
-            tokenizer.enter(Name::Paragraph);
-            tokenizer.enter_link(
-                Name::Data,
-                Link {
-                    previous: None,
-                    next: None,
-                    content: Content::Text,
-                },
-            );
-            State::Retry(StateName::ParagraphInside)
-        }
+    debug_assert!(tokenizer.current.is_some());
+    tokenizer.enter(Name::Paragraph);
+    State::Retry(StateName::ParagraphLineStart)
+}
+
+/// Start of a line in a paragraph.
+///
+/// ```markdown
+/// > | abc
+///     ^
+/// > | def
+///     ^
+/// ```
+pub fn line_start(tokenizer: &mut Tokenizer) -> State {
+    debug_assert!(tokenizer.current.is_some());
+    tokenizer.enter_link(
+        Name::Data,
+        Link {
+            previous: None,
+            next: None,
+            content: Content::Text,
+        },
+    );
+
+    if tokenizer.tokenize_state.connect {
+        let index = tokenizer.events.len() - 1;
+        link(&mut tokenizer.events, index);
+    } else {
+        tokenizer.tokenize_state.connect = true;
     }
+
+    State::Retry(StateName::ParagraphInside)
 }
 
 /// In paragraph.
@@ -78,91 +96,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
 /// ```
 pub fn inside(tokenizer: &mut Tokenizer) -> State {
     match tokenizer.current {
-        None | Some(b'\n') => {
+        None => {
+            tokenizer.tokenize_state.connect = false;
             tokenizer.exit(Name::Data);
             tokenizer.exit(Name::Paragraph);
-            tokenizer.register_resolver_before(ResolveName::Paragraph);
-            // You’d be interrupting.
-            tokenizer.interrupt = true;
             State::Ok
         }
+        Some(b'\n') => {
+            tokenizer.consume();
+            tokenizer.exit(Name::Data);
+            State::Next(StateName::ParagraphLineStart)
+        }
         _ => {
             tokenizer.consume();
             State::Next(StateName::ParagraphInside)
         }
     }
 }
-
-/// Merge “`Paragraph`”s, which currently span a single line, into actual
-/// `Paragraph`s that span multiple lines.
-pub fn resolve(tokenizer: &mut Tokenizer) {
-    let mut index = 0;
-
-    while index < tokenizer.events.len() {
-        let event = &tokenizer.events[index];
-
-        if event.kind == Kind::Enter && event.name == Name::Paragraph {
-            // Exit:Paragraph
-            let mut exit_index = index + 3;
-
-            loop {
-                let mut enter_index = exit_index + 1;
-
-                if enter_index == tokenizer.events.len()
-                    || tokenizer.events[enter_index].name != Name::LineEnding
-                {
-                    break;
-                }
-
-                enter_index += 2;
-
-                while enter_index < tokenizer.events.len() {
-                    let event = &tokenizer.events[enter_index];
-
-                    if event.name != Name::SpaceOrTab
-                        && event.name != Name::BlockQuotePrefix
-                        && event.name != Name::BlockQuoteMarker
-                    {
-                        break;
-                    }
-
-                    enter_index += 1;
-                }
-
-                if enter_index == tokenizer.events.len()
-                    || tokenizer.events[enter_index].name != Name::Paragraph
-                {
-                    break;
-                }
-
-                // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding.
-                tokenizer.map.add(exit_index, 3, vec![]);
-
-                // Remove Enter:Paragraph.
-                tokenizer.map.add(enter_index, 1, vec![]);
-
-                // Add Exit:LineEnding position info to Exit:Data.
-                tokenizer.events[exit_index - 1].point =
-                    tokenizer.events[exit_index + 2].point.clone();
-
-                // Link Enter:Data on the previous line to Enter:Data on this line.
-                if let Some(link) = &mut tokenizer.events[exit_index - 2].link {
-                    link.next = Some(enter_index + 1);
-                }
-                if let Some(link) = &mut tokenizer.events[enter_index + 1].link {
-                    link.previous = Some(exit_index - 2);
-                }
-
-                // Potential next start.
-                exit_index = enter_index + 3;
-            }
-
-            // Move to `Exit:Paragraph`.
-            index = exit_index;
-        }
-
-        index += 1;
-    }
-
-    tokenizer.map.consume(&mut tokenizer.events);
-}
author	Titus Wormer <tituswormer@gmail.com>	2022-09-14 16:21:42 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-09-14 16:26:24 +0200
commit	74d2688aa329f0a41c2a92034c3454ed9299e71a (patch)
tree	9ec8fdc6e40ff7cd40a14408afcc47716990134e /src/construct/paragraph.rs
parent	65d4b46c2a3bdecb0493e484473d2de3d124f839 (diff)
download	markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.gz markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.tar.bz2 markdown-rs-74d2688aa329f0a41c2a92034c3454ed9299e71a.zip