Add paragraph

author: Titus Wormer <tituswormer@gmail.com> 2022-06-20 13:40:23 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-20 13:40:23 +0200
commit: 61271d73128f8553f8c4c17927828cde52a25eba (patch)
tree: 5b812e04f9f9311ae22209843db257f34fc90d8d /src
parent: 262aec96cece3e9dd55828397b8ec859e7cff606 (diff)
download: markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.tar.gz
markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.tar.bz2
markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.zip
11 files changed, 215 insertions, 141 deletions
diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs
index 7f794b9..fdb1ee0 100644
--- a/src/construct/blank_line.rs
+++ b/src/construct/blank_line.rs
@@ -6,7 +6,7 @@
 //! blank_line ::= *(' ' '\t')
 //! ```
 //!
-//! Blank lines are sometimes needed, such as to differentiate a paragraph
+//! Blank lines are sometimes needed, such as to differentiate a [paragraph][]
 //! from another paragraph.
 //! In several cases, blank lines are not needed between flow constructs,
 //! such as between two [heading (atx)][heading-atx]s.
@@ -24,9 +24,10 @@
 //! *   [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines)
 //!
 //! [flow]: crate::content::flow
+//! [paragraph]: crate::construct::paragraph
 //! [heading-atx]: crate::construct::heading_atx
 //!
-//! <!-- To do: link `list`, `paragraph` -->
+//! <!-- To do: link `list` -->
 
 use crate::construct::partial_whitespace::start as whitespace;
 use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index 28ac20b..ba76aa8 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -64,7 +64,8 @@
 //! ```
 //!
 //! The `info` and `meta` parts are interpreted as the [string][] content type.
-//! That means that character escapes and character reference are allowed.
+//! That means that [character escapes][character_escape] and
+//! [character references][character_reference] are allowed.
 //!
 //! In markdown, it is also possible to use [code (text)][code_text] in the
 //! [text][] content type.
@@ -84,6 +85,8 @@
 //! [text]: crate::content::text
 //! [code_indented]: crate::construct::code_indented
 //! [code_text]: crate::construct::code_text
+//! [character_escape]: crate::construct::character_escape
+//! [character_reference]: crate::construct::character_reference
 //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
 //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
 
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 65c0991..f7f8acd 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -38,7 +38,8 @@
 //!
 //! The `label`, `destination`, and `title` parts are interpreted as the
 //! [string][] content type.
-//! That means that character escapes and character reference are allowed.
+//! That means that [character escapes][character_escape] and
+//! [character references][character_reference] are allowed.
 //!
 //! ## References
 //!
@@ -47,6 +48,8 @@
 //!
 //! [flow]: crate::content::flow
 //! [string]: crate::content::string
+//! [character_escape]: crate::construct::character_escape
+//! [character_reference]: crate::construct::character_reference
 //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
 //!
 //! <!-- To do: link link (reference) -->
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 3ff6fea..ab8b6a5 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -3,9 +3,9 @@
 //! They’re formed with the following BNF:
 //!
 //! ```bnf
-//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab
+//! heading_atx ::= 1*6'#' [ 1*space_or_tab text [ 1*space_or_tab 1*'#' ] ] *space_or_tab
 //!
-//! code ::= . ; any unicode code point (other than line endings).
+//! text ::= code - eol
 //! space_or_tab ::= ' ' | '\t'
 //! ```
 //!
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index da4517d..5adac7d 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -63,7 +63,7 @@
 //!
 //! The **complete** production of HTML (flow) is not allowed to interrupt
 //! content.
-//! That means that a blank line is needed between a paragraph and it.
+//! That means that a blank line is needed between a [paragraph][] and it.
 //! However, [HTML (text)][html_text] has a similar production, which will
 //! typically kick-in instead.
 //!
@@ -87,6 +87,7 @@
 //!
 //! [flow]: crate::content::flow
 //! [html_text]: crate::construct::html_text
+//! [paragraph]: crate::construct::paragraph
 //! [html_raw_names]: crate::constant::HTML_RAW_NAMES
 //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES
 //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 3195205..1debb74 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -35,7 +35,7 @@
 //! *   label start (image)
 //! *   label start (link)
 //! *   list
-//! *   paragraph
+//! *   [paragraph][]
 //! *   [thematic break][thematic_break]
 //!
 //! Each construct maintained here is explained with a BNF diagram.
@@ -67,6 +67,7 @@ pub mod heading_atx;
 pub mod heading_setext;
 pub mod html_flow;
 pub mod html_text;
+pub mod paragraph;
 pub mod partial_destination;
 pub mod partial_label;
 pub mod partial_title;
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
new file mode 100644
index 0000000..50ef627
--- /dev/null
+++ b/src/construct/paragraph.rs
@@ -0,0 +1,177 @@
+//! Paragraph is a construct that occurs in the [flow] content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: lines cannot start other flow constructs.
+//! ; Restriction: lines cannot be blank.
+//! paragraph ::= 1*line *( eol 1*line )
+//! ```
+//!
+//! Paragraphs in markdown relate to the `<p>` element in HTML.
+//! See [*§ 4.4.1 The `p` element* in the HTML spec][html] for more info.
+//!
+//! Paragraphs can contain line endings and whitespace, but they are not
+//! allowed to contain blank lines, or to be blank themselves.
+//!
+//! The paragraph is interpreted as the [text][] content type.
+//! That means that [autolinks][autolink], [code (text)][code_text], etc are allowed.
+//!
+//! ## References
+//!
+//! *   [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js)
+//! *   [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs)
+//!
+//! [flow]: crate::content::flow
+//! [text]: crate::content::text
+//! [autolink]: crate::construct::autolink
+//! [code_text]: crate::construct::code_text
+//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element
+
+use crate::constant::TAB_SIZE;
+use crate::construct::{
+    code_fenced::start as code_fenced, heading_atx::start as heading_atx,
+    html_flow::start as html_flow, partial_whitespace::start as whitespace,
+    thematic_break::start as thematic_break,
+};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::span::from_exit_event;
+
+/// Before a paragraph.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            unreachable!("unexpected eol/eof at start of paragraph")
+        }
+        _ => {
+            tokenizer.enter(TokenType::Paragraph);
+            tokenizer.enter(TokenType::ChunkText);
+            inside(tokenizer, code)
+        }
+    }
+}
+
+/// In a paragraph.
+///
+/// ```markdown
+/// al|pha
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None => end(tokenizer, code),
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer
+            .check(interrupt, |ok| {
+                Box::new(if ok { at_line_ending } else { end })
+            })(tokenizer, code),
+        _ => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(inside)), None)
+        }
+    }
+}
+
+/// At a line ending, not interrupting.
+///
+/// ```markdown
+/// alpha|
+/// bravo.
+/// ```
+fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    tokenizer.consume(code);
+    tokenizer.exit(TokenType::ChunkText);
+    tokenizer.enter(TokenType::ChunkText);
+    let next_index = tokenizer.events.len() - 1;
+    tokenizer.events[next_index - 2].next = Some(next_index);
+    tokenizer.events[next_index].previous = Some(next_index - 2);
+    (State::Fn(Box::new(inside)), None)
+}
+
+/// At a line ending, done.
+///
+/// ```markdown
+/// alpha|
+/// ***
+/// ```
+fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    tokenizer.exit(TokenType::ChunkText);
+    tokenizer.exit(TokenType::Paragraph);
+    (State::Ok, Some(vec![code]))
+}
+
+/// Before a potential interruption.
+///
+/// ```markdown
+/// alpha|
+/// ***
+/// ```
+fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.enter(TokenType::LineEnding);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::LineEnding);
+            (State::Fn(Box::new(interrupt_initial)), None)
+        }
+        _ => unreachable!("expected eol"),
+    }
+}
+
+/// After a line ending.
+///
+/// ```markdown
+/// alpha|
+/// ~~~js
+/// ~~~
+/// ```
+fn interrupt_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    tokenizer.attempt_2(code_fenced, html_flow, |ok| {
+        if ok {
+            Box::new(|_tokenizer, _code| (State::Nok, None))
+        } else {
+            Box::new(|tokenizer, code| {
+                tokenizer.attempt(
+                    |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+                    |_ok| Box::new(interrupt_start),
+                )(tokenizer, code)
+            })
+        }
+    })(tokenizer, code)
+}
+
+/// After a line ending, after optional whitespace.
+///
+/// ```markdown
+/// alpha|
+/// # bravo
+/// ```
+fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    let tail = tokenizer.events.last();
+    let mut prefix = 0;
+
+    if let Some(event) = tail {
+        if event.token_type == TokenType::Whitespace {
+            let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
+            prefix = span.end_index - span.start_index;
+        }
+    }
+
+    match code {
+        // Blank lines are not allowed in paragraph.
+        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
+        // To do: If code is disabled, indented lines are allowed.
+        _ if prefix >= TAB_SIZE => (State::Ok, None),
+        // To do: definitions, setext headings, etc?
+        _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| {
+            let result = if ok {
+                (State::Nok, None)
+            } else {
+                (State::Ok, None)
+            };
+            Box::new(|_t, _c| result)
+        })(tokenizer, code),
+    }
+}
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
index a2f638b..58d07c1 100644
--- a/src/construct/partial_destination.rs
+++ b/src/construct/partial_destination.rs
@@ -26,7 +26,8 @@
 //! URLs.
 //!
 //! The destination is interpreted as the [string][] content type.
-//! That means that character escapes and character reference are allowed.
+//! That means that [character escapes][character_escape] and
+//! [character references][character_reference] are allowed.
 //!
 //! ## References
 //!
@@ -34,6 +35,8 @@
 //!
 //! [definition]: crate::construct::definition
 //! [string]: crate::content::string
+//! [character_escape]: crate::construct::character_escape
+//! [character_reference]: crate::construct::character_reference
 //!
 //! <!-- To do: link label end. -->
 
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
index f7ce8d7..4997390 100644
--- a/src/construct/partial_label.rs
+++ b/src/construct/partial_label.rs
@@ -19,7 +19,8 @@
 //! contain blank lines, and they must not be blank themselves.
 //!
 //! The label is interpreted as the [string][] content type.
-//! That means that character escapes and character reference are allowed.
+//! That means that [character escapes][character_escape] and
+//! [character references][character_reference] are allowed.
 //!
 //! > 👉 **Note**: this label relates to, but is not, the initial “label” of
 //! > what is know as a reference in markdown:
@@ -46,6 +47,8 @@
 //!
 //! [definition]: crate::construct::definition
 //! [string]: crate::content::string
+//! [character_escape]: crate::construct::character_escape
+//! [character_reference]: crate::construct::character_reference
 //! [link_reference_size_max]: crate::constant::LINK_REFERENCE_SIZE_MAX
 //!
 //! <!-- To do: link label end, label starts. -->
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
index 7b5fa64..19ba8d4 100644
--- a/src/construct/partial_title.rs
+++ b/src/construct/partial_title.rs
@@ -17,7 +17,8 @@
 //! They are allowed to be blank themselves.
 //!
 //! The title is interpreted as the [string][] content type.
-//! That means that character escapes and character reference are allowed.
+//! That means that [character escapes][character_escape] and
+//! [character references][character_reference] are allowed.
 //!
 //! ## References
 //!
@@ -25,6 +26,8 @@
 //!
 //! [definition]: crate::construct::definition
 //! [string]: crate::content::string
+//! [character_escape]: crate::construct::character_escape
+//! [character_reference]: crate::construct::character_reference
 //!
 //! <!-- To do: link label end. -->
 
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 58be61d..22aa77f 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -19,17 +19,15 @@
 //! *   [HTML (flow)][crate::construct::html_flow]
 //! *   [Thematic break][crate::construct::thematic_break]
 
-use crate::constant::TAB_SIZE;
 use crate::construct::{
     blank_line::start as blank_line, code_fenced::start as code_fenced,
     code_indented::start as code_indented, definition::start as definition,
     heading_atx::start as heading_atx, heading_setext::start as heading_setext,
-    html_flow::start as html_flow, partial_whitespace::start as whitespace,
-    thematic_break::start as thematic_break,
+    html_flow::start as html_flow, paragraph::start as paragraph,
+    partial_whitespace::start as whitespace, thematic_break::start as thematic_break,
 };
 use crate::subtokenize::subtokenize;
 use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
-use crate::util::span::from_exit_event;
 
 /// Turn `codes` as the flow content type into events.
 pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
@@ -52,7 +50,7 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
 /// |    bravo
 /// |***
 /// ```
-pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     match code {
         Code::None => (State::Ok, None),
         _ => tokenizer.attempt(blank_line, |ok| {
@@ -132,7 +130,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
 /// ```markdown
 /// |qwe
 /// ```
-pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     tokenizer.attempt(
         |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
         |_ok| Box::new(before_after_prefix),
@@ -145,140 +143,21 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
 /// |# asd
 /// |***
 /// ```
-pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     tokenizer.attempt_4(
         heading_atx,
         thematic_break,
         definition,
         heading_setext,
-        |ok| Box::new(if ok { after } else { paragraph_before }),
+        |ok| Box::new(if ok { after } else { before_paragraph }),
     )(tokenizer, code)
 }
 
 /// Before a paragraph.
 ///
 /// ```markdown
-/// |qwe
+/// |asd
 /// ```
-fn paragraph_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    match code {
-        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
-            after(tokenizer, code)
-        }
-        _ => {
-            tokenizer.enter(TokenType::Paragraph);
-            tokenizer.enter(TokenType::ChunkText);
-            paragraph_inside(tokenizer, code, tokenizer.events.len() - 1)
-        }
-    }
-}
-
-/// In a paragraph.
-///
-/// ```markdown
-/// al|pha
-/// ```
-fn paragraph_inside(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult {
-    match code {
-        Code::None => paragraph_end(tokenizer, code),
-        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
-            tokenizer.check(continuation_construct, move |ok| {
-                Box::new(move |t, c| {
-                    if ok {
-                        paragraph_continue(t, c, previous)
-                    } else {
-                        paragraph_end(t, c)
-                    }
-                })
-            })(tokenizer, code)
-        }
-        _ => {
-            tokenizer.consume(code);
-            (
-                State::Fn(Box::new(move |t, c| paragraph_inside(t, c, previous))),
-                None,
-            )
-        }
-    }
-}
-
-fn continuation_construct(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    match code {
-        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
-            tokenizer.enter(TokenType::LineEnding);
-            tokenizer.consume(code);
-            tokenizer.exit(TokenType::LineEnding);
-            (
-                State::Fn(Box::new(continuation_construct_initial_before)),
-                None,
-            )
-        }
-        _ => unreachable!("expected eol"),
-    }
-}
-
-fn continuation_construct_initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    tokenizer.attempt_2(code_fenced, html_flow, |ok| {
-        if ok {
-            Box::new(|_tokenizer, _code| (State::Nok, None))
-        } else {
-            Box::new(|tokenizer, code| {
-                tokenizer.attempt(
-                    |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
-                    |_ok| Box::new(continuation_construct_after_prefix),
-                )(tokenizer, code)
-            })
-        }
-    })(tokenizer, code)
-}
-
-fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    let tail = tokenizer.events.last();
-    let mut prefix = 0;
-
-    if let Some(event) = tail {
-        if event.token_type == TokenType::Whitespace {
-            let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
-            prefix = span.end_index - span.start_index;
-        }
-    }
-
-    match code {
-        // Blank lines are not allowed in paragraph.
-        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
-        // To do: If code is disabled, indented lines are part of the paragraph.
-        _ if prefix >= TAB_SIZE => (State::Ok, None),
-        // To do: definitions, setext headings, etc?
-        _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| {
-            let result = if ok {
-                (State::Nok, None)
-            } else {
-                (State::Ok, None)
-            };
-            Box::new(|_t, _c| result)
-        })(tokenizer, code),
-    }
-}
-
-fn paragraph_continue(
-    tokenizer: &mut Tokenizer,
-    code: Code,
-    previous_index: usize,
-) -> StateFnResult {
-    tokenizer.consume(code);
-    tokenizer.exit(TokenType::ChunkText);
-    tokenizer.enter(TokenType::ChunkText);
-    let next_index = tokenizer.events.len() - 1;
-    tokenizer.events[previous_index].next = Some(next_index);
-    tokenizer.events[next_index].previous = Some(previous_index);
-    (
-        State::Fn(Box::new(move |t, c| paragraph_inside(t, c, next_index))),
-        None,
-    )
-}
-
-fn paragraph_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    tokenizer.exit(TokenType::ChunkText);
-    tokenizer.exit(TokenType::Paragraph);
-    after(tokenizer, code)
+fn before_paragraph(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    tokenizer.go(paragraph, after)(tokenizer, code)
 }
author	Titus Wormer <tituswormer@gmail.com>	2022-06-20 13:40:23 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-20 13:40:23 +0200
commit	61271d73128f8553f8c4c17927828cde52a25eba (patch)
tree	5b812e04f9f9311ae22209843db257f34fc90d8d /src
parent	262aec96cece3e9dd55828397b8ec859e7cff606 (diff)
download	markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.tar.gz markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.tar.bz2 markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.zip