From 58ba69452a25c3d4b2059c01cc6cd837159d2f90 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Thu, 16 Jun 2022 11:34:35 +0200
Subject: Add support for hard break escape

---
 readme.md                          |   6 +-
 src/compiler.rs                    |   6 ++
 src/construct/character_escape.rs  |  11 ++-
 src/construct/hard_break_escape.rs |  61 ++++++++++++++
 src/construct/mod.rs               |   3 +-
 src/content/text.rs                |   7 +-
 src/tokenizer.rs                   |  14 +++-
 tests/character_escape.rs          |  11 ++-
 tests/hard_break_escape.rs         | 167 +++++++++++++++++++++++++++++++++++++
 9 files changed, 265 insertions(+), 21 deletions(-)
 create mode 100644 src/construct/hard_break_escape.rs
 create mode 100644 tests/hard_break_escape.rs
diff --git a/readme.md b/readme.md
index a17e603..0e58750 100644
--- a/readme.md
+++ b/readme.md
@@ -111,7 +111,7 @@ cargo doc --document-private-items
 - [x] (1) code (text)
 - [ ] (3) content
 - [ ] (3) definition
-- [ ] (1) hard break escape
+- [x] (1) hard break escape
 - [x] heading (atx)
 - [ ] (1) heading (setext)
 - [x] html (flow)
@@ -122,6 +122,7 @@ cargo doc --document-private-items
 - [ ] (8) list
 - [ ] (1) paragraph
 - [x] thematic break
+- [ ] (1) trailing break escape
 
 ### Content types
 
@@ -146,11 +147,12 @@ cargo doc --document-private-items
   - [x] character escape
   - [x] character reference
   - [x] code (text)
-  - [ ] hard break escape
+  - [x] hard break escape
   - [x] html (text)
   - [ ] label end
   - [ ] label start (image)
   - [ ] label start (link)
+  - [ ] trailing break escape
 - [x] string
   - [x] character escape
   - [x] character reference
diff --git a/src/compiler.rs b/src/compiler.rs
index 6127231..3aacca0 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -150,6 +150,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                 | TokenType::BlankLineEnding
                 | TokenType::BlankLineWhitespace
                 | TokenType::Whitespace
+                | TokenType::HardBreakEscape
+                | TokenType::HardBreakEscapeMarker
                 | TokenType::HtmlFlowData
                 | TokenType::HtmlTextData
                 | TokenType::CodeFencedFence
@@ -192,6 +194,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                 | TokenType::CharacterEscapeMarker
                 | TokenType::CharacterReference
                 | TokenType::CharacterReferenceMarkerSemi
+                | TokenType::HardBreakEscapeMarker
                 | TokenType::Autolink
                 | TokenType::AutolinkMarker => {}
                 TokenType::HtmlFlow | TokenType::HtmlText => {
@@ -208,6 +211,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                 TokenType::Paragraph => {
                     buf_tail_mut(buffers).push("</p>".to_string());
                 }
+                TokenType::HardBreakEscape => {
+                    buf_tail_mut(buffers).push("<br />".to_string());
+                }
                 TokenType::CodeIndented | TokenType::CodeFenced => {
                     let seen_data =
                         code_flow_seen_data.expect("`code_flow_seen_data` must be defined");
diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs
index 7bab42d..baedd4b 100644
--- a/src/construct/character_escape.rs
+++ b/src/construct/character_escape.rs
@@ -11,11 +11,11 @@
 //! slash, or a slash followed by anything other than an ASCII punctuation
 //! character, is exactly that: just a slash.
 //! To escape (most) arbitrary characters, use a
-//! [character reference][] instead
+//! [character reference][character_reference] instead
 //! (as in, `&amp;`, `&#123;`, or say `&#x9;`).
 //! It is also possible to escape a line ending in text with a similar
-//! construct: a backslash followed by a line ending (that is part of the
-//! construct instead of ending it).
+//! construct: a [hard break escape][hard_break_escape] is a backslash followed
+//! by a line ending (that is part of the construct instead of ending it).
 //!
 //! ## References
 //!
@@ -24,9 +24,8 @@
 //!
 //! [string]: crate::content::string
 //! [text]: crate::content::text
-//! [character reference]: crate::construct::character_reference
-//!
-//! <!-- To do: link `hard_break_escape` -->
+//! [character_reference]: crate::construct::character_reference
+//! [hard_break_escape]: crate::construct::hard_break_escape
 
 use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
 
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
new file mode 100644
index 0000000..a7712d6
--- /dev/null
+++ b/src/construct/hard_break_escape.rs
@@ -0,0 +1,61 @@
+//! Hard break escapes are a construct that occurs in the  [text][] content
+//! type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: followed by a line ending  (that is part of the construct
+//! ; instead of ending it).
+//! hard_break_escape ::= '\\'
+//! ```
+//! It is also possible to escape punctuation characters with a similar
+//! construct: a [character escape][character_escape] is a backslash followed
+//! by an ASCII punctuation character.
+//! Arbitrary characters can be escaped with
+//! [character reference][character_reference]s.
+//!
+//! ## References
+//!
+//! *   [`hard-break-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/hard-break-escape.js)
+//! *   [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks)
+//!
+//! [text]: crate::content::text
+//! [character_escape]: crate::construct::character_escape
+//! [character_reference]: crate::construct::character_reference
+//!
+//! <!-- To do: link `hard_break_escape` -->
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a hard break escape.
+///
+/// ```markdown
+/// a|\
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('\\') => {
+            tokenizer.enter(TokenType::HardBreakEscape);
+            tokenizer.enter(TokenType::HardBreakEscapeMarker);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::HardBreakEscapeMarker);
+            (State::Fn(Box::new(inside)), None)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// At the end of a hard break escape, after `\`.
+///
+/// ```markdown
+/// a\|
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+            tokenizer.exit(TokenType::HardBreakEscape);
+            (State::Ok, Some(vec![code]))
+        }
+        _ => (State::Nok, None),
+    }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 1fa57d5..27f4308 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -26,7 +26,7 @@
 //! *   [code (text)][code_text]
 //! *   content
 //! *   definition
-//! *   hard break escape
+//! *   [hard break escape][hard_break_escape]
 //! *   [heading (atx)][heading_atx]
 //! *   heading (setext)
 //! *   [html (flow)][html_flow]
@@ -60,6 +60,7 @@ pub mod character_reference;
 pub mod code_fenced;
 pub mod code_indented;
 pub mod code_text;
+pub mod hard_break_escape;
 pub mod heading_atx;
 pub mod html_flow;
 pub mod html_text;
diff --git a/src/content/text.rs b/src/content/text.rs
index 9d510cb..d4d5493 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -8,7 +8,7 @@
 //! *   [Autolink][crate::construct::autolink]
 //! *   Attention
 //! *   [HTML (text)][crate::construct::html_text]
-//! *   Hard break escape
+//! *   [Hard break escape][crate::construct::hard_break_escape]
 //! *   [Code (text)][crate::construct::code_text]
 //! *   Line ending
 //! *   Label start (image)
@@ -19,7 +19,7 @@
 use crate::construct::{
     autolink::start as autolink, character_escape::start as character_escape,
     character_reference::start as character_reference, code_text::start as code_text,
-    html_text::start as html_text,
+    hard_break_escape::start as hard_break_escape, html_text::start as html_text,
 };
 use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
 
@@ -35,9 +35,10 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
 pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     match code {
         Code::None => (State::Ok, None),
-        _ => tokenizer.attempt_5(
+        _ => tokenizer.attempt_6(
             character_reference,
             character_escape,
+            hard_break_escape,
             autolink,
             html_text,
             code_text,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index c5df42b..a63d209 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -60,6 +60,9 @@ pub enum TokenType {
 
     Data,
 
+    HardBreakEscape,
+    HardBreakEscapeMarker,
+
     HtmlFlow,
     HtmlFlowData,
 
@@ -441,6 +444,7 @@ impl Tokenizer {
             None,
             None,
             None,
+            None,
             done,
         )
     }
@@ -459,18 +463,20 @@ impl Tokenizer {
             Some(Box::new(c)),
             None,
             None,
+            None,
             done,
         )
     }
 
-    #[allow(clippy::many_single_char_names)]
-    pub fn attempt_5(
+    #[allow(clippy::too_many_arguments, clippy::many_single_char_names)]
+    pub fn attempt_6(
         &mut self,
         a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
         b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
         c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
         d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
         e: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+        f: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
         done: impl FnOnce(bool) -> Box<StateFn> + 'static,
     ) -> Box<StateFn> {
         self.call_multiple(
@@ -480,6 +486,7 @@ impl Tokenizer {
             Some(Box::new(c)),
             Some(Box::new(d)),
             Some(Box::new(e)),
+            Some(Box::new(f)),
             done,
         )
     }
@@ -493,6 +500,7 @@ impl Tokenizer {
         c: Option<Box<StateFn>>,
         d: Option<Box<StateFn>>,
         e: Option<Box<StateFn>>,
+        f: Option<Box<StateFn>>,
         done: impl FnOnce(bool) -> Box<StateFn> + 'static,
     ) -> Box<StateFn> {
         if let Some(head) = a {
@@ -501,7 +509,7 @@ impl Tokenizer {
                     done(ok)
                 } else {
                     Box::new(move |tokenizer: &mut Tokenizer, code| {
-                        tokenizer.call_multiple(check, b, c, d, e, None, done)(tokenizer, code)
+                        tokenizer.call_multiple(check, b, c, d, e, f, None, done)(tokenizer, code)
                     })
                 }
             };
diff --git a/tests/character_escape.rs b/tests/character_escape.rs
index aae0b58..c81760d 100644
--- a/tests/character_escape.rs
+++ b/tests/character_escape.rs
@@ -30,12 +30,11 @@ fn character_escape() {
         "should escape other constructs"
     );
 
-    // To do: hard break.
-    // assert_eq!(
-    //     micromark("foo\\\nbar"),
-    //     "<p>foo<br />\nbar</p>",
-    //     "should escape a line break"
-    // );
+    assert_eq!(
+        micromark("foo\\\nbar"),
+        "<p>foo<br />\nbar</p>",
+        "should escape a line break"
+    );
 
     assert_eq!(
         micromark("`` \\[\\` ``"),
diff --git a/tests/hard_break_escape.rs b/tests/hard_break_escape.rs
new file mode 100644
index 0000000..fe4c82b
--- /dev/null
+++ b/tests/hard_break_escape.rs
@@ -0,0 +1,167 @@
+extern crate micromark;
+use micromark::{micromark};
+
+#[test]
+fn hard_break_escape() {
+    // To do: trailing.
+    // assert_eq!(
+    //     micromark("foo  \nbaz"),
+    //     "<p>foo<br />\nbaz</p>",
+    //     "should support two trailing spaces to form a hard break"
+    // );
+
+    assert_eq!(
+        micromark("foo\\\nbaz"),
+        "<p>foo<br />\nbaz</p>",
+        "should support a backslash to form a hard break"
+    );
+
+    // To do: trailing.
+    // assert_eq!(
+    //     micromark("foo       \nbaz"),
+    //     "<p>foo<br />\nbaz</p>",
+    //     "should support multiple trailing spaces"
+    // );
+
+    // To do: trailing.
+    // assert_eq!(
+    //     micromark("foo  \n     bar"),
+    //     "<p>foo<br />\nbar</p>",
+    //     "should support leading spaces after a trailing hard break"
+    // );
+
+    // To do: trim paragraph whitespace.
+    // assert_eq!(
+    //     micromark("foo\\\n     bar"),
+    //     "<p>foo<br />\nbar</p>",
+    //     "should support leading spaces after an escape hard break"
+    // );
+
+    // To do: trailing, attention.
+    // assert_eq!(
+    //     micromark("*foo  \nbar*"),
+    //     "<p><em>foo<br />\nbar</em></p>",
+    //     "should support trailing hard breaks in emphasis"
+    // );
+
+    // To do: attention.
+    // assert_eq!(
+    //     micromark("*foo\\\nbar*"),
+    //     "<p><em>foo<br />\nbar</em></p>",
+    //     "should support escape hard breaks in emphasis"
+    // );
+
+    assert_eq!(
+        micromark("`code  \ntext`"),
+        "<p><code>code   text</code></p>",
+        "should not support trailing hard breaks in code"
+    );
+
+    assert_eq!(
+        micromark("``code\\\ntext``"),
+        "<p><code>code\\ text</code></p>",
+        "should not support escape hard breaks in code"
+    );
+
+    // To do: paragraph trimming.
+    // assert_eq!(
+    //     micromark("foo  "),
+    //     "<p>foo</p>",
+    //     "should not support trailing hard breaks at the end of a paragraph"
+    // );
+
+    assert_eq!(
+        micromark("foo\\"),
+        "<p>foo\\</p>",
+        "should not support escape hard breaks at the end of a paragraph"
+    );
+
+    assert_eq!(
+        micromark("### foo\\"),
+        "<h3>foo\\</h3>",
+        "should not support escape hard breaks at the end of a heading"
+    );
+
+    assert_eq!(
+        micromark("### foo  "),
+        "<h3>foo</h3>",
+        "should not support trailing hard breaks at the end of a heading"
+    );
+
+    // To do: paragraph trimming.
+    // assert_eq!(
+    //     micromark("aaa  \t\nbb"),
+    //     "<p>aaa\nbb</p>",
+    //     "should support a mixed line suffix (1)"
+    // );
+
+    // To do: paragraph trimming.
+    // assert_eq!(
+    //     micromark("aaa\t  \nbb"),
+    //     "<p>aaa\nbb</p>",
+    //     "should support a mixed line suffix (2)"
+    // );
+
+    // To do: paragraph trimming.
+    // assert_eq!(
+    //     micromark("aaa  \t  \nbb"),
+    //     "<p>aaa\nbb</p>",
+    //     "should support a mixed line suffix (3)"
+    // );
+
+    // To do: trailing.
+    // assert_eq!(
+    //     micromark("aaa\0  \nbb"),
+    //     "<p>aaa�<br />\nbb</p>",
+    //     "should support a hard break after a replacement character"
+    // );
+
+    // To do: trailing.
+    // assert_eq!(
+    //     micromark("aaa\0\t\nbb"),
+    //     "<p>aaa�\nbb</p>",
+    //     "should support a line suffix after a replacement character"
+    // );
+
+    // To do: attention, trailing.
+    // assert_eq!(
+    //     micromark("*a*  \nbb"),
+    //     "<p><em>a</em><br />\nbb</p>",
+    //     "should support a hard break after a span"
+    // );
+
+    // To do: attention, trailing.
+    // assert_eq!(
+    //     micromark("*a*\t\nbb"),
+    //     "<p><em>a</em>\nbb</p>",
+    //     "should support a line suffix after a span"
+    // );
+
+    // To do: attention, trailing.
+    // assert_eq!(
+    //     micromark("*a*  \t\nbb"),
+    //     "<p><em>a</em>\nbb</p>",
+    //     "should support a mixed line suffix after a span (1)"
+    // );
+
+    // To do: attention, trailing.
+    // assert_eq!(
+    //     micromark("*a*\t  \nbb"),
+    //     "<p><em>a</em>\nbb</p>",
+    //     "should support a mixed line suffix after a span (2)"
+    // );
+
+    // To do: attention, trailing.
+    // assert_eq!(
+    //     micromark("*a*  \t  \nbb"),
+    //     "<p><em>a</em>\nbb</p>",
+    //     "should support a mixed line suffix after a span (3)"
+    // );
+
+    // // To do: turning off things.
+    // assert_eq!(
+    //   micromark("a\\\nb", {extensions: [{disable: {null: ["hardBreakEscape"]}}]}),
+    //   "<p>a\\\nb</p>",
+    //   "should support turning off hard break (escape)"
+    // );
+}
-- 
cgit