From 58ba69452a25c3d4b2059c01cc6cd837159d2f90 Mon Sep 17 00:00:00 2001
From: Titus Wormer
Date: Thu, 16 Jun 2022 11:34:35 +0200
Subject: Add support for hard break escape
---
readme.md | 6 +-
src/compiler.rs | 6 ++
src/construct/character_escape.rs | 11 ++-
src/construct/hard_break_escape.rs | 61 ++++++++++++++
src/construct/mod.rs | 3 +-
src/content/text.rs | 7 +-
src/tokenizer.rs | 14 +++-
tests/character_escape.rs | 11 ++-
tests/hard_break_escape.rs | 167 +++++++++++++++++++++++++++++++++++++
9 files changed, 265 insertions(+), 21 deletions(-)
create mode 100644 src/construct/hard_break_escape.rs
create mode 100644 tests/hard_break_escape.rs
diff --git a/readme.md b/readme.md
index a17e603..0e58750 100644
--- a/readme.md
+++ b/readme.md
@@ -111,7 +111,7 @@ cargo doc --document-private-items
- [x] (1) code (text)
- [ ] (3) content
- [ ] (3) definition
-- [ ] (1) hard break escape
+- [x] (1) hard break escape
- [x] heading (atx)
- [ ] (1) heading (setext)
- [x] html (flow)
@@ -122,6 +122,7 @@ cargo doc --document-private-items
- [ ] (8) list
- [ ] (1) paragraph
- [x] thematic break
+- [ ] (1) trailing break escape
### Content types
@@ -146,11 +147,12 @@ cargo doc --document-private-items
- [x] character escape
- [x] character reference
- [x] code (text)
- - [ ] hard break escape
+ - [x] hard break escape
- [x] html (text)
- [ ] label end
- [ ] label start (image)
- [ ] label start (link)
+ - [ ] trailing break escape
- [x] string
- [x] character escape
- [x] character reference
diff --git a/src/compiler.rs b/src/compiler.rs
index 6127231..3aacca0 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -150,6 +150,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::BlankLineEnding
| TokenType::BlankLineWhitespace
| TokenType::Whitespace
+ | TokenType::HardBreakEscape
+ | TokenType::HardBreakEscapeMarker
| TokenType::HtmlFlowData
| TokenType::HtmlTextData
| TokenType::CodeFencedFence
@@ -192,6 +194,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::CharacterEscapeMarker
| TokenType::CharacterReference
| TokenType::CharacterReferenceMarkerSemi
+ | TokenType::HardBreakEscapeMarker
| TokenType::Autolink
| TokenType::AutolinkMarker => {}
TokenType::HtmlFlow | TokenType::HtmlText => {
@@ -208,6 +211,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
TokenType::Paragraph => {
buf_tail_mut(buffers).push("
".to_string());
}
+ TokenType::HardBreakEscape => {
+ buf_tail_mut(buffers).push("
".to_string());
+ }
TokenType::CodeIndented | TokenType::CodeFenced => {
let seen_data =
code_flow_seen_data.expect("`code_flow_seen_data` must be defined");
diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs
index 7bab42d..baedd4b 100644
--- a/src/construct/character_escape.rs
+++ b/src/construct/character_escape.rs
@@ -11,11 +11,11 @@
//! slash, or a slash followed by anything other than an ASCII punctuation
//! character, is exactly that: just a slash.
//! To escape (most) arbitrary characters, use a
-//! [character reference][] instead
+//! [character reference][character_reference] instead
//! (as in, `&`, `{`, or say ` `).
//! It is also possible to escape a line ending in text with a similar
-//! construct: a backslash followed by a line ending (that is part of the
-//! construct instead of ending it).
+//! construct: a [hard break escape][hard_break_escape] is a backslash followed
+//! by a line ending (that is part of the construct instead of ending it).
//!
//! ## References
//!
@@ -24,9 +24,8 @@
//!
//! [string]: crate::content::string
//! [text]: crate::content::text
-//! [character reference]: crate::construct::character_reference
-//!
-//!
+//! [character_reference]: crate::construct::character_reference
+//! [hard_break_escape]: crate::construct::hard_break_escape
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
new file mode 100644
index 0000000..a7712d6
--- /dev/null
+++ b/src/construct/hard_break_escape.rs
@@ -0,0 +1,61 @@
+//! Hard break escapes are a construct that occurs in the [text][] content
+//! type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: followed by a line ending (that is part of the construct
+//! ; instead of ending it).
+//! hard_break_escape ::= '\\'
+//! ```
+//! It is also possible to escape punctuation characters with a similar
+//! construct: a [character escape][character_escape] is a backslash followed
+//! by an ASCII punctuation character.
+//! Arbitrary characters can be escaped with
+//! [character reference][character_reference]s.
+//!
+//! ## References
+//!
+//! * [`hard-break-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/hard-break-escape.js)
+//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks)
+//!
+//! [text]: crate::content::text
+//! [character_escape]: crate::construct::character_escape
+//! [character_reference]: crate::construct::character_reference
+//!
+//!
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a hard break escape.
+///
+/// ```markdown
+/// a|\
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('\\') => {
+ tokenizer.enter(TokenType::HardBreakEscape);
+ tokenizer.enter(TokenType::HardBreakEscapeMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::HardBreakEscapeMarker);
+ (State::Fn(Box::new(inside)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// At the end of a hard break escape, after `\`.
+///
+/// ```markdown
+/// a\|
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.exit(TokenType::HardBreakEscape);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 1fa57d5..27f4308 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -26,7 +26,7 @@
//! * [code (text)][code_text]
//! * content
//! * definition
-//! * hard break escape
+//! * [hard break escape][hard_break_escape]
//! * [heading (atx)][heading_atx]
//! * heading (setext)
//! * [html (flow)][html_flow]
@@ -60,6 +60,7 @@ pub mod character_reference;
pub mod code_fenced;
pub mod code_indented;
pub mod code_text;
+pub mod hard_break_escape;
pub mod heading_atx;
pub mod html_flow;
pub mod html_text;
diff --git a/src/content/text.rs b/src/content/text.rs
index 9d510cb..d4d5493 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -8,7 +8,7 @@
//! * [Autolink][crate::construct::autolink]
//! * Attention
//! * [HTML (text)][crate::construct::html_text]
-//! * Hard break escape
+//! * [Hard break escape][crate::construct::hard_break_escape]
//! * [Code (text)][crate::construct::code_text]
//! * Line ending
//! * Label start (image)
@@ -19,7 +19,7 @@
use crate::construct::{
autolink::start as autolink, character_escape::start as character_escape,
character_reference::start as character_reference, code_text::start as code_text,
- html_text::start as html_text,
+ hard_break_escape::start as hard_break_escape, html_text::start as html_text,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -35,9 +35,10 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
- _ => tokenizer.attempt_5(
+ _ => tokenizer.attempt_6(
character_reference,
character_escape,
+ hard_break_escape,
autolink,
html_text,
code_text,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index c5df42b..a63d209 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -60,6 +60,9 @@ pub enum TokenType {
Data,
+ HardBreakEscape,
+ HardBreakEscapeMarker,
+
HtmlFlow,
HtmlFlowData,
@@ -441,6 +444,7 @@ impl Tokenizer {
None,
None,
None,
+ None,
done,
)
}
@@ -459,18 +463,20 @@ impl Tokenizer {
Some(Box::new(c)),
None,
None,
+ None,
done,
)
}
- #[allow(clippy::many_single_char_names)]
- pub fn attempt_5(
+ #[allow(clippy::too_many_arguments, clippy::many_single_char_names)]
+ pub fn attempt_6(
&mut self,
a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
e: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ f: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
done: impl FnOnce(bool) -> Box + 'static,
) -> Box {
self.call_multiple(
@@ -480,6 +486,7 @@ impl Tokenizer {
Some(Box::new(c)),
Some(Box::new(d)),
Some(Box::new(e)),
+ Some(Box::new(f)),
done,
)
}
@@ -493,6 +500,7 @@ impl Tokenizer {
c: Option>,
d: Option>,
e: Option>,
+ f: Option>,
done: impl FnOnce(bool) -> Box + 'static,
) -> Box {
if let Some(head) = a {
@@ -501,7 +509,7 @@ impl Tokenizer {
done(ok)
} else {
Box::new(move |tokenizer: &mut Tokenizer, code| {
- tokenizer.call_multiple(check, b, c, d, e, None, done)(tokenizer, code)
+ tokenizer.call_multiple(check, b, c, d, e, f, None, done)(tokenizer, code)
})
}
};
diff --git a/tests/character_escape.rs b/tests/character_escape.rs
index aae0b58..c81760d 100644
--- a/tests/character_escape.rs
+++ b/tests/character_escape.rs
@@ -30,12 +30,11 @@ fn character_escape() {
"should escape other constructs"
);
- // To do: hard break.
- // assert_eq!(
- // micromark("foo\\\nbar"),
- // "foo
\nbar
",
- // "should escape a line break"
- // );
+ assert_eq!(
+ micromark("foo\\\nbar"),
+ "foo
\nbar
",
+ "should escape a line break"
+ );
assert_eq!(
micromark("`` \\[\\` ``"),
diff --git a/tests/hard_break_escape.rs b/tests/hard_break_escape.rs
new file mode 100644
index 0000000..fe4c82b
--- /dev/null
+++ b/tests/hard_break_escape.rs
@@ -0,0 +1,167 @@
+extern crate micromark;
+use micromark::{micromark};
+
+#[test]
+fn hard_break_escape() {
+ // To do: trailing.
+ // assert_eq!(
+ // micromark("foo \nbaz"),
+ // "foo
\nbaz
",
+ // "should support two trailing spaces to form a hard break"
+ // );
+
+ assert_eq!(
+ micromark("foo\\\nbaz"),
+ "foo
\nbaz
",
+ "should support a backslash to form a hard break"
+ );
+
+ // To do: trailing.
+ // assert_eq!(
+ // micromark("foo \nbaz"),
+ // "foo
\nbaz
",
+ // "should support multiple trailing spaces"
+ // );
+
+ // To do: trailing.
+ // assert_eq!(
+ // micromark("foo \n bar"),
+ // "foo
\nbar
",
+ // "should support leading spaces after a trailing hard break"
+ // );
+
+ // To do: trim paragraph whitespace.
+ // assert_eq!(
+ // micromark("foo\\\n bar"),
+ // "foo
\nbar
",
+ // "should support leading spaces after an escape hard break"
+ // );
+
+ // To do: trailing, attention.
+ // assert_eq!(
+ // micromark("*foo \nbar*"),
+ // "foo
\nbar
",
+ // "should support trailing hard breaks in emphasis"
+ // );
+
+ // To do: attention.
+ // assert_eq!(
+ // micromark("*foo\\\nbar*"),
+ // "foo
\nbar
",
+ // "should support escape hard breaks in emphasis"
+ // );
+
+ assert_eq!(
+ micromark("`code \ntext`"),
+ "code text
",
+ "should not support trailing hard breaks in code"
+ );
+
+ assert_eq!(
+ micromark("``code\\\ntext``"),
+ "code\\ text
",
+ "should not support escape hard breaks in code"
+ );
+
+ // To do: paragraph trimming.
+ // assert_eq!(
+ // micromark("foo "),
+ // "foo
",
+ // "should not support trailing hard breaks at the end of a paragraph"
+ // );
+
+ assert_eq!(
+ micromark("foo\\"),
+ "foo\\
",
+ "should not support escape hard breaks at the end of a paragraph"
+ );
+
+ assert_eq!(
+ micromark("### foo\\"),
+ "foo\\
",
+ "should not support escape hard breaks at the end of a heading"
+ );
+
+ assert_eq!(
+ micromark("### foo "),
+ "foo
",
+ "should not support trailing hard breaks at the end of a heading"
+ );
+
+ // To do: paragraph trimming.
+ // assert_eq!(
+ // micromark("aaa \t\nbb"),
+ // "aaa\nbb
",
+ // "should support a mixed line suffix (1)"
+ // );
+
+ // To do: paragraph trimming.
+ // assert_eq!(
+ // micromark("aaa\t \nbb"),
+ // "aaa\nbb
",
+ // "should support a mixed line suffix (2)"
+ // );
+
+ // To do: paragraph trimming.
+ // assert_eq!(
+ // micromark("aaa \t \nbb"),
+ // "aaa\nbb
",
+ // "should support a mixed line suffix (3)"
+ // );
+
+ // To do: trailing.
+ // assert_eq!(
+ // micromark("aaa\0 \nbb"),
+ // "aaa�
\nbb
",
+ // "should support a hard break after a replacement character"
+ // );
+
+ // To do: trailing.
+ // assert_eq!(
+ // micromark("aaa\0\t\nbb"),
+ // "aaa�\nbb
",
+ // "should support a line suffix after a replacement character"
+ // );
+
+ // To do: attention, trailing.
+ // assert_eq!(
+ // micromark("*a* \nbb"),
+ // "a
\nbb
",
+ // "should support a hard break after a span"
+ // );
+
+ // To do: attention, trailing.
+ // assert_eq!(
+ // micromark("*a*\t\nbb"),
+ // "a\nbb
",
+ // "should support a line suffix after a span"
+ // );
+
+ // To do: attention, trailing.
+ // assert_eq!(
+ // micromark("*a* \t\nbb"),
+ // "a\nbb
",
+ // "should support a mixed line suffix after a span (1)"
+ // );
+
+ // To do: attention, trailing.
+ // assert_eq!(
+ // micromark("*a*\t \nbb"),
+ // "a\nbb
",
+ // "should support a mixed line suffix after a span (2)"
+ // );
+
+ // To do: attention, trailing.
+ // assert_eq!(
+ // micromark("*a* \t \nbb"),
+ // "a\nbb
",
+ // "should support a mixed line suffix after a span (3)"
+ // );
+
+ // // To do: turning off things.
+ // assert_eq!(
+ // micromark("a\\\nb", {extensions: [{disable: {null: ["hardBreakEscape"]}}]}),
+ // "a\\\nb
",
+ // "should support turning off hard break (escape)"
+ // );
+}
--
cgit