From 7350acc692a79d9d4cf56afbc53ac3c9f2a6237c Mon Sep 17 00:00:00 2001
From: Titus Wormer
Date: Thu, 16 Jun 2022 12:55:50 +0200
Subject: Add support for hard break (trailing)
---
src/compiler.rs | 5 ++-
src/constant.rs | 6 +++
src/construct/character_escape.rs | 2 +-
src/construct/hard_break_escape.rs | 19 ++++++---
src/construct/hard_break_trailing.rs | 83 ++++++++++++++++++++++++++++++++++++
src/construct/mod.rs | 4 +-
src/content/text.rs | 15 ++++---
src/tokenizer.rs | 13 +++++-
src/util/span.rs | 2 +-
9 files changed, 132 insertions(+), 17 deletions(-)
create mode 100644 src/construct/hard_break_trailing.rs
(limited to 'src')
diff --git a/src/compiler.rs b/src/compiler.rs
index 3aacca0..9f84a38 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -152,6 +152,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::Whitespace
| TokenType::HardBreakEscape
| TokenType::HardBreakEscapeMarker
+ | TokenType::HardBreakTrailing
+ | TokenType::HardBreakTrailingSpace
| TokenType::HtmlFlowData
| TokenType::HtmlTextData
| TokenType::CodeFencedFence
@@ -195,6 +197,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::CharacterReference
| TokenType::CharacterReferenceMarkerSemi
| TokenType::HardBreakEscapeMarker
+ | TokenType::HardBreakTrailingSpace
| TokenType::Autolink
| TokenType::AutolinkMarker => {}
TokenType::HtmlFlow | TokenType::HtmlText => {
@@ -211,7 +214,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
TokenType::Paragraph => {
buf_tail_mut(buffers).push("
".to_string());
}
- TokenType::HardBreakEscape => {
+ TokenType::HardBreakEscape | TokenType::HardBreakTrailing => {
buf_tail_mut(buffers).push("
".to_string());
}
TokenType::CodeIndented | TokenType::CodeFenced => {
diff --git a/src/constant.rs b/src/constant.rs
index d2fb238..ff9e62e 100644
--- a/src/constant.rs
+++ b/src/constant.rs
@@ -44,6 +44,12 @@ pub const AUTOLINK_SCHEME_SIZE_MAX: usize = 32;
/// [autolink]: crate::construct::autolink
pub const AUTOLINK_DOMAIN_SIZE_MAX: usize = 63;
+/// The number of spaces needed, before a line ending, for a [hard break
+/// (trailing)][hard_break_trailing] to form.
+///
+/// [hard_break_trailing]: crate::construct::hard_break_trailing
+pub const HARD_BREAK_PREFIX_SIZE_MIN: usize = 2;
+
/// The number of markers needed for a [thematic break][thematic_break] to form.
///
/// Like many things in markdown, the number is `3`.
diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs
index baedd4b..743cbf8 100644
--- a/src/construct/character_escape.rs
+++ b/src/construct/character_escape.rs
@@ -14,7 +14,7 @@
//! [character reference][character_reference] instead
//! (as in, `&`, `{`, or say ` `).
//! It is also possible to escape a line ending in text with a similar
-//! construct: a [hard break escape][hard_break_escape] is a backslash followed
+//! construct: a [hard break (escape)][hard_break_escape] is a backslash followed
//! by a line ending (that is part of the construct instead of ending it).
//!
//! ## References
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
index a7712d6..51da953 100644
--- a/src/construct/hard_break_escape.rs
+++ b/src/construct/hard_break_escape.rs
@@ -1,4 +1,4 @@
-//! Hard break escapes are a construct that occurs in the [text][] content
+//! Hard break (escape) is a construct that occurs in the [text][] content
//! type.
//!
//! They’re formed with the following BNF:
@@ -8,6 +8,15 @@
//! ; instead of ending it).
//! hard_break_escape ::= '\\'
//! ```
+//!
+//! Hard breaks in markdown relate to the HTML element `
`.
+//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info.
+//!
+//! It is also possible to create a hard break with a
+//! [hard break (trailing)][hard_break_trailing].
+//! That construct is not recommended because trailing spaces are typically
+//! invisible in editors, or even automatically removed, making them to use.
+//!
//! It is also possible to escape punctuation characters with a similar
//! construct: a [character escape][character_escape] is a backslash followed
//! by an ASCII punctuation character.
@@ -22,12 +31,12 @@
//! [text]: crate::content::text
//! [character_escape]: crate::construct::character_escape
//! [character_reference]: crate::construct::character_reference
-//!
-//!
+//! [hard_break_trailing]: crate::construct::hard_break_trailing
+//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
-/// Start of a hard break escape.
+/// Start of a hard break (escape).
///
/// ```markdown
/// a|\
@@ -45,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
-/// At the end of a hard break escape, after `\`.
+/// At the end of a hard break (escape), after `\`.
///
/// ```markdown
/// a\|
diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs
new file mode 100644
index 0000000..46337c5
--- /dev/null
+++ b/src/construct/hard_break_trailing.rs
@@ -0,0 +1,83 @@
+//! Hard break (trailing) is a construct that occurs in the [text][] content
+//! type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: followed by a line ending (that is part of the construct
+//! ; instead of ending it).
+//! hard_break_trailing ::= 2*' '
+//! ```
+//!
+//! The minimum number of the spaces is defined in
+//! [`HARD_BREAK_PREFIX_SIZE_MIN`][hard_break_prefix_size_min].
+//!
+//! Hard breaks in markdown relate to the HTML element `
`.
+//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info.
+//!
+//! It is also possible to create a hard break with a similar construct: a
+//! [hard break (escape)][hard_break_escape] is a backslash followed
+//! by a line ending.
+//! That construct is recommended because it is similar to a
+//! [character escape][character_escape] and similar to how line endings can be
+//! “escaped” in other languages.
+//! Trailing spaces are typically invisible in editors, or even automatically
+//! removed, making hard break (trailing) hard to use.
+//!
+//! ## References
+//!
+//! * [`lib/initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js)
+//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks)
+//!
+//! [text]: crate::content::text
+//! [hard_break_escape]: crate::construct::hard_break_escape
+//! [character_escape]: crate::construct::character_escape
+//! [hard_break_prefix_size_min]: crate::constant::HARD_BREAK_PREFIX_SIZE_MIN
+//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
+
+use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a hard break (trailing).
+///
+/// ```markdown
+/// a| ␊
+/// b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(' ') => {
+ tokenizer.enter(TokenType::HardBreakTrailing);
+ tokenizer.enter(TokenType::HardBreakTrailingSpace);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(|t, c| inside(t, c, 1))), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside the hard break (trailing).
+///
+/// ```markdown
+/// a |␊
+/// b
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ Code::Char(' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| inside(t, c, size + 1))),
+ None,
+ )
+ }
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ if size >= HARD_BREAK_PREFIX_SIZE_MIN =>
+ {
+ tokenizer.exit(TokenType::HardBreakTrailingSpace);
+ tokenizer.exit(TokenType::HardBreakTrailing);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 27f4308..880d055 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -26,7 +26,8 @@
//! * [code (text)][code_text]
//! * content
//! * definition
-//! * [hard break escape][hard_break_escape]
+//! * [hard break (escape)][hard_break_escape]
+//! * [hard break (trailing)][hard_break_trailing]
//! * [heading (atx)][heading_atx]
//! * heading (setext)
//! * [html (flow)][html_flow]
@@ -61,6 +62,7 @@ pub mod code_fenced;
pub mod code_indented;
pub mod code_text;
pub mod hard_break_escape;
+pub mod hard_break_trailing;
pub mod heading_atx;
pub mod html_flow;
pub mod html_text;
diff --git a/src/content/text.rs b/src/content/text.rs
index d4d5493..f61b390 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -8,7 +8,8 @@
//! * [Autolink][crate::construct::autolink]
//! * Attention
//! * [HTML (text)][crate::construct::html_text]
-//! * [Hard break escape][crate::construct::hard_break_escape]
+//! * [Hard break (escape)][crate::construct::hard_break_escape]
+//! * [Hard break (trailing)][crate::construct::hard_break_trailing]
//! * [Code (text)][crate::construct::code_text]
//! * Line ending
//! * Label start (image)
@@ -19,7 +20,8 @@
use crate::construct::{
autolink::start as autolink, character_escape::start as character_escape,
character_reference::start as character_reference, code_text::start as code_text,
- hard_break_escape::start as hard_break_escape, html_text::start as html_text,
+ hard_break_escape::start as hard_break_escape,
+ hard_break_trailing::start as hard_break_trailing, html_text::start as html_text,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -35,10 +37,11 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
- _ => tokenizer.attempt_6(
+ _ => tokenizer.attempt_7(
character_reference,
character_escape,
hard_break_escape,
+ hard_break_trailing,
autolink,
html_text,
code_text,
@@ -78,12 +81,12 @@ fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::None => {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
tokenizer.exit(TokenType::Data);
- (State::Ok, None)
+ before_data(tokenizer, code)
}
// To do: somehow get these markers from constructs.
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '<' | '\\' | '`') => {
+ Code::Char(' ' | '&' | '<' | '\\' | '`') => {
tokenizer.exit(TokenType::Data);
start(tokenizer, code)
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index a63d209..da45ee5 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -62,6 +62,8 @@ pub enum TokenType {
HardBreakEscape,
HardBreakEscapeMarker,
+ HardBreakTrailing,
+ HardBreakTrailingSpace,
HtmlFlow,
HtmlFlowData,
@@ -445,6 +447,7 @@ impl Tokenizer {
None,
None,
None,
+ None,
done,
)
}
@@ -464,12 +467,13 @@ impl Tokenizer {
None,
None,
None,
+ None,
done,
)
}
#[allow(clippy::too_many_arguments, clippy::many_single_char_names)]
- pub fn attempt_6(
+ pub fn attempt_7(
&mut self,
a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
@@ -477,6 +481,7 @@ impl Tokenizer {
d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
e: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
f: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ g: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
done: impl FnOnce(bool) -> Box + 'static,
) -> Box {
self.call_multiple(
@@ -487,6 +492,7 @@ impl Tokenizer {
Some(Box::new(d)),
Some(Box::new(e)),
Some(Box::new(f)),
+ Some(Box::new(g)),
done,
)
}
@@ -501,6 +507,7 @@ impl Tokenizer {
d: Option>,
e: Option>,
f: Option>,
+ g: Option>,
done: impl FnOnce(bool) -> Box + 'static,
) -> Box {
if let Some(head) = a {
@@ -509,7 +516,9 @@ impl Tokenizer {
done(ok)
} else {
Box::new(move |tokenizer: &mut Tokenizer, code| {
- tokenizer.call_multiple(check, b, c, d, e, f, None, done)(tokenizer, code)
+ tokenizer.call_multiple(check, b, c, d, e, f, g, None, done)(
+ tokenizer, code,
+ )
})
}
};
diff --git a/src/util/span.rs b/src/util/span.rs
index c48549b..02811cc 100644
--- a/src/util/span.rs
+++ b/src/util/span.rs
@@ -36,7 +36,7 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span {
assert_eq!(
exit.event_type,
EventType::Exit,
- "expected `get_span` to be called on `exit` event"
+ "expected `from_exit_event` to be called on `exit` event"
);
let mut enter_index = index - 1;
--
cgit