From 7350acc692a79d9d4cf56afbc53ac3c9f2a6237c Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 16 Jun 2022 12:55:50 +0200 Subject: Add support for hard break (trailing) --- readme.md | 8 +-- src/compiler.rs | 5 +- src/constant.rs | 6 ++ src/construct/character_escape.rs | 2 +- src/construct/hard_break_escape.rs | 19 +++-- src/construct/hard_break_trailing.rs | 83 ++++++++++++++++++++++ src/construct/mod.rs | 4 +- src/content/text.rs | 15 ++-- src/tokenizer.rs | 13 +++- src/util/span.rs | 2 +- tests/hard_break_escape.rs | 121 +------------------------------ tests/hard_break_trailing.rs | 133 +++++++++++++++++++++++++++++++++++ 12 files changed, 271 insertions(+), 140 deletions(-) create mode 100644 src/construct/hard_break_trailing.rs create mode 100644 tests/hard_break_trailing.rs diff --git a/readme.md b/readme.md index 0e58750..0cbff1e 100644 --- a/readme.md +++ b/readme.md @@ -111,7 +111,8 @@ cargo doc --document-private-items - [x] (1) code (text) - [ ] (3) content - [ ] (3) definition -- [x] (1) hard break escape +- [x] hard break (escape) +- [x] hard break (trailing) - [x] heading (atx) - [ ] (1) heading (setext) - [x] html (flow) @@ -122,7 +123,6 @@ cargo doc --document-private-items - [ ] (8) list - [ ] (1) paragraph - [x] thematic break -- [ ] (1) trailing break escape ### Content types @@ -147,12 +147,12 @@ cargo doc --document-private-items - [x] character escape - [x] character reference - [x] code (text) - - [x] hard break escape + - [x] hard break (escape) + - [x] hard break (trailing) - [x] html (text) - [ ] label end - [ ] label start (image) - [ ] label start (link) - - [ ] trailing break escape - [x] string - [x] character escape - [x] character reference diff --git a/src/compiler.rs b/src/compiler.rs index 3aacca0..9f84a38 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -152,6 +152,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::Whitespace | TokenType::HardBreakEscape | TokenType::HardBreakEscapeMarker + | TokenType::HardBreakTrailing + | TokenType::HardBreakTrailingSpace | TokenType::HtmlFlowData | TokenType::HtmlTextData | TokenType::CodeFencedFence @@ -195,6 +197,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterReference | TokenType::CharacterReferenceMarkerSemi | TokenType::HardBreakEscapeMarker + | TokenType::HardBreakTrailingSpace | TokenType::Autolink | TokenType::AutolinkMarker => {} TokenType::HtmlFlow | TokenType::HtmlText => { @@ -211,7 +214,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St TokenType::Paragraph => { buf_tail_mut(buffers).push("

".to_string()); } - TokenType::HardBreakEscape => { + TokenType::HardBreakEscape | TokenType::HardBreakTrailing => { buf_tail_mut(buffers).push("
".to_string()); } TokenType::CodeIndented | TokenType::CodeFenced => { diff --git a/src/constant.rs b/src/constant.rs index d2fb238..ff9e62e 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -44,6 +44,12 @@ pub const AUTOLINK_SCHEME_SIZE_MAX: usize = 32; /// [autolink]: crate::construct::autolink pub const AUTOLINK_DOMAIN_SIZE_MAX: usize = 63; +/// The number of spaces needed, before a line ending, for a [hard break +/// (trailing)][hard_break_trailing] to form. +/// +/// [hard_break_trailing]: crate::construct::hard_break_trailing +pub const HARD_BREAK_PREFIX_SIZE_MIN: usize = 2; + /// The number of markers needed for a [thematic break][thematic_break] to form. /// /// Like many things in markdown, the number is `3`. diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index baedd4b..743cbf8 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -14,7 +14,7 @@ //! [character reference][character_reference] instead //! (as in, `&`, `{`, or say ` `). //! It is also possible to escape a line ending in text with a similar -//! construct: a [hard break escape][hard_break_escape] is a backslash followed +//! construct: a [hard break (escape)][hard_break_escape] is a backslash followed //! by a line ending (that is part of the construct instead of ending it). //! //! ## References diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index a7712d6..51da953 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -1,4 +1,4 @@ -//! Hard break escapes are a construct that occurs in the [text][] content +//! Hard break (escape) is a construct that occurs in the [text][] content //! type. //! //! They’re formed with the following BNF: @@ -8,6 +8,15 @@ //! ; instead of ending it). //! hard_break_escape ::= '\\' //! ``` +//! +//! Hard breaks in markdown relate to the HTML element `
`. +//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info. +//! +//! It is also possible to create a hard break with a +//! [hard break (trailing)][hard_break_trailing]. +//! That construct is not recommended because trailing spaces are typically +//! invisible in editors, or even automatically removed, making them to use. +//! //! It is also possible to escape punctuation characters with a similar //! construct: a [character escape][character_escape] is a backslash followed //! by an ASCII punctuation character. @@ -22,12 +31,12 @@ //! [text]: crate::content::text //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference -//! -//! +//! [hard_break_trailing]: crate::construct::hard_break_trailing +//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; -/// Start of a hard break escape. +/// Start of a hard break (escape). /// /// ```markdown /// a|\ @@ -45,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// At the end of a hard break escape, after `\`. +/// At the end of a hard break (escape), after `\`. /// /// ```markdown /// a\| diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs new file mode 100644 index 0000000..46337c5 --- /dev/null +++ b/src/construct/hard_break_trailing.rs @@ -0,0 +1,83 @@ +//! Hard break (trailing) is a construct that occurs in the [text][] content +//! type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: followed by a line ending (that is part of the construct +//! ; instead of ending it). +//! hard_break_trailing ::= 2*' ' +//! ``` +//! +//! The minimum number of the spaces is defined in +//! [`HARD_BREAK_PREFIX_SIZE_MIN`][hard_break_prefix_size_min]. +//! +//! Hard breaks in markdown relate to the HTML element `
`. +//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info. +//! +//! It is also possible to create a hard break with a similar construct: a +//! [hard break (escape)][hard_break_escape] is a backslash followed +//! by a line ending. +//! That construct is recommended because it is similar to a +//! [character escape][character_escape] and similar to how line endings can be +//! “escaped” in other languages. +//! Trailing spaces are typically invisible in editors, or even automatically +//! removed, making hard break (trailing) hard to use. +//! +//! ## References +//! +//! * [`lib/initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js) +//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks) +//! +//! [text]: crate::content::text +//! [hard_break_escape]: crate::construct::hard_break_escape +//! [character_escape]: crate::construct::character_escape +//! [hard_break_prefix_size_min]: crate::constant::HARD_BREAK_PREFIX_SIZE_MIN +//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element + +use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a hard break (trailing). +/// +/// ```markdown +/// a| ␊ +/// b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(' ') => { + tokenizer.enter(TokenType::HardBreakTrailing); + tokenizer.enter(TokenType::HardBreakTrailingSpace); + tokenizer.consume(code); + (State::Fn(Box::new(|t, c| inside(t, c, 1))), None) + } + _ => (State::Nok, None), + } +} + +/// Inside the hard break (trailing). +/// +/// ```markdown +/// a |␊ +/// b +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + Code::Char(' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |t, c| inside(t, c, size + 1))), + None, + ) + } + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + if size >= HARD_BREAK_PREFIX_SIZE_MIN => + { + tokenizer.exit(TokenType::HardBreakTrailingSpace); + tokenizer.exit(TokenType::HardBreakTrailing); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 27f4308..880d055 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -26,7 +26,8 @@ //! * [code (text)][code_text] //! * content //! * definition -//! * [hard break escape][hard_break_escape] +//! * [hard break (escape)][hard_break_escape] +//! * [hard break (trailing)][hard_break_trailing] //! * [heading (atx)][heading_atx] //! * heading (setext) //! * [html (flow)][html_flow] @@ -61,6 +62,7 @@ pub mod code_fenced; pub mod code_indented; pub mod code_text; pub mod hard_break_escape; +pub mod hard_break_trailing; pub mod heading_atx; pub mod html_flow; pub mod html_text; diff --git a/src/content/text.rs b/src/content/text.rs index d4d5493..f61b390 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -8,7 +8,8 @@ //! * [Autolink][crate::construct::autolink] //! * Attention //! * [HTML (text)][crate::construct::html_text] -//! * [Hard break escape][crate::construct::hard_break_escape] +//! * [Hard break (escape)][crate::construct::hard_break_escape] +//! * [Hard break (trailing)][crate::construct::hard_break_trailing] //! * [Code (text)][crate::construct::code_text] //! * Line ending //! * Label start (image) @@ -19,7 +20,8 @@ use crate::construct::{ autolink::start as autolink, character_escape::start as character_escape, character_reference::start as character_reference, code_text::start as code_text, - hard_break_escape::start as hard_break_escape, html_text::start as html_text, + hard_break_escape::start as hard_break_escape, + hard_break_trailing::start as hard_break_trailing, html_text::start as html_text, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -35,10 +37,11 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), - _ => tokenizer.attempt_6( + _ => tokenizer.attempt_7( character_reference, character_escape, hard_break_escape, + hard_break_trailing, autolink, html_text, code_text, @@ -78,12 +81,12 @@ fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::None => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { tokenizer.exit(TokenType::Data); - (State::Ok, None) + before_data(tokenizer, code) } // To do: somehow get these markers from constructs. - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '<' | '\\' | '`') => { + Code::Char(' ' | '&' | '<' | '\\' | '`') => { tokenizer.exit(TokenType::Data); start(tokenizer, code) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index a63d209..da45ee5 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -62,6 +62,8 @@ pub enum TokenType { HardBreakEscape, HardBreakEscapeMarker, + HardBreakTrailing, + HardBreakTrailingSpace, HtmlFlow, HtmlFlowData, @@ -445,6 +447,7 @@ impl Tokenizer { None, None, None, + None, done, ) } @@ -464,12 +467,13 @@ impl Tokenizer { None, None, None, + None, done, ) } #[allow(clippy::too_many_arguments, clippy::many_single_char_names)] - pub fn attempt_6( + pub fn attempt_7( &mut self, a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, @@ -477,6 +481,7 @@ impl Tokenizer { d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, e: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, f: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + g: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, done: impl FnOnce(bool) -> Box + 'static, ) -> Box { self.call_multiple( @@ -487,6 +492,7 @@ impl Tokenizer { Some(Box::new(d)), Some(Box::new(e)), Some(Box::new(f)), + Some(Box::new(g)), done, ) } @@ -501,6 +507,7 @@ impl Tokenizer { d: Option>, e: Option>, f: Option>, + g: Option>, done: impl FnOnce(bool) -> Box + 'static, ) -> Box { if let Some(head) = a { @@ -509,7 +516,9 @@ impl Tokenizer { done(ok) } else { Box::new(move |tokenizer: &mut Tokenizer, code| { - tokenizer.call_multiple(check, b, c, d, e, f, None, done)(tokenizer, code) + tokenizer.call_multiple(check, b, c, d, e, f, g, None, done)( + tokenizer, code, + ) }) } }; diff --git a/src/util/span.rs b/src/util/span.rs index c48549b..02811cc 100644 --- a/src/util/span.rs +++ b/src/util/span.rs @@ -36,7 +36,7 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span { assert_eq!( exit.event_type, EventType::Exit, - "expected `get_span` to be called on `exit` event" + "expected `from_exit_event` to be called on `exit` event" ); let mut enter_index = index - 1; diff --git a/tests/hard_break_escape.rs b/tests/hard_break_escape.rs index fe4c82b..2e3a3ba 100644 --- a/tests/hard_break_escape.rs +++ b/tests/hard_break_escape.rs @@ -1,49 +1,21 @@ extern crate micromark; -use micromark::{micromark}; +use micromark::micromark; #[test] fn hard_break_escape() { - // To do: trailing. - // assert_eq!( - // micromark("foo \nbaz"), - // "

foo
\nbaz

", - // "should support two trailing spaces to form a hard break" - // ); - assert_eq!( micromark("foo\\\nbaz"), "

foo
\nbaz

", "should support a backslash to form a hard break" ); - // To do: trailing. - // assert_eq!( - // micromark("foo \nbaz"), - // "

foo
\nbaz

", - // "should support multiple trailing spaces" - // ); - - // To do: trailing. - // assert_eq!( - // micromark("foo \n bar"), - // "

foo
\nbar

", - // "should support leading spaces after a trailing hard break" - // ); - - // To do: trim paragraph whitespace. + // To do: trimming whitespace in paragraphs. // assert_eq!( // micromark("foo\\\n bar"), // "

foo
\nbar

", // "should support leading spaces after an escape hard break" // ); - // To do: trailing, attention. - // assert_eq!( - // micromark("*foo \nbar*"), - // "

foo
\nbar

", - // "should support trailing hard breaks in emphasis" - // ); - // To do: attention. // assert_eq!( // micromark("*foo\\\nbar*"), @@ -51,25 +23,12 @@ fn hard_break_escape() { // "should support escape hard breaks in emphasis" // ); - assert_eq!( - micromark("`code \ntext`"), - "

code text

", - "should not support trailing hard breaks in code" - ); - assert_eq!( micromark("``code\\\ntext``"), "

code\\ text

", "should not support escape hard breaks in code" ); - // To do: paragraph trimming. - // assert_eq!( - // micromark("foo "), - // "

foo

", - // "should not support trailing hard breaks at the end of a paragraph" - // ); - assert_eq!( micromark("foo\\"), "

foo\\

", @@ -82,82 +41,6 @@ fn hard_break_escape() { "should not support escape hard breaks at the end of a heading" ); - assert_eq!( - micromark("### foo "), - "

foo

", - "should not support trailing hard breaks at the end of a heading" - ); - - // To do: paragraph trimming. - // assert_eq!( - // micromark("aaa \t\nbb"), - // "

aaa\nbb

", - // "should support a mixed line suffix (1)" - // ); - - // To do: paragraph trimming. - // assert_eq!( - // micromark("aaa\t \nbb"), - // "

aaa\nbb

", - // "should support a mixed line suffix (2)" - // ); - - // To do: paragraph trimming. - // assert_eq!( - // micromark("aaa \t \nbb"), - // "

aaa\nbb

", - // "should support a mixed line suffix (3)" - // ); - - // To do: trailing. - // assert_eq!( - // micromark("aaa\0 \nbb"), - // "

aaa�
\nbb

", - // "should support a hard break after a replacement character" - // ); - - // To do: trailing. - // assert_eq!( - // micromark("aaa\0\t\nbb"), - // "

aaa�\nbb

", - // "should support a line suffix after a replacement character" - // ); - - // To do: attention, trailing. - // assert_eq!( - // micromark("*a* \nbb"), - // "

a
\nbb

", - // "should support a hard break after a span" - // ); - - // To do: attention, trailing. - // assert_eq!( - // micromark("*a*\t\nbb"), - // "

a\nbb

", - // "should support a line suffix after a span" - // ); - - // To do: attention, trailing. - // assert_eq!( - // micromark("*a* \t\nbb"), - // "

a\nbb

", - // "should support a mixed line suffix after a span (1)" - // ); - - // To do: attention, trailing. - // assert_eq!( - // micromark("*a*\t \nbb"), - // "

a\nbb

", - // "should support a mixed line suffix after a span (2)" - // ); - - // To do: attention, trailing. - // assert_eq!( - // micromark("*a* \t \nbb"), - // "

a\nbb

", - // "should support a mixed line suffix after a span (3)" - // ); - // // To do: turning off things. // assert_eq!( // micromark("a\\\nb", {extensions: [{disable: {null: ["hardBreakEscape"]}}]}), diff --git a/tests/hard_break_trailing.rs b/tests/hard_break_trailing.rs new file mode 100644 index 0000000..6c29020 --- /dev/null +++ b/tests/hard_break_trailing.rs @@ -0,0 +1,133 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn hard_break_trailing() { + assert_eq!( + micromark("foo \nbaz"), + "

foo
\nbaz

", + "should support two trailing spaces to form a hard break" + ); + + assert_eq!( + micromark("foo \nbaz"), + "

foo
\nbaz

", + "should support multiple trailing spaces" + ); + + // To do: trimming whitespace in paragraphs. + // assert_eq!( + // micromark("foo \n bar"), + // "

foo
\nbar

", + // "should support leading spaces after a trailing hard break" + // ); + + // To do: attention. + // assert_eq!( + // micromark("*foo \nbar*"), + // "

foo
\nbar

", + // "should support trailing hard breaks in emphasis" + // ); + + // To do: attention. + // assert_eq!( + // micromark("*foo\\\nbar*"), + // "

foo
\nbar

", + // "should support escape hard breaks in emphasis" + // ); + + assert_eq!( + micromark("`code \ntext`"), + "

code text

", + "should not support trailing hard breaks in code" + ); + + // To do: trimming whitespace in paragraphs. + // assert_eq!( + // micromark("foo "), + // "

foo

", + // "should not support trailing hard breaks at the end of a paragraph" + // ); + + assert_eq!( + micromark("### foo "), + "

foo

", + "should not support trailing hard breaks at the end of a heading" + ); + + // To do: trimming whitespace in paragraphs. + // assert_eq!( + // micromark("aaa \t\nbb"), + // "

aaa\nbb

", + // "should support a mixed line suffix (1)" + // ); + + // To do: trimming whitespace in paragraphs. + // assert_eq!( + // micromark("aaa\t \nbb"), + // "

aaa\nbb

", + // "should support a mixed line suffix (2)" + // ); + + // To do: trimming whitespace in paragraphs. + // assert_eq!( + // micromark("aaa \t \nbb"), + // "

aaa\nbb

", + // "should support a mixed line suffix (3)" + // ); + + assert_eq!( + micromark("aaa\0 \nbb"), + "

aaa�
\nbb

", + "should support a hard break after a replacement character" + ); + + // To do: trimming whitespace in paragraphs. + // assert_eq!( + // micromark("aaa\0\t\nbb"), + // "

aaa�\nbb

", + // "should support a line suffix after a replacement character" + // ); + + // To do: attention. + // assert_eq!( + // micromark("*a* \nbb"), + // "

a
\nbb

", + // "should support a hard break after a span" + // ); + + // To do: attention, trimming whitespace in paragraphs. + // assert_eq!( + // micromark("*a*\t\nbb"), + // "

a\nbb

", + // "should support a line suffix after a span" + // ); + + // To do: attention, trimming whitespace in paragraphs. + // assert_eq!( + // micromark("*a* \t\nbb"), + // "

a\nbb

", + // "should support a mixed line suffix after a span (1)" + // ); + + // To do: attention, trimming whitespace in paragraphs. + // assert_eq!( + // micromark("*a*\t \nbb"), + // "

a\nbb

", + // "should support a mixed line suffix after a span (2)" + // ); + + // To do: attention, trimming whitespace in paragraphs. + // assert_eq!( + // micromark("*a* \t \nbb"), + // "

a\nbb

", + // "should support a mixed line suffix after a span (3)" + // ); + + // // To do: turning off things. + // assert_eq!( + // micromark("a \nb", {extensions: [{disable: {null: ["hardBreakTrailing"]}}]}), + // "

a\nb

", + // "should support turning off hard break (trailing)" + // ); +} -- cgit