diff options
Diffstat (limited to 'src/construct')
-rw-r--r-- | src/construct/heading_atx.rs | 32 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 301 | ||||
-rw-r--r-- | src/construct/mod.rs | 3 | ||||
-rw-r--r-- | src/construct/thematic_break.rs | 5 |
4 files changed, 323 insertions, 18 deletions
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 1a9ed03..3ff6fea 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -18,9 +18,11 @@ //! In older markdown versions, this was not required, and headings would form //! without it. //! -//! In markdown, it is also possible to create headings with the setext heading -//! construct. -//! The benefit of setext headings is that their text can include line endings. +//! In markdown, it is also possible to create headings with a +//! [heading (setext)][heading_setext] construct. +//! The benefit of setext headings is that their text can include line endings, +//! and by extensions also hard breaks (e.g., with +//! [hard break (escape)][hard_break_escape]). //! However, their limit is that they cannot form `<h3>` through `<h6>` //! headings. //! Due to this limitation, it is recommended to use atx headings. @@ -39,11 +41,11 @@ //! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) //! //! [flow]: crate::content::flow +//! [heading_setext]: crate::construct::heading_setext +//! [hard_break_escape]: crate::construct::hard_break_escape //! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements //! [wiki-setext]: https://en.wikipedia.org/wiki/Setext //! [atx]: http://www.aaronsw.com/2002/atx/ -//! -//! <!-- To do: link `setext` --> use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -55,8 +57,8 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if Code::Char('#') == code { - tokenizer.enter(TokenType::AtxHeading); - tokenizer.enter(TokenType::AtxHeadingSequence); + tokenizer.enter(TokenType::HeadingAtx); + tokenizer.enter(TokenType::HeadingAtxSequence); sequence_open(tokenizer, code, 0) } else { (State::Nok, None) @@ -76,7 +78,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR | Code::Char('\t' | '\n' | '\r' | ' ') if rank > 0 => { - tokenizer.exit(TokenType::AtxHeadingSequence); + tokenizer.exit(TokenType::HeadingAtxSequence); at_break(tokenizer, code) } Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { @@ -104,19 +106,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::AtxHeading); + tokenizer.exit(TokenType::HeadingAtx); (State::Ok, Some(vec![code])) } Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.enter(TokenType::AtxHeadingWhitespace); + tokenizer.enter(TokenType::HeadingAtxWhitespace); whitespace(tokenizer, code) } Code::Char('#') => { - tokenizer.enter(TokenType::AtxHeadingSequence); + tokenizer.enter(TokenType::HeadingAtxSequence); further_sequence(tokenizer, code) } Code::Char(_) => { - tokenizer.enter(TokenType::AtxHeadingText); + tokenizer.enter(TokenType::HeadingAtxText); tokenizer.enter(TokenType::ChunkText); data(tokenizer, code) } @@ -134,7 +136,7 @@ fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); (State::Fn(Box::new(further_sequence)), None) } else { - tokenizer.exit(TokenType::AtxHeadingSequence); + tokenizer.exit(TokenType::HeadingAtxSequence); at_break(tokenizer, code) } } @@ -151,7 +153,7 @@ fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { (State::Fn(Box::new(whitespace)), None) } _ => { - tokenizer.exit(TokenType::AtxHeadingWhitespace); + tokenizer.exit(TokenType::HeadingAtxWhitespace); at_break(tokenizer, code) } } @@ -167,7 +169,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { tokenizer.exit(TokenType::ChunkText); - tokenizer.exit(TokenType::AtxHeadingText); + tokenizer.exit(TokenType::HeadingAtxText); at_break(tokenizer, code) } _ => { diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs new file mode 100644 index 0000000..8cc4f6d --- /dev/null +++ b/src/construct/heading_setext.rs @@ -0,0 +1,301 @@ +//! Heading (setext) is a construct that occurs in the [flow] content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! heading_setext ::= line *(eol line) eol whitespace_optional (1*'-' | 1*'=') whitespace_optional +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ whitespace ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! ``` +//! +//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in +//! HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! In markdown, it is also possible to create headings with a +//! [heading (atx)][heading_atx] construct. +//! The benefit of setext headings is that their text can include line endings, +//! and by extensions also hard breaks (e.g., with +//! [hard break (escape)][hard_break_escape]). +//! However, their limit is that they cannot form `<h3>` through `<h6>` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! [Thematic breaks][thematic_break] formed with dashes (without whitespace) +//! can also form heading (setext). +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`setext-underline.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/setext-underline.js) +//! * [*§ 4.3 Setext headings* in `CommonMark`](https://spec.commonmark.org/0.30/#setext-headings) +//! +//! [flow]: crate::content::flow +//! [heading_atx]: crate::construct::heading_atx +//! [thematic_break]: crate::construct::thematic_break +//! [hard_break_escape]: crate::construct::hard_break_escape +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ + +use crate::constant::TAB_SIZE; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::span::from_exit_event; + +/// Kind of underline. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + Dash, + /// Tilde code. + EqualsTo, +} + +/// Start of a heading (setext). +/// +/// ```markdown +/// |alpha +/// == +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("expected non-eol/eof"); + } + _ => { + tokenizer.enter(TokenType::HeadingSetext); + tokenizer.enter(TokenType::HeadingSetextText); + tokenizer.enter(TokenType::ChunkText); + text_inside(tokenizer, code) + } + } +} + +/// Inside text. +/// +/// ```markdown +/// al|pha +/// bra|vo +/// == +/// ``` +pub fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::HeadingSetextText); + tokenizer.attempt(underline_before, |ok| { + Box::new(if ok { after } else { text_continue }) + })(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(text_inside)), None) + } + } +} + +/// At a line ending, not at an underline. +/// +/// ```markdown +/// alpha +/// |bravo +/// == +/// ``` +fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // Needed to connect the text. + // To do: does it work? + tokenizer.enter(TokenType::HeadingSetextText); + tokenizer.events.pop(); + tokenizer.events.pop(); + + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + let next = tokenizer.events.len(); + let previous = next - 2; + + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + + ( + State::Fn(Box::new(tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(text_line_start), + ))), + None, + ) + } + _ => unreachable!("expected eol"), + } +} + +/// At a line ending after whitespace, not at an underline. +/// +/// ```markdown +/// alpha +/// |bravo +/// == +/// ``` +fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let next = tokenizer.events.len() - 2; + let previous = next - 2; + + // Link the whitespace, if it exists. + if tokenizer.events[next].token_type == TokenType::Whitespace { + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + } + + match code { + // Blank lines not allowed. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), + _ => { + let next = tokenizer.events.len(); + let previous = next - 2; + + tokenizer.enter(TokenType::ChunkText); + + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + + text_inside(tokenizer, code) + } + } +} + +/// After a heading (setext). +/// +/// ```markdown +/// alpha +/// ==| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::HeadingSetext); + (State::Ok, Some(vec![code])) +} + +/// At a line ending, presumably an underline. +/// +/// ```markdown +/// alpha| +/// == +/// ``` +fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(underline_start)), None) + } + _ => unreachable!("expected eol"), + } +} + +/// After a line ending, presumably an underline. +/// +/// ```markdown +/// alpha +/// |== +/// ``` +fn underline_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(underline_sequence_start), + )(tokenizer, code) +} + +/// After optional whitespace, presumably an underline. +/// +/// ```markdown +/// alpha +/// |== +/// ``` +fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == '-' || char == '=' => { + let marker = if char == '-' { + Kind::Dash + } else { + Kind::EqualsTo + }; + tokenizer.enter(TokenType::HeadingSetextUnderline); + underline_sequence_inside(tokenizer, code, marker) + } + _ => (State::Nok, None), + } +} + +/// In an underline sequence. +/// +/// ```markdown +/// alpha +/// =|= +/// ``` +fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { + let marker = if kind == Kind::Dash { '-' } else { '=' }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + underline_sequence_inside(tokenizer, code, kind) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(underline_after), + )(tokenizer, code), + _ => underline_after(tokenizer, code), + } +} + +/// After an underline sequence, after optional whitespace. +/// +/// ```markdown +/// alpha +/// ==| +/// ``` +fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::HeadingSetextUnderline); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 880d055..ca1149f 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -29,7 +29,7 @@ //! * [hard break (escape)][hard_break_escape] //! * [hard break (trailing)][hard_break_trailing] //! * [heading (atx)][heading_atx] -//! * heading (setext) +//! * [heading (setext)][heading_setext] //! * [html (flow)][html_flow] //! * [html (text)][html_text] //! * label end @@ -64,6 +64,7 @@ pub mod code_text; pub mod hard_break_escape; pub mod hard_break_trailing; pub mod heading_atx; +pub mod heading_setext; pub mod html_flow; pub mod html_text; pub mod partial_whitespace; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 7a4f71a..bc41991 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -24,7 +24,7 @@ //! For these reasons, it is recommend to not use spaces or tabs between the //! markers. //! Thematic breaks formed with dashes (without whitespace) can also form -//! setext headings. +//! [heading (setext)][heading_setext]. //! As dashes and underscores frequently occur in natural language and URLs, it //! is recommended to use asterisks for thematic breaks to distinguish from //! such use. @@ -39,9 +39,10 @@ //! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks) //! //! [flow]: crate::content::flow +//! [heading_setext]: crate::construct::heading_setext //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element //! -//! <!-- To do: link `lists`, `setext heading` --> +//! <!-- To do: link `lists` --> use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; |