diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
commit | 4c06c8554c35887f8f5147783953b2b7e7c2327f (patch) | |
tree | 1b2463848a3ae4c645f7f1a325877ee829ab65c5 /src/construct/heading_atx.rs | |
download | markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.gz markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.bz2 markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.zip |
.
Diffstat (limited to 'src/construct/heading_atx.rs')
-rw-r--r-- | src/construct/heading_atx.rs | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs new file mode 100644 index 0000000..b3aef1b --- /dev/null +++ b/src/construct/heading_atx.rs @@ -0,0 +1,175 @@ +//! Heading (atx) is a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! `CommonMark` introduced the requirement on whitespace existing after the +//! opening sequence and before text. +//! In older markdown versions, this was not required, and headings would form +//! without it. +//! +//! In markdown, it is also possible to create headings with the setext heading +//! construct. +//! The benefit of setext headings is that their text can include line endings. +//! However, their limit is that they cannot form `<h3>` through `<h6>` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js) +//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) +//! +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ +//! +//! <!-- To do: link `flow`, `setext` --> + +use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a heading (atx). +/// +/// ```markdown +/// |## alpha +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('#') == code { + tokenizer.enter(TokenType::AtxHeading); + tokenizer.enter(TokenType::AtxHeadingSequence); + sequence_open(tokenizer, code, 0) + } else { + (State::Nok, None) + } +} + +/// In the opening sequence. +/// +/// ```markdown +/// #|# alpha +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ') + if rank > 0 => + { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } + Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence_open(tokenizer, code, rank + 1) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After something but before something else. +/// +/// ```markdown +/// ## |alpha +/// ## alpha| bravo +/// ## alpha |bravo +/// ## alpha bravo|## +/// ## alpha bravo ##| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::AtxHeading); + (State::Ok, Some(vec![code])) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.enter(TokenType::AtxHeadingWhitespace); + whitespace(tokenizer, code) + } + Code::Char('#') => { + tokenizer.enter(TokenType::AtxHeadingSequence); + further_sequence(tokenizer, code) + } + Code::Char(_) => { + tokenizer.enter(TokenType::AtxHeadingText); + data(tokenizer, code) + } + } +} + +/// In a further sequence (after whitespace). +/// Could be normal “visible” hashes in the heading or a final sequence. +/// +/// ```markdown +/// ## alpha #|# +/// ``` +fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.consume(code); + (State::Fn(Box::new(further_sequence)), None) + } else { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } +} + +/// In whitespace. +/// +/// ```markdown +/// ## alpha | bravo +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(whitespace)), None) + } + _ => { + tokenizer.exit(TokenType::AtxHeadingWhitespace); + at_break(tokenizer, code) + } + } +} + +/// In text. +/// +/// ```markdown +/// ## al|pha +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { + tokenizer.exit(TokenType::AtxHeadingText); + at_break(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(data)), None) + } + } +} |