diff options
-rw-r--r-- | readme.md | 8 | ||||
-rw-r--r-- | src/compiler.rs | 126 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 32 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 301 | ||||
-rw-r--r-- | src/construct/mod.rs | 3 | ||||
-rw-r--r-- | src/construct/thematic_break.rs | 5 | ||||
-rw-r--r-- | src/content/content.rs | 22 | ||||
-rw-r--r-- | src/content/flow.rs | 19 | ||||
-rw-r--r-- | src/tokenizer.rs | 11 | ||||
-rw-r--r-- | tests/character_escape.rs | 2 | ||||
-rw-r--r-- | tests/code_fenced.rs | 11 | ||||
-rw-r--r-- | tests/code_indented.rs | 11 | ||||
-rw-r--r-- | tests/heading_setext.rs | 279 | ||||
-rw-r--r-- | tests/thematic_break.rs | 11 |
14 files changed, 727 insertions, 114 deletions
@@ -46,8 +46,6 @@ cargo doc --document-private-items ### Some major obstacles -- [ ] (1) Setext headings: can they be solved in content, or do they have to be - solved in flow somehow - [ ] (8) Can content (and to a lesser extent string and text) operate more performantly than checking whether other flow constructs start a line, before exiting and actually attempting flow constructs? @@ -114,7 +112,7 @@ cargo doc --document-private-items - [x] hard break (escape) - [x] hard break (trailing) - [x] heading (atx) -- [ ] (1) heading (setext) +- [x] heading (setext) - [x] html (flow) - [x] html (text) - [ ] (3) label end @@ -135,11 +133,11 @@ cargo doc --document-private-items - [x] code (indented) - [x] content - [x] heading (atx) + - [x] heading (setext) - [x] html (flow) - [x] thematic break - [ ] (3) content - [ ] definition - - [ ] heading (setext) - [x] paragraph - [ ] (5) text - [ ] attention (strong, emphasis) (text) @@ -169,6 +167,8 @@ cargo doc --document-private-items - [x] (1) Add examples to `CompileOptions` docs - [x] (3) Fix deep subtokenization - [x] (1) text in heading +- [x] (1) Setext headings: can they be solved in content, or do they have to be + solved in flow somehow ### Extensions diff --git a/src/compiler.rs b/src/compiler.rs index 50c06e1..9941fa5 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -5,7 +5,7 @@ use crate::util::{ decode_character_reference::{decode_named, decode_numeric}, encode::encode, sanitize_uri::sanitize_uri, - span::{from_exit_event, serialize}, + span::{codes as codes_from_span, from_exit_event, serialize}, }; /// Configuration (optional). @@ -78,6 +78,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St let buffers: &mut Vec<Vec<String>> = &mut vec![vec![]]; let mut atx_opening_sequence_size: Option<usize> = None; let mut atx_heading_buffer: Option<String> = None; + let mut heading_setext_buffer: Option<String> = None; let mut code_flow_seen_data: Option<bool> = None; let mut code_fenced_fences_count: Option<usize> = None; let mut slurp_one_line_ending = false; @@ -102,10 +103,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St match event.event_type { EventType::Enter => match token_type { - TokenType::AtxHeading - | TokenType::AtxHeadingSequence - | TokenType::AtxHeadingWhitespace - | TokenType::Autolink + TokenType::Autolink | TokenType::AutolinkEmail | TokenType::AutolinkMarker | TokenType::AutolinkProtocol @@ -134,6 +132,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::HardBreakEscapeMarker | TokenType::HardBreakTrailing | TokenType::HardBreakTrailingSpace + | TokenType::HeadingAtx + | TokenType::HeadingAtxSequence + | TokenType::HeadingAtxWhitespace + | TokenType::HeadingSetext + | TokenType::HeadingSetextUnderline | TokenType::HtmlFlowData | TokenType::HtmlTextData | TokenType::LineEnding @@ -143,9 +146,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::Whitespace => { // Ignore. } - TokenType::AtxHeadingText - | TokenType::CodeFencedFenceInfo - | TokenType::CodeFencedFenceMeta => { + TokenType::CodeFencedFenceInfo + | TokenType::CodeFencedFenceMeta + | TokenType::HeadingAtxText + | TokenType::HeadingSetextText => { buffer(buffers); } TokenType::CodeIndented => { @@ -199,6 +203,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::Content | TokenType::HardBreakEscapeMarker | TokenType::HardBreakTrailingSpace + | TokenType::HeadingSetext | TokenType::ThematicBreakSequence | TokenType::ThematicBreakWhitespace | TokenType::Whitespace => { @@ -213,52 +218,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St false, ))); } - TokenType::AtxHeading => { - let rank = atx_opening_sequence_size - .expect("`atx_opening_sequence_size` must be set in headings"); - buf_tail_mut(buffers).push(format!("</h{}>", rank)); - atx_opening_sequence_size = None; - atx_heading_buffer = None; - } - // `AtxHeadingWhitespace` is ignored after the opening sequence, - // before the closing sequence, and after the closing sequence. - // But it is used around intermediate sequences. - // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`. - // `AtxHeadingSequence` is ignored as the opening and closing sequence, - // but not when intermediate. - TokenType::AtxHeadingSequence | TokenType::AtxHeadingWhitespace => { - if let Some(buf) = atx_heading_buffer { - atx_heading_buffer = Some( - buf.to_string() - + &encode(&serialize( - codes, - &from_exit_event(events, index), - false, - )), - ); - } - - // First fence we see. - if None == atx_opening_sequence_size { - let rank = serialize(codes, &from_exit_event(events, index), false).len(); - atx_opening_sequence_size = Some(rank); - buf_tail_mut(buffers).push(format!("<h{}>", rank)); - } - } - TokenType::AtxHeadingText => { - let result = resume(buffers); - - if let Some(ref buf) = atx_heading_buffer { - if !buf.is_empty() { - buf_tail_mut(buffers).push(encode(buf)); - atx_heading_buffer = Some("".to_string()); - } - } else { - atx_heading_buffer = Some("".to_string()); - } - - buf_tail_mut(buffers).push(encode(&result)); - } TokenType::AutolinkEmail => { let slice = serialize(codes, &from_exit_event(events, index), false); let buf = buf_tail_mut(buffers); @@ -394,11 +353,68 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St TokenType::CodeTextLineEnding => { buf_tail_mut(buffers).push(" ".to_string()); } - TokenType::HardBreakEscape | TokenType::HardBreakTrailing => { buf_tail_mut(buffers).push("<br />".to_string()); } + TokenType::HeadingAtx => { + let rank = atx_opening_sequence_size + .expect("`atx_opening_sequence_size` must be set in headings"); + buf_tail_mut(buffers).push(format!("</h{}>", rank)); + atx_opening_sequence_size = None; + atx_heading_buffer = None; + } + // `HeadingAtxWhitespace` is ignored after the opening sequence, + // before the closing sequence, and after the closing sequence. + // But it is used around intermediate sequences. + // `atx_heading_buffer` is set to `Some` by the first `HeadingAtxText`. + // `HeadingAtxSequence` is ignored as the opening and closing sequence, + // but not when intermediate. + TokenType::HeadingAtxSequence | TokenType::HeadingAtxWhitespace => { + if let Some(buf) = atx_heading_buffer { + atx_heading_buffer = Some( + buf.to_string() + + &encode(&serialize( + codes, + &from_exit_event(events, index), + false, + )), + ); + } + + // First fence we see. + if None == atx_opening_sequence_size { + let rank = serialize(codes, &from_exit_event(events, index), false).len(); + atx_opening_sequence_size = Some(rank); + buf_tail_mut(buffers).push(format!("<h{}>", rank)); + } + } + TokenType::HeadingAtxText => { + let result = resume(buffers); + if let Some(ref buf) = atx_heading_buffer { + if !buf.is_empty() { + buf_tail_mut(buffers).push(encode(buf)); + atx_heading_buffer = Some("".to_string()); + } + } else { + atx_heading_buffer = Some("".to_string()); + } + + buf_tail_mut(buffers).push(encode(&result)); + } + TokenType::HeadingSetextText => { + heading_setext_buffer = Some(resume(buffers)); + slurp_one_line_ending = true; + } + TokenType::HeadingSetextUnderline => { + let text = heading_setext_buffer + .expect("`atx_opening_sequence_size` must be set in headings"); + let head = codes_from_span(codes, &from_exit_event(events, index))[0]; + let level: usize = if head == Code::Char('-') { 2 } else { 1 }; + + heading_setext_buffer = None; + buf_tail_mut(buffers).push(format!("<h{}>{}</h{}>", level, text, level)); + } TokenType::HtmlFlow | TokenType::HtmlText => { ignore_encode = false; } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 1a9ed03..3ff6fea 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -18,9 +18,11 @@ //! In older markdown versions, this was not required, and headings would form //! without it. //! -//! In markdown, it is also possible to create headings with the setext heading -//! construct. -//! The benefit of setext headings is that their text can include line endings. +//! In markdown, it is also possible to create headings with a +//! [heading (setext)][heading_setext] construct. +//! The benefit of setext headings is that their text can include line endings, +//! and by extensions also hard breaks (e.g., with +//! [hard break (escape)][hard_break_escape]). //! However, their limit is that they cannot form `<h3>` through `<h6>` //! headings. //! Due to this limitation, it is recommended to use atx headings. @@ -39,11 +41,11 @@ //! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) //! //! [flow]: crate::content::flow +//! [heading_setext]: crate::construct::heading_setext +//! [hard_break_escape]: crate::construct::hard_break_escape //! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements //! [wiki-setext]: https://en.wikipedia.org/wiki/Setext //! [atx]: http://www.aaronsw.com/2002/atx/ -//! -//! <!-- To do: link `setext` --> use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -55,8 +57,8 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if Code::Char('#') == code { - tokenizer.enter(TokenType::AtxHeading); - tokenizer.enter(TokenType::AtxHeadingSequence); + tokenizer.enter(TokenType::HeadingAtx); + tokenizer.enter(TokenType::HeadingAtxSequence); sequence_open(tokenizer, code, 0) } else { (State::Nok, None) @@ -76,7 +78,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR | Code::Char('\t' | '\n' | '\r' | ' ') if rank > 0 => { - tokenizer.exit(TokenType::AtxHeadingSequence); + tokenizer.exit(TokenType::HeadingAtxSequence); at_break(tokenizer, code) } Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { @@ -104,19 +106,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::AtxHeading); + tokenizer.exit(TokenType::HeadingAtx); (State::Ok, Some(vec![code])) } Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.enter(TokenType::AtxHeadingWhitespace); + tokenizer.enter(TokenType::HeadingAtxWhitespace); whitespace(tokenizer, code) } Code::Char('#') => { - tokenizer.enter(TokenType::AtxHeadingSequence); + tokenizer.enter(TokenType::HeadingAtxSequence); further_sequence(tokenizer, code) } Code::Char(_) => { - tokenizer.enter(TokenType::AtxHeadingText); + tokenizer.enter(TokenType::HeadingAtxText); tokenizer.enter(TokenType::ChunkText); data(tokenizer, code) } @@ -134,7 +136,7 @@ fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); (State::Fn(Box::new(further_sequence)), None) } else { - tokenizer.exit(TokenType::AtxHeadingSequence); + tokenizer.exit(TokenType::HeadingAtxSequence); at_break(tokenizer, code) } } @@ -151,7 +153,7 @@ fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { (State::Fn(Box::new(whitespace)), None) } _ => { - tokenizer.exit(TokenType::AtxHeadingWhitespace); + tokenizer.exit(TokenType::HeadingAtxWhitespace); at_break(tokenizer, code) } } @@ -167,7 +169,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { tokenizer.exit(TokenType::ChunkText); - tokenizer.exit(TokenType::AtxHeadingText); + tokenizer.exit(TokenType::HeadingAtxText); at_break(tokenizer, code) } _ => { diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs new file mode 100644 index 0000000..8cc4f6d --- /dev/null +++ b/src/construct/heading_setext.rs @@ -0,0 +1,301 @@ +//! Heading (setext) is a construct that occurs in the [flow] content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! heading_setext ::= line *(eol line) eol whitespace_optional (1*'-' | 1*'=') whitespace_optional +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ whitespace ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! ``` +//! +//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in +//! HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! In markdown, it is also possible to create headings with a +//! [heading (atx)][heading_atx] construct. +//! The benefit of setext headings is that their text can include line endings, +//! and by extensions also hard breaks (e.g., with +//! [hard break (escape)][hard_break_escape]). +//! However, their limit is that they cannot form `<h3>` through `<h6>` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! [Thematic breaks][thematic_break] formed with dashes (without whitespace) +//! can also form heading (setext). +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`setext-underline.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/setext-underline.js) +//! * [*§ 4.3 Setext headings* in `CommonMark`](https://spec.commonmark.org/0.30/#setext-headings) +//! +//! [flow]: crate::content::flow +//! [heading_atx]: crate::construct::heading_atx +//! [thematic_break]: crate::construct::thematic_break +//! [hard_break_escape]: crate::construct::hard_break_escape +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ + +use crate::constant::TAB_SIZE; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::span::from_exit_event; + +/// Kind of underline. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + Dash, + /// Tilde code. + EqualsTo, +} + +/// Start of a heading (setext). +/// +/// ```markdown +/// |alpha +/// == +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("expected non-eol/eof"); + } + _ => { + tokenizer.enter(TokenType::HeadingSetext); + tokenizer.enter(TokenType::HeadingSetextText); + tokenizer.enter(TokenType::ChunkText); + text_inside(tokenizer, code) + } + } +} + +/// Inside text. +/// +/// ```markdown +/// al|pha +/// bra|vo +/// == +/// ``` +pub fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::HeadingSetextText); + tokenizer.attempt(underline_before, |ok| { + Box::new(if ok { after } else { text_continue }) + })(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(text_inside)), None) + } + } +} + +/// At a line ending, not at an underline. +/// +/// ```markdown +/// alpha +/// |bravo +/// == +/// ``` +fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // Needed to connect the text. + // To do: does it work? + tokenizer.enter(TokenType::HeadingSetextText); + tokenizer.events.pop(); + tokenizer.events.pop(); + + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + let next = tokenizer.events.len(); + let previous = next - 2; + + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + + ( + State::Fn(Box::new(tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(text_line_start), + ))), + None, + ) + } + _ => unreachable!("expected eol"), + } +} + +/// At a line ending after whitespace, not at an underline. +/// +/// ```markdown +/// alpha +/// |bravo +/// == +/// ``` +fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let next = tokenizer.events.len() - 2; + let previous = next - 2; + + // Link the whitespace, if it exists. + if tokenizer.events[next].token_type == TokenType::Whitespace { + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + } + + match code { + // Blank lines not allowed. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), + _ => { + let next = tokenizer.events.len(); + let previous = next - 2; + + tokenizer.enter(TokenType::ChunkText); + + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + + text_inside(tokenizer, code) + } + } +} + +/// After a heading (setext). +/// +/// ```markdown +/// alpha +/// ==| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::HeadingSetext); + (State::Ok, Some(vec![code])) +} + +/// At a line ending, presumably an underline. +/// +/// ```markdown +/// alpha| +/// == +/// ``` +fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(underline_start)), None) + } + _ => unreachable!("expected eol"), + } +} + +/// After a line ending, presumably an underline. +/// +/// ```markdown +/// alpha +/// |== +/// ``` +fn underline_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(underline_sequence_start), + )(tokenizer, code) +} + +/// After optional whitespace, presumably an underline. +/// +/// ```markdown +/// alpha +/// |== +/// ``` +fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == '-' || char == '=' => { + let marker = if char == '-' { + Kind::Dash + } else { + Kind::EqualsTo + }; + tokenizer.enter(TokenType::HeadingSetextUnderline); + underline_sequence_inside(tokenizer, code, marker) + } + _ => (State::Nok, None), + } +} + +/// In an underline sequence. +/// +/// ```markdown +/// alpha +/// =|= +/// ``` +fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { + let marker = if kind == Kind::Dash { '-' } else { '=' }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + underline_sequence_inside(tokenizer, code, kind) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(underline_after), + )(tokenizer, code), + _ => underline_after(tokenizer, code), + } +} + +/// After an underline sequence, after optional whitespace. +/// +/// ```markdown +/// alpha +/// ==| +/// ``` +fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::HeadingSetextUnderline); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 880d055..ca1149f 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -29,7 +29,7 @@ //! * [hard break (escape)][hard_break_escape] //! * [hard break (trailing)][hard_break_trailing] //! * [heading (atx)][heading_atx] -//! * heading (setext) +//! * [heading (setext)][heading_setext] //! * [html (flow)][html_flow] //! * [html (text)][html_text] //! * label end @@ -64,6 +64,7 @@ pub mod code_text; pub mod hard_break_escape; pub mod hard_break_trailing; pub mod heading_atx; +pub mod heading_setext; pub mod html_flow; pub mod html_text; pub mod partial_whitespace; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 7a4f71a..bc41991 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -24,7 +24,7 @@ //! For these reasons, it is recommend to not use spaces or tabs between the //! markers. //! Thematic breaks formed with dashes (without whitespace) can also form -//! setext headings. +//! [heading (setext)][heading_setext]. //! As dashes and underscores frequently occur in natural language and URLs, it //! is recommended to use asterisks for thematic breaks to distinguish from //! such use. @@ -39,9 +39,10 @@ //! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks) //! //! [flow]: crate::content::flow +//! [heading_setext]: crate::construct::heading_setext //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element //! -//! <!-- To do: link `lists`, `setext heading` --> +//! <!-- To do: link `lists` --> use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; diff --git a/src/content/content.rs b/src/content/content.rs index 4660fbe..4ca69ee 100644 --- a/src/content/content.rs +++ b/src/content/content.rs @@ -27,7 +27,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { unreachable!("expected non-eol/eof"); } - _ => paragraph_initial(tokenizer, code) + _ => after_definitions(tokenizer, code) // To do: definition. // _ => tokenizer.attempt(definition, |ok| { // Box::new(if ok { @@ -44,10 +44,26 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ```markdown /// |asd /// ``` +fn after_definitions(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("to do: handle eol after definition"); + } + _ => paragraph_initial(tokenizer, code), + } +} + +/// Before a paragraph. +/// +/// ```markdown +/// |asd +/// ``` fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - unreachable!("expected non-eol/eof"); + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("to do: handle eol after definition"); } _ => { tokenizer.enter(TokenType::Paragraph); diff --git a/src/content/flow.rs b/src/content/flow.rs index 4d2ece1..d7509d7 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -14,17 +14,18 @@ //! * [Code (fenced)][crate::construct::code_fenced] //! * [Code (indented)][crate::construct::code_indented] //! * [Heading (atx)][crate::construct::heading_atx] +//! * [Heading (setext)][crate::construct::heading_setext] //! * [HTML (flow)][crate::construct::html_flow] //! * [Thematic break][crate::construct::thematic_break] //! -//! <!-- To do: `setext` in content? Link to content. --> +//! <!-- To do: Link to content. --> use crate::constant::TAB_SIZE; use crate::construct::{ blank_line::start as blank_line, code_fenced::start as code_fenced, code_indented::start as code_indented, heading_atx::start as heading_atx, - html_flow::start as html_flow, partial_whitespace::start as whitespace, - thematic_break::start as thematic_break, + heading_setext::start as heading_setext, html_flow::start as html_flow, + partial_whitespace::start as whitespace, thematic_break::start as thematic_break, }; use crate::subtokenize::subtokenize; use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; @@ -144,24 +145,20 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |*** /// ``` pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_2(heading_atx, thematic_break, |ok| { + tokenizer.attempt_3(heading_atx, thematic_break, heading_setext, |ok| { Box::new(if ok { after } else { content_before }) })(tokenizer, code) } -/// Before flow, but not before a heading (atx) or thematic break. -/// -/// At this point, we’re at content (zero or more definitions and zero or one -/// paragraph/setext heading). +/// Before content. /// /// ```markdown /// |qwe /// ``` -// To do: currently only parses a single line. +/// // To do: // - Multiline // - One or more definitions. -// - Setext heading. fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { @@ -174,12 +171,12 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } } + /// In content. /// /// ```markdown /// al|pha /// ``` -// To do: lift limitations as documented above. fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult { match code { Code::None => content_end(tokenizer, code), diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 0aae480..fc9e177 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -24,10 +24,6 @@ pub enum TokenType { AutolinkMarker, AutolinkProtocol, AutolinkEmail, - AtxHeading, - AtxHeadingSequence, - AtxHeadingWhitespace, - AtxHeadingText, BlankLineEnding, BlankLineWhitespace, CharacterEscape, @@ -58,6 +54,13 @@ pub enum TokenType { HardBreakEscapeMarker, HardBreakTrailing, HardBreakTrailingSpace, + HeadingAtx, + HeadingAtxSequence, + HeadingAtxWhitespace, + HeadingAtxText, + HeadingSetext, + HeadingSetextText, + HeadingSetextUnderline, HtmlFlow, HtmlFlowData, HtmlText, diff --git a/tests/character_escape.rs b/tests/character_escape.rs index c81760d..9e2a5c8 100644 --- a/tests/character_escape.rs +++ b/tests/character_escape.rs @@ -24,7 +24,7 @@ fn character_escape() { assert_eq!( micromark( - "\\*not emphasized*\n\\<br/> not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\ö not a character entity" + "\\*not emphasized*\n\\<br/> not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\ö not a character entity" ), "<p>*not emphasized*\n<br/> not a tag\n[not a link](/foo)\n`not code`\n1. not a list\n* not a list\n# not a heading\n[foo]: /url "not a reference"\n&ouml; not a character entity</p>", "should escape other constructs" diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs index 82ac088..0e19637 100644 --- a/tests/code_fenced.rs +++ b/tests/code_fenced.rs @@ -136,12 +136,11 @@ fn code_fenced() { "should support interrupting paragraphs" ); - // To do: setext. - // assert_eq!( - // micromark("foo\n---\n~~~\nbar\n~~~\n# baz"), - // "<h2>foo</h2>\n<pre><code>bar\n</code></pre>\n<h1>baz</h1>", - // "should support interrupting other content" - // ); + assert_eq!( + micromark("foo\n---\n~~~\nbar\n~~~\n# baz"), + "<h2>foo</h2>\n<pre><code>bar\n</code></pre>\n<h1>baz</h1>", + "should support interrupting other content" + ); assert_eq!( micromark("```ruby\ndef foo(x)\n return 3\nend\n```"), diff --git a/tests/code_indented.rs b/tests/code_indented.rs index f21d761..a7afb21 100644 --- a/tests/code_indented.rs +++ b/tests/code_indented.rs @@ -53,12 +53,11 @@ fn code_indented() { "should support paragraphs directly after indented code" ); - // To do: setext. - // assert_eq!( - // micromark("# Heading\n foo\nHeading\n------\n foo\n----"), - // "<h1>Heading</h1>\n<pre><code>foo\n</code></pre>\n<h2>Heading</h2>\n<pre><code>foo\n</code></pre>\n<hr />", - // "should mix w/ other content" - // ); + assert_eq!( + micromark("# Heading\n foo\nHeading\n------\n foo\n----"), + "<h1>Heading</h1>\n<pre><code>foo\n</code></pre>\n<h2>Heading</h2>\n<pre><code>foo\n</code></pre>\n<hr />", + "should mix w/ other content" + ); assert_eq!( micromark(" foo\n bar"), diff --git a/tests/heading_setext.rs b/tests/heading_setext.rs new file mode 100644 index 0000000..92a5b43 --- /dev/null +++ b/tests/heading_setext.rs @@ -0,0 +1,279 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn heading_setext() { + // To do: emphasis. + // assert_eq!( + // micromark("Foo *bar*\n========="), + // "<h1>Foo <em>bar</em></h1>", + // "should support a heading w/ an equals to (rank of 1)" + // ); + + // To do: emphasis. + // assert_eq!( + // micromark("Foo *bar*\n---------"), + // "<h2>Foo <em>bar</em></h2>", + // "should support a heading w/ a dash (rank of 2)" + // ); + + // To do: emphasis. + // assert_eq!( + // micromark("Foo *bar\nbaz*\n===="), + // "<h1>Foo <em>bar\nbaz</em></h1>", + // "should support line endings in setext headings" + // ); + + // To do: emphasis, trim. + // assert_eq!( + // micromark(" Foo *bar\nbaz*\t\n===="), + // "<h1>Foo <em>bar\nbaz</em></h1>", + // "should not include initial and final whitespace around content" + // ); + + assert_eq!( + micromark("Foo\n-------------------------"), + "<h2>Foo</h2>", + "should support long underlines" + ); + + assert_eq!( + micromark("Foo\n="), + "<h1>Foo</h1>", + "should support short underlines" + ); + + assert_eq!( + micromark(" Foo\n ==="), + "<h1>Foo</h1>", + "should support indented content w/ 1 space" + ); + + assert_eq!( + micromark(" Foo\n---"), + "<h2>Foo</h2>", + "should support indented content w/ 2 spaces" + ); + + assert_eq!( + micromark(" Foo\n---"), + "<h2>Foo</h2>", + "should support indented content w/ 3 spaces" + ); + + assert_eq!( + micromark(" Foo\n ---"), + "<pre><code>Foo\n---\n</code></pre>", + "should not support too much indented content (1)" + ); + + assert_eq!( + micromark(" Foo\n---"), + "<pre><code>Foo\n</code></pre>\n<hr />", + "should not support too much indented content (2)" + ); + + assert_eq!( + micromark("Foo\n ---- "), + "<h2>Foo</h2>", + "should support initial and final whitespace around the underline" + ); + + assert_eq!( + micromark("Foo\n ="), + "<h1>Foo</h1>", + "should support whitespace before underline" + ); + + // To do: trim paragraphs. + // assert_eq!( + // micromark("Foo\n ="), + // "<p>Foo\n=</p>", + // "should not support too much whitespace before underline (1)" + // ); + + // To do: trim paragraphs. + // assert_eq!( + // micromark("Foo\n\t="), + // "<p>Foo\n=</p>", + // "should not support too much whitespace before underline (2)" + // ); + + assert_eq!( + micromark("Foo\n= ="), + "<p>Foo\n= =</p>", + "should not support whitespace in the underline (1)" + ); + + assert_eq!( + micromark("Foo\n--- -"), + "<p>Foo</p>\n<hr />", + "should not support whitespace in the underline (2)" + ); + + // To do: trim setext. + // assert_eq!( + // micromark("Foo \n-----"), + // "<h2>Foo</h2>", + // "should not support a hard break w/ spaces at the end" + // ); + + assert_eq!( + micromark("Foo\\\n-----"), + "<h2>Foo\\</h2>", + "should not support a hard break w/ backslash at the end" + ); + + assert_eq!( + micromark("`Foo\n----\n`"), + "<h2>`Foo</h2>\n<p>`</p>", + "should precede over inline constructs (1)" + ); + + assert_eq!( + micromark("<a title=\"a lot\n---\nof dashes\"/>"), + "<h2><a title="a lot</h2>\n<p>of dashes"/></p>", + "should precede over inline constructs (2)" + ); + + // To do: block quote. + // assert_eq!( + // micromark("> Foo\n---"), + // "<blockquote>\n<p>Foo</p>\n</blockquote>\n<hr />", + // "should not allow underline to be lazy (1)" + // ); + + // To do: block quote. + // assert_eq!( + // micromark("> foo\nbar\n==="), + // "<blockquote>\n<p>foo\nbar\n===</p>\n</blockquote>", + // "should not allow underline to be lazy (2)" + // ); + + // To do: list. + // assert_eq!( + // micromark("- Foo\n---"), + // "<ul>\n<li>Foo</li>\n</ul>\n<hr />", + // "should not allow underline to be lazy (3)" + // ); + + assert_eq!( + micromark("Foo\nBar\n---"), + "<h2>Foo\nBar</h2>", + "should support line endings in setext headings" + ); + + assert_eq!( + micromark("---\nFoo\n---\nBar\n---\nBaz"), + "<hr />\n<h2>Foo</h2>\n<h2>Bar</h2>\n<p>Baz</p>", + "should support adjacent setext headings" + ); + + assert_eq!( + micromark("\n===="), + "<p>====</p>", + "should not support empty setext headings" + ); + + assert_eq!( + micromark("---\n---"), + "<hr />\n<hr />", + "should prefer other constructs over setext headings (1)" + ); + + // To do: list. + // assert_eq!( + // micromark("- foo\n-----"), + // "<ul>\n<li>foo</li>\n</ul>\n<hr />", + // "should prefer other constructs over setext headings (2)" + // ); + + assert_eq!( + micromark(" foo\n---"), + "<pre><code>foo\n</code></pre>\n<hr />", + "should prefer other constructs over setext headings (3)" + ); + + // To do: block quote. + // assert_eq!( + // micromark("> foo\n-----"), + // "<blockquote>\n<p>foo</p>\n</blockquote>\n<hr />", + // "should prefer other constructs over setext headings (4)" + // ); + + assert_eq!( + micromark("\\> foo\n------"), + "<h2>> foo</h2>", + "should support starting w/ character escapes" + ); + + assert_eq!( + micromark("Foo\nbar\n---\nbaz"), + "<h2>Foo\nbar</h2>\n<p>baz</p>", + "paragraph and heading interplay (1)" + ); + + assert_eq!( + micromark("Foo\n\nbar\n---\nbaz"), + "<p>Foo</p>\n<h2>bar</h2>\n<p>baz</p>", + "paragraph and heading interplay (2)" + ); + + assert_eq!( + micromark("Foo\nbar\n\n---\n\nbaz"), + "<p>Foo\nbar</p>\n<hr />\n<p>baz</p>", + "paragraph and heading interplay (3)" + ); + + assert_eq!( + micromark("Foo\nbar\n* * *\nbaz"), + "<p>Foo\nbar</p>\n<hr />\n<p>baz</p>", + "paragraph and heading interplay (4)" + ); + + assert_eq!( + micromark("Foo\nbar\n\\---\nbaz"), + "<p>Foo\nbar\n---\nbaz</p>", + "paragraph and heading interplay (5)" + ); + + // Extra: + assert_eq!( + micromark("Foo \nbar\n-----"), + "<h2>Foo<br />\nbar</h2>", + "should support a hard break w/ spaces in between" + ); + + assert_eq!( + micromark("Foo\\\nbar\n-----"), + "<h2>Foo<br />\nbar</h2>", + "should support a hard break w/ backslash in between" + ); + + assert_eq!( + micromark("a\n-\nb"), + "<h2>a</h2>\n<p>b</p>", + "should prefer a setext heading over an interrupting list" + ); + + // To do: block quote. + // assert_eq!( + // micromark("> ===\na"), + // "<blockquote>\n<p>===\na</p>\n</blockquote>", + // "should not support lazyness (1)" + // ); + + // To do: block quote. + // assert_eq!( + // micromark("> a\n==="), + // "<blockquote>\n<p>a\n===</p>\n</blockquote>", + // "should not support lazyness (2)" + // ); + + // To do: turning things off. + // assert_eq!( + // micromark("a\n-", {extensions: [{disable: {null: ["setextUnderline"]}}]}), + // "<p>a\n-</p>", + // "should support turning off setext underlines" + // ); +} diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs index 3dc7b5d..cbc84e0 100644 --- a/tests/thematic_break.rs +++ b/tests/thematic_break.rs @@ -144,12 +144,11 @@ fn thematic_break() { "should support thematic breaks interrupting paragraphs" ); - // To do: setext. - // assert_eq!( - // micromark("Foo\n---\nbar"), - // "<h2>Foo</h2>\n<p>bar</p>", - // "should not support thematic breaks w/ dashes interrupting paragraphs (setext heading)" - // ); + assert_eq!( + micromark("Foo\n---\nbar"), + "<h2>Foo</h2>\n<p>bar</p>", + "should not support thematic breaks w/ dashes interrupting paragraphs (setext heading)" + ); // To do: list. // assert_eq!( |