diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/compiler.rs | 126 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 32 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 301 | ||||
-rw-r--r-- | src/construct/mod.rs | 3 | ||||
-rw-r--r-- | src/construct/thematic_break.rs | 5 | ||||
-rw-r--r-- | src/content/content.rs | 22 | ||||
-rw-r--r-- | src/content/flow.rs | 19 | ||||
-rw-r--r-- | src/tokenizer.rs | 11 |
8 files changed, 428 insertions, 91 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 50c06e1..9941fa5 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -5,7 +5,7 @@ use crate::util::{ decode_character_reference::{decode_named, decode_numeric}, encode::encode, sanitize_uri::sanitize_uri, - span::{from_exit_event, serialize}, + span::{codes as codes_from_span, from_exit_event, serialize}, }; /// Configuration (optional). @@ -78,6 +78,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St let buffers: &mut Vec<Vec<String>> = &mut vec![vec![]]; let mut atx_opening_sequence_size: Option<usize> = None; let mut atx_heading_buffer: Option<String> = None; + let mut heading_setext_buffer: Option<String> = None; let mut code_flow_seen_data: Option<bool> = None; let mut code_fenced_fences_count: Option<usize> = None; let mut slurp_one_line_ending = false; @@ -102,10 +103,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St match event.event_type { EventType::Enter => match token_type { - TokenType::AtxHeading - | TokenType::AtxHeadingSequence - | TokenType::AtxHeadingWhitespace - | TokenType::Autolink + TokenType::Autolink | TokenType::AutolinkEmail | TokenType::AutolinkMarker | TokenType::AutolinkProtocol @@ -134,6 +132,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::HardBreakEscapeMarker | TokenType::HardBreakTrailing | TokenType::HardBreakTrailingSpace + | TokenType::HeadingAtx + | TokenType::HeadingAtxSequence + | TokenType::HeadingAtxWhitespace + | TokenType::HeadingSetext + | TokenType::HeadingSetextUnderline | TokenType::HtmlFlowData | TokenType::HtmlTextData | TokenType::LineEnding @@ -143,9 +146,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::Whitespace => { // Ignore. } - TokenType::AtxHeadingText - | TokenType::CodeFencedFenceInfo - | TokenType::CodeFencedFenceMeta => { + TokenType::CodeFencedFenceInfo + | TokenType::CodeFencedFenceMeta + | TokenType::HeadingAtxText + | TokenType::HeadingSetextText => { buffer(buffers); } TokenType::CodeIndented => { @@ -199,6 +203,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::Content | TokenType::HardBreakEscapeMarker | TokenType::HardBreakTrailingSpace + | TokenType::HeadingSetext | TokenType::ThematicBreakSequence | TokenType::ThematicBreakWhitespace | TokenType::Whitespace => { @@ -213,52 +218,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St false, ))); } - TokenType::AtxHeading => { - let rank = atx_opening_sequence_size - .expect("`atx_opening_sequence_size` must be set in headings"); - buf_tail_mut(buffers).push(format!("</h{}>", rank)); - atx_opening_sequence_size = None; - atx_heading_buffer = None; - } - // `AtxHeadingWhitespace` is ignored after the opening sequence, - // before the closing sequence, and after the closing sequence. - // But it is used around intermediate sequences. - // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`. - // `AtxHeadingSequence` is ignored as the opening and closing sequence, - // but not when intermediate. - TokenType::AtxHeadingSequence | TokenType::AtxHeadingWhitespace => { - if let Some(buf) = atx_heading_buffer { - atx_heading_buffer = Some( - buf.to_string() - + &encode(&serialize( - codes, - &from_exit_event(events, index), - false, - )), - ); - } - - // First fence we see. - if None == atx_opening_sequence_size { - let rank = serialize(codes, &from_exit_event(events, index), false).len(); - atx_opening_sequence_size = Some(rank); - buf_tail_mut(buffers).push(format!("<h{}>", rank)); - } - } - TokenType::AtxHeadingText => { - let result = resume(buffers); - - if let Some(ref buf) = atx_heading_buffer { - if !buf.is_empty() { - buf_tail_mut(buffers).push(encode(buf)); - atx_heading_buffer = Some("".to_string()); - } - } else { - atx_heading_buffer = Some("".to_string()); - } - - buf_tail_mut(buffers).push(encode(&result)); - } TokenType::AutolinkEmail => { let slice = serialize(codes, &from_exit_event(events, index), false); let buf = buf_tail_mut(buffers); @@ -394,11 +353,68 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St TokenType::CodeTextLineEnding => { buf_tail_mut(buffers).push(" ".to_string()); } - TokenType::HardBreakEscape | TokenType::HardBreakTrailing => { buf_tail_mut(buffers).push("<br />".to_string()); } + TokenType::HeadingAtx => { + let rank = atx_opening_sequence_size + .expect("`atx_opening_sequence_size` must be set in headings"); + buf_tail_mut(buffers).push(format!("</h{}>", rank)); + atx_opening_sequence_size = None; + atx_heading_buffer = None; + } + // `HeadingAtxWhitespace` is ignored after the opening sequence, + // before the closing sequence, and after the closing sequence. + // But it is used around intermediate sequences. + // `atx_heading_buffer` is set to `Some` by the first `HeadingAtxText`. + // `HeadingAtxSequence` is ignored as the opening and closing sequence, + // but not when intermediate. + TokenType::HeadingAtxSequence | TokenType::HeadingAtxWhitespace => { + if let Some(buf) = atx_heading_buffer { + atx_heading_buffer = Some( + buf.to_string() + + &encode(&serialize( + codes, + &from_exit_event(events, index), + false, + )), + ); + } + + // First fence we see. + if None == atx_opening_sequence_size { + let rank = serialize(codes, &from_exit_event(events, index), false).len(); + atx_opening_sequence_size = Some(rank); + buf_tail_mut(buffers).push(format!("<h{}>", rank)); + } + } + TokenType::HeadingAtxText => { + let result = resume(buffers); + if let Some(ref buf) = atx_heading_buffer { + if !buf.is_empty() { + buf_tail_mut(buffers).push(encode(buf)); + atx_heading_buffer = Some("".to_string()); + } + } else { + atx_heading_buffer = Some("".to_string()); + } + + buf_tail_mut(buffers).push(encode(&result)); + } + TokenType::HeadingSetextText => { + heading_setext_buffer = Some(resume(buffers)); + slurp_one_line_ending = true; + } + TokenType::HeadingSetextUnderline => { + let text = heading_setext_buffer + .expect("`atx_opening_sequence_size` must be set in headings"); + let head = codes_from_span(codes, &from_exit_event(events, index))[0]; + let level: usize = if head == Code::Char('-') { 2 } else { 1 }; + + heading_setext_buffer = None; + buf_tail_mut(buffers).push(format!("<h{}>{}</h{}>", level, text, level)); + } TokenType::HtmlFlow | TokenType::HtmlText => { ignore_encode = false; } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 1a9ed03..3ff6fea 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -18,9 +18,11 @@ //! In older markdown versions, this was not required, and headings would form //! without it. //! -//! In markdown, it is also possible to create headings with the setext heading -//! construct. -//! The benefit of setext headings is that their text can include line endings. +//! In markdown, it is also possible to create headings with a +//! [heading (setext)][heading_setext] construct. +//! The benefit of setext headings is that their text can include line endings, +//! and by extensions also hard breaks (e.g., with +//! [hard break (escape)][hard_break_escape]). //! However, their limit is that they cannot form `<h3>` through `<h6>` //! headings. //! Due to this limitation, it is recommended to use atx headings. @@ -39,11 +41,11 @@ //! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) //! //! [flow]: crate::content::flow +//! [heading_setext]: crate::construct::heading_setext +//! [hard_break_escape]: crate::construct::hard_break_escape //! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements //! [wiki-setext]: https://en.wikipedia.org/wiki/Setext //! [atx]: http://www.aaronsw.com/2002/atx/ -//! -//! <!-- To do: link `setext` --> use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -55,8 +57,8 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if Code::Char('#') == code { - tokenizer.enter(TokenType::AtxHeading); - tokenizer.enter(TokenType::AtxHeadingSequence); + tokenizer.enter(TokenType::HeadingAtx); + tokenizer.enter(TokenType::HeadingAtxSequence); sequence_open(tokenizer, code, 0) } else { (State::Nok, None) @@ -76,7 +78,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR | Code::Char('\t' | '\n' | '\r' | ' ') if rank > 0 => { - tokenizer.exit(TokenType::AtxHeadingSequence); + tokenizer.exit(TokenType::HeadingAtxSequence); at_break(tokenizer, code) } Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { @@ -104,19 +106,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::AtxHeading); + tokenizer.exit(TokenType::HeadingAtx); (State::Ok, Some(vec![code])) } Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.enter(TokenType::AtxHeadingWhitespace); + tokenizer.enter(TokenType::HeadingAtxWhitespace); whitespace(tokenizer, code) } Code::Char('#') => { - tokenizer.enter(TokenType::AtxHeadingSequence); + tokenizer.enter(TokenType::HeadingAtxSequence); further_sequence(tokenizer, code) } Code::Char(_) => { - tokenizer.enter(TokenType::AtxHeadingText); + tokenizer.enter(TokenType::HeadingAtxText); tokenizer.enter(TokenType::ChunkText); data(tokenizer, code) } @@ -134,7 +136,7 @@ fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); (State::Fn(Box::new(further_sequence)), None) } else { - tokenizer.exit(TokenType::AtxHeadingSequence); + tokenizer.exit(TokenType::HeadingAtxSequence); at_break(tokenizer, code) } } @@ -151,7 +153,7 @@ fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { (State::Fn(Box::new(whitespace)), None) } _ => { - tokenizer.exit(TokenType::AtxHeadingWhitespace); + tokenizer.exit(TokenType::HeadingAtxWhitespace); at_break(tokenizer, code) } } @@ -167,7 +169,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { tokenizer.exit(TokenType::ChunkText); - tokenizer.exit(TokenType::AtxHeadingText); + tokenizer.exit(TokenType::HeadingAtxText); at_break(tokenizer, code) } _ => { diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs new file mode 100644 index 0000000..8cc4f6d --- /dev/null +++ b/src/construct/heading_setext.rs @@ -0,0 +1,301 @@ +//! Heading (setext) is a construct that occurs in the [flow] content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! heading_setext ::= line *(eol line) eol whitespace_optional (1*'-' | 1*'=') whitespace_optional +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ whitespace ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! ``` +//! +//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in +//! HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! In markdown, it is also possible to create headings with a +//! [heading (atx)][heading_atx] construct. +//! The benefit of setext headings is that their text can include line endings, +//! and by extensions also hard breaks (e.g., with +//! [hard break (escape)][hard_break_escape]). +//! However, their limit is that they cannot form `<h3>` through `<h6>` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! [Thematic breaks][thematic_break] formed with dashes (without whitespace) +//! can also form heading (setext). +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`setext-underline.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/setext-underline.js) +//! * [*§ 4.3 Setext headings* in `CommonMark`](https://spec.commonmark.org/0.30/#setext-headings) +//! +//! [flow]: crate::content::flow +//! [heading_atx]: crate::construct::heading_atx +//! [thematic_break]: crate::construct::thematic_break +//! [hard_break_escape]: crate::construct::hard_break_escape +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ + +use crate::constant::TAB_SIZE; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::span::from_exit_event; + +/// Kind of underline. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + Dash, + /// Tilde code. + EqualsTo, +} + +/// Start of a heading (setext). +/// +/// ```markdown +/// |alpha +/// == +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("expected non-eol/eof"); + } + _ => { + tokenizer.enter(TokenType::HeadingSetext); + tokenizer.enter(TokenType::HeadingSetextText); + tokenizer.enter(TokenType::ChunkText); + text_inside(tokenizer, code) + } + } +} + +/// Inside text. +/// +/// ```markdown +/// al|pha +/// bra|vo +/// == +/// ``` +pub fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::HeadingSetextText); + tokenizer.attempt(underline_before, |ok| { + Box::new(if ok { after } else { text_continue }) + })(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(text_inside)), None) + } + } +} + +/// At a line ending, not at an underline. +/// +/// ```markdown +/// alpha +/// |bravo +/// == +/// ``` +fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // Needed to connect the text. + // To do: does it work? + tokenizer.enter(TokenType::HeadingSetextText); + tokenizer.events.pop(); + tokenizer.events.pop(); + + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + let next = tokenizer.events.len(); + let previous = next - 2; + + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + + ( + State::Fn(Box::new(tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(text_line_start), + ))), + None, + ) + } + _ => unreachable!("expected eol"), + } +} + +/// At a line ending after whitespace, not at an underline. +/// +/// ```markdown +/// alpha +/// |bravo +/// == +/// ``` +fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let next = tokenizer.events.len() - 2; + let previous = next - 2; + + // Link the whitespace, if it exists. + if tokenizer.events[next].token_type == TokenType::Whitespace { + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + } + + match code { + // Blank lines not allowed. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), + _ => { + let next = tokenizer.events.len(); + let previous = next - 2; + + tokenizer.enter(TokenType::ChunkText); + + tokenizer.events[previous].next = Some(next); + tokenizer.events[next].previous = Some(previous); + + text_inside(tokenizer, code) + } + } +} + +/// After a heading (setext). +/// +/// ```markdown +/// alpha +/// ==| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::HeadingSetext); + (State::Ok, Some(vec![code])) +} + +/// At a line ending, presumably an underline. +/// +/// ```markdown +/// alpha| +/// == +/// ``` +fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(underline_start)), None) + } + _ => unreachable!("expected eol"), + } +} + +/// After a line ending, presumably an underline. +/// +/// ```markdown +/// alpha +/// |== +/// ``` +fn underline_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(underline_sequence_start), + )(tokenizer, code) +} + +/// After optional whitespace, presumably an underline. +/// +/// ```markdown +/// alpha +/// |== +/// ``` +fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == '-' || char == '=' => { + let marker = if char == '-' { + Kind::Dash + } else { + Kind::EqualsTo + }; + tokenizer.enter(TokenType::HeadingSetextUnderline); + underline_sequence_inside(tokenizer, code, marker) + } + _ => (State::Nok, None), + } +} + +/// In an underline sequence. +/// +/// ```markdown +/// alpha +/// =|= +/// ``` +fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { + let marker = if kind == Kind::Dash { '-' } else { '=' }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + underline_sequence_inside(tokenizer, code, kind) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(underline_after), + )(tokenizer, code), + _ => underline_after(tokenizer, code), + } +} + +/// After an underline sequence, after optional whitespace. +/// +/// ```markdown +/// alpha +/// ==| +/// ``` +fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::HeadingSetextUnderline); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 880d055..ca1149f 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -29,7 +29,7 @@ //! * [hard break (escape)][hard_break_escape] //! * [hard break (trailing)][hard_break_trailing] //! * [heading (atx)][heading_atx] -//! * heading (setext) +//! * [heading (setext)][heading_setext] //! * [html (flow)][html_flow] //! * [html (text)][html_text] //! * label end @@ -64,6 +64,7 @@ pub mod code_text; pub mod hard_break_escape; pub mod hard_break_trailing; pub mod heading_atx; +pub mod heading_setext; pub mod html_flow; pub mod html_text; pub mod partial_whitespace; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 7a4f71a..bc41991 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -24,7 +24,7 @@ //! For these reasons, it is recommend to not use spaces or tabs between the //! markers. //! Thematic breaks formed with dashes (without whitespace) can also form -//! setext headings. +//! [heading (setext)][heading_setext]. //! As dashes and underscores frequently occur in natural language and URLs, it //! is recommended to use asterisks for thematic breaks to distinguish from //! such use. @@ -39,9 +39,10 @@ //! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks) //! //! [flow]: crate::content::flow +//! [heading_setext]: crate::construct::heading_setext //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element //! -//! <!-- To do: link `lists`, `setext heading` --> +//! <!-- To do: link `lists` --> use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; diff --git a/src/content/content.rs b/src/content/content.rs index 4660fbe..4ca69ee 100644 --- a/src/content/content.rs +++ b/src/content/content.rs @@ -27,7 +27,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { unreachable!("expected non-eol/eof"); } - _ => paragraph_initial(tokenizer, code) + _ => after_definitions(tokenizer, code) // To do: definition. // _ => tokenizer.attempt(definition, |ok| { // Box::new(if ok { @@ -44,10 +44,26 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ```markdown /// |asd /// ``` +fn after_definitions(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("to do: handle eol after definition"); + } + _ => paragraph_initial(tokenizer, code), + } +} + +/// Before a paragraph. +/// +/// ```markdown +/// |asd +/// ``` fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - unreachable!("expected non-eol/eof"); + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("to do: handle eol after definition"); } _ => { tokenizer.enter(TokenType::Paragraph); diff --git a/src/content/flow.rs b/src/content/flow.rs index 4d2ece1..d7509d7 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -14,17 +14,18 @@ //! * [Code (fenced)][crate::construct::code_fenced] //! * [Code (indented)][crate::construct::code_indented] //! * [Heading (atx)][crate::construct::heading_atx] +//! * [Heading (setext)][crate::construct::heading_setext] //! * [HTML (flow)][crate::construct::html_flow] //! * [Thematic break][crate::construct::thematic_break] //! -//! <!-- To do: `setext` in content? Link to content. --> +//! <!-- To do: Link to content. --> use crate::constant::TAB_SIZE; use crate::construct::{ blank_line::start as blank_line, code_fenced::start as code_fenced, code_indented::start as code_indented, heading_atx::start as heading_atx, - html_flow::start as html_flow, partial_whitespace::start as whitespace, - thematic_break::start as thematic_break, + heading_setext::start as heading_setext, html_flow::start as html_flow, + partial_whitespace::start as whitespace, thematic_break::start as thematic_break, }; use crate::subtokenize::subtokenize; use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; @@ -144,24 +145,20 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |*** /// ``` pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_2(heading_atx, thematic_break, |ok| { + tokenizer.attempt_3(heading_atx, thematic_break, heading_setext, |ok| { Box::new(if ok { after } else { content_before }) })(tokenizer, code) } -/// Before flow, but not before a heading (atx) or thematic break. -/// -/// At this point, we’re at content (zero or more definitions and zero or one -/// paragraph/setext heading). +/// Before content. /// /// ```markdown /// |qwe /// ``` -// To do: currently only parses a single line. +/// // To do: // - Multiline // - One or more definitions. -// - Setext heading. fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { @@ -174,12 +171,12 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } } + /// In content. /// /// ```markdown /// al|pha /// ``` -// To do: lift limitations as documented above. fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult { match code { Code::None => content_end(tokenizer, code), diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 0aae480..fc9e177 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -24,10 +24,6 @@ pub enum TokenType { AutolinkMarker, AutolinkProtocol, AutolinkEmail, - AtxHeading, - AtxHeadingSequence, - AtxHeadingWhitespace, - AtxHeadingText, BlankLineEnding, BlankLineWhitespace, CharacterEscape, @@ -58,6 +54,13 @@ pub enum TokenType { HardBreakEscapeMarker, HardBreakTrailing, HardBreakTrailingSpace, + HeadingAtx, + HeadingAtxSequence, + HeadingAtxWhitespace, + HeadingAtxText, + HeadingSetext, + HeadingSetextText, + HeadingSetextUnderline, HtmlFlow, HtmlFlowData, HtmlText, |