diff options
Diffstat (limited to '')
-rw-r--r-- | src/compiler.rs | 10 | ||||
-rw-r--r-- | src/construct/blank_line.rs | 13 | ||||
-rw-r--r-- | src/construct/code_fenced.rs | 169 | ||||
-rw-r--r-- | src/construct/code_indented.rs | 96 | ||||
-rw-r--r-- | src/construct/definition.rs | 96 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 52 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 45 | ||||
-rw-r--r-- | src/construct/html_flow.rs | 7 | ||||
-rw-r--r-- | src/construct/html_text.rs | 9 | ||||
-rw-r--r-- | src/construct/mod.rs | 2 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 68 | ||||
-rw-r--r-- | src/construct/partial_destination.rs | 2 | ||||
-rw-r--r-- | src/construct/partial_space_or_tab.rs | 98 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 7 | ||||
-rw-r--r-- | src/construct/partial_whitespace.rs | 64 | ||||
-rw-r--r-- | src/construct/thematic_break.rs | 50 | ||||
-rw-r--r-- | src/content/flow.rs | 45 | ||||
-rw-r--r-- | src/tokenizer.rs | 32 |
18 files changed, 324 insertions, 541 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 59fcd22..366dcd9 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -108,7 +108,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::AutolinkMarker | TokenType::AutolinkProtocol | TokenType::BlankLineEnding - | TokenType::BlankLineWhitespace | TokenType::CharacterEscape | TokenType::CharacterEscapeMarker | TokenType::CharacterEscapeValue @@ -118,10 +117,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterReferenceMarkerNumeric | TokenType::CharacterReferenceMarkerSemi | TokenType::CharacterReferenceValue - | TokenType::CodeIndentedPrefixWhitespace | TokenType::CodeFencedFence | TokenType::CodeFencedFenceSequence - | TokenType::CodeFencedFenceWhitespace | TokenType::CodeFlowChunk | TokenType::CodeTextData | TokenType::CodeTextLineEnding @@ -153,7 +150,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::LineEnding | TokenType::ThematicBreak | TokenType::ThematicBreakSequence - | TokenType::ThematicBreakWhitespace | TokenType::Whitespace => { // Ignore. } @@ -172,7 +168,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St TokenType::CodeFenced => { code_flow_seen_data = Some(false); line_ending_if_needed(buffers); - // Note: no `>`, which is added later. + // Note that no `>` is used, which is added later. buf_tail_mut(buffers).push("<pre><code".to_string()); code_fenced_fences_count = Some(0); } @@ -203,14 +199,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St TokenType::Autolink | TokenType::AutolinkMarker | TokenType::BlankLineEnding - | TokenType::BlankLineWhitespace | TokenType::CharacterEscape | TokenType::CharacterEscapeMarker | TokenType::CharacterReference | TokenType::CharacterReferenceMarkerSemi | TokenType::CodeFencedFenceSequence - | TokenType::CodeFencedFenceWhitespace - | TokenType::CodeIndentedPrefixWhitespace | TokenType::CodeTextSequence | TokenType::DefinitionLabel | TokenType::DefinitionLabelMarker @@ -228,7 +221,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::HardBreakTrailingSpace | TokenType::HeadingSetext | TokenType::ThematicBreakSequence - | TokenType::ThematicBreakWhitespace | TokenType::Whitespace => { // Ignore. } diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index fdb1ee0..86091d9 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -29,27 +29,24 @@ //! //! <!-- To do: link `list` --> -use crate::construct::partial_whitespace::start as whitespace; -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::construct::partial_space_or_tab::space_or_tab_opt; +use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; /// Start of a blank line. /// -/// Note: `β ` represents a space character. +/// > π **Note**: `β ` represents a space character. /// /// ```markdown /// |β β /// | /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace), - |_ok| Box::new(after), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), after)(tokenizer, code) } /// After zero or more spaces or tabs, before a line ending or EOF. /// -/// Note: `β ` represents a space character. +/// > π **Note**: `β ` represents a space character. /// /// ```markdown /// |β β diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index ba76aa8..30ec911 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -91,7 +91,7 @@ //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; -use crate::construct::partial_whitespace::start as whitespace; +use crate::construct::partial_space_or_tab::{space_or_tab_min_max, space_or_tab_opt}; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; use crate::util::span::from_exit_event; @@ -130,10 +130,7 @@ struct Info { pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::CodeFenced); tokenizer.enter(TokenType::CodeFencedFence); - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(before_sequence_open), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), before_sequence_open)(tokenizer, code) } /// Inside the opening fence, after an optional prefix, before a sequence. @@ -159,6 +156,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult tokenizer.enter(TokenType::CodeFencedFenceSequence); sequence_open( tokenizer, + code, Info { prefix, size: 0, @@ -168,7 +166,6 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult Kind::Tilde }, }, - code, ) } _ => (State::Nok, None), @@ -182,7 +179,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult /// console.log(1); /// ~~~ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +fn sequence_open(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { let marker = if info.kind == Kind::GraveAccent { '`' } else { @@ -193,26 +190,18 @@ fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnRe Code::Char(char) if char == marker => { tokenizer.consume(code); ( - State::Fn(Box::new(|tokenizer, code| { + State::Fn(Box::new(|t, c| { let mut info = info; info.size += 1; - sequence_open(tokenizer, info, code) + sequence_open(t, c, info) })), None, ) } + _ if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN => (State::Nok, None), _ => { - if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN { - (State::Nok, None) - } else { - tokenizer.exit(TokenType::CodeFencedFenceSequence); - tokenizer.attempt( - |tokenizer, code| { - whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace) - }, - |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)), - )(tokenizer, code) - } + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.go(space_or_tab_opt(), |t, c| info_before(t, c, info))(tokenizer, code) } } } @@ -224,16 +213,16 @@ fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnRe /// console.log(1); /// ~~~ /// ``` -fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +fn info_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::CodeFencedFence); - at_break(tokenizer, info, code) + at_break(tokenizer, code, info) } _ => { tokenizer.enter(TokenType::CodeFencedFenceInfo); tokenizer.enter(TokenType::ChunkString); - info_inside(tokenizer, info, code, vec![]) + info_inside(tokenizer, code, info, vec![]) } } } @@ -247,8 +236,8 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResu /// ``` fn info_inside( tokenizer: &mut Tokenizer, - info: Info, code: Code, + info: Info, codes: Vec<Code>, ) -> StateFnResult { match code { @@ -256,15 +245,12 @@ fn info_inside( tokenizer.exit(TokenType::ChunkString); tokenizer.exit(TokenType::CodeFencedFenceInfo); tokenizer.exit(TokenType::CodeFencedFence); - at_break(tokenizer, info, code) + at_break(tokenizer, code, info) } Code::VirtualSpace | Code::Char('\t' | ' ') => { tokenizer.exit(TokenType::ChunkString); tokenizer.exit(TokenType::CodeFencedFenceInfo); - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), - |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), |t, c| meta_before(t, c, info))(tokenizer, code) } Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), Code::Char(_) => { @@ -272,9 +258,7 @@ fn info_inside( codes.push(code); tokenizer.consume(code); ( - State::Fn(Box::new(|tokenizer, code| { - info_inside(tokenizer, info, code, codes) - })), + State::Fn(Box::new(|t, c| info_inside(t, c, info, codes))), None, ) } @@ -288,16 +272,16 @@ fn info_inside( /// console.log(1); /// ~~~ /// ``` -fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +fn meta_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::CodeFencedFence); - at_break(tokenizer, info, code) + at_break(tokenizer, code, info) } _ => { tokenizer.enter(TokenType::CodeFencedFenceMeta); tokenizer.enter(TokenType::ChunkString); - meta(tokenizer, info, code) + meta(tokenizer, code, info) } } } @@ -309,21 +293,18 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResu /// console.log(1); /// ~~~ /// ``` -fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +fn meta(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::ChunkString); tokenizer.exit(TokenType::CodeFencedFenceMeta); tokenizer.exit(TokenType::CodeFencedFence); - at_break(tokenizer, info, code) + at_break(tokenizer, code, info) } Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), _ => { tokenizer.consume(code); - ( - State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))), - None, - ) + (State::Fn(Box::new(|t, c| meta(t, c, info))), None) } } } @@ -335,7 +316,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { /// aa| /// ~~~ /// ``` -fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { let clone = info.clone(); match code { @@ -345,12 +326,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new(|tokenizer, code| { - close_before(tokenizer, info, code) - })), - None, - ) + (State::Fn(Box::new(|t, c| close_start(t, c, info))), None) }, |ok| { if ok { @@ -360,12 +336,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new(|tokenizer, code| { - content_start(tokenizer, clone, code) - })), - None, - ) + (State::Fn(Box::new(|t, c| content_start(t, c, clone))), None) }) } }, @@ -385,12 +356,11 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult /// console.log('1') /// | ~~~ /// ``` -fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +fn close_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { tokenizer.enter(TokenType::CodeFencedFence); - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)), - )(tokenizer, code) + tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), |t, c| { + close_before(t, c, info) + })(tokenizer, code) } /// In a closing fence, after optional whitespace, before sequence. @@ -404,31 +374,17 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnRes /// console.log('1') /// |~~~ /// ``` -fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { - let tail = tokenizer.events.last(); - let mut prefix = 0; +fn close_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { let marker = if info.kind == Kind::GraveAccent { '`' } else { '~' }; - if let Some(event) = tail { - if event.token_type == TokenType::Whitespace { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; - } - } - - // To do: 4+ should be okay if code (indented) is turned off! - if prefix >= TAB_SIZE { - return (State::Nok, None); - } - match code { Code::Char(char) if char == marker => { tokenizer.enter(TokenType::CodeFencedFenceSequence); - close_sequence(tokenizer, info, code, 0) + close_sequence(tokenizer, code, info, 0) } _ => (State::Nok, None), } @@ -441,7 +397,7 @@ fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> S /// console.log('1') /// ~|~~ /// ``` -fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult { +fn close_sequence(tokenizer: &mut Tokenizer, code: Code, info: Info, size: usize) -> StateFnResult { let marker = if info.kind == Kind::GraveAccent { '`' } else { @@ -452,18 +408,13 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize Code::Char(char) if char == marker => { tokenizer.consume(code); ( - State::Fn(Box::new(move |tokenizer, code| { - close_sequence(tokenizer, info, code, size + 1) - })), + State::Fn(Box::new(move |t, c| close_sequence(t, c, info, size + 1))), None, ) } _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { tokenizer.exit(TokenType::CodeFencedFenceSequence); - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), - |_ok| Box::new(close_whitespace_after), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), close_sequence_after)(tokenizer, code) } _ => (State::Nok, None), } @@ -476,7 +427,7 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize /// console.log('1') /// ~~~ | /// ``` -fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn close_sequence_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::CodeFencedFence); @@ -493,53 +444,27 @@ fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResul /// |aa /// ~~~ /// ``` -fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { - match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_break(tokenizer, info, code) - } - Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => { - tokenizer.enter(TokenType::Whitespace); - content_prefix(tokenizer, info, 0, code) - } - _ => { - tokenizer.enter(TokenType::CodeFlowChunk); - content_continue(tokenizer, info, code) - } - } +fn content_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + tokenizer.go(space_or_tab_min_max(0, info.prefix), |t, c| { + content_begin(t, c, info) + })(tokenizer, code) } -/// Before code content, in a prefix. +/// Before code content, after a prefix. /// /// ```markdown /// ~~~js /// | aa /// ~~~ /// ``` -fn content_prefix( - tokenizer: &mut Tokenizer, - info: Info, - prefix: usize, - code: Code, -) -> StateFnResult { +fn content_begin(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { - Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => { - tokenizer.consume(code); - ( - State::Fn(Box::new(move |tokenizer, code| { - content_prefix(tokenizer, info, prefix + 1, code) - })), - None, - ) - } Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::Whitespace); - at_break(tokenizer, info, code) + at_break(tokenizer, code, info) } _ => { - tokenizer.exit(TokenType::Whitespace); tokenizer.enter(TokenType::CodeFlowChunk); - content_continue(tokenizer, info, code) + content_continue(tokenizer, code, info) } } } @@ -553,18 +478,16 @@ fn content_prefix( /// ab| /// ~~~ /// ``` -fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +fn content_continue(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::CodeFlowChunk); - at_break(tokenizer, info, code) + at_break(tokenizer, code, info) } _ => { tokenizer.consume(code); ( - State::Fn(Box::new(|tokenizer, code| { - content_continue(tokenizer, info, code) - })), + State::Fn(Box::new(|t, c| content_continue(t, c, info))), None, ) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 55b8901..64956be 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -38,6 +38,7 @@ //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +use super::partial_space_or_tab::{space_or_tab_min_max, space_or_tab_opt}; use crate::constant::TAB_SIZE; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -46,46 +47,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// ```markdown /// | asd /// ``` -pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::VirtualSpace | Code::Char(' ' | '\t') => { - tokenizer.enter(TokenType::CodeIndented); - tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); - indent(tokenizer, code, 0) - } - _ => (State::Nok, None), - } -} - -/// Inside the initial whitespace. -/// -/// ```markdown -/// | asd -/// | asd -/// | asd -/// |asd -/// ``` /// /// > **Parsing note**: it is not needed to check if this first line is a /// > filled line (that it has a non-whitespace character), because blank lines /// > are parsed already, so we never run into that. -fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { - match code { - _ if size == TAB_SIZE => { - tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); - at_break(tokenizer, code) - } - Code::VirtualSpace | Code::Char(' ' | '\t') => { - tokenizer.consume(code); - ( - State::Fn(Box::new(move |tokenizer, code| { - indent(tokenizer, code, size + 1) - })), - None, - ) - } - _ => (State::Nok, None), - } +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeIndented); + tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code) } /// At a break. @@ -153,39 +121,45 @@ fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::LineEnding); (State::Fn(Box::new(further_start)), None) } - Code::VirtualSpace | Code::Char(' ' | '\t') => { - tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); - further_indent(tokenizer, code, 0) - } - _ => (State::Nok, None), + _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { + Box::new(if ok { further_end } else { further_begin }) + })(tokenizer, code), } } -/// Inside further whitespace. +/// After a proper indent. /// /// ```markdown /// asd -/// | asd +/// |asd /// ``` -fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { +fn further_end(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + (State::Ok, Some(vec![code])) +} + +/// At the beginning of a line that is not indented enough. +/// +/// > π **Note**: `β ` represents a space character. +/// +/// ```markdown +/// asd +/// |β β +/// asd +/// ``` +fn further_begin(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.go(space_or_tab_opt(), further_after)(tokenizer, code) +} + +/// After whitespace. +/// +/// ```markdown +/// asd +/// β β | +/// asd +/// ``` +fn further_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - _ if size == TAB_SIZE => { - tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); - (State::Ok, Some(vec![code])) - } - Code::VirtualSpace | Code::Char(' ' | '\t') => { - tokenizer.consume(code); - ( - State::Fn(Box::new(move |tokenizer, code| { - further_indent(tokenizer, code, size + 1) - })), - None, - ) - } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); - further_start(tokenizer, code) - } + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => further_start(tokenizer, code), _ => (State::Nok, None), } } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index f7f8acd..03baee6 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -58,7 +58,7 @@ use crate::construct::{ partial_destination::start as destination, partial_label::start as label, - partial_title::start as title, partial_whitespace::start as whitespace, + partial_space_or_tab::space_or_tab_opt, partial_title::start as title, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -68,11 +68,18 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// |[a]: b "c" /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::Definition); + tokenizer.go(space_or_tab_opt(), before)(tokenizer, code) +} + +/// At the start of a definition, after whitespace. +/// +/// ```markdown +/// |[a]: b "c" +/// ``` +pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::Char('[') => { - tokenizer.enter(TokenType::Definition); - tokenizer.go(label, label_after)(tokenizer, code) - } + Code::Char('[') => tokenizer.go(label, label_after)(tokenizer, code), _ => (State::Nok, None), } } @@ -93,27 +100,15 @@ fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::DefinitionMarker); tokenizer.consume(code); tokenizer.exit(TokenType::DefinitionMarker); - (State::Fn(Box::new(marker_after)), None) + ( + State::Fn(Box::new(tokenizer.go(space_or_tab_opt(), marker_after))), + None, + ) } _ => (State::Nok, None), } } -/// After the marker of a definition. -/// -/// ```markdown -/// [a]:| b "c" -/// -/// [a]:| β -/// b "c" -/// ``` -fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(marker_after_optional_whitespace), - )(tokenizer, code) -} - /// After the marker, after whitespace. /// /// ```markdown @@ -122,31 +117,23 @@ fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// [a]: |β /// b "c" /// ``` -fn marker_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); - (State::Fn(Box::new(marker_after_optional_line_ending)), None) + ( + State::Fn(Box::new( + tokenizer.go(space_or_tab_opt(), destination_before), + )), + None, + ) } _ => destination_before(tokenizer, code), } } -/// After the marker, after a line ending. -/// -/// ```markdown -/// [a]: -/// | b "c" -/// ``` -fn marker_after_optional_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(destination_before), - )(tokenizer, code) -} - /// Before a destination. /// /// ```markdown @@ -163,8 +150,9 @@ fn destination_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') ); - if !char_nok - && (event.token_type == TokenType::LineEnding || event.token_type == TokenType::Whitespace) + // Whitespace. + if (event.token_type == TokenType::LineEnding || event.token_type == TokenType::Whitespace) + && !char_nok { tokenizer.go(destination, destination_after)(tokenizer, code) } else { @@ -191,10 +179,7 @@ fn destination_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// [a]: b "c"| /// ``` fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(after_whitespace), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), after_whitespace)(tokenizer, code) } /// After a definition, after optional whitespace. @@ -222,10 +207,7 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// "c" /// ``` fn title_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(title_before_after_optional_whitespace), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), title_before_after_optional_whitespace)(tokenizer, code) } /// Before a title, after optional whitespace. @@ -243,7 +225,9 @@ fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); ( - State::Fn(Box::new(title_before_after_optional_line_ending)), + State::Fn(Box::new( + tokenizer.go(space_or_tab_opt(), title_before_marker), + )), None, ) } @@ -257,19 +241,6 @@ fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) /// [a]: bβ /// | "c" /// ``` -fn title_before_after_optional_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(title_before_marker), - )(tokenizer, code) -} - -/// Before a title, after a line ending. -/// -/// ```markdown -/// [a]: bβ -/// | "c" -/// ``` fn title_before_marker(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { let event = tokenizer.events.last().unwrap(); @@ -289,10 +260,7 @@ fn title_before_marker(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// "c"| /// ``` fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(title_after_after_optional_whitespace), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), title_after_after_optional_whitespace)(tokenizer, code) } /// After a title, after optional whitespace. diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index ab8b6a5..12d4193 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -47,6 +47,7 @@ //! [wiki-setext]: https://en.wikipedia.org/wiki/Setext //! [atx]: http://www.aaronsw.com/2002/atx/ +use super::partial_space_or_tab::{space_or_tab, space_or_tab_opt}; use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -56,8 +57,17 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// |## alpha /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::HeadingAtx); + tokenizer.go(space_or_tab_opt(), before)(tokenizer, code) +} + +/// Start of a heading (atx), after whitespace. +/// +/// ```markdown +/// |## alpha +/// ``` +pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if Code::Char('#') == code { - tokenizer.enter(TokenType::HeadingAtx); tokenizer.enter(TokenType::HeadingAtxSequence); sequence_open(tokenizer, code, 0) } else { @@ -72,12 +82,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult { match code { - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ') - if rank > 0 => - { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') if rank > 0 => { tokenizer.exit(TokenType::HeadingAtxSequence); at_break(tokenizer, code) } @@ -90,6 +95,13 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR None, ) } + _ if rank > 0 => { + tokenizer.exit(TokenType::HeadingAtxSequence); + tokenizer.go( + space_or_tab(TokenType::HeadingAtxWhitespace, 1, usize::MAX), + at_break, + )(tokenizer, code) + } _ => (State::Nok, None), } } @@ -109,10 +121,10 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::HeadingAtx); (State::Ok, Some(vec![code])) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.enter(TokenType::HeadingAtxWhitespace); - whitespace(tokenizer, code) - } + Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.go( + space_or_tab(TokenType::HeadingAtxWhitespace, 1, usize::MAX), + at_break, + )(tokenizer, code), Code::Char('#') => { tokenizer.enter(TokenType::HeadingAtxSequence); further_sequence(tokenizer, code) @@ -141,24 +153,6 @@ fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// In whitespace. -/// -/// ```markdown -/// ## alpha | bravo -/// ``` -fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.consume(code); - (State::Fn(Box::new(whitespace)), None) - } - _ => { - tokenizer.exit(TokenType::HeadingAtxWhitespace); - at_break(tokenizer, code) - } - } -} - /// In text. /// /// ```markdown diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index f4c6001..64647cb 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -50,7 +50,7 @@ //! [atx]: http://www.aaronsw.com/2002/atx/ use crate::constant::TAB_SIZE; -use crate::construct::partial_whitespace::start as whitespace; +use crate::construct::partial_space_or_tab::space_or_tab_opt; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; use crate::util::span::from_exit_event; @@ -70,12 +70,22 @@ pub enum Kind { /// == /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::HeadingSetext); + tokenizer.go(space_or_tab_opt(), before)(tokenizer, code) +} + +/// Start of a heading (setext), after whitespace. +/// +/// ```markdown +/// |alpha +/// == +/// ``` +pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { unreachable!("expected non-eol/eof"); } _ => { - tokenizer.enter(TokenType::HeadingSetext); tokenizer.enter(TokenType::HeadingSetextText); tokenizer.enter(TokenType::ChunkText); text_inside(tokenizer, code) @@ -134,10 +144,7 @@ fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.events[next].previous = Some(previous); ( - State::Fn(Box::new(tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(text_line_start), - ))), + State::Fn(Box::new(tokenizer.go(space_or_tab_opt(), text_line_start))), None, ) } @@ -202,25 +209,17 @@ fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); - (State::Fn(Box::new(underline_start)), None) + ( + State::Fn(Box::new( + tokenizer.go(space_or_tab_opt(), underline_sequence_start), + )), + None, + ) } _ => unreachable!("expected eol"), } } -/// After a line ending, presumably an underline. -/// -/// ```markdown -/// alpha -/// |== -/// ``` -fn underline_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(underline_sequence_start), - )(tokenizer, code) -} - /// After optional whitespace, presumably an underline. /// /// ```markdown @@ -276,11 +275,7 @@ fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) None, ) } - Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(underline_after), - )(tokenizer, code), - _ => underline_after(tokenizer, code), + _ => tokenizer.go(space_or_tab_opt(), underline_after)(tokenizer, code), } } diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 5adac7d..4819e63 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -93,7 +93,7 @@ //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; -use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::construct::{blank_line::start as blank_line, partial_space_or_tab::space_or_tab_opt}; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Kind of HTML (flow). @@ -155,10 +155,7 @@ struct Info { pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::HtmlFlow); tokenizer.enter(TokenType::HtmlFlowData); - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(before), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), before)(tokenizer, code) } /// After optional whitespace, before `<`. diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 93b4b62..a91113f 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -49,7 +49,7 @@ //! [html_flow]: crate::construct::html_flow //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing -use crate::construct::partial_whitespace::start as whitespace; +use crate::construct::partial_space_or_tab::space_or_tab_opt; use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer}; /// Start of HTML (text) @@ -673,10 +673,9 @@ fn after_line_ending( code: Code, return_state: Box<StateFn>, ) -> StateFnResult { - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(|t, c| after_line_ending_prefix(t, c, return_state)), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), |t, c| { + after_line_ending_prefix(t, c, return_state) + })(tokenizer, code) } /// After a line ending, after indent. diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 1debb74..407dc6b 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -70,6 +70,6 @@ pub mod html_text; pub mod paragraph; pub mod partial_destination; pub mod partial_label; +pub mod partial_space_or_tab; pub mod partial_title; -pub mod partial_whitespace; pub mod thematic_break; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 50ef627..fa18f28 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -30,12 +30,11 @@ use crate::constant::TAB_SIZE; use crate::construct::{ - code_fenced::start as code_fenced, heading_atx::start as heading_atx, - html_flow::start as html_flow, partial_whitespace::start as whitespace, - thematic_break::start as thematic_break, + blank_line::start as blank_line, code_fenced::start as code_fenced, + heading_atx::start as heading_atx, html_flow::start as html_flow, + partial_space_or_tab::space_or_tab_min_max, thematic_break::start as thematic_break, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; -use crate::util::span::from_exit_event; /// Before a paragraph. /// @@ -114,7 +113,7 @@ fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); - (State::Fn(Box::new(interrupt_initial)), None) + (State::Fn(Box::new(interrupt_start)), None) } _ => unreachable!("expected eol"), } @@ -123,55 +122,30 @@ fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// After a line ending. /// /// ```markdown -/// alpha| -/// ~~~js +/// alpha +/// |~~~js /// ~~~ /// ``` -fn interrupt_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_2(code_fenced, html_flow, |ok| { +fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // To do: If code is disabled, indented lines are allowed to interrupt. + tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { if ok { - Box::new(|_tokenizer, _code| (State::Nok, None)) + Box::new(|_t, code| (State::Ok, Some(vec![code]))) } else { Box::new(|tokenizer, code| { - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(interrupt_start), + tokenizer.attempt_5( + blank_line, + code_fenced, + html_flow, + heading_atx, + thematic_break, + |ok| { + Box::new(move |_t, code| { + (if ok { State::Nok } else { State::Ok }, Some(vec![code])) + }) + }, )(tokenizer, code) }) } })(tokenizer, code) } - -/// After a line ending, after optional whitespace. -/// -/// ```markdown -/// alpha| -/// # bravo -/// ``` -fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let tail = tokenizer.events.last(); - let mut prefix = 0; - - if let Some(event) = tail { - if event.token_type == TokenType::Whitespace { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; - } - } - - match code { - // Blank lines are not allowed in paragraph. - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), - // To do: If code is disabled, indented lines are allowed. - _ if prefix >= TAB_SIZE => (State::Ok, None), - // To do: definitions, setext headings, etc? - _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| { - let result = if ok { - (State::Nok, None) - } else { - (State::Ok, None) - }; - Box::new(|_t, _c| result) - })(tokenizer, code), - } -} diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 58d07c1..bc95055 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -60,7 +60,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::DefinitionDestinationLiteralMarker); (State::Fn(Box::new(enclosed_before)), None) } - Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(')') => { + Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ' | ')') => { (State::Nok, None) } Code::Char(char) if char.is_ascii_control() => (State::Nok, None), diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs new file mode 100644 index 0000000..40ece49 --- /dev/null +++ b/src/construct/partial_space_or_tab.rs @@ -0,0 +1,98 @@ +//! Several helpers to parse whitespace (`space_or_tab`). +//! +//! ## References +//! +//! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js) + +use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer}; + +/// Optional `space_or_tab` +/// +/// ```bnf +/// space_or_tab_opt ::= *( ' ' '\t' ) +/// ``` +pub fn space_or_tab_opt() -> Box<StateFn> { + space_or_tab_min_max(0, usize::MAX) +} + +/// Between `x` and `y` `space_or_tab` +/// +/// ```bnf +/// space_or_tab_min_max ::= x*y( ' ' '\t' ) +/// ``` +pub fn space_or_tab_min_max(min: usize, max: usize) -> Box<StateFn> { + space_or_tab(TokenType::Whitespace, min, max) +} + +/// Between `x` and `y` `space_or_tab`, with the given token type. +/// +/// ```bnf +/// space_or_tab ::= x*y( ' ' '\t' ) +/// ``` +pub fn space_or_tab(kind: TokenType, min: usize, max: usize) -> Box<StateFn> { + Box::new(move |t, c| start(t, c, kind, min, max)) +} + +/// Before whitespace. +/// +/// ```markdown +/// alpha| bravo +/// ``` +fn start( + tokenizer: &mut Tokenizer, + code: Code, + kind: TokenType, + min: usize, + max: usize, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') if max > 0 => { + tokenizer.enter(kind.clone()); + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + inside(tokenizer, code, kind, min, max, 1) + })), + None, + ) + } + _ => ( + if min == 0 { State::Ok } else { State::Nok }, + Some(vec![code]), + ), + } +} + +/// In whitespace. +/// +/// ```markdown +/// alpha |bravo +/// alpha | bravo +/// ``` +fn inside( + tokenizer: &mut Tokenizer, + code: Code, + kind: TokenType, + min: usize, + max: usize, + size: usize, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') if size < max => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + inside(tokenizer, code, kind, min, max, size + 1) + })), + None, + ) + } + _ => { + tokenizer.exit(kind); + ( + if size >= min { State::Ok } else { State::Nok }, + Some(vec![code]), + ) + } + } +} diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 19ba8d4..0669c8e 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -33,7 +33,7 @@ // To do: pass token types in. -use crate::construct::partial_whitespace::start as whitespace; +use crate::construct::partial_space_or_tab::space_or_tab_opt; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Type of title. @@ -143,10 +143,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult /// |b" /// ``` fn line_start(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { - tokenizer.attempt( - |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(|t, c| line_begin(t, c, kind)), - )(tokenizer, code) + tokenizer.go(space_or_tab_opt(), |t, c| line_begin(t, c, kind))(tokenizer, code) } /// After a line ending, after optional whitespace. diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs deleted file mode 100644 index b8cf9a7..0000000 --- a/src/construct/partial_whitespace.rs +++ /dev/null @@ -1,64 +0,0 @@ -//! A little helper to parse `space_or_tab` -//! -//! Theyβre formed with the following BNF: -//! -//! ```bnf -//! space_or_tab ::= 1*(' ' '\t') -//! ``` -//! -//! Depending on where whitespace can occur, it can be optional (or not), -//! and present in the rendered result (or not). -//! -//! ## References -//! -//! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js) - -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; - -// To do: should `token_type` be a `Some`, with `None` defaulting to something? -// To do: should `max: Some(usize)` be added? - -/// Before whitespace. -/// -/// ```markdown -/// alpha| bravo -/// ``` -pub fn start(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { - match code { - Code::VirtualSpace | Code::Char('\t' | ' ') => { - // To do: lifetimes. - let clone = token_type.clone(); - tokenizer.enter(token_type); - tokenizer.consume(code); - ( - State::Fn(Box::new(|tokenizer, code| inside(tokenizer, code, clone))), - None, - ) - } - _ => (State::Nok, None), - } -} - -/// In whitespace. -/// -/// ```markdown -/// alpha |bravo -/// alpha | bravo -/// ``` -fn inside(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { - match code { - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.consume(code); - ( - State::Fn(Box::new(|tokenizer, code| { - inside(tokenizer, code, token_type) - })), - None, - ) - } - _ => { - tokenizer.exit(token_type); - (State::Ok, Some(vec![code])) - } - } -} diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index bc41991..abf733d 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -44,6 +44,7 @@ //! //! <!-- To do: link `lists` --> +use super::partial_space_or_tab::space_or_tab_opt; use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -53,9 +54,18 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// |*** /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::ThematicBreak); + tokenizer.go(space_or_tab_opt(), before)(tokenizer, code) +} + +/// Start of a thematic break, after whitespace. +/// +/// ```markdown +/// |*** +/// ``` +pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::Char(char) if char == '*' || char == '-' || char == '_' => { - tokenizer.enter(TokenType::ThematicBreak); at_break(tokenizer, code, char, 0) } _ => (State::Nok, None), @@ -71,20 +81,16 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn at_break(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { match code { - Code::Char(char) if char == marker => { - tokenizer.enter(TokenType::ThematicBreakSequence); - sequence(tokenizer, code, marker, size) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.enter(TokenType::ThematicBreakWhitespace); - whitespace(tokenizer, code, marker, size) - } Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') if size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(TokenType::ThematicBreak); (State::Ok, Some(vec![code])) } + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::ThematicBreakSequence); + sequence(tokenizer, code, marker, size) + } _ => (State::Nok, None), } } @@ -109,31 +115,9 @@ fn sequence(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> } _ => { tokenizer.exit(TokenType::ThematicBreakSequence); - at_break(tokenizer, code, marker, size) - } - } -} - -/// In whitespace. -/// -/// ```markdown -/// * |* * -/// * | * * -/// ``` -fn whitespace(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { - match code { - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.consume(code); - ( - State::Fn(Box::new(move |tokenizer, code| { - whitespace(tokenizer, code, marker, size) - })), - None, + tokenizer.go(space_or_tab_opt(), move |t, c| at_break(t, c, marker, size))( + tokenizer, code, ) } - _ => { - tokenizer.exit(TokenType::ThematicBreakWhitespace); - at_break(tokenizer, code, marker, size) - } } } diff --git a/src/content/flow.rs b/src/content/flow.rs index 22aa77f..f4af4ea 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -24,7 +24,7 @@ use crate::construct::{ code_indented::start as code_indented, definition::start as definition, heading_atx::start as heading_atx, heading_setext::start as heading_setext, html_flow::start as html_flow, paragraph::start as paragraph, - partial_whitespace::start as whitespace, thematic_break::start as thematic_break, + thematic_break::start as thematic_break, }; use crate::subtokenize::subtokenize; use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; @@ -95,9 +95,16 @@ fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), // To do: should all flow just start before the prefix? - _ => tokenizer.attempt_3(code_indented, code_fenced, html_flow, |ok| { - Box::new(if ok { after } else { before }) - })(tokenizer, code), + _ => tokenizer.attempt_7( + code_indented, + code_fenced, + html_flow, + heading_atx, + thematic_break, + definition, + heading_setext, + |ok| Box::new(if ok { after } else { before_paragraph }), + )(tokenizer, code), } } @@ -123,36 +130,6 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// Before flow, but not at code (indented) or code (fenced). -/// -/// Compared to flow (initial), normal flow can be arbitrarily prefixed. -/// -/// ```markdown -/// |qwe -/// ``` -fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(before_after_prefix), - )(tokenizer, code) -} - -/// Before flow, after potential whitespace. -/// -/// ```markdown -/// |# asd -/// |*** -/// ``` -fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_4( - heading_atx, - thematic_break, - definition, - heading_setext, - |ok| Box::new(if ok { after } else { before_paragraph }), - )(tokenizer, code) -} - /// Before a paragraph. /// /// ```markdown diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c1bb61b..de27d12 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -25,7 +25,6 @@ pub enum TokenType { AutolinkProtocol, AutolinkEmail, BlankLineEnding, - BlankLineWhitespace, CharacterEscape, CharacterEscapeMarker, CharacterEscapeValue, @@ -38,12 +37,10 @@ pub enum TokenType { CodeFenced, CodeFencedFence, CodeFencedFenceSequence, - CodeFencedFenceWhitespace, CodeFencedFenceInfo, CodeFencedFenceMeta, CodeFlowChunk, CodeIndented, - CodeIndentedPrefixWhitespace, CodeText, CodeTextSequence, CodeTextLineEnding, @@ -81,7 +78,6 @@ pub enum TokenType { Paragraph, ThematicBreak, ThematicBreakSequence, - ThematicBreakWhitespace, Whitespace, // Chunks are tokenizer, but unraveled by `subtokenize`. @@ -114,7 +110,7 @@ pub struct Point { /// 1-indexed line number. pub line: usize, /// 1-indexed column number. - /// Note that this is increases up to a tab stop for tabs. + /// This is increases up to a tab stop for tabs. /// Some editors count tabs as 1 character, so this position is not always /// the same as editors. pub column: usize, @@ -485,32 +481,14 @@ impl Tokenizer { ) } - pub fn attempt_3( - &mut self, - a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, - b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, - c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, - done: impl FnOnce(bool) -> Box<StateFn> + 'static, - ) -> Box<StateFn> { - self.call_multiple( - false, - Some(Box::new(a)), - Some(Box::new(b)), - Some(Box::new(c)), - None, - None, - None, - None, - done, - ) - } - - pub fn attempt_4( + #[allow(clippy::too_many_arguments, clippy::many_single_char_names)] + pub fn attempt_5( &mut self, a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + e: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, done: impl FnOnce(bool) -> Box<StateFn> + 'static, ) -> Box<StateFn> { self.call_multiple( @@ -519,7 +497,7 @@ impl Tokenizer { Some(Box::new(b)), Some(Box::new(c)), Some(Box::new(d)), - None, + Some(Box::new(e)), None, None, done, |