diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-08-09 10:45:15 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-08-09 10:45:15 +0200 |
commit | 4ce1ac9e41cafa9051377470e8a246063f7d9b1a (patch) | |
tree | d678d9583764b2706fe7ea4876e91e40609f15b0 | |
parent | 8ffed1822bcbc1b6ce6647b840fb03996b0635ea (diff) | |
download | markdown-rs-4ce1ac9e41cafa9051377470e8a246063f7d9b1a.tar.gz markdown-rs-4ce1ac9e41cafa9051377470e8a246063f7d9b1a.tar.bz2 markdown-rs-4ce1ac9e41cafa9051377470e8a246063f7d9b1a.zip |
Rewrite algorithm to not pass around boxed functions
* Pass state names from an enum around instead of boxed functions
* Refactor to simplify attempts a lot
* Use a subtokenizer for the the `document` content type
34 files changed, 2065 insertions, 1194 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index b86fd82..57ab40a 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -482,28 +482,45 @@ fn on_enter_list(context: &mut CompileContext) { // Blank line directly in list or directly in list item, // but not a blank line after an empty list item. if balance < 3 && event.token_type == Token::BlankLineEnding { - let at_marker = balance == 2 - && events[skip::opt_back( - events, - index - 2, - &[Token::BlankLineEnding, Token::SpaceOrTab], - )] - .token_type - == Token::ListItemPrefix; - let at_list_item = balance == 1 && events[index - 2].token_type == Token::ListItem; - let at_empty_list_item = if at_list_item { - let before_item = skip::opt_back(events, index - 2, &[Token::ListItem]); - let before_prefix = skip::opt_back( - events, - index - 3, - &[Token::ListItemPrefix, Token::SpaceOrTab], - ); - before_item + 1 == before_prefix - } else { - false - }; + let mut at_marker = false; + + if balance == 2 { + let mut before = index - 2; + + if events[before].token_type == Token::SpaceOrTab { + before -= 2; + } + + if events[before].token_type == Token::ListItemPrefix { + at_marker = true; + } + } + + let mut at_empty_list_item = false; + let mut at_empty_block_quote = false; + + if balance == 1 { + let mut before = index - 2; + + if events[before].token_type == Token::SpaceOrTab { + before -= 2; + } + + if events[before].token_type == Token::ListItem + && events[before - 1].token_type == Token::ListItemPrefix + { + at_empty_list_item = true; + } + + if events[before].token_type == Token::ListItem + && events[before - 1].token_type == Token::BlockQuote + && events[before - 2].token_type == Token::BlockQuotePrefix + { + at_empty_block_quote = true; + } + } - if !at_marker && !at_list_item && !at_empty_list_item { + if !at_marker && !at_empty_list_item && !at_empty_block_quote { loose = true; break; } diff --git a/src/construct/attention.rs b/src/construct/attention.rs index fc2acfb..5a98a89 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -52,7 +52,7 @@ //! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element use crate::token::Token; -use crate::tokenizer::{Event, EventType, Point, State, Tokenizer}; +use crate::tokenizer::{Event, EventType, Point, State, StateName, Tokenizer}; use crate::unicode::PUNCTUATION; use crate::util::slice::Slice; @@ -132,11 +132,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | ** /// ^^ /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'*' | b'_') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.consume(); - State::Fn(Box::new(inside)) + State::Fn(StateName::AttentionInside) } _ => { tokenizer.exit(Token::AttentionSequence); diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 1444c61..15bfac1 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -103,7 +103,7 @@ use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX}; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of an autolink. /// @@ -121,7 +121,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::AutolinkMarker); tokenizer.enter(Token::AutolinkProtocol); - State::Fn(Box::new(open)) + State::Fn(StateName::AutolinkOpen) } _ => State::Nok, } @@ -135,12 +135,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | a<user@example.com>b /// ^ /// ``` -fn open(tokenizer: &mut Tokenizer) -> State { +pub fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphabetic. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(scheme_or_email_atext)) + State::Fn(StateName::AutolinkSchemeOrEmailAtext) } _ => email_atext(tokenizer), } @@ -154,7 +154,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// > | a<user@example.com>b /// ^ /// ``` -fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { +pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumeric and `+`, `-`, and `.`. Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { @@ -174,12 +174,12 @@ fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { /// > | a<user@example.com>b /// ^ /// ``` -fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer) -> State { +pub fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b':') => { tokenizer.consume(); tokenizer.tokenize_state.size = 0; - State::Fn(Box::new(url_inside)) + State::Fn(StateName::AutolinkUrlInside) } // ASCII alphanumeric and `+`, `-`, and `.`. Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') @@ -187,7 +187,7 @@ fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer) -> State { { tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(scheme_inside_or_email_atext)) + State::Fn(StateName::AutolinkSchemeInsideOrEmailAtext) } _ => { tokenizer.tokenize_state.size = 0; @@ -202,7 +202,7 @@ fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer) -> State { /// > | a<https://example.com>b /// ^ /// ``` -fn url_inside(tokenizer: &mut Tokenizer) -> State { +pub fn url_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { tokenizer.exit(Token::AutolinkProtocol); @@ -212,7 +212,7 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { None | Some(b'\0'..=0x1F | b' ' | b'<' | 0x7F) => State::Nok, Some(_) => { tokenizer.consume(); - State::Fn(Box::new(url_inside)) + State::Fn(StateName::AutolinkUrlInside) } } } @@ -223,11 +223,11 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { /// > | a<user.name@example.com>b /// ^ /// ``` -fn email_atext(tokenizer: &mut Tokenizer) -> State { +pub fn email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'@') => { tokenizer.consume(); - State::Fn(Box::new(email_at_sign_or_dot)) + State::Fn(StateName::AutolinkEmailAtSignOrDot) } // ASCII atext. // @@ -250,7 +250,7 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~', ) => { tokenizer.consume(); - State::Fn(Box::new(email_atext)) + State::Fn(StateName::AutolinkEmailAtext) } _ => State::Nok, } @@ -262,7 +262,7 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// > | a<user.name@example.com>b /// ^ ^ /// ``` -fn email_at_sign_or_dot(tokenizer: &mut Tokenizer) -> State { +pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumeric. Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer), @@ -276,12 +276,12 @@ fn email_at_sign_or_dot(tokenizer: &mut Tokenizer) -> State { /// > | a<user.name@example.com>b /// ^ /// ``` -fn email_label(tokenizer: &mut Tokenizer) -> State { +pub fn email_label(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'.') => { tokenizer.tokenize_state.size = 0; tokenizer.consume(); - State::Fn(Box::new(email_at_sign_or_dot)) + State::Fn(StateName::AutolinkEmailAtSignOrDot) } Some(b'>') => { tokenizer.tokenize_state.size = 0; @@ -304,20 +304,20 @@ fn email_label(tokenizer: &mut Tokenizer) -> State { /// > | a<user.name@ex-ample.com>b /// ^ /// ``` -fn email_value(tokenizer: &mut Tokenizer) -> State { +pub fn email_value(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumeric or `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if tokenizer.tokenize_state.size < AUTOLINK_DOMAIN_SIZE_MAX => { - let func = if matches!(tokenizer.current, Some(b'-')) { - email_value + let state_name = if matches!(tokenizer.current, Some(b'-')) { + StateName::AutolinkEmailValue } else { - email_label + StateName::AutolinkEmailLabel }; tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(func)) + State::Fn(state_name) } _ => { tokenizer.tokenize_state.size = 0; @@ -334,7 +334,7 @@ fn email_value(tokenizer: &mut Tokenizer) -> State { /// > | a<user@example.com>b /// ^ /// ``` -fn end(tokenizer: &mut Tokenizer) -> State { +pub fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { tokenizer.enter(Token::AutolinkMarker); diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index c4eacf5..b12c2c4 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -33,7 +33,7 @@ //! [flow]: crate::content::flow use crate::construct::partial_space_or_tab::space_or_tab; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of a blank line. /// @@ -46,7 +46,8 @@ use crate::tokenizer::{State, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt(space_or_tab(), after)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::BlankLineAfter) } /// After zero or more spaces or tabs, before a line ending or EOF. @@ -57,7 +58,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | ␊ /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => State::Ok, _ => State::Nok, diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index 7e4753d..df58d62 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -36,7 +36,7 @@ use crate::constant::TAB_SIZE; use crate::construct::partial_space_or_tab::space_or_tab_min_max; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of block quote. /// @@ -45,13 +45,17 @@ use crate::tokenizer::{State, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; if tokenizer.parse_state.constructs.block_quote { - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + tokenizer.go(state_name, StateName::BlockQuoteBefore) } else { State::Nok } @@ -63,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | > a /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { tokenizer.enter(Token::BlockQuote); @@ -80,13 +84,17 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -pub fn cont(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - tokenizer.go(space_or_tab_min_max(0, max), cont_before)(tokenizer) +pub fn cont_start(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + tokenizer.go(state_name, StateName::BlockQuoteContBefore) } /// After whitespace, before `>`. @@ -96,14 +104,14 @@ pub fn cont(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn cont_before(tokenizer: &mut Tokenizer) -> State { +pub fn cont_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { tokenizer.enter(Token::BlockQuotePrefix); tokenizer.enter(Token::BlockQuoteMarker); tokenizer.consume(); tokenizer.exit(Token::BlockQuoteMarker); - State::Fn(Box::new(cont_after)) + State::Fn(StateName::BlockQuoteContAfter) } _ => State::Nok, } @@ -117,15 +125,13 @@ fn cont_before(tokenizer: &mut Tokenizer) -> State { /// > | >b /// ^ /// ``` -fn cont_after(tokenizer: &mut Tokenizer) -> State { +pub fn cont_after(tokenizer: &mut Tokenizer) -> State { if let Some(b'\t' | b' ') = tokenizer.current { tokenizer.enter(Token::SpaceOrTab); tokenizer.consume(); tokenizer.exit(Token::SpaceOrTab); - tokenizer.exit(Token::BlockQuotePrefix); - State::Ok - } else { - tokenizer.exit(Token::BlockQuotePrefix); - State::Ok } + + tokenizer.exit(Token::BlockQuotePrefix); + State::Ok } diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 4419d7a..de09f17 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -34,7 +34,7 @@ //! [hard_break_escape]: crate::construct::hard_break_escape use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of a character escape. /// @@ -49,7 +49,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::CharacterEscapeMarker); tokenizer.consume(); tokenizer.exit(Token::CharacterEscapeMarker); - State::Fn(Box::new(inside)) + State::Fn(StateName::CharacterEscapeInside) } _ => State::Nok, } @@ -61,7 +61,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | a\*b /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +// StateName::CharacterEscapeInside +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII punctuation. Some(b'!'..=b'/' | b':'..=b'@' | b'['..=b'`' | b'{'..=b'~') => { diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 7cc74ba..ba05fab 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,7 +66,7 @@ use crate::constant::{ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, }; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; use crate::util::slice::Slice; /// Start of a character reference. @@ -86,7 +86,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::CharacterReferenceMarker); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarker); - State::Fn(Box::new(open)) + State::Fn(StateName::CharacterReferenceOpen) } _ => State::Nok, } @@ -103,12 +103,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | a	b /// ^ /// ``` -fn open(tokenizer: &mut Tokenizer) -> State { +// StateName::CharacterReferenceOpen +pub fn open(tokenizer: &mut Tokenizer) -> State { if let Some(b'#') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerNumeric); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerNumeric); - State::Fn(Box::new(numeric)) + State::Fn(StateName::CharacterReferenceNumeric) } else { tokenizer.tokenize_state.marker = b'&'; tokenizer.enter(Token::CharacterReferenceValue); @@ -125,14 +126,15 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// > | a	b /// ^ /// ``` -fn numeric(tokenizer: &mut Tokenizer) -> State { +// StateName::CharacterReferenceNumeric +pub fn numeric(tokenizer: &mut Tokenizer) -> State { if let Some(b'x' | b'X') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerHexadecimal); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); tokenizer.enter(Token::CharacterReferenceValue); tokenizer.tokenize_state.marker = b'x'; - State::Fn(Box::new(value)) + State::Fn(StateName::CharacterReferenceValue) } else { tokenizer.enter(Token::CharacterReferenceValue); tokenizer.tokenize_state.marker = b'#'; @@ -154,7 +156,7 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { /// > | a	b /// ^ /// ``` -fn value(tokenizer: &mut Tokenizer) -> State { +pub fn value(tokenizer: &mut Tokenizer) -> State { if matches!(tokenizer.current, Some(b';')) && tokenizer.tokenize_state.size > 0 { // Named. if tokenizer.tokenize_state.marker == b'&' { @@ -200,7 +202,7 @@ fn value(tokenizer: &mut Tokenizer) -> State { if tokenizer.tokenize_state.size < max && test(&byte) { tokenizer.tokenize_state.size += 1; tokenizer.consume(); - return State::Fn(Box::new(value)); + return State::Fn(StateName::CharacterReferenceValue); } } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index a22a0f9..46c5f9f 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -102,12 +102,9 @@ //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; -use crate::construct::{ - partial_non_lazy_continuation::start as partial_non_lazy_continuation, - partial_space_or_tab::{space_or_tab, space_or_tab_min_max}, -}; +use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::token::Token; -use crate::tokenizer::{ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, StateName, Tokenizer}; use crate::util::slice::{Position, Slice}; /// Start of fenced code. @@ -122,17 +119,16 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { if tokenizer.parse_state.constructs.code_fenced { tokenizer.enter(Token::CodeFenced); tokenizer.enter(Token::CodeFencedFence); - tokenizer.go( - space_or_tab_min_max( - 0, - if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - ), - before_sequence_open, - )(tokenizer) + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + tokenizer.go(state_name, StateName::CodeFencedBeforeSequenceOpen) } else { State::Nok } @@ -146,7 +142,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { +pub fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { let tail = tokenizer.events.last(); let mut prefix = 0; @@ -178,16 +174,17 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer) -> State { +pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(sequence_open)) + State::Fn(StateName::CodeFencedSequenceOpen) } _ if tokenizer.tokenize_state.size >= CODE_FENCED_SEQUENCE_SIZE_MIN => { tokenizer.exit(Token::CodeFencedFenceSequence); - tokenizer.attempt_opt(space_or_tab(), info_before)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::CodeFencedInfoBefore) } _ => { tokenizer.tokenize_state.marker = 0; @@ -206,7 +203,7 @@ fn sequence_open(tokenizer: &mut Tokenizer) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn info_before(tokenizer: &mut Tokenizer) -> State { +pub fn info_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); @@ -217,7 +214,7 @@ fn info_before(tokenizer: &mut Tokenizer) -> State { _ => { tokenizer.enter(Token::CodeFencedFenceInfo); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - info_inside(tokenizer) + info(tokenizer) } } } @@ -230,7 +227,7 @@ fn info_before(tokenizer: &mut Tokenizer) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn info_inside(tokenizer: &mut Tokenizer) -> State { +pub fn info(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Data); @@ -243,7 +240,8 @@ fn info_inside(tokenizer: &mut Tokenizer) -> State { Some(b'\t' | b' ') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); - tokenizer.attempt_opt(space_or_tab(), meta_before)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::CodeFencedMetaBefore) } Some(b'`') if tokenizer.tokenize_state.marker == b'`' => { tokenizer.concrete = false; @@ -254,7 +252,7 @@ fn info_inside(tokenizer: &mut Tokenizer) -> State { } Some(_) => { tokenizer.consume(); - State::Fn(Box::new(info_inside)) + State::Fn(StateName::CodeFencedInfo) } } } @@ -267,7 +265,7 @@ fn info_inside(tokenizer: &mut Tokenizer) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn meta_before(tokenizer: &mut Tokenizer) -> State { +pub fn meta_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); @@ -291,7 +289,7 @@ fn meta_before(tokenizer: &mut Tokenizer) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn meta(tokenizer: &mut Tokenizer) -> State { +pub fn meta(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Data); @@ -310,7 +308,7 @@ fn meta(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(meta)) + State::Fn(StateName::CodeFencedMeta) } } } @@ -324,10 +322,14 @@ fn meta(tokenizer: &mut Tokenizer) -> State { /// ^ /// | ~~~ /// ``` -fn at_break(tokenizer: &mut Tokenizer) -> State { - tokenizer.check(partial_non_lazy_continuation, |ok| { - Box::new(if ok { at_non_lazy_break } else { after }) - })(tokenizer) +pub fn at_break(tokenizer: &mut Tokenizer) -> State { + tokenizer.check(StateName::NonLazyContinuationStart, |ok| { + State::Fn(if ok { + StateName::CodeFencedAtNonLazyBreak + } else { + StateName::CodeFencedAfter + }) + }) } /// At an eol/eof in code, before a non-lazy closing fence or content. @@ -339,10 +341,14 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// ^ /// | ~~~ /// ``` -fn at_non_lazy_break(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt(close_begin, |ok| { - Box::new(if ok { after } else { content_before }) - })(tokenizer) +pub fn at_non_lazy_break(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt(StateName::CodeFencedCloseBefore, |ok| { + State::Fn(if ok { + StateName::CodeFencedAfter + } else { + StateName::CodeFencedContentBefore + }) + }) } /// Before a closing fence, at the line ending. @@ -353,13 +359,13 @@ fn at_non_lazy_break(tokenizer: &mut Tokenizer) -> State { /// ^ /// | ~~~ /// ``` -fn close_begin(tokenizer: &mut Tokenizer) -> State { +pub fn close_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(close_start)) + State::Fn(StateName::CodeFencedCloseStart) } _ => unreachable!("expected eol"), } @@ -373,19 +379,18 @@ fn close_begin(tokenizer: &mut Tokenizer) -> State { /// > | ~~~ /// ^ /// ``` -fn close_start(tokenizer: &mut Tokenizer) -> State { +pub fn close_start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::CodeFencedFence); - tokenizer.go( - space_or_tab_min_max( - 0, - if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - ), - close_before, - )(tokenizer) + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + tokenizer.go(state_name, StateName::CodeFencedBeforeSequenceClose) } /// In a closing fence, after optional whitespace, before sequence. @@ -396,11 +401,11 @@ fn close_start(tokenizer: &mut Tokenizer) -> State { /// > | ~~~ /// ^ /// ``` -fn close_before(tokenizer: &mut Tokenizer) -> State { +pub fn before_sequence_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.enter(Token::CodeFencedFenceSequence); - close_sequence(tokenizer) + sequence_close(tokenizer) } _ => State::Nok, } @@ -414,19 +419,20 @@ fn close_before(tokenizer: &mut Tokenizer) -> State { /// > | ~~~ /// ^ /// ``` -fn close_sequence(tokenizer: &mut Tokenizer) -> State { +pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.tokenize_state.size_other += 1; tokenizer.consume(); - State::Fn(Box::new(close_sequence)) + State::Fn(StateName::CodeFencedSequenceClose) } _ if tokenizer.tokenize_state.size_other >= CODE_FENCED_SEQUENCE_SIZE_MIN && tokenizer.tokenize_state.size_other >= tokenizer.tokenize_state.size => { tokenizer.tokenize_state.size_other = 0; tokenizer.exit(Token::CodeFencedFenceSequence); - tokenizer.attempt_opt(space_or_tab(), close_sequence_after)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::CodeFencedAfterSequenceClose) } _ => { tokenizer.tokenize_state.size_other = 0; @@ -443,7 +449,7 @@ fn close_sequence(tokenizer: &mut Tokenizer) -> State { /// > | ~~~ /// ^ /// ``` -fn close_sequence_after(tokenizer: &mut Tokenizer) -> State { +pub fn sequence_close_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); @@ -461,11 +467,11 @@ fn close_sequence_after(tokenizer: &mut Tokenizer) -> State { /// ^ /// | ~~~ /// ``` -fn content_before(tokenizer: &mut Tokenizer) -> State { +pub fn content_before(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(content_start)) + State::Fn(StateName::CodeFencedContentStart) } /// Before code content, definitely not before a closing fence. /// @@ -475,11 +481,9 @@ fn content_before(tokenizer: &mut Tokenizer) -> State { /// ^ /// | ~~~ /// ``` -fn content_start(tokenizer: &mut Tokenizer) -> State { - tokenizer.go( - space_or_tab_min_max(0, tokenizer.tokenize_state.prefix), - content_begin, - )(tokenizer) +pub fn content_start(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_min_max(tokenizer, 0, tokenizer.tokenize_state.prefix); + tokenizer.go(state_name, StateName::CodeFencedBeforeContentChunk) } /// Before code content, after a prefix. @@ -490,12 +494,12 @@ fn content_start(tokenizer: &mut Tokenizer) -> State { /// ^ /// | ~~~ /// ``` -fn content_begin(tokenizer: &mut Tokenizer) -> State { +pub fn before_content_chunk(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => at_break(tokenizer), _ => { tokenizer.enter(Token::CodeFlowChunk); - content_continue(tokenizer) + content_chunk(tokenizer) } } } @@ -508,7 +512,7 @@ fn content_begin(tokenizer: &mut Tokenizer) -> State { /// ^^^^^^^^^^^^^^ /// | ~~~ /// ``` -fn content_continue(tokenizer: &mut Tokenizer) -> State { +pub fn content_chunk(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFlowChunk); @@ -516,7 +520,7 @@ fn content_continue(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(content_continue)) + State::Fn(StateName::CodeFencedContentChunk) } } } @@ -529,7 +533,7 @@ fn content_continue(tokenizer: &mut Tokenizer) -> State { /// > | ~~~ /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CodeFenced); tokenizer.tokenize_state.marker = 0; tokenizer.tokenize_state.prefix = 0; diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 81a3080..516b493 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -48,7 +48,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::TAB_SIZE; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of code (indented). /// @@ -64,7 +64,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { // Do not interrupt paragraphs. if !tokenizer.interrupt && tokenizer.parse_state.constructs.code_indented { tokenizer.enter(Token::CodeIndented); - tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer) + let state_name = space_or_tab_min_max(tokenizer, TAB_SIZE, TAB_SIZE); + tokenizer.go(state_name, StateName::CodeIndentedAtBreak) } else { State::Nok } @@ -76,15 +77,19 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | aaa /// ^ ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer) -> State { +pub fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => after(tokenizer), - Some(b'\n') => tokenizer.attempt(further_start, |ok| { - Box::new(if ok { at_break } else { after }) - })(tokenizer), + Some(b'\n') => tokenizer.attempt(StateName::CodeIndentedFurtherStart, |ok| { + State::Fn(if ok { + StateName::CodeIndentedAtBreak + } else { + StateName::CodeIndentedAfter + }) + }), _ => { tokenizer.enter(Token::CodeFlowChunk); - content(tokenizer) + inside(tokenizer) } } } @@ -95,7 +100,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// > | aaa /// ^^^^ /// ``` -fn content(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFlowChunk); @@ -103,7 +108,7 @@ fn content(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(content)) + State::Fn(StateName::CodeIndentedInside) } } } @@ -114,7 +119,7 @@ fn content(tokenizer: &mut Tokenizer) -> State { /// > | aaa /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CodeIndented); // Feel free to interrupt. tokenizer.interrupt = false; @@ -128,17 +133,24 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// ^ /// | bbb /// ``` -fn further_start(tokenizer: &mut Tokenizer) -> State { +pub fn further_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') if !tokenizer.lazy => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(further_start)) + State::Fn(StateName::CodeIndentedFurtherStart) + } + _ if !tokenizer.lazy => { + let state_name = space_or_tab_min_max(tokenizer, TAB_SIZE, TAB_SIZE); + tokenizer.attempt(state_name, |ok| { + State::Fn(if ok { + StateName::CodeIndentedFurtherEnd + } else { + StateName::CodeIndentedFurtherBegin + }) + }) } - _ if !tokenizer.lazy => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { - Box::new(if ok { further_end } else { further_begin }) - })(tokenizer), _ => State::Nok, } } @@ -150,7 +162,7 @@ fn further_start(tokenizer: &mut Tokenizer) -> State { /// ^ /// | bbb /// ``` -fn further_end(_tokenizer: &mut Tokenizer) -> State { +pub fn further_end(_tokenizer: &mut Tokenizer) -> State { State::Ok } @@ -161,8 +173,9 @@ fn further_end(_tokenizer: &mut Tokenizer) -> State { /// > | bbb /// ^ /// ``` -fn further_begin(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt(space_or_tab(), further_after)(tokenizer) +pub fn further_begin(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::CodeIndentedFurtherAfter) } /// After whitespace, not indented enough. @@ -172,7 +185,7 @@ fn further_begin(tokenizer: &mut Tokenizer) -> State { /// > | bbb /// ^ /// ``` -fn further_after(tokenizer: &mut Tokenizer) -> State { +pub fn further_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => further_start(tokenizer), _ => State::Nok, diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index 31777f4..5bdefbb 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -84,7 +84,7 @@ //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of code (text). /// @@ -117,11 +117,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | `a` /// ^ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer) -> State { +pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { if let Some(b'`') = tokenizer.current { tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(sequence_open)) + State::Fn(StateName::CodeTextSequenceOpen) } else { tokenizer.exit(Token::CodeTextSequence); between(tokenizer) @@ -134,7 +134,7 @@ fn sequence_open(tokenizer: &mut Tokenizer) -> State { /// > | `a` /// ^^ /// ``` -fn between(tokenizer: &mut Tokenizer) -> State { +pub fn between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => { tokenizer.tokenize_state.size = 0; @@ -144,7 +144,7 @@ fn between(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(between)) + State::Fn(StateName::CodeTextBetween) } Some(b'`') => { tokenizer.enter(Token::CodeTextSequence); @@ -163,7 +163,7 @@ fn between(tokenizer: &mut Tokenizer) -> State { /// > | `a` /// ^ /// ``` -fn data(tokenizer: &mut Tokenizer) -> State { +pub fn data(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n' | b'`') => { tokenizer.exit(Token::CodeTextData); @@ -171,7 +171,7 @@ fn data(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(data)) + State::Fn(StateName::CodeTextData) } } } @@ -182,12 +182,12 @@ fn data(tokenizer: &mut Tokenizer) -> State { /// > | `a` /// ^ /// ``` -fn sequence_close(tokenizer: &mut Tokenizer) -> State { +pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`') => { tokenizer.tokenize_state.size_other += 1; tokenizer.consume(); - State::Fn(Box::new(sequence_close)) + State::Fn(StateName::CodeTextSequenceClose) } _ => { if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_other { diff --git a/src/construct/definition.rs b/src/construct/definition.rs index a56dab4..fbad99d 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -93,14 +93,9 @@ //! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element //! [html-img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element -use crate::construct::{ - partial_destination::start as destination, - partial_label::start as label, - partial_space_or_tab::{space_or_tab, space_or_tab_eol}, - partial_title::start as title, -}; +use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_eol}; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; /// At the start of a definition. @@ -124,7 +119,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { if possible && tokenizer.parse_state.constructs.definition { tokenizer.enter(Token::Definition); // Note: arbitrary whitespace allowed even if code (indented) is on. - tokenizer.attempt_opt(space_or_tab(), before)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::DefinitionBefore) } else { State::Nok } @@ -136,13 +132,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'[') => { tokenizer.tokenize_state.token_1 = Token::DefinitionLabel; tokenizer.tokenize_state.token_2 = Token::DefinitionLabelMarker; tokenizer.tokenize_state.token_3 = Token::DefinitionLabelString; - tokenizer.go(label, label_after)(tokenizer) + tokenizer.go(StateName::LabelStart, StateName::DefinitionLabelAfter) } _ => State::Nok, } @@ -154,7 +150,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn label_after(tokenizer: &mut Tokenizer) -> State { +pub fn label_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; @@ -164,34 +160,38 @@ fn label_after(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::DefinitionMarker); tokenizer.consume(); tokenizer.exit(Token::DefinitionMarker); - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab_eol(), destination_before), - )) + State::Fn(StateName::DefinitionMarkerAfter) } _ => State::Nok, } } +/// To do. +pub fn marker_after(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_eol(tokenizer); + tokenizer.attempt_opt(state_name, StateName::DefinitionDestinationBefore) +} + /// Before a destination. /// /// ```markdown /// > | [a]: b "c" /// ^ /// ``` -fn destination_before(tokenizer: &mut Tokenizer) -> State { +pub fn destination_before(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::DefinitionDestination; tokenizer.tokenize_state.token_2 = Token::DefinitionDestinationLiteral; tokenizer.tokenize_state.token_3 = Token::DefinitionDestinationLiteralMarker; tokenizer.tokenize_state.token_4 = Token::DefinitionDestinationRaw; tokenizer.tokenize_state.token_5 = Token::DefinitionDestinationString; tokenizer.tokenize_state.size_other = usize::MAX; - tokenizer.attempt(destination, |ok| { - Box::new(if ok { - destination_after + tokenizer.attempt(StateName::DestinationStart, |ok| { + State::Fn(if ok { + StateName::DefinitionDestinationAfter } else { - destination_missing + StateName::DefinitionDestinationMissing }) - })(tokenizer) + }) } /// After a destination. @@ -200,18 +200,18 @@ fn destination_before(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn destination_after(tokenizer: &mut Tokenizer) -> State { +pub fn destination_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.tokenize_state.token_4 = Token::Data; tokenizer.tokenize_state.token_5 = Token::Data; tokenizer.tokenize_state.size_other = 0; - tokenizer.attempt_opt(title_before, after)(tokenizer) + tokenizer.attempt_opt(StateName::DefinitionTitleBefore, StateName::DefinitionAfter) } /// Without destination. -fn destination_missing(tokenizer: &mut Tokenizer) -> State { +pub fn destination_missing(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; @@ -229,8 +229,9 @@ fn destination_missing(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt(space_or_tab(), after_whitespace)(tokenizer) +pub fn after(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::DefinitionAfterWhitespace) } /// After a definition, after optional whitespace. @@ -241,7 +242,7 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn after_whitespace(tokenizer: &mut Tokenizer) -> State { +pub fn after_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Definition); @@ -261,8 +262,9 @@ fn after_whitespace(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn title_before(tokenizer: &mut Tokenizer) -> State { - tokenizer.go(space_or_tab_eol(), title_before_marker)(tokenizer) +pub fn title_before(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_eol(tokenizer); + tokenizer.go(state_name, StateName::DefinitionTitleBeforeMarker) } /// Before a title, after a line ending. @@ -272,11 +274,11 @@ fn title_before(tokenizer: &mut Tokenizer) -> State { /// > | "c" /// ^ /// ``` -fn title_before_marker(tokenizer: &mut Tokenizer) -> State { +pub fn title_before_marker(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::DefinitionTitle; tokenizer.tokenize_state.token_2 = Token::DefinitionTitleMarker; tokenizer.tokenize_state.token_3 = Token::DefinitionTitleString; - tokenizer.go(title, title_after)(tokenizer) + tokenizer.go(StateName::TitleStart, StateName::DefinitionTitleAfter) } /// After a title. @@ -285,11 +287,15 @@ fn title_before_marker(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn title_after(tokenizer: &mut Tokenizer) -> State { +pub fn title_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; - tokenizer.attempt_opt(space_or_tab(), title_after_after_optional_whitespace)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt( + state_name, + StateName::DefinitionTitleAfterOptionalWhitespace, + ) } /// After a title, after optional whitespace. @@ -298,7 +304,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// > | [a]: b "c" /// ^ /// ``` -fn title_after_after_optional_whitespace(tokenizer: &mut Tokenizer) -> State { +pub fn title_after_optional_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => State::Ok, _ => State::Nok, diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index d09bf54..47b7e94 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -40,7 +40,7 @@ //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of a hard break (escape). /// @@ -54,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => { tokenizer.enter(Token::HardBreakEscape); tokenizer.consume(); - State::Fn(Box::new(after)) + State::Fn(StateName::HardBreakEscapeAfter) } _ => State::Nok, } @@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// | b /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.exit(Token::HardBreakEscape); diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 6751567..45c4758 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -57,7 +57,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; use crate::token::Token; -use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer}; +use crate::tokenizer::{ContentType, Event, EventType, State, StateName, Tokenizer}; /// Start of a heading (atx). /// @@ -68,17 +68,16 @@ use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer) -> State { if tokenizer.parse_state.constructs.heading_atx { tokenizer.enter(Token::HeadingAtx); - tokenizer.go( - space_or_tab_min_max( - 0, - if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - ), - before, - )(tokenizer) + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + tokenizer.go(state_name, StateName::HeadingAtxBefore) } else { State::Nok } @@ -90,7 +89,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | ## aa /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { if Some(b'#') == tokenizer.current { tokenizer.enter(Token::HeadingAtxSequence); sequence_open(tokenizer) @@ -105,7 +104,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | ## aa /// ^ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer) -> State { +pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') if tokenizer.tokenize_state.size > 0 => { tokenizer.tokenize_state.size = 0; @@ -115,12 +114,13 @@ fn sequence_open(tokenizer: &mut Tokenizer) -> State { Some(b'#') if tokenizer.tokenize_state.size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(sequence_open)) + State::Fn(StateName::HeadingAtxSequenceOpen) } _ if tokenizer.tokenize_state.size > 0 => { tokenizer.tokenize_state.size = 0; tokenizer.exit(Token::HeadingAtxSequence); - tokenizer.go(space_or_tab(), at_break)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.go(state_name, StateName::HeadingAtxAtBreak) } _ => { tokenizer.tokenize_state.size = 0; @@ -135,7 +135,7 @@ fn sequence_open(tokenizer: &mut Tokenizer) -> State { /// > | ## aa /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer) -> State { +pub fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::HeadingAtx); @@ -144,10 +144,13 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { tokenizer.interrupt = false; State::Ok } - Some(b'\t' | b' ') => tokenizer.go(space_or_tab(), at_break)(tokenizer), + Some(b'\t' | b' ') => { + let state_name = space_or_tab(tokenizer); + tokenizer.go(state_name, StateName::HeadingAtxAtBreak) + } Some(b'#') => { tokenizer.enter(Token::HeadingAtxSequence); - further_sequence(tokenizer) + sequence_further(tokenizer) } Some(_) => { tokenizer.enter_with_content(Token::Data, Some(ContentType::Text)); @@ -164,10 +167,10 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// > | ## aa ## /// ^ /// ``` -fn further_sequence(tokenizer: &mut Tokenizer) -> State { +pub fn sequence_further(tokenizer: &mut Tokenizer) -> State { if let Some(b'#') = tokenizer.current { tokenizer.consume(); - State::Fn(Box::new(further_sequence)) + State::Fn(StateName::HeadingAtxSequenceFurther) } else { tokenizer.exit(Token::HeadingAtxSequence); at_break(tokenizer) @@ -180,7 +183,7 @@ fn further_sequence(tokenizer: &mut Tokenizer) -> State { /// > | ## aa /// ^ /// ``` -fn data(tokenizer: &mut Tokenizer) -> State { +pub fn data(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. None | Some(b'\t' | b'\n' | b' ') => { @@ -189,7 +192,7 @@ fn data(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(data)) + State::Fn(StateName::HeadingAtxData) } } } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 675b2ac..50feba4 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -60,7 +60,7 @@ use crate::constant::TAB_SIZE; use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::token::Token; -use crate::tokenizer::{EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, StateName, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; /// At a line ending, presumably an underline. @@ -83,17 +83,17 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { .token_type == Token::Paragraph) { - tokenizer.go( - space_or_tab_min_max( - 0, - if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - ), - before, - )(tokenizer) + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + + tokenizer.go(state_name, StateName::HeadingSetextBefore) } else { State::Nok } @@ -106,7 +106,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | == /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-' | b'=') => { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); @@ -124,16 +124,17 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | == /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-' | b'=') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.consume(); - State::Fn(Box::new(inside)) + State::Fn(StateName::HeadingSetextInside) } _ => { tokenizer.tokenize_state.marker = 0; tokenizer.exit(Token::HeadingSetextUnderline); - tokenizer.attempt_opt(space_or_tab(), after)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::HeadingSetextAfter) } } } @@ -145,7 +146,7 @@ fn inside(tokenizer: &mut Tokenizer) -> State { /// > | == /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { // Feel free to interrupt. diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index aaa803d..779146c 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -101,13 +101,11 @@ use crate::constant::{ HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE, }; -use crate::construct::{ - blank_line::start as blank_line, - partial_non_lazy_continuation::start as partial_non_lazy_continuation, - partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions}, +use crate::construct::partial_space_or_tab::{ + space_or_tab_with_options, Options as SpaceOrTabOptions, }; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; use crate::util::slice::Slice; /// Symbol for `<script>` (condition 1). @@ -134,8 +132,9 @@ const COMPLETE: u8 = 7; pub fn start(tokenizer: &mut Tokenizer) -> State { if tokenizer.parse_state.constructs.html_flow { tokenizer.enter(Token::HtmlFlow); - tokenizer.go( - space_or_tab_with_options(SpaceOrTabOptions { + let state_name = space_or_tab_with_options( + tokenizer, + SpaceOrTabOptions { kind: Token::HtmlFlowData, min: 0, max: if tokenizer.parse_state.constructs.code_indented { @@ -145,9 +144,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { }, connect: false, content_type: None, - }), - before, - )(tokenizer) + }, + ); + + tokenizer.go(state_name, StateName::HtmlFlowBefore) } else { State::Nok } @@ -159,11 +159,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | <x /> /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { if Some(b'<') == tokenizer.current { tokenizer.enter(Token::HtmlFlowData); tokenizer.consume(); - State::Fn(Box::new(open)) + State::Fn(StateName::HtmlFlowOpen) } else { State::Nok } @@ -179,17 +179,17 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | <!--xxx--> /// ^ /// ``` -fn open(tokenizer: &mut Tokenizer) -> State { +pub fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'!') => { tokenizer.consume(); - State::Fn(Box::new(declaration_open)) + State::Fn(StateName::HtmlFlowDeclarationOpen) } Some(b'/') => { tokenizer.consume(); tokenizer.tokenize_state.seen = true; tokenizer.tokenize_state.start = tokenizer.point.index; - State::Fn(Box::new(tag_close_start)) + State::Fn(StateName::HtmlFlowTagCloseStart) } Some(b'?') => { tokenizer.tokenize_state.marker = INSTRUCTION; @@ -198,7 +198,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.concrete = true; // While we’re in an instruction instead of a declaration, we’re on a `?` // right now, so we do need to search for `>`, similar to declarations. - State::Fn(Box::new(continuation_declaration_inside)) + State::Fn(StateName::HtmlFlowContinuationDeclarationInside) } // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { @@ -219,24 +219,24 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// > | <![CDATA[>&<]]> /// ^ /// ``` -fn declaration_open(tokenizer: &mut Tokenizer) -> State { +pub fn declaration_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-') => { tokenizer.consume(); tokenizer.tokenize_state.marker = COMMENT; - State::Fn(Box::new(comment_open_inside)) + State::Fn(StateName::HtmlFlowCommentOpenInside) } Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); tokenizer.tokenize_state.marker = DECLARATION; // Do not form containers. tokenizer.concrete = true; - State::Fn(Box::new(continuation_declaration_inside)) + State::Fn(StateName::HtmlFlowContinuationDeclarationInside) } Some(b'[') => { tokenizer.consume(); tokenizer.tokenize_state.marker = CDATA; - State::Fn(Box::new(cdata_open_inside)) + State::Fn(StateName::HtmlFlowCdataOpenInside) } _ => State::Nok, } @@ -248,12 +248,12 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { /// > | <!--xxx--> /// ^ /// ``` -fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { +pub fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { if let Some(b'-') = tokenizer.current { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; - State::Fn(Box::new(continuation_declaration_inside)) + State::Fn(StateName::HtmlFlowContinuationDeclarationInside) } else { tokenizer.tokenize_state.marker = 0; State::Nok @@ -266,7 +266,7 @@ fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { /// > | <![CDATA[>&<]]> /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { +pub fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { if tokenizer.current == Some(HTML_CDATA_PREFIX[tokenizer.tokenize_state.size]) { tokenizer.tokenize_state.size += 1; tokenizer.consume(); @@ -275,9 +275,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.size = 0; // Do not form containers. tokenizer.concrete = true; - State::Fn(Box::new(continuation)) + State::Fn(StateName::HtmlFlowContinuation) } else { - State::Fn(Box::new(cdata_open_inside)) + State::Fn(StateName::HtmlFlowCdataOpenInside) } } else { tokenizer.tokenize_state.marker = 0; @@ -292,10 +292,10 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { /// > | </x> /// ^ /// ``` -fn tag_close_start(tokenizer: &mut Tokenizer) -> State { +pub fn tag_close_start(tokenizer: &mut Tokenizer) -> State { if let Some(b'A'..=b'Z' | b'a'..=b'z') = tokenizer.current { tokenizer.consume(); - State::Fn(Box::new(tag_name)) + State::Fn(StateName::HtmlFlowTagName) } else { tokenizer.tokenize_state.seen = false; tokenizer.tokenize_state.start = 0; @@ -311,7 +311,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// > | </ab> /// ^^ /// ``` -fn tag_name(tokenizer: &mut Tokenizer) -> State { +pub fn tag_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => { let closing_tag = tokenizer.tokenize_state.seen; @@ -340,7 +340,7 @@ fn tag_name(tokenizer: &mut Tokenizer) -> State { if slash { tokenizer.consume(); - State::Fn(Box::new(basic_self_closing)) + State::Fn(StateName::HtmlFlowBasicSelfClosing) } else { // Do not form containers. tokenizer.concrete = true; @@ -363,7 +363,7 @@ fn tag_name(tokenizer: &mut Tokenizer) -> State { // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(tag_name)) + State::Fn(StateName::HtmlFlowTagName) } Some(_) => { tokenizer.tokenize_state.seen = false; @@ -378,12 +378,12 @@ fn tag_name(tokenizer: &mut Tokenizer) -> State { /// > | <div/> /// ^ /// ``` -fn basic_self_closing(tokenizer: &mut Tokenizer) -> State { +pub fn basic_self_closing(tokenizer: &mut Tokenizer) -> State { if let Some(b'>') = tokenizer.current { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; - State::Fn(Box::new(continuation)) + State::Fn(StateName::HtmlFlowContinuation) } else { tokenizer.tokenize_state.marker = 0; State::Nok @@ -396,11 +396,11 @@ fn basic_self_closing(tokenizer: &mut Tokenizer) -> State { /// > | <x/> /// ^ /// ``` -fn complete_closing_tag_after(tokenizer: &mut Tokenizer) -> State { +pub fn complete_closing_tag_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(complete_closing_tag_after)) + State::Fn(StateName::HtmlFlowCompleteClosingTagAfter) } _ => complete_end(tokenizer), } @@ -425,20 +425,20 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer) -> State { /// > | <a > /// ^ /// ``` -fn complete_attribute_name_before(tokenizer: &mut Tokenizer) -> State { +pub fn complete_attribute_name_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_name_before)) + State::Fn(StateName::HtmlFlowCompleteAttributeNameBefore) } Some(b'/') => { tokenizer.consume(); - State::Fn(Box::new(complete_end)) + State::Fn(StateName::HtmlFlowCompleteEnd) } // ASCII alphanumerical and `:` and `_`. Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_name)) + State::Fn(StateName::HtmlFlowCompleteAttributeName) } _ => complete_end(tokenizer), } @@ -454,12 +454,12 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer) -> State { /// > | <a b> /// ^ /// ``` -fn complete_attribute_name(tokenizer: &mut Tokenizer) -> State { +pub fn complete_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumerical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_name)) + State::Fn(StateName::HtmlFlowCompleteAttributeName) } _ => complete_attribute_name_after(tokenizer), } @@ -474,15 +474,15 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer) -> State { /// > | <a b=c> /// ^ /// ``` -fn complete_attribute_name_after(tokenizer: &mut Tokenizer) -> State { +pub fn complete_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_name_after)) + State::Fn(StateName::HtmlFlowCompleteAttributeNameAfter) } Some(b'=') => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_value_before)) + State::Fn(StateName::HtmlFlowCompleteAttributeValueBefore) } _ => complete_attribute_name_before(tokenizer), } @@ -497,7 +497,7 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer) -> State { /// > | <a b="c"> /// ^ /// ``` -fn complete_attribute_value_before(tokenizer: &mut Tokenizer) -> State { +pub fn complete_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'<' | b'=' | b'>' | b'`') => { tokenizer.tokenize_state.marker = 0; @@ -505,12 +505,12 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer) -> State { } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_value_before)) + State::Fn(StateName::HtmlFlowCompleteAttributeValueBefore) } Some(b'"' | b'\'') => { tokenizer.tokenize_state.marker_other = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(complete_attribute_value_quoted)) + State::Fn(StateName::HtmlFlowCompleteAttributeValueQuoted) } _ => complete_attribute_value_unquoted(tokenizer), } @@ -524,7 +524,7 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer) -> State { /// > | <a b='c'> /// ^ /// ``` -fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { +pub fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.tokenize_state.marker = 0; @@ -536,11 +536,11 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { { tokenizer.tokenize_state.marker_other = 0; tokenizer.consume(); - State::Fn(Box::new(complete_attribute_value_quoted_after)) + State::Fn(StateName::HtmlFlowCompleteAttributeValueQuotedAfter) } _ => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_value_quoted)) + State::Fn(StateName::HtmlFlowCompleteAttributeValueQuoted) } } } @@ -551,14 +551,14 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { /// > | <a b=c> /// ^ /// ``` -fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { +pub fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\t' | b'\n' | b' ' | b'"' | b'\'' | b'/' | b'<' | b'=' | b'>' | b'`') => { complete_attribute_name_after(tokenizer) } Some(_) => { tokenizer.consume(); - State::Fn(Box::new(complete_attribute_value_unquoted)) + State::Fn(StateName::HtmlFlowCompleteAttributeValueUnquoted) } } } @@ -570,7 +570,7 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { /// > | <a b="c"> /// ^ /// ``` -fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { +pub fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { if let Some(b'\t' | b' ' | b'/' | b'>') = tokenizer.current { complete_attribute_name_before(tokenizer) } else { @@ -585,10 +585,10 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { /// > | <a b="c"> /// ^ /// ``` -fn complete_end(tokenizer: &mut Tokenizer) -> State { +pub fn complete_end(tokenizer: &mut Tokenizer) -> State { if let Some(b'>') = tokenizer.current { tokenizer.consume(); - State::Fn(Box::new(complete_after)) + State::Fn(StateName::HtmlFlowCompleteAfter) } else { tokenizer.tokenize_state.marker = 0; State::Nok @@ -601,7 +601,7 @@ fn complete_end(tokenizer: &mut Tokenizer) -> State { /// > | <x> /// ^ /// ``` -fn complete_after(tokenizer: &mut Tokenizer) -> State { +pub fn complete_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { // Do not form containers. @@ -610,7 +610,7 @@ fn complete_after(tokenizer: &mut Tokenizer) -> State { } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(complete_after)) + State::Fn(StateName::HtmlFlowCompleteAfter) } Some(_) => { tokenizer.tokenize_state.marker = 0; @@ -625,20 +625,20 @@ fn complete_after(tokenizer: &mut Tokenizer) -> State { /// > | <!--xxx--> /// ^ /// ``` -fn continuation(tokenizer: &mut Tokenizer) -> State { +pub fn continuation(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') if tokenizer.tokenize_state.marker == BASIC || tokenizer.tokenize_state.marker == COMPLETE => { tokenizer.exit(Token::HtmlFlowData); - tokenizer.check(blank_line_before, |ok| { - Box::new(if ok { - continuation_after + tokenizer.check(StateName::HtmlFlowBlankLineBefore, |ok| { + State::Fn(if ok { + StateName::HtmlFlowContinuationAfter } else { - continuation_start + StateName::HtmlFlowContinuationStart }) - })(tokenizer) + }) } // Note: important that this is after the basic/complete case. None | Some(b'\n') => { @@ -647,27 +647,27 @@ fn continuation(tokenizer: &mut Tokenizer) -> State { } Some(b'-') if tokenizer.tokenize_state.marker == COMMENT => { tokenizer.consume(); - State::Fn(Box::new(continuation_comment_inside)) + State::Fn(StateName::HtmlFlowContinuationCommentInside) } Some(b'<') if tokenizer.tokenize_state.marker == RAW => { tokenizer.consume(); - State::Fn(Box::new(continuation_raw_tag_open)) + State::Fn(StateName::HtmlFlowContinuationRawTagOpen) } Some(b'>') if tokenizer.tokenize_state.marker == DECLARATION => { tokenizer.consume(); - State::Fn(Box::new(continuation_close)) + State::Fn(StateName::HtmlFlowContinuationClose) } Some(b'?') if tokenizer.tokenize_state.marker == INSTRUCTION => { tokenizer.consume(); - State::Fn(Box::new(continuation_declaration_inside)) + State::Fn(StateName::HtmlFlowContinuationDeclarationInside) } Some(b']') if tokenizer.tokenize_state.marker == CDATA => { tokenizer.consume(); - State::Fn(Box::new(continuation_character_data_inside)) + State::Fn(StateName::HtmlFlowContinuationCdataInside) } _ => { tokenizer.consume(); - State::Fn(Box::new(continuation)) + State::Fn(StateName::HtmlFlowContinuation) } } } @@ -679,14 +679,14 @@ fn continuation(tokenizer: &mut Tokenizer) -> State { /// ^ /// | asd /// ``` -fn continuation_start(tokenizer: &mut Tokenizer) -> State { - tokenizer.check(partial_non_lazy_continuation, |ok| { - Box::new(if ok { - continuation_start_non_lazy +pub fn continuation_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.check(StateName::NonLazyContinuationStart, |ok| { + State::Fn(if ok { + StateName::HtmlFlowContinuationStartNonLazy } else { - continuation_after + StateName::HtmlFlowContinuationAfter }) - })(tokenizer) + }) } /// In continuation, at an eol, before non-lazy content. @@ -696,13 +696,13 @@ fn continuation_start(tokenizer: &mut Tokenizer) -> State { /// ^ /// | asd /// ``` -fn continuation_start_non_lazy(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_start_non_lazy(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(continuation_before)) + State::Fn(StateName::HtmlFlowContinuationBefore) } _ => unreachable!("expected eol"), } @@ -715,7 +715,7 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer) -> State { /// > | asd /// ^ /// ``` -fn continuation_before(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => continuation_start(tokenizer), _ => { @@ -731,11 +731,11 @@ fn continuation_before(tokenizer: &mut Tokenizer) -> State { /// > | <!--xxx--> /// ^ /// ``` -fn continuation_comment_inside(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_comment_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-') => { tokenizer.consume(); - State::Fn(Box::new(continuation_declaration_inside)) + State::Fn(StateName::HtmlFlowContinuationDeclarationInside) } _ => continuation(tokenizer), } @@ -747,12 +747,12 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer) -> State { /// > | <script>console.log(1)</script> /// ^ /// ``` -fn continuation_raw_tag_open(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_raw_tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'/') => { tokenizer.consume(); tokenizer.tokenize_state.start = tokenizer.point.index; - State::Fn(Box::new(continuation_raw_end_tag)) + State::Fn(StateName::HtmlFlowContinuationRawEndTag) } _ => continuation(tokenizer), } @@ -764,7 +764,7 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer) -> State { /// > | <script>console.log(1)</script> /// ^^^^^^ /// ``` -fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { // Guaranteed to be valid ASCII bytes. @@ -779,7 +779,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State { if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); - State::Fn(Box::new(continuation_close)) + State::Fn(StateName::HtmlFlowContinuationClose) } else { continuation(tokenizer) } @@ -788,7 +788,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State { if tokenizer.point.index - tokenizer.tokenize_state.start < HTML_RAW_SIZE_MAX => { tokenizer.consume(); - State::Fn(Box::new(continuation_raw_end_tag)) + State::Fn(StateName::HtmlFlowContinuationRawEndTag) } _ => { tokenizer.tokenize_state.start = 0; @@ -803,11 +803,11 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State { /// > | <![CDATA[>&<]]> /// ^ /// ``` -fn continuation_character_data_inside(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_cdata_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b']') => { tokenizer.consume(); - State::Fn(Box::new(continuation_declaration_inside)) + State::Fn(StateName::HtmlFlowContinuationDeclarationInside) } _ => continuation(tokenizer), } @@ -827,15 +827,15 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer) -> State { /// > | <![CDATA[>&<]]> /// ^ /// ``` -fn continuation_declaration_inside(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_declaration_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { tokenizer.consume(); - State::Fn(Box::new(continuation_close)) + State::Fn(StateName::HtmlFlowContinuationClose) } Some(b'-') if tokenizer.tokenize_state.marker == COMMENT => { tokenizer.consume(); - State::Fn(Box::new(continuation_declaration_inside)) + State::Fn(StateName::HtmlFlowContinuationDeclarationInside) } _ => continuation(tokenizer), } @@ -847,7 +847,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer) -> State { /// > | <!doctype> /// ^ /// ``` -fn continuation_close(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::HtmlFlowData); @@ -855,7 +855,7 @@ fn continuation_close(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(continuation_close)) + State::Fn(StateName::HtmlFlowContinuationClose) } } } @@ -866,7 +866,7 @@ fn continuation_close(tokenizer: &mut Tokenizer) -> State { /// > | <!doctype> /// ^ /// ``` -fn continuation_after(tokenizer: &mut Tokenizer) -> State { +pub fn continuation_after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::HtmlFlow); tokenizer.tokenize_state.marker = 0; // Feel free to interrupt. @@ -883,9 +883,9 @@ fn continuation_after(tokenizer: &mut Tokenizer) -> State { /// ^ /// | /// ``` -fn blank_line_before(tokenizer: &mut Tokenizer) -> State { +pub fn blank_line_before(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(blank_line)) + State::Fn(StateName::BlankLineStart) } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index a4c0349..1c1f9e6 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -57,7 +57,7 @@ use crate::constant::HTML_CDATA_PREFIX; use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of HTML (text) /// @@ -70,7 +70,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::HtmlText); tokenizer.enter(Token::HtmlTextData); tokenizer.consume(); - State::Fn(Box::new(open)) + State::Fn(StateName::HtmlTextOpen) } else { State::Nok } @@ -86,24 +86,24 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | a <!--b--> c /// ^ /// ``` -fn open(tokenizer: &mut Tokenizer) -> State { +pub fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'!') => { tokenizer.consume(); - State::Fn(Box::new(declaration_open)) + State::Fn(StateName::HtmlTextDeclarationOpen) } Some(b'/') => { tokenizer.consume(); - State::Fn(Box::new(tag_close_start)) + State::Fn(StateName::HtmlTextTagCloseStart) } Some(b'?') => { tokenizer.consume(); - State::Fn(Box::new(instruction)) + State::Fn(StateName::HtmlTextInstruction) } // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(tag_open)) + State::Fn(StateName::HtmlTextTagOpen) } _ => State::Nok, } @@ -119,20 +119,20 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> c /// ^ /// ``` -fn declaration_open(tokenizer: &mut Tokenizer) -> State { +pub fn declaration_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-') => { tokenizer.consume(); - State::Fn(Box::new(comment_open_inside)) + State::Fn(StateName::HtmlTextCommentOpenInside) } // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(declaration)) + State::Fn(StateName::HtmlTextDeclaration) } Some(b'[') => { tokenizer.consume(); - State::Fn(Box::new(cdata_open_inside)) + State::Fn(StateName::HtmlTextCdataOpenInside) } _ => State::Nok, } @@ -144,11 +144,11 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { /// > | a <!--b--> c /// ^ /// ``` -fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { +pub fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-') => { tokenizer.consume(); - State::Fn(Box::new(comment_start)) + State::Fn(StateName::HtmlTextCommentStart) } _ => State::Nok, } @@ -167,12 +167,12 @@ fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { /// ``` /// /// [html_flow]: crate::construct::html_flow -fn comment_start(tokenizer: &mut Tokenizer) -> State { +pub fn comment_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => State::Nok, Some(b'-') => { tokenizer.consume(); - State::Fn(Box::new(comment_start_dash)) + State::Fn(StateName::HtmlTextCommentStartDash) } _ => comment(tokenizer), } @@ -191,7 +191,7 @@ fn comment_start(tokenizer: &mut Tokenizer) -> State { /// ``` /// /// [html_flow]: crate::construct::html_flow -fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { +pub fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => State::Nok, _ => comment(tokenizer), @@ -204,20 +204,20 @@ fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { /// > | a <!--b--> c /// ^ /// ``` -fn comment(tokenizer: &mut Tokenizer) -> State { +pub fn comment(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(comment)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextComment); + line_ending_before(tokenizer) } Some(b'-') => { tokenizer.consume(); - State::Fn(Box::new(comment_close)) + State::Fn(StateName::HtmlTextCommentClose) } _ => { tokenizer.consume(); - State::Fn(Box::new(comment)) + State::Fn(StateName::HtmlTextComment) } } } @@ -228,11 +228,11 @@ fn comment(tokenizer: &mut Tokenizer) -> State { /// > | a <!--b--> c /// ^ /// ``` -fn comment_close(tokenizer: &mut Tokenizer) -> State { +pub fn comment_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-') => { tokenizer.consume(); - State::Fn(Box::new(end)) + State::Fn(StateName::HtmlTextEnd) } _ => comment(tokenizer), } @@ -244,16 +244,16 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { +pub fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { if tokenizer.current == Some(HTML_CDATA_PREFIX[tokenizer.tokenize_state.size]) { tokenizer.tokenize_state.size += 1; tokenizer.consume(); if tokenizer.tokenize_state.size == HTML_CDATA_PREFIX.len() { tokenizer.tokenize_state.size = 0; - State::Fn(Box::new(cdata)) + State::Fn(StateName::HtmlTextCdata) } else { - State::Fn(Box::new(cdata_open_inside)) + State::Fn(StateName::HtmlTextCdataOpenInside) } } else { State::Nok @@ -266,20 +266,20 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^^^ /// ``` -fn cdata(tokenizer: &mut Tokenizer) -> State { +pub fn cdata(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(cdata)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextCdata); + line_ending_before(tokenizer) } Some(b']') => { tokenizer.consume(); - State::Fn(Box::new(cdata_close)) + State::Fn(StateName::HtmlTextCdataClose) } _ => { tokenizer.consume(); - State::Fn(Box::new(cdata)) + State::Fn(StateName::HtmlTextCdata) } } } @@ -290,11 +290,11 @@ fn cdata(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^ /// ``` -fn cdata_close(tokenizer: &mut Tokenizer) -> State { +pub fn cdata_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b']') => { tokenizer.consume(); - State::Fn(Box::new(cdata_end)) + State::Fn(StateName::HtmlTextCdataEnd) } _ => cdata(tokenizer), } @@ -306,7 +306,7 @@ fn cdata_close(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^ /// ``` -fn cdata_end(tokenizer: &mut Tokenizer) -> State { +pub fn cdata_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => end(tokenizer), Some(b']') => cdata_close(tokenizer), @@ -320,16 +320,16 @@ fn cdata_end(tokenizer: &mut Tokenizer) -> State { /// > | a <!b> c /// ^ /// ``` -fn declaration(tokenizer: &mut Tokenizer) -> State { +pub fn declaration(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'>') => end(tokenizer), Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(declaration)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextDeclaration); + line_ending_before(tokenizer) } _ => { tokenizer.consume(); - State::Fn(Box::new(declaration)) + State::Fn(StateName::HtmlTextDeclaration) } } } @@ -340,20 +340,20 @@ fn declaration(tokenizer: &mut Tokenizer) -> State { /// > | a <?b?> c /// ^ /// ``` -fn instruction(tokenizer: &mut Tokenizer) -> State { +pub fn instruction(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(instruction)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextInstruction); + line_ending_before(tokenizer) } Some(b'?') => { tokenizer.consume(); - State::Fn(Box::new(instruction_close)) + State::Fn(StateName::HtmlTextInstructionClose) } _ => { tokenizer.consume(); - State::Fn(Box::new(instruction)) + State::Fn(StateName::HtmlTextInstruction) } } } @@ -364,7 +364,7 @@ fn instruction(tokenizer: &mut Tokenizer) -> State { /// > | a <?b?> c /// ^ /// ``` -fn instruction_close(tokenizer: &mut Tokenizer) -> State { +pub fn instruction_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => end(tokenizer), _ => instruction(tokenizer), @@ -377,12 +377,12 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// > | a </b> c /// ^ /// ``` -fn tag_close_start(tokenizer: &mut Tokenizer) -> State { +pub fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(tag_close)) + State::Fn(StateName::HtmlTextTagClose) } _ => State::Nok, } @@ -394,12 +394,12 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// > | a </b> c /// ^ /// ``` -fn tag_close(tokenizer: &mut Tokenizer) -> State { +pub fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(tag_close)) + State::Fn(StateName::HtmlTextTagClose) } _ => tag_close_between(tokenizer), } @@ -411,15 +411,15 @@ fn tag_close(tokenizer: &mut Tokenizer) -> State { /// > | a </b> c /// ^ /// ``` -fn tag_close_between(tokenizer: &mut Tokenizer) -> State { +pub fn tag_close_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(tag_close_between)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextTagCloseBetween); + line_ending_before(tokenizer) } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(tag_close_between)) + State::Fn(StateName::HtmlTextTagCloseBetween) } _ => end(tokenizer), } @@ -431,12 +431,12 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// > | a <b> c /// ^ /// ``` -fn tag_open(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(tag_open)) + State::Fn(StateName::HtmlTextTagOpen) } Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => tag_open_between(tokenizer), _ => State::Nok, @@ -449,24 +449,24 @@ fn tag_open(tokenizer: &mut Tokenizer) -> State { /// > | a <b> c /// ^ /// ``` -fn tag_open_between(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_between)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextTagOpenBetween); + line_ending_before(tokenizer) } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(tag_open_between)) + State::Fn(StateName::HtmlTextTagOpenBetween) } Some(b'/') => { tokenizer.consume(); - State::Fn(Box::new(end)) + State::Fn(StateName::HtmlTextEnd) } // ASCII alphabetical and `:` and `_`. Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_name)) + State::Fn(StateName::HtmlTextTagOpenAttributeName) } _ => end(tokenizer), } @@ -478,12 +478,12 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// > | a <b c> d /// ^ /// ``` -fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphabetical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_name)) + State::Fn(StateName::HtmlTextTagOpenAttributeName) } _ => tag_open_attribute_name_after(tokenizer), } @@ -496,19 +496,20 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { /// > | a <b c> d /// ^ /// ``` -fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_attribute_name_after)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = + Some(StateName::HtmlTextTagOpenAttributeNameAfter); + line_ending_before(tokenizer) } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_name_after)) + State::Fn(StateName::HtmlTextTagOpenAttributeNameAfter) } Some(b'=') => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_value_before)) + State::Fn(StateName::HtmlTextTagOpenAttributeValueBefore) } _ => tag_open_between(tokenizer), } @@ -521,25 +522,26 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { /// > | a <b c=d> e /// ^ /// ``` -fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_attribute_value_before)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = + Some(StateName::HtmlTextTagOpenAttributeValueBefore); + line_ending_before(tokenizer) } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_value_before)) + State::Fn(StateName::HtmlTextTagOpenAttributeValueBefore) } Some(b'"' | b'\'') => { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_value_quoted)) + State::Fn(StateName::HtmlTextTagOpenAttributeValueQuoted) } Some(_) => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_value_unquoted)) + State::Fn(StateName::HtmlTextTagOpenAttributeValueUnquoted) } } } @@ -550,24 +552,25 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { /// > | a <b c="d"> e /// ^ /// ``` -fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => { tokenizer.tokenize_state.marker = 0; State::Nok } Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_attribute_value_quoted)); - at_line_ending(tokenizer) + tokenizer.tokenize_state.return_state = + Some(StateName::HtmlTextTagOpenAttributeValueQuoted); + line_ending_before(tokenizer) } Some(b'"' | b'\'') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.tokenize_state.marker = 0; tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_value_quoted_after)) + State::Fn(StateName::HtmlTextTagOpenAttributeValueQuotedAfter) } _ => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_value_quoted)) + State::Fn(StateName::HtmlTextTagOpenAttributeValueQuoted) } } } @@ -578,13 +581,13 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { /// > | a <b c=d> e /// ^ /// ``` -fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'"' | b'\'' | b'<' | b'=' | b'`') => State::Nok, Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => tag_open_between(tokenizer), Some(_) => { tokenizer.consume(); - State::Fn(Box::new(tag_open_attribute_value_unquoted)) + State::Fn(StateName::HtmlTextTagOpenAttributeValueUnquoted) } } } @@ -596,7 +599,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { /// > | a <b c="d"> e /// ^ /// ``` -fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { +pub fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b'\n' | b' ' | b'>' | b'/') => tag_open_between(tokenizer), _ => State::Nok, @@ -609,7 +612,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { /// > | a <b c="d"> e /// ^ /// ``` -fn end(tokenizer: &mut Tokenizer) -> State { +pub fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { tokenizer.consume(); @@ -631,14 +634,14 @@ fn end(tokenizer: &mut Tokenizer) -> State { /// ^ /// | b--> /// ``` -fn at_line_ending(tokenizer: &mut Tokenizer) -> State { +pub fn line_ending_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.exit(Token::HtmlTextData); tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(after_line_ending)) + State::Fn(StateName::HtmlTextLineEndingAfter) } _ => unreachable!("expected eol"), } @@ -654,8 +657,9 @@ fn at_line_ending(tokenizer: &mut Tokenizer) -> State { /// > | b--> /// ^ /// ``` -fn after_line_ending(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt(space_or_tab(), after_line_ending_prefix)(tokenizer) +pub fn line_ending_after(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::HtmlTextLineEndingAfterPrefix) } /// After a line ending, after indent. @@ -668,8 +672,9 @@ fn after_line_ending(tokenizer: &mut Tokenizer) -> State { /// > | b--> /// ^ /// ``` -fn after_line_ending_prefix(tokenizer: &mut Tokenizer) -> State { - let return_state = tokenizer.tokenize_state.return_state.take().unwrap(); +pub fn line_ending_after_prefix(tokenizer: &mut Tokenizer) -> State { + let state_name = tokenizer.tokenize_state.return_state.take().unwrap(); + let func = state_name.to_func(); tokenizer.enter(Token::HtmlTextData); - return_state(tokenizer) + func(tokenizer) } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index b38e15a..ae9fe77 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -147,12 +147,9 @@ //! [html-img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element use crate::constant::RESOURCE_DESTINATION_BALANCE_MAX; -use crate::construct::{ - partial_destination::start as destination, partial_label::start as label, - partial_space_or_tab::space_or_tab_eol, partial_title::start as title, -}; +use crate::construct::partial_space_or_tab::space_or_tab_eol; use crate::token::Token; -use crate::tokenizer::{Event, EventType, Media, State, Tokenizer}; +use crate::tokenizer::{Event, EventType, Media, State, StateName, Tokenizer}; use crate::util::{ normalize_identifier::normalize_identifier, skip, @@ -204,7 +201,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::LabelMarker); tokenizer.exit(Token::LabelEnd); - return State::Fn(Box::new(after)); + return State::Fn(StateName::LabelEndAfter); } } @@ -223,7 +220,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | [a] b /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { let start = &tokenizer.label_start_stack[tokenizer.tokenize_state.start]; let defined = tokenizer .parse_state @@ -240,19 +237,23 @@ fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Resource (`[asd](fgh)`)? - Some(b'(') => tokenizer.attempt(resource, move |is_ok| { - Box::new(if is_ok || defined { ok } else { nok }) - })(tokenizer), + Some(b'(') => tokenizer.attempt(StateName::LabelEndResourceStart, move |is_ok| { + State::Fn(if is_ok || defined { + StateName::LabelEndOk + } else { + StateName::LabelEndNok + }) + }), // Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference? - Some(b'[') => tokenizer.attempt(full_reference, move |is_ok| { - Box::new(if is_ok { - ok + Some(b'[') => tokenizer.attempt(StateName::LabelEndReferenceFull, move |is_ok| { + State::Fn(if is_ok { + StateName::LabelEndOk } else if defined { - reference_not_full + StateName::LabelEndReferenceNotFull } else { - nok + StateName::LabelEndNok }) - })(tokenizer), + }), // Shortcut (`[asd]`) reference? _ => { let func = if defined { ok } else { nok }; @@ -271,10 +272,14 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// > | [a] b /// ^ /// ``` -fn reference_not_full(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt(collapsed_reference, |is_ok| { - Box::new(if is_ok { ok } else { nok }) - })(tokenizer) +pub fn reference_not_full(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt(StateName::LabelEndReferenceCollapsed, |is_ok| { + State::Fn(if is_ok { + StateName::LabelEndOk + } else { + StateName::LabelEndNok + }) + }) } /// Done, we found something. @@ -289,7 +294,7 @@ fn reference_not_full(tokenizer: &mut Tokenizer) -> State { /// > | [a] b /// ^ /// ``` -fn ok(tokenizer: &mut Tokenizer) -> State { +pub fn ok(tokenizer: &mut Tokenizer) -> State { let label_start_index = tokenizer.tokenize_state.start; // Remove this one and everything after it. let mut left = tokenizer.label_start_stack.split_off(label_start_index); @@ -332,7 +337,7 @@ fn ok(tokenizer: &mut Tokenizer) -> State { /// > | [a] b /// ^ /// ``` -fn nok(tokenizer: &mut Tokenizer) -> State { +pub fn nok(tokenizer: &mut Tokenizer) -> State { tokenizer .label_start_stack .get_mut(tokenizer.tokenize_state.start) @@ -349,14 +354,14 @@ fn nok(tokenizer: &mut Tokenizer) -> State { /// > | [a](b) c /// ^ /// ``` -fn resource(tokenizer: &mut Tokenizer) -> State { +pub fn resource_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'(') => { tokenizer.enter(Token::Resource); tokenizer.enter(Token::ResourceMarker); tokenizer.consume(); tokenizer.exit(Token::ResourceMarker); - State::Fn(Box::new(resource_start)) + State::Fn(StateName::LabelEndResourceBefore) } _ => unreachable!("expected `(`"), } @@ -368,8 +373,9 @@ fn resource(tokenizer: &mut Tokenizer) -> State { /// > | [a](b) c /// ^ /// ``` -fn resource_start(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt(space_or_tab_eol(), resource_open)(tokenizer) +pub fn resource_before(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_eol(tokenizer); + tokenizer.attempt_opt(state_name, StateName::LabelEndResourceOpen) } /// At the start of a resource, after optional whitespace. @@ -378,7 +384,7 @@ fn resource_start(tokenizer: &mut Tokenizer) -> State { /// > | [a](b) c /// ^ /// ``` -fn resource_open(tokenizer: &mut Tokenizer) -> State { +pub fn resource_open(tokenizer: &mut Tokenizer) -> State { if let Some(b')') = tokenizer.current { resource_end(tokenizer) } else { @@ -389,13 +395,13 @@ fn resource_open(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_5 = Token::ResourceDestinationString; tokenizer.tokenize_state.size_other = RESOURCE_DESTINATION_BALANCE_MAX; - tokenizer.attempt(destination, |ok| { - Box::new(if ok { - destination_after + tokenizer.attempt(StateName::DestinationStart, |ok| { + State::Fn(if ok { + StateName::LabelEndResourceDestinationAfter } else { - destination_missing + StateName::LabelEndResourceDestinationMissing }) - })(tokenizer) + }) } } @@ -405,21 +411,26 @@ fn resource_open(tokenizer: &mut Tokenizer) -> State { /// > | [a](b) c /// ^ /// ``` -fn destination_after(tokenizer: &mut Tokenizer) -> State { +pub fn resource_destination_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.tokenize_state.token_4 = Token::Data; tokenizer.tokenize_state.token_5 = Token::Data; tokenizer.tokenize_state.size_other = 0; - - tokenizer.attempt(space_or_tab_eol(), |ok| { - Box::new(if ok { resource_between } else { resource_end }) - })(tokenizer) + let state_name = space_or_tab_eol(tokenizer); + + tokenizer.attempt(state_name, |ok| { + State::Fn(if ok { + StateName::LabelEndResourceBetween + } else { + StateName::LabelEndResourceEnd + }) + }) } /// Without destination. -fn destination_missing(tokenizer: &mut Tokenizer) -> State { +pub fn resource_destination_missing(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; @@ -435,13 +446,13 @@ fn destination_missing(tokenizer: &mut Tokenizer) -> State { /// > | [a](b ) c /// ^ /// ``` -fn resource_between(tokenizer: &mut Tokenizer) -> State { +pub fn resource_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'"' | b'\'' | b'(') => { tokenizer.tokenize_state.token_1 = Token::ResourceTitle; tokenizer.tokenize_state.token_2 = Token::ResourceTitleMarker; tokenizer.tokenize_state.token_3 = Token::ResourceTitleString; - tokenizer.go(title, title_after)(tokenizer) + tokenizer.go(StateName::TitleStart, StateName::LabelEndResourceTitleAfter) } _ => resource_end(tokenizer), } @@ -453,11 +464,12 @@ fn resource_between(tokenizer: &mut Tokenizer) -> State { /// > | [a](b "c") d /// ^ /// ``` -fn title_after(tokenizer: &mut Tokenizer) -> State { +pub fn resource_title_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; - tokenizer.attempt_opt(space_or_tab_eol(), resource_end)(tokenizer) + let state_name = space_or_tab_eol(tokenizer); + tokenizer.attempt_opt(state_name, StateName::LabelEndResourceEnd) } /// In a resource, at the `)`. @@ -466,7 +478,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// > | [a](b) d /// ^ /// ``` -fn resource_end(tokenizer: &mut Tokenizer) -> State { +pub fn resource_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b')') => { tokenizer.enter(Token::ResourceMarker); @@ -485,13 +497,13 @@ fn resource_end(tokenizer: &mut Tokenizer) -> State { /// > | [a][b] d /// ^ /// ``` -fn full_reference(tokenizer: &mut Tokenizer) -> State { +pub fn reference_full(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'[') => { tokenizer.tokenize_state.token_1 = Token::Reference; tokenizer.tokenize_state.token_2 = Token::ReferenceMarker; tokenizer.tokenize_state.token_3 = Token::ReferenceString; - tokenizer.go(label, full_reference_after)(tokenizer) + tokenizer.go(StateName::LabelStart, StateName::LabelEndReferenceFullAfter) } _ => unreachable!("expected `[`"), } @@ -503,7 +515,7 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State { /// > | [a][b] d /// ^ /// ``` -fn full_reference_after(tokenizer: &mut Tokenizer) -> State { +pub fn reference_full_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_1 = Token::Data; tokenizer.tokenize_state.token_2 = Token::Data; tokenizer.tokenize_state.token_3 = Token::Data; @@ -541,14 +553,14 @@ fn full_reference_after(tokenizer: &mut Tokenizer) -> State { /// > | [a][] d /// ^ /// ``` -fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { +pub fn reference_collapsed(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'[') => { tokenizer.enter(Token::Reference); tokenizer.enter(Token::ReferenceMarker); tokenizer.consume(); tokenizer.exit(Token::ReferenceMarker); - State::Fn(Box::new(collapsed_reference_open)) + State::Fn(StateName::LabelEndReferenceCollapsedOpen) } _ => State::Nok, } @@ -562,7 +574,7 @@ fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { /// > | [a][] d /// ^ /// ``` -fn collapsed_reference_open(tokenizer: &mut Tokenizer) -> State { +pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b']') => { tokenizer.enter(Token::ReferenceMarker); diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index 4a3508e..4fcf8c2 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -30,7 +30,7 @@ use super::label_end::resolve_media; use crate::token::Token; -use crate::tokenizer::{LabelStart, State, Tokenizer}; +use crate::tokenizer::{LabelStart, State, StateName, Tokenizer}; /// Start of label (image) start. /// @@ -45,7 +45,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::LabelImageMarker); tokenizer.consume(); tokenizer.exit(Token::LabelImageMarker); - State::Fn(Box::new(open)) + State::Fn(StateName::LabelStartImageOpen) } _ => State::Nok, } diff --git a/src/construct/list.rs b/src/construct/list.rs index 0e12b7c..6ecfb04 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -45,12 +45,9 @@ //! [commonmark-block]: https://spec.commonmark.org/0.30/#phase-1-block-structure use crate::constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}; -use crate::construct::{ - blank_line::start as blank_line, partial_space_or_tab::space_or_tab_min_max, - thematic_break::start as thematic_break, -}; +use crate::construct::partial_space_or_tab::space_or_tab_min_max; use crate::token::Token; -use crate::tokenizer::{EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, StateName, Tokenizer}; use crate::util::{ skip, slice::{Position, Slice}, @@ -65,17 +62,16 @@ use crate::util::{ pub fn start(tokenizer: &mut Tokenizer) -> State { if tokenizer.parse_state.constructs.list { tokenizer.enter(Token::ListItem); - tokenizer.go( - space_or_tab_min_max( - 0, - if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - ), - before, - )(tokenizer) + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + tokenizer.go(state_name, StateName::ListBefore) } else { State::Nok } @@ -87,12 +83,16 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Unordered. - Some(b'*' | b'-') => tokenizer.check(thematic_break, |ok| { - Box::new(if ok { nok } else { before_unordered }) - })(tokenizer), + Some(b'*' | b'-') => tokenizer.check(StateName::ThematicBreakStart, |ok| { + State::Fn(if ok { + StateName::ListNok + } else { + StateName::ListBeforeUnordered + }) + }), Some(b'+') => before_unordered(tokenizer), // Ordered. Some(b'0'..=b'9') if !tokenizer.interrupt => before_ordered(tokenizer), @@ -109,7 +109,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn before_unordered(tokenizer: &mut Tokenizer) -> State { +pub fn before_unordered(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::ListItemPrefix); marker(tokenizer) } @@ -120,10 +120,10 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn before_ordered(tokenizer: &mut Tokenizer) -> State { +pub fn before_ordered(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::ListItemPrefix); tokenizer.enter(Token::ListItemValue); - inside(tokenizer) + value(tokenizer) } /// In an ordered list item value. @@ -132,7 +132,7 @@ fn before_ordered(tokenizer: &mut Tokenizer) -> State { /// > | 1. a /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +pub fn value(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'.' | b')') if !tokenizer.interrupt || tokenizer.tokenize_state.size < 2 => { tokenizer.exit(Token::ListItemValue); @@ -141,7 +141,7 @@ fn inside(tokenizer: &mut Tokenizer) -> State { Some(b'0'..=b'9') if tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(inside)) + State::Fn(StateName::ListValue) } _ => { tokenizer.tokenize_state.size = 0; @@ -158,11 +158,11 @@ fn inside(tokenizer: &mut Tokenizer) -> State { /// > | 1. b /// ^ /// ``` -fn marker(tokenizer: &mut Tokenizer) -> State { +pub fn marker(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::ListItemMarker); tokenizer.consume(); tokenizer.exit(Token::ListItemMarker); - State::Fn(Box::new(marker_after)) + State::Fn(StateName::ListMarkerAfter) } /// After a list item marker. @@ -173,11 +173,15 @@ fn marker(tokenizer: &mut Tokenizer) -> State { /// > | 1. b /// ^ /// ``` -fn marker_after(tokenizer: &mut Tokenizer) -> State { +pub fn marker_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.size = 1; - tokenizer.check(blank_line, |ok| { - Box::new(if ok { after } else { marker_after_not_blank }) - })(tokenizer) + tokenizer.check(StateName::BlankLineStart, |ok| { + State::Fn(if ok { + StateName::ListAfter + } else { + StateName::ListMarkerAfterFilled + }) + }) } /// After a list item marker, not followed by a blank line. @@ -186,13 +190,17 @@ fn marker_after(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn marker_after_not_blank(tokenizer: &mut Tokenizer) -> State { +pub fn marker_after_filled(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.size = 0; // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace. - tokenizer.attempt(whitespace, |ok| { - Box::new(if ok { after } else { prefix_other }) - })(tokenizer) + tokenizer.attempt(StateName::ListWhitespace, |ok| { + State::Fn(if ok { + StateName::ListAfter + } else { + StateName::ListPrefixOther + }) + }) } /// In whitespace after a marker. @@ -201,8 +209,9 @@ fn marker_after_not_blank(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn whitespace(tokenizer: &mut Tokenizer) -> State { - tokenizer.go(space_or_tab_min_max(1, TAB_SIZE), whitespace_after)(tokenizer) +pub fn whitespace(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_min_max(tokenizer, 1, TAB_SIZE); + tokenizer.go(state_name, StateName::ListWhitespaceAfter) } /// After acceptable whitespace. @@ -211,7 +220,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn whitespace_after(tokenizer: &mut Tokenizer) -> State { +pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State { if let Some(b'\t' | b' ') = tokenizer.current { State::Nok } else { @@ -225,13 +234,13 @@ fn whitespace_after(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn prefix_other(tokenizer: &mut Tokenizer) -> State { +pub fn prefix_other(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') => { tokenizer.enter(Token::SpaceOrTab); tokenizer.consume(); tokenizer.exit(Token::SpaceOrTab); - State::Fn(Box::new(after)) + State::Fn(StateName::ListAfter) } _ => State::Nok, } @@ -243,7 +252,7 @@ fn prefix_other(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { let blank = tokenizer.tokenize_state.size == 1; tokenizer.tokenize_state.size = 0; @@ -285,10 +294,14 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// > | b /// ^ /// ``` -pub fn cont(tokenizer: &mut Tokenizer) -> State { - tokenizer.check(blank_line, |ok| { - Box::new(if ok { blank_cont } else { not_blank_cont }) - })(tokenizer) +pub fn cont_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.check(StateName::BlankLineStart, |ok| { + State::Fn(if ok { + StateName::ListContBlank + } else { + StateName::ListContFilled + }) + }) } /// Start of blank list item continuation. @@ -299,15 +312,16 @@ pub fn cont(tokenizer: &mut Tokenizer) -> State { /// ^ /// | b /// ``` -pub fn blank_cont(tokenizer: &mut Tokenizer) -> State { +pub fn cont_blank(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.as_ref().unwrap(); let size = container.size; if container.blank_initial { State::Nok } else { + let state_name = space_or_tab_min_max(tokenizer, 0, size); // Consume, optionally, at most `size`. - tokenizer.go(space_or_tab_min_max(0, size), ok)(tokenizer) + tokenizer.go(state_name, StateName::ListOk) } } @@ -318,14 +332,15 @@ pub fn blank_cont(tokenizer: &mut Tokenizer) -> State { /// > | b /// ^ /// ``` -pub fn not_blank_cont(tokenizer: &mut Tokenizer) -> State { +pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.as_mut().unwrap(); let size = container.size; container.blank_initial = false; // Consume exactly `size`. - tokenizer.go(space_or_tab_min_max(size, size), ok)(tokenizer) + let state_name = space_or_tab_min_max(tokenizer, size, size); + tokenizer.go(state_name, StateName::ListOk) } /// A state fn to yield [`State::Ok`]. @@ -334,16 +349,16 @@ pub fn ok(_tokenizer: &mut Tokenizer) -> State { } /// A state fn to yield [`State::Nok`]. -fn nok(_tokenizer: &mut Tokenizer) -> State { +pub fn nok(_tokenizer: &mut Tokenizer) -> State { State::Nok } /// Find adjacent list items with the same marker. pub fn resolve_list_item(tokenizer: &mut Tokenizer) { - let mut index = 0; - let mut balance = 0; let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; + let mut index = 0; + let mut balance = 0; // Merge list items. while index < tokenizer.events.len() { diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 7fdaa66..de750f4 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -33,7 +33,7 @@ //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element use crate::token::Token; -use crate::tokenizer::{ContentType, EventType, State, Tokenizer}; +use crate::tokenizer::{ContentType, EventType, State, StateName, Tokenizer}; use crate::util::skip::opt as skip_opt; /// Before a paragraph. @@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | abc /// ^^^ /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Data); @@ -71,7 +71,7 @@ fn inside(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(inside)) + State::Fn(StateName::ParagraphInside) } } } diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs index 2257bfd..b32b7f9 100644 --- a/src/construct/partial_bom.rs +++ b/src/construct/partial_bom.rs @@ -11,7 +11,7 @@ //! * [`micromark/lib/preprocess.js` in `micromark`](https://github.com/micromark/micromark/blob/ed23453/packages/micromark/dev/lib/preprocess.js#L54-L60) use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; const BOM: [u8; 3] = [0xEF, 0xBB, 0xBF]; @@ -36,7 +36,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | 0xEF 0xBB 0xBF /// ^^^^ ^^^^ ^^^^ /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { if tokenizer.current == Some(BOM[tokenizer.tokenize_state.size]) { tokenizer.tokenize_state.size += 1; tokenizer.consume(); @@ -45,7 +45,7 @@ fn inside(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.size = 0; State::Ok } else { - State::Fn(Box::new(inside)) + State::Fn(StateName::BomInside) } } else { tokenizer.tokenize_state.size = 0; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 0365489..1cb5e61 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -7,7 +7,7 @@ //! [text]: crate::content::text use crate::token::Token; -use crate::tokenizer::{EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, StateName, Tokenizer}; /// At the beginning of data. /// @@ -17,10 +17,11 @@ use crate::tokenizer::{EventType, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // Make sure to eat the first `stop`. Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => { tokenizer.enter(Token::Data); tokenizer.consume(); - State::Fn(Box::new(data)) + State::Fn(StateName::DataInside) } _ => at_break(tokenizer), } @@ -32,14 +33,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | abc /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer) -> State { +pub fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(at_break)) + State::Fn(StateName::DataAtBreak) } Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => { tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data)); @@ -47,7 +48,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.enter(Token::Data); - data(tokenizer) + inside(tokenizer) } } } @@ -58,7 +59,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// > | abc /// ^^^ /// ``` -fn data(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { let done = match tokenizer.current { None | Some(b'\n') => true, Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => true, @@ -70,7 +71,7 @@ fn data(tokenizer: &mut Tokenizer) -> State { at_break(tokenizer) } else { tokenizer.consume(); - State::Fn(Box::new(data)) + State::Fn(StateName::DataInside) } } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index f1cfc7d..e8818a0 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -72,7 +72,7 @@ //! [sanitize_uri]: crate::util::sanitize_uri use crate::token::Token; -use crate::tokenizer::{ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, StateName, Tokenizer}; /// Before a destination. /// @@ -90,7 +90,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); tokenizer.consume(); tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); - State::Fn(Box::new(enclosed_before)) + State::Fn(StateName::DestinationEnclosedBefore) } // ASCII control, space, closing paren, but *not* `\0`. None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok, @@ -110,7 +110,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | <aa> /// ^ /// ``` -fn enclosed_before(tokenizer: &mut Tokenizer) -> State { +pub fn enclosed_before(tokenizer: &mut Tokenizer) -> State { if let Some(b'>') = tokenizer.current { tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); tokenizer.consume(); @@ -131,7 +131,7 @@ fn enclosed_before(tokenizer: &mut Tokenizer) -> State { /// > | <aa> /// ^ /// ``` -fn enclosed(tokenizer: &mut Tokenizer) -> State { +pub fn enclosed(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n' | b'<') => State::Nok, Some(b'>') => { @@ -141,11 +141,11 @@ fn enclosed(tokenizer: &mut Tokenizer) -> State { } Some(b'\\') => { tokenizer.consume(); - State::Fn(Box::new(enclosed_escape)) + State::Fn(StateName::DestinationEnclosedEscape) } _ => { tokenizer.consume(); - State::Fn(Box::new(enclosed)) + State::Fn(StateName::DestinationEnclosed) } } } @@ -156,11 +156,11 @@ fn enclosed(tokenizer: &mut Tokenizer) -> State { /// > | <a\*a> /// ^ /// ``` -fn enclosed_escape(tokenizer: &mut Tokenizer) -> State { +pub fn enclosed_escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'<' | b'>' | b'\\') => { tokenizer.consume(); - State::Fn(Box::new(enclosed)) + State::Fn(StateName::DestinationEnclosed) } _ => enclosed(tokenizer), } @@ -172,7 +172,7 @@ fn enclosed_escape(tokenizer: &mut Tokenizer) -> State { /// > | aa /// ^ /// ``` -fn raw(tokenizer: &mut Tokenizer) -> State { +pub fn raw(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\t' | b'\n' | b' ' | b')') if tokenizer.tokenize_state.size == 0 => { tokenizer.exit(Token::Data); @@ -185,7 +185,7 @@ fn raw(tokenizer: &mut Tokenizer) -> State { Some(b'(') if tokenizer.tokenize_state.size < tokenizer.tokenize_state.size_other => { tokenizer.consume(); tokenizer.tokenize_state.size += 1; - State::Fn(Box::new(raw)) + State::Fn(StateName::DestinationRaw) } // ASCII control (but *not* `\0`) and space and `(`. None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => { @@ -195,15 +195,15 @@ fn raw(tokenizer: &mut Tokenizer) -> State { Some(b')') => { tokenizer.consume(); tokenizer.tokenize_state.size -= 1; - State::Fn(Box::new(raw)) + State::Fn(StateName::DestinationRaw) } Some(b'\\') => { tokenizer.consume(); - State::Fn(Box::new(raw_escape)) + State::Fn(StateName::DestinationRawEscape) } Some(_) => { tokenizer.consume(); - State::Fn(Box::new(raw)) + State::Fn(StateName::DestinationRaw) } } } @@ -214,11 +214,11 @@ fn raw(tokenizer: &mut Tokenizer) -> State { /// > | a\*a /// ^ /// ``` -fn raw_escape(tokenizer: &mut Tokenizer) -> State { +pub fn raw_escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'(' | b')' | b'\\') => { tokenizer.consume(); - State::Fn(Box::new(raw)) + State::Fn(StateName::DestinationRaw) } _ => raw(tokenizer), } diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 0e1c2ec..0c8366e 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -62,7 +62,7 @@ use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::constant::LINK_REFERENCE_SIZE_MAX; use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, StateName, Tokenizer}; /// Before a label. /// @@ -78,7 +78,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); - State::Fn(Box::new(at_break)) + State::Fn(StateName::LabelAtBreak) } _ => State::Nok, } @@ -90,7 +90,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | [a] /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer) -> State { +pub fn at_break(tokenizer: &mut Tokenizer) -> State { if tokenizer.tokenize_state.size > LINK_REFERENCE_SIZE_MAX || matches!(tokenizer.current, None | Some(b'[')) || (matches!(tokenizer.current, Some(b']')) && !tokenizer.tokenize_state.seen) @@ -101,13 +101,22 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { State::Nok } else { match tokenizer.current { - Some(b'\n') => tokenizer.attempt( - space_or_tab_eol_with_options(EolOptions { - content_type: Some(ContentType::String), - connect: tokenizer.tokenize_state.connect, - }), - |ok| Box::new(if ok { after_eol } else { at_blank_line }), - )(tokenizer), + Some(b'\n') => { + let state_name = space_or_tab_eol_with_options( + tokenizer, + EolOptions { + content_type: Some(ContentType::String), + connect: tokenizer.tokenize_state.connect, + }, + ); + tokenizer.attempt(state_name, |ok| { + State::Fn(if ok { + StateName::LabelEolAfter + } else { + StateName::LabelAtBlankLine + }) + }) + } Some(b']') => { tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); @@ -129,20 +138,20 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.connect = true; } - label(tokenizer) + inside(tokenizer) } } } } /// To do. -fn after_eol(tokenizer: &mut Tokenizer) -> State { +pub fn eol_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.connect = true; at_break(tokenizer) } /// To do. -fn at_blank_line(tokenizer: &mut Tokenizer) -> State { +pub fn at_blank_line(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.marker = 0; tokenizer.tokenize_state.connect = false; State::Nok @@ -154,7 +163,7 @@ fn at_blank_line(tokenizer: &mut Tokenizer) -> State { /// > | [a] /// ^ /// ``` -fn label(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n' | b'[' | b']') => { tokenizer.exit(Token::Data); @@ -165,13 +174,16 @@ fn label(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::Data); at_break(tokenizer) } else { - let func = if matches!(byte, b'\\') { escape } else { label }; tokenizer.consume(); tokenizer.tokenize_state.size += 1; if !tokenizer.tokenize_state.seen && !matches!(byte, b'\t' | b' ') { tokenizer.tokenize_state.seen = true; } - State::Fn(Box::new(func)) + State::Fn(if matches!(byte, b'\\') { + StateName::LabelEscape + } else { + StateName::LabelInside + }) } } } @@ -183,13 +195,13 @@ fn label(tokenizer: &mut Tokenizer) -> State { /// > | [a\*a] /// ^ /// ``` -fn escape(tokenizer: &mut Tokenizer) -> State { +pub fn escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'[' | b'\\' | b']') => { tokenizer.consume(); tokenizer.tokenize_state.size += 1; - State::Fn(Box::new(label)) + State::Fn(StateName::LabelInside) } - _ => label(tokenizer), + _ => inside(tokenizer), } } diff --git a/src/construct/partial_non_lazy_continuation.rs b/src/construct/partial_non_lazy_continuation.rs index 6005a6c..6d5cd7a 100644 --- a/src/construct/partial_non_lazy_continuation.rs +++ b/src/construct/partial_non_lazy_continuation.rs @@ -11,7 +11,7 @@ //! [html_flow]: crate::construct::html_flow use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of continuation. /// @@ -26,7 +26,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(after)) + State::Fn(StateName::NonLazyContinuationAfter) } _ => State::Nok, } @@ -39,7 +39,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | b /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { if tokenizer.lazy { State::Nok } else { diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index e3eac45..b0b35a6 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -6,7 +6,7 @@ use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{ContentType, State, StateFn, Tokenizer}; +use crate::tokenizer::{ContentType, State, StateName, Tokenizer}; /// Options to parse `space_or_tab`. #[derive(Debug)] @@ -37,8 +37,8 @@ pub struct EolOptions { /// ```bnf /// space_or_tab ::= 1*( ' ' '\t' ) /// ``` -pub fn space_or_tab() -> Box<StateFn> { - space_or_tab_min_max(1, usize::MAX) +pub fn space_or_tab(tokenizer: &mut Tokenizer) -> StateName { + space_or_tab_min_max(tokenizer, 1, usize::MAX) } /// Between `x` and `y` `space_or_tab`. @@ -46,26 +46,27 @@ pub fn space_or_tab() -> Box<StateFn> { /// ```bnf /// space_or_tab_min_max ::= x*y( ' ' '\t' ) /// ``` -pub fn space_or_tab_min_max(min: usize, max: usize) -> Box<StateFn> { - space_or_tab_with_options(Options { - kind: Token::SpaceOrTab, - min, - max, - content_type: None, - connect: false, - }) +pub fn space_or_tab_min_max(tokenizer: &mut Tokenizer, min: usize, max: usize) -> StateName { + space_or_tab_with_options( + tokenizer, + Options { + kind: Token::SpaceOrTab, + min, + max, + content_type: None, + connect: false, + }, + ) } /// `space_or_tab`, with the given options. -pub fn space_or_tab_with_options(options: Options) -> Box<StateFn> { - Box::new(|tokenizer| { - tokenizer.tokenize_state.space_or_tab_connect = options.connect; - tokenizer.tokenize_state.space_or_tab_content_type = options.content_type; - tokenizer.tokenize_state.space_or_tab_min = options.min; - tokenizer.tokenize_state.space_or_tab_max = options.max; - tokenizer.tokenize_state.space_or_tab_token = options.kind; - start(tokenizer) - }) +pub fn space_or_tab_with_options(tokenizer: &mut Tokenizer, options: Options) -> StateName { + tokenizer.tokenize_state.space_or_tab_connect = options.connect; + tokenizer.tokenize_state.space_or_tab_content_type = options.content_type; + tokenizer.tokenize_state.space_or_tab_min = options.min; + tokenizer.tokenize_state.space_or_tab_max = options.max; + tokenizer.tokenize_state.space_or_tab_token = options.kind; + StateName::SpaceOrTabStart } /// `space_or_tab`, or optionally `space_or_tab`, one `eol`, and @@ -74,41 +75,21 @@ pub fn space_or_tab_with_options(options: Options) -> Box<StateFn> { /// ```bnf /// space_or_tab_eol ::= 1*( ' ' '\t' ) | 0*( ' ' '\t' ) eol 0*( ' ' '\t' ) /// ``` -pub fn space_or_tab_eol() -> Box<StateFn> { - space_or_tab_eol_with_options(EolOptions { - content_type: None, - connect: false, - }) +pub fn space_or_tab_eol(tokenizer: &mut Tokenizer) -> StateName { + space_or_tab_eol_with_options( + tokenizer, + EolOptions { + content_type: None, + connect: false, + }, + ) } /// `space_or_tab_eol`, with the given options. -pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> { - Box::new(move |tokenizer| { - tokenizer.tokenize_state.space_or_tab_eol_content_type = options.content_type; - tokenizer.tokenize_state.space_or_tab_eol_connect = options.connect; - - tokenizer.attempt( - space_or_tab_with_options(Options { - kind: Token::SpaceOrTab, - min: 1, - max: usize::MAX, - content_type: tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .clone(), - connect: tokenizer.tokenize_state.space_or_tab_eol_connect, - }), - move |ok| { - Box::new(move |tokenizer| { - if ok { - tokenizer.tokenize_state.space_or_tab_eol_ok = ok; - } - - after_space_or_tab(tokenizer) - }) - }, - )(tokenizer) - }) +pub fn space_or_tab_eol_with_options(tokenizer: &mut Tokenizer, options: EolOptions) -> StateName { + tokenizer.tokenize_state.space_or_tab_eol_content_type = options.content_type; + tokenizer.tokenize_state.space_or_tab_eol_connect = options.connect; + StateName::SpaceOrTabEolStart } /// Before `space_or_tab`. @@ -117,7 +98,7 @@ pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> { /// > | a␠␠b /// ^ /// ``` -fn start(tokenizer: &mut Tokenizer) -> State { +pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') if tokenizer.tokenize_state.space_or_tab_max > 0 => { tokenizer.enter_with_content( @@ -144,7 +125,7 @@ fn start(tokenizer: &mut Tokenizer) -> State { /// > | a␠␠b /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') if tokenizer.tokenize_state.space_or_tab_size @@ -152,7 +133,7 @@ fn inside(tokenizer: &mut Tokenizer) -> State { { tokenizer.consume(); tokenizer.tokenize_state.space_or_tab_size += 1; - State::Fn(Box::new(inside)) + State::Fn(StateName::SpaceOrTabInside) } _ => { tokenizer.exit(tokenizer.tokenize_state.space_or_tab_token.clone()); @@ -167,7 +148,7 @@ fn inside(tokenizer: &mut Tokenizer) -> State { /// > | a␠␠b /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { let state = if tokenizer.tokenize_state.space_or_tab_size >= tokenizer.tokenize_state.space_or_tab_min { @@ -184,6 +165,44 @@ fn after(tokenizer: &mut Tokenizer) -> State { state } +pub fn eol_start(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_with_options( + tokenizer, + Options { + kind: Token::SpaceOrTab, + min: 1, + max: usize::MAX, + content_type: tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .clone(), + connect: tokenizer.tokenize_state.space_or_tab_eol_connect, + }, + ); + + tokenizer.attempt(state_name, move |ok| { + State::Fn(if ok { + StateName::SpaceOrTabEolAfterFirst + } else { + StateName::SpaceOrTabEolAtEol + }) + }) +} + +pub fn eol_after_first(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.space_or_tab_eol_ok = true; + + if tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .is_some() + { + tokenizer.tokenize_state.space_or_tab_eol_connect = true; + } + + eol_at_eol(tokenizer) +} + /// `space_or_tab_eol`: after optionally first `space_or_tab`. /// /// ```markdown @@ -191,16 +210,7 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// ^ /// | b /// ``` -fn after_space_or_tab(tokenizer: &mut Tokenizer) -> State { - if tokenizer.tokenize_state.space_or_tab_eol_ok - && tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .is_some() - { - tokenizer.tokenize_state.space_or_tab_eol_connect = true; - } - +pub fn eol_at_eol(tokenizer: &mut Tokenizer) -> State { if let Some(b'\n') = tokenizer.current { tokenizer.enter_with_content( Token::LineEnding, @@ -223,17 +233,17 @@ fn after_space_or_tab(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(after_eol)) + State::Fn(StateName::SpaceOrTabEolAfterEol) } else { - let state = if tokenizer.tokenize_state.space_or_tab_eol_ok { - State::Ok - } else { - State::Nok - }; + let ok = tokenizer.tokenize_state.space_or_tab_eol_ok; tokenizer.tokenize_state.space_or_tab_eol_content_type = None; tokenizer.tokenize_state.space_or_tab_eol_connect = false; tokenizer.tokenize_state.space_or_tab_eol_ok = false; - state + if ok { + State::Ok + } else { + State::Nok + } } } @@ -245,9 +255,10 @@ fn after_space_or_tab(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` #[allow(clippy::needless_pass_by_value)] -fn after_eol(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt( - space_or_tab_with_options(Options { +pub fn eol_after_eol(tokenizer: &mut Tokenizer) -> State { + let state_name = space_or_tab_with_options( + tokenizer, + Options { kind: Token::SpaceOrTab, min: 1, max: usize::MAX, @@ -256,9 +267,9 @@ fn after_eol(tokenizer: &mut Tokenizer) -> State { .space_or_tab_eol_content_type .clone(), connect: tokenizer.tokenize_state.space_or_tab_eol_connect, - }), - after_more_space_or_tab, - )(tokenizer) + }, + ); + tokenizer.attempt_opt(state_name, StateName::SpaceOrTabEolAfterMore) } /// `space_or_tab_eol`: after more (optional) `space_or_tab`. @@ -268,7 +279,7 @@ fn after_eol(tokenizer: &mut Tokenizer) -> State { /// > | b /// ^ /// ``` -fn after_more_space_or_tab(tokenizer: &mut Tokenizer) -> State { +pub fn eol_after_more(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.space_or_tab_eol_content_type = None; tokenizer.tokenize_state.space_or_tab_eol_connect = false; tokenizer.tokenize_state.space_or_tab_eol_ok = false; diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 6bf9099..8b72608 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -30,10 +30,10 @@ //! [character_reference]: crate::construct::character_reference //! [label_end]: crate::construct::label_end -use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; +use crate::construct::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, StateName, Tokenizer}; /// Before a title. /// @@ -50,7 +50,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); tokenizer.consume(); tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); - State::Fn(Box::new(begin)) + State::Fn(StateName::TitleBegin) } _ => State::Nok, } @@ -64,7 +64,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | "a" /// ^ /// ``` -fn begin(tokenizer: &mut Tokenizer) -> State { +pub fn begin(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => @@ -90,20 +90,30 @@ fn begin(tokenizer: &mut Tokenizer) -> State { /// > | "a" /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer) -> State { +pub fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => { tokenizer.tokenize_state.marker = 0; tokenizer.tokenize_state.connect = false; State::Nok } - Some(b'\n') => tokenizer.attempt( - space_or_tab_eol_with_options(EolOptions { - content_type: Some(ContentType::String), - connect: tokenizer.tokenize_state.connect, - }), - |ok| Box::new(if ok { after_eol } else { at_blank_line }), - )(tokenizer), + Some(b'\n') => { + let state_name = space_or_tab_eol_with_options( + tokenizer, + EolOptions { + content_type: Some(ContentType::String), + connect: tokenizer.tokenize_state.connect, + }, + ); + + tokenizer.attempt(state_name, |ok| { + State::Fn(if ok { + StateName::TitleAfterEol + } else { + StateName::TitleAtBlankLine + }) + }) + } Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { @@ -120,19 +130,19 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.connect = true; } - title(tokenizer) + inside(tokenizer) } } } /// To do. -fn after_eol(tokenizer: &mut Tokenizer) -> State { +pub fn after_eol(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.connect = true; at_break(tokenizer) } /// To do. -fn at_blank_line(tokenizer: &mut Tokenizer) -> State { +pub fn at_blank_line(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.marker = 0; tokenizer.tokenize_state.connect = false; State::Nok @@ -144,7 +154,7 @@ fn at_blank_line(tokenizer: &mut Tokenizer) -> State { /// > | "a" /// ^ /// ``` -fn title(tokenizer: &mut Tokenizer) -> State { +pub fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Data); @@ -157,9 +167,12 @@ fn title(tokenizer: &mut Tokenizer) -> State { at_break(tokenizer) } Some(byte) => { - let func = if matches!(byte, b'\\') { escape } else { title }; tokenizer.consume(); - State::Fn(Box::new(func)) + State::Fn(if matches!(byte, b'\\') { + StateName::TitleEscape + } else { + StateName::TitleInside + }) } } } @@ -170,12 +183,12 @@ fn title(tokenizer: &mut Tokenizer) -> State { /// > | "a\*b" /// ^ /// ``` -fn escape(tokenizer: &mut Tokenizer) -> State { +pub fn escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'"' | b'\'' | b')') => { tokenizer.consume(); - State::Fn(Box::new(title)) + State::Fn(StateName::TitleInside) } - _ => title(tokenizer), + _ => inside(tokenizer), } } diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 2ed2046..4ed25b6 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -51,7 +51,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Start of a thematic break. /// @@ -62,17 +62,17 @@ use crate::tokenizer::{State, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer) -> State { if tokenizer.parse_state.constructs.thematic_break { tokenizer.enter(Token::ThematicBreak); - tokenizer.go( - space_or_tab_min_max( - 0, - if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - ), - before, - )(tokenizer) + let state_name = space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ); + + tokenizer.go(state_name, StateName::ThematicBreakBefore) } else { State::Nok } @@ -84,7 +84,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | *** /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'*' | b'-' | b'_') => { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); @@ -100,7 +100,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | *** /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer) -> State { +pub fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') if tokenizer.tokenize_state.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.tokenize_state.marker = 0; @@ -130,18 +130,19 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// > | *** /// ^ /// ``` -fn sequence(tokenizer: &mut Tokenizer) -> State { +pub fn sequence(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.consume(); tokenizer.tokenize_state.size += 1; - State::Fn(Box::new(sequence)) + State::Fn(StateName::ThematicBreakSequence) } _ => { tokenizer.exit(Token::ThematicBreakSequence); - tokenizer.attempt_opt(space_or_tab(), at_break)(tokenizer) + let state_name = space_or_tab(tokenizer); + tokenizer.attempt_opt(state_name, StateName::ThematicBreakAtBreak) } } } diff --git a/src/content/document.rs b/src/content/document.rs index 33c8ff9..7a43d48 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -8,16 +8,13 @@ //! * [Block quote][crate::construct::block_quote] //! * [List][crate::construct::list] -use crate::construct::{ - block_quote::{cont as block_quote_cont, start as block_quote}, - list::{cont as list_item_const, start as list_item}, - partial_bom::start as bom, -}; -use crate::content::flow::start as flow; use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::token::Token; -use crate::tokenizer::{Container, ContainerState, Event, EventType, Point, State, Tokenizer}; +use crate::tokenizer::{ + Container, ContainerState, ContentType, Event, EventType, Link, Point, State, StateName, + Tokenizer, +}; use crate::util::{ normalize_identifier::normalize_identifier, skip, @@ -59,7 +56,7 @@ enum Phase { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.bytes.len(), Box::new(start)); + let state = tokenizer.push(0, parse_state.bytes.len(), StateName::DocumentStart); tokenizer.flush(state, true); let mut index = 0; @@ -103,8 +100,13 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { /// > | a /// ^ /// ``` -fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt(bom, line_start)(tokenizer) +pub fn start(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.child_tokenizer = Some(Box::new(Tokenizer::new( + tokenizer.point.clone(), + tokenizer.parse_state, + ))); + tokenizer.tokenize_state.document_child_state = Some(State::Fn(StateName::FlowStart)); + tokenizer.attempt_opt(StateName::BomStart, StateName::DocumentLineStart) } /// Start of a line. @@ -115,13 +117,8 @@ fn start(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn line_start(tokenizer: &mut Tokenizer) -> State { +pub fn line_start(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.document_continued = 0; - tokenizer.tokenize_state.document_index = tokenizer.events.len(); - tokenizer - .tokenize_state - .document_inject - .push((vec![], vec![])); // Containers would only be interrupting if we’ve continued. tokenizer.interrupt = false; container_existing_before(tokenizer) @@ -134,7 +131,7 @@ fn line_start(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn container_existing_before(tokenizer: &mut Tokenizer) -> State { +pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State { // If there are more existing containers, check whether the next one continues. if tokenizer.tokenize_state.document_continued < tokenizer.tokenize_state.document_container_stack.len() @@ -143,19 +140,19 @@ fn container_existing_before(tokenizer: &mut Tokenizer) -> State { .tokenize_state .document_container_stack .remove(tokenizer.tokenize_state.document_continued); - let cont = match container.kind { - Container::BlockQuote => block_quote_cont, - Container::ListItem => list_item_const, + let state_name = match container.kind { + Container::BlockQuote => StateName::BlockQuoteContStart, + Container::ListItem => StateName::ListContStart, }; tokenizer.container = Some(container); - tokenizer.attempt(cont, |ok| { - Box::new(if ok { - container_existing_after + tokenizer.attempt(state_name, |ok| { + State::Fn(if ok { + StateName::DocumentContainerExistingAfter } else { - container_existing_missing + StateName::DocumentContainerExistingMissing }) - })(tokenizer) + }) } // Otherwise, check new containers. else { @@ -170,7 +167,7 @@ fn container_existing_before(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn container_existing_missing(tokenizer: &mut Tokenizer) -> State { +pub fn container_existing_missing(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); tokenizer .tokenize_state @@ -186,7 +183,7 @@ fn container_existing_missing(tokenizer: &mut Tokenizer) -> State { /// > | b /// ^ /// ``` -fn container_existing_after(tokenizer: &mut Tokenizer) -> State { +pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); tokenizer .tokenize_state @@ -204,17 +201,28 @@ fn container_existing_after(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn container_new_before(tokenizer: &mut Tokenizer) -> State { +pub fn container_new_before(tokenizer: &mut Tokenizer) -> State { // If we have completely continued, restore the flow’s past `interrupt` // status. if tokenizer.tokenize_state.document_continued == tokenizer.tokenize_state.document_container_stack.len() { - tokenizer.interrupt = tokenizer.tokenize_state.document_interrupt_before; + tokenizer.interrupt = tokenizer + .tokenize_state + .child_tokenizer + .as_ref() + .unwrap() + .interrupt; // …and if we’re in a concrete construct, new containers can’t “pierce” // into them. - if tokenizer.concrete { + if tokenizer + .tokenize_state + .child_tokenizer + .as_ref() + .unwrap() + .concrete + { return containers_after(tokenizer); } } @@ -227,17 +235,17 @@ fn container_new_before(tokenizer: &mut Tokenizer) -> State { size: 0, }); - tokenizer.attempt(block_quote, |ok| { - Box::new(if ok { - container_new_after + tokenizer.attempt(StateName::BlockQuoteStart, |ok| { + State::Fn(if ok { + StateName::DocumentContainerNewAfter } else { - container_new_before_not_blockquote + StateName::DocumentContainerNewBeforeNotBlockQuote }) - })(tokenizer) + }) } /// To do. -fn container_new_before_not_blockquote(tokenizer: &mut Tokenizer) -> State { +pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State { // List item? tokenizer.container = Some(ContainerState { kind: Container::ListItem, @@ -245,13 +253,13 @@ fn container_new_before_not_blockquote(tokenizer: &mut Tokenizer) -> State { size: 0, }); - tokenizer.attempt(list_item, |ok| { - Box::new(if ok { - container_new_after + tokenizer.attempt(StateName::ListStart, |ok| { + State::Fn(if ok { + StateName::DocumentContainerNewAfter } else { - containers_after + StateName::DocumentContainersAfter }) - })(tokenizer) + }) } /// After a new container. @@ -262,31 +270,9 @@ fn container_new_before_not_blockquote(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn container_new_after(tokenizer: &mut Tokenizer) -> State { +pub fn container_new_after(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); - // Remove from the event stack. - // We’ll properly add exits at different points manually. - let token_type = match container.kind { - Container::BlockQuote => Token::BlockQuote, - Container::ListItem => Token::ListItem, - }; - - let mut stack_index = tokenizer.stack.len(); - let mut found = false; - - while stack_index > 0 { - stack_index -= 1; - - if tokenizer.stack[stack_index] == token_type { - tokenizer.stack.remove(stack_index); - found = true; - break; - } - } - - debug_assert!(found, "expected to find container token to exit"); - // If we did not continue all existing containers, and there is a new one, // close the flow and those containers. if tokenizer.tokenize_state.document_continued @@ -314,37 +300,55 @@ fn container_new_after(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn containers_after(tokenizer: &mut Tokenizer) -> State { - // Store the container events we parsed. - tokenizer - .tokenize_state - .document_inject - .last_mut() - .unwrap() - .0 - .append( - &mut tokenizer - .events - .split_off(tokenizer.tokenize_state.document_index), - ); +pub fn containers_after(tokenizer: &mut Tokenizer) -> State { + if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer { + child.lazy = tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len(); + child.interrupt = tokenizer.tokenize_state.document_interrupt_before; + child.define_skip(tokenizer.point.clone()); + } - tokenizer.lazy = tokenizer.tokenize_state.document_continued - != tokenizer.tokenize_state.document_container_stack.len(); - tokenizer.interrupt = tokenizer.tokenize_state.document_interrupt_before; - tokenizer.define_skip_current(); + match tokenizer.current { + // Note: EOL is part of data. + None => flow_end(tokenizer), + Some(_) => { + let current = tokenizer.events.len(); + let previous = tokenizer.tokenize_state.document_data_index.take(); + if let Some(previous) = previous { + tokenizer.events[previous].link.as_mut().unwrap().next = Some(current); + } + tokenizer.tokenize_state.document_data_index = Some(current); + tokenizer.enter_with_link( + Token::Data, + Some(Link { + previous, + next: None, + content_type: ContentType::Flow, + }), + ); + flow_inside(tokenizer) + } + } +} - let state = tokenizer - .tokenize_state - .document_next - .take() - .unwrap_or_else(|| Box::new(flow)); - - // Parse flow, pausing after eols. - tokenizer.go_until( - state, - |code| matches!(code, Some(b'\n')), - |state| Box::new(|t| flow_end(t, state)), - )(tokenizer) +/// To do. +pub fn flow_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None => { + tokenizer.exit(Token::Data); + flow_end(tokenizer) + } + // Note: EOL is part of data. + Some(b'\n') => { + tokenizer.consume(); + tokenizer.exit(Token::Data); + State::Fn(StateName::DocumentFlowEnd) + } + Some(_) => { + tokenizer.consume(); + State::Fn(StateName::DocumentFlowInside) + } + } } /// After flow (after eol or at eof). @@ -354,42 +358,70 @@ fn containers_after(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ ^ /// ``` -fn flow_end(tokenizer: &mut Tokenizer, result: State) -> State { - let paragraph = !tokenizer.events.is_empty() - && tokenizer.events[skip::opt_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Token::LineEnding], - )] - .token_type - == Token::Paragraph; - - if tokenizer.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before { - tokenizer.tokenize_state.document_continued = - tokenizer.tokenize_state.document_container_stack.len(); - } - - if tokenizer.tokenize_state.document_continued - != tokenizer.tokenize_state.document_container_stack.len() +pub fn flow_end(tokenizer: &mut Tokenizer) -> State { + let mut paragraph = false; + let mut interrupt = false; + + // We have new data. + // Note that everything except for a `null` is data. + if tokenizer.events.len() > 1 + && tokenizer.events[tokenizer.events.len() - 1].token_type == Token::Data { - exit_containers(tokenizer, &Phase::After); - } + let position = Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); + + let state = tokenizer + .tokenize_state + .document_child_state + .take() + .unwrap_or(State::Fn(StateName::FlowStart)); + + let state_name = match state { + State::Fn(state_name) => state_name, + _ => unreachable!("expected state name"), + }; + + if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer { + // To do: handle VS? + // if position.start.vs > 0 { + // } + let state = child.push(position.start.index, position.end.index, state_name); + + interrupt = child.interrupt; + paragraph = matches!(state, State::Fn(StateName::ParagraphInside)) + || (!child.events.is_empty() + && child.events[skip::opt_back( + &child.events, + child.events.len() - 1, + &[Token::LineEnding], + )] + .token_type + == Token::Paragraph); + + tokenizer.tokenize_state.document_child_state = Some(state); + + if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before { + tokenizer.tokenize_state.document_continued = + tokenizer.tokenize_state.document_container_stack.len(); + } - match result { - State::Ok => { - if !tokenizer.tokenize_state.document_container_stack.is_empty() { - tokenizer.tokenize_state.document_continued = 0; - exit_containers(tokenizer, &Phase::Eof); + if tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len() + { + exit_containers(tokenizer, &Phase::After); } + } + } + match tokenizer.current { + None => { + tokenizer.tokenize_state.document_continued = 0; + exit_containers(tokenizer, &Phase::Eof); resolve(tokenizer); State::Ok } - State::Nok => unreachable!("unexpected `nok` from flow"), - State::Fn(func) => { + Some(_) => { tokenizer.tokenize_state.document_paragraph_before = paragraph; - tokenizer.tokenize_state.document_interrupt_before = tokenizer.interrupt; - tokenizer.tokenize_state.document_next = Some(func); + tokenizer.tokenize_state.document_interrupt_before = interrupt; line_start(tokenizer) } } @@ -403,98 +435,248 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { .split_off(tokenizer.tokenize_state.document_continued); // So, we’re at the end of a line, but we need to close the *previous* line. - if *phase != Phase::Eof { - tokenizer.define_skip_current(); - let mut current_events = tokenizer - .events - .split_off(tokenizer.tokenize_state.document_index); - let state = tokenizer - .tokenize_state - .document_next - .take() - .unwrap_or_else(|| Box::new(flow)); - tokenizer.flush(State::Fn(state), false); - - if *phase == Phase::Prefix { - tokenizer.tokenize_state.document_index = tokenizer.events.len(); + if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer { + if *phase != Phase::After { + let state = tokenizer + .tokenize_state + .document_child_state + .take() + .unwrap_or(State::Fn(StateName::FlowStart)); + + child.flush(state, false); } - tokenizer.events.append(&mut current_events); - } + if !stack_close.is_empty() { + let mut inject_index = tokenizer.events.len(); - let mut exits = Vec::with_capacity(stack_close.len()); + // Move past the current data to find the last container start if we’re + // closing due to a potential lazy flow that was not lazy. + if *phase == Phase::After { + inject_index -= 2; + } - while !stack_close.is_empty() { - let container = stack_close.pop().unwrap(); - let token_type = match container.kind { - Container::BlockQuote => Token::BlockQuote, - Container::ListItem => Token::ListItem, - }; + // Move past the container starts to find the last data if we’re + // closing due to a different container or lazy flow like above. + if *phase == Phase::After || *phase == Phase::Prefix { + while inject_index > 0 { + let event = &tokenizer.events[inject_index - 1]; + + if event.token_type == Token::Data { + break; + } + + inject_index -= 1; + } + } + + // Move past data starts that are just whitespace only without + // container starts. + while inject_index > 0 { + let event = &tokenizer.events[inject_index - 1]; + + if event.token_type == Token::Data { + if event.event_type == EventType::Exit { + let slice = Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event(&tokenizer.events, inject_index - 1), + ); + let bytes = slice.bytes; + let mut whitespace = true; + let mut index = 0; + while index < bytes.len() { + match bytes[index] { + b'\t' | b'\n' | b'\r' | b' ' => index += 1, + _ => { + whitespace = false; + break; + } + } + } + + if !whitespace { + break; + } + } + } else { + break; + } + + inject_index -= 1; + } + + let ref_point = if inject_index == tokenizer.events.len() { + tokenizer.point.clone() + } else { + tokenizer.events[inject_index].point.clone() + }; + + let mut exits = Vec::with_capacity(stack_close.len()); + + while !stack_close.is_empty() { + let container = stack_close.pop().unwrap(); + let token_type = match container.kind { + Container::BlockQuote => Token::BlockQuote, + Container::ListItem => Token::ListItem, + }; + + exits.push(Event { + event_type: EventType::Exit, + token_type: token_type.clone(), + point: ref_point.clone(), + link: None, + }); + + let mut stack_index = tokenizer.stack.len(); + let mut found = false; + + while stack_index > 0 { + stack_index -= 1; + + if tokenizer.stack[stack_index] == token_type { + tokenizer.stack.remove(stack_index); + found = true; + break; + } + } + + debug_assert!(found, "expected to find container token to exit"); + } - exits.push(Event { - event_type: EventType::Exit, - token_type: token_type.clone(), - // Note: positions are fixed later. - point: tokenizer.point.clone(), - link: None, - }); + tokenizer.map.add(inject_index, 0, exits); + } } - let index = - tokenizer.tokenize_state.document_inject.len() - (if *phase == Phase::Eof { 1 } else { 2 }); - tokenizer.tokenize_state.document_inject[index] - .1 - .append(&mut exits); tokenizer.tokenize_state.document_interrupt_before = false; } // Inject the container events. fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - let mut inject = tokenizer.tokenize_state.document_inject.split_off(0); - inject.reverse(); - let mut first_line_ending_in_run = None; - - while let Some((before, mut after)) = inject.pop() { - if !before.is_empty() { - first_line_ending_in_run = None; - tokenizer.map.add(index, 0, before); - } + let mut child = tokenizer.tokenize_state.child_tokenizer.take().unwrap(); + child.map.consume(&mut child.events); + // To do: see if we can do this less. + tokenizer.map.consume(&mut tokenizer.events); - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; + let mut link_index = skip::to(&tokenizer.events, 0, &[Token::Data]); + // To do: share this code with `subtokenize`. + // Now, loop through all subevents to figure out which parts + // belong where and fix deep links. + let mut subindex = 0; + let mut slices = vec![]; + let mut slice_start = 0; + let mut old_prev: Option<usize> = None; + + while subindex < child.events.len() { + // Find the first event that starts after the end we’re looking + // for. + if child.events[subindex].event_type == EventType::Enter + && child.events[subindex].point.index >= tokenizer.events[link_index + 1].point.index + { + slices.push((link_index, slice_start)); + slice_start = subindex; + link_index = tokenizer.events[link_index] + .link + .as_ref() + .unwrap() + .next + .unwrap(); + } - if event.token_type == Token::LineEnding || event.token_type == Token::BlankLineEnding { - if event.event_type == EventType::Enter { - first_line_ending_in_run = first_line_ending_in_run.or(Some(index)); + // Fix sublinks. + if let Some(sublink_curr) = &child.events[subindex].link { + if sublink_curr.previous.is_some() { + let old_prev = old_prev.unwrap(); + let prev_event = &mut child.events[old_prev]; + // The `index` in `events` where the current link is, + // minus one to get the previous link, + // minus 2 events (the enter and exit) for each removed + // link. + let new_link = if slices.is_empty() { + old_prev + link_index + 2 } else { - index += 1; - break; - } - } else if event.token_type == Token::SpaceOrTab { - // Empty to allow whitespace in blank lines. - } else if first_line_ending_in_run.is_some() { - first_line_ending_in_run = None; + old_prev + link_index - (slices.len() - 1) * 2 + }; + prev_event.link.as_mut().unwrap().next = Some(new_link); } + } - index += 1; + // If there is a `next` link in the subevents, we have to change + // its `previous` index to account for the shifted events. + // If it points to a next event, we also change the next event’s + // reference back to *this* event. + if let Some(sublink_curr) = &child.events[subindex].link { + if let Some(next) = sublink_curr.next { + let sublink_next = child.events[next].link.as_mut().unwrap(); + + old_prev = sublink_next.previous; + + sublink_next.previous = sublink_next + .previous + // The `index` in `events` where the current link is, + // minus 2 events (the enter and exit) for each removed + // link. + .map(|previous| previous + link_index - (slices.len() * 2)); + } } - let point_rel = if let Some(index) = first_line_ending_in_run { - &tokenizer.events[index].point - } else { - &tokenizer.point - }; + subindex += 1; + } - let close_index = first_line_ending_in_run.unwrap_or(index); + if !child.events.is_empty() { + slices.push((link_index, slice_start)); + } + + // Finally, inject the subevents. + let mut index = slices.len(); + + while index > 0 { + index -= 1; + let start = slices[index].0; + tokenizer.map.add( + start, + if start == tokenizer.events.len() { + 0 + } else { + 2 + }, + child.events.split_off(slices[index].1), + ); + } + // To do: share the above code with `subtokenize`. - let mut subevent_index = 0; - while subevent_index < after.len() { - after[subevent_index].point = point_rel.clone(); - subevent_index += 1; + let mut resolvers = child.resolvers.split_off(0); + let mut resolver_ids = child.resolver_ids.split_off(0); + tokenizer.resolvers.append(&mut resolvers); + tokenizer.resolver_ids.append(&mut resolver_ids); + + // To do: see if we can do this less. + tokenizer.map.consume(&mut tokenizer.events); + + let mut index = 0; + let mut last_eol_enter: Option<usize> = None; + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.event_type == EventType::Exit { + if event.token_type == Token::BlockQuote || event.token_type == Token::ListItem { + if let Some(inject) = last_eol_enter { + let point = tokenizer.events[inject].point.clone(); + let mut clone = event.clone(); + clone.point = point; + // Inject a fixed exit. + tokenizer.map.add(inject, 0, vec![clone]); + // Remove this exit. + tokenizer.map.add(index, 1, vec![]); + } + } else if event.token_type == Token::LineEnding + || event.token_type == Token::BlankLineEnding + { + last_eol_enter = Some(index - 1); + } else { + last_eol_enter = None; + } } - tokenizer.map.add(close_index, 0, after); + index += 1; } tokenizer.map.consume(&mut tokenizer.events); diff --git a/src/content/flow.rs b/src/content/flow.rs index bf4104c..6f62901 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -19,15 +19,8 @@ //! * [HTML (flow)][crate::construct::html_flow] //! * [Thematic break][crate::construct::thematic_break] -use crate::construct::{ - blank_line::start as blank_line, code_fenced::start as code_fenced, - code_indented::start as code_indented, definition::start as definition, - heading_atx::start as heading_atx, heading_setext::start as heading_setext, - html_flow::start as html_flow, paragraph::start as paragraph, - thematic_break::start as thematic_break, -}; use crate::token::Token; -use crate::tokenizer::{State, Tokenizer}; +use crate::tokenizer::{State, StateName, Tokenizer}; /// Before flow. /// @@ -42,9 +35,13 @@ use crate::tokenizer::{State, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, - _ => tokenizer.attempt(blank_line, |ok| { - Box::new(if ok { blank_line_after } else { initial_before }) - })(tokenizer), + _ => tokenizer.attempt(StateName::BlankLineStart, |ok| { + State::Fn(if ok { + StateName::FlowBlankLineAfter + } else { + StateName::FlowBefore + }) + }), } } @@ -60,21 +57,27 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// |~~~js /// |<div> /// ``` -fn initial_before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, _ => tokenizer.attempt_n( vec![ - Box::new(code_indented), - Box::new(code_fenced), - Box::new(html_flow), - Box::new(heading_atx), - Box::new(heading_setext), - Box::new(thematic_break), - Box::new(definition), + StateName::CodeIndentedStart, + StateName::CodeFencedStart, + StateName::HtmlFlowStart, + StateName::HeadingAtxStart, + StateName::HeadingSetextStart, + StateName::ThematicBreakStart, + StateName::DefinitionStart, ], - |ok| Box::new(if ok { after } else { before_paragraph }), - )(tokenizer), + |ok| { + State::Fn(if ok { + StateName::FlowAfter + } else { + StateName::FlowBeforeParagraph + }) + }, + ), } } @@ -85,7 +88,7 @@ fn initial_before(tokenizer: &mut Tokenizer) -> State { /// ```markdown /// ␠␠| /// ``` -fn blank_line_after(tokenizer: &mut Tokenizer) -> State { +pub fn blank_line_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, Some(b'\n') => { @@ -94,7 +97,7 @@ fn blank_line_after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::BlankLineEnding); // Feel free to interrupt. tokenizer.interrupt = false; - State::Fn(Box::new(start)) + State::Fn(StateName::FlowStart) } _ => unreachable!("expected eol/eof"), } @@ -109,14 +112,14 @@ fn blank_line_after(tokenizer: &mut Tokenizer) -> State { /// asd /// ~~~| /// ``` -fn after(tokenizer: &mut Tokenizer) -> State { +pub fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(start)) + State::Fn(StateName::FlowStart) } _ => unreachable!("expected eol/eof"), } @@ -127,6 +130,6 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// ```markdown /// |asd /// ``` -fn before_paragraph(tokenizer: &mut Tokenizer) -> State { - tokenizer.go(paragraph, after)(tokenizer) +pub fn before_paragraph(tokenizer: &mut Tokenizer) -> State { + tokenizer.go(StateName::ParagraphStart, StateName::FlowAfter) } diff --git a/src/content/string.rs b/src/content/string.rs index 2e738fb..697ec2c 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -12,11 +12,8 @@ //! //! [text]: crate::content::text -use crate::construct::{ - character_escape::start as character_escape, character_reference::start as character_reference, - partial_data::start as data, partial_whitespace::resolve_whitespace, -}; -use crate::tokenizer::{State, Tokenizer}; +use crate::construct::partial_whitespace::resolve_whitespace; +use crate::tokenizer::{State, StateName, Tokenizer}; const MARKERS: [u8; 2] = [b'&', b'\\']; @@ -28,19 +25,28 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { } /// Before string. -fn before(tokenizer: &mut Tokenizer) -> State { +pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, _ => tokenizer.attempt_n( - vec![Box::new(character_reference), Box::new(character_escape)], - |ok| Box::new(if ok { before } else { before_data }), - )(tokenizer), + vec![ + StateName::CharacterReferenceStart, + StateName::CharacterEscapeStart, + ], + |ok| { + State::Fn(if ok { + StateName::StringBefore + } else { + StateName::StringBeforeData + }) + }, + ), } } /// At data. -fn before_data(tokenizer: &mut Tokenizer) -> State { - tokenizer.go(data, before)(tokenizer) +pub fn before_data(tokenizer: &mut Tokenizer) -> State { + tokenizer.go(StateName::DataStart, StateName::StringBefore) } /// Resolve whitespace. diff --git a/src/content/text.rs b/src/content/text.rs index f4666d1..d8a2726 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -20,15 +20,8 @@ //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][crate::construct::partial_whitespace]. -use crate::construct::{ - attention::start as attention, autolink::start as autolink, - character_escape::start as character_escape, character_reference::start as character_reference, - code_text::start as code_text, hard_break_escape::start as hard_break_escape, - html_text::start as html_text, label_end::start as label_end, - label_start_image::start as label_start_image, label_start_link::start as label_start_link, - partial_data::start as data, partial_whitespace::resolve_whitespace, -}; -use crate::tokenizer::{State, Tokenizer}; +use crate::construct::partial_whitespace::resolve_whitespace; +use crate::tokenizer::{State, StateName, Tokenizer}; const MARKERS: [u8; 9] = [ b'!', // `label_start_image` @@ -55,19 +48,25 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { None => State::Ok, _ => tokenizer.attempt_n( vec![ - Box::new(attention), - Box::new(autolink), - Box::new(character_escape), - Box::new(character_reference), - Box::new(code_text), - Box::new(hard_break_escape), - Box::new(html_text), - Box::new(label_end), - Box::new(label_start_image), - Box::new(label_start_link), + StateName::AttentionStart, + StateName::AutolinkStart, + StateName::CharacterEscapeStart, + StateName::CharacterReferenceStart, + StateName::CodeTextStart, + StateName::HardBreakEscapeStart, + StateName::HtmlTextStart, + StateName::LabelEndStart, + StateName::LabelStartImageStart, + StateName::LabelStartLinkStart, ], - |ok| Box::new(if ok { before } else { before_data }), - )(tokenizer), + |ok| { + State::Fn(if ok { + StateName::TextBefore + } else { + StateName::TextBeforeData + }) + }, + ), } } @@ -76,8 +75,8 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { /// ```markdown /// |qwe /// ``` -fn before_data(tokenizer: &mut Tokenizer) -> State { - tokenizer.go(data, before)(tokenizer) +pub fn before_data(tokenizer: &mut Tokenizer) -> State { + tokenizer.go(StateName::DataStart, StateName::TextBefore) } /// Resolve whitespace. diff --git a/src/subtokenize.rs b/src/subtokenize.rs index c641419..b080b46 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -21,9 +21,8 @@ //! thus the whole document needs to be parsed up to the level of definitions, //! before any level that can include references can be parsed. -use crate::content::{string::start as string, text::start as text}; use crate::parser::ParseState; -use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer}; +use crate::tokenizer::{ContentType, Event, EventType, State, StateName, Tokenizer}; use crate::util::edit_map::EditMap; /// Create a link between two [`Event`][]s. @@ -79,11 +78,11 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> bool { // Subtokenizer. let mut tokenizer = Tokenizer::new(event.point.clone(), parse_state); // Substate. - let mut state = State::Fn(Box::new(if link.content_type == ContentType::String { - string + let mut state = State::Fn(if link.content_type == ContentType::String { + StateName::StringStart } else { - text - })); + StateName::TextStart + }); // Loop through links to pass them in order to the subtokenizer. while let Some(index) = link_index { @@ -92,7 +91,7 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> bool { debug_assert_eq!(enter.event_type, EventType::Enter); if link_curr.previous != None { - tokenizer.define_skip(&enter.point); + tokenizer.define_skip(enter.point.clone()); } state = tokenizer.push( diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3068ddf..7d28b77 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -12,6 +12,8 @@ //! [`check`]: Tokenizer::check use crate::constant::TAB_SIZE; +use crate::construct; +use crate::content; use crate::parser::ParseState; use crate::token::{Token, VOID_TOKENS}; use crate::util::edit_map::EditMap; @@ -19,10 +21,12 @@ use crate::util::edit_map::EditMap; /// Embedded content type. #[derive(Debug, Clone, PartialEq)] pub enum ContentType { - /// Represents [text content][crate::content::text]. - Text, + /// Represents [flow content][crate::content::flow]. + Flow, /// Represents [string content][crate::content::string]. String, + /// Represents [text content][crate::content::text]. + Text, } #[derive(Debug, PartialEq)] @@ -79,10 +83,9 @@ pub struct Event { pub link: Option<Link>, } -/// The essence of the state machine are functions: `StateFn`. -/// It’s responsible for dealing with the current byte. -/// It yields a [`State`][]. -pub type StateFn = dyn FnOnce(&mut Tokenizer) -> State; +pub struct Attempt { + done: Box<dyn FnOnce(&mut Tokenizer, State) -> State + 'static>, +} /// Callback that can be registered and is called when the tokenizer is done. /// @@ -91,10 +94,619 @@ pub type StateFn = dyn FnOnce(&mut Tokenizer) -> State; /// the compiler and other users. pub type Resolver = dyn FnOnce(&mut Tokenizer); +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum StateName { + AttentionStart, + AttentionInside, + + AutolinkStart, + AutolinkOpen, + AutolinkSchemeOrEmailAtext, + AutolinkSchemeInsideOrEmailAtext, + AutolinkUrlInside, + AutolinkEmailAtSignOrDot, + AutolinkEmailAtext, + AutolinkEmailValue, + AutolinkEmailLabel, + + BlankLineStart, + BlankLineAfter, + + BlockQuoteStart, + BlockQuoteBefore, + BlockQuoteContStart, + BlockQuoteContBefore, + BlockQuoteContAfter, + + BomStart, + BomInside, + + CharacterEscapeStart, + CharacterEscapeInside, + + CharacterReferenceStart, + CharacterReferenceOpen, + CharacterReferenceNumeric, + CharacterReferenceValue, + + CodeFencedStart, + CodeFencedBeforeSequenceOpen, + CodeFencedSequenceOpen, + CodeFencedInfoBefore, + CodeFencedInfo, + CodeFencedMetaBefore, + CodeFencedMeta, + CodeFencedAtNonLazyBreak, + CodeFencedCloseBefore, + CodeFencedCloseStart, + CodeFencedBeforeSequenceClose, + CodeFencedSequenceClose, + CodeFencedAfterSequenceClose, + CodeFencedContentBefore, + CodeFencedContentStart, + CodeFencedBeforeContentChunk, + CodeFencedContentChunk, + CodeFencedAfter, + + CodeIndentedStart, + CodeIndentedAtBreak, + CodeIndentedAfter, + CodeIndentedFurtherStart, + CodeIndentedInside, + CodeIndentedFurtherEnd, + CodeIndentedFurtherBegin, + CodeIndentedFurtherAfter, + + CodeTextStart, + CodeTextSequenceOpen, + CodeTextBetween, + CodeTextData, + CodeTextSequenceClose, + + DataStart, + DataInside, + DataAtBreak, + + DefinitionStart, + DefinitionBefore, + DefinitionLabelAfter, + DefinitionMarkerAfter, + DefinitionDestinationBefore, + DefinitionDestinationAfter, + DefinitionDestinationMissing, + DefinitionTitleBefore, + DefinitionAfter, + DefinitionAfterWhitespace, + DefinitionTitleBeforeMarker, + DefinitionTitleAfter, + DefinitionTitleAfterOptionalWhitespace, + + DestinationStart, + DestinationEnclosedBefore, + DestinationEnclosed, + DestinationEnclosedEscape, + DestinationRaw, + DestinationRawEscape, + + DocumentStart, + DocumentLineStart, + // DocumentContainerExistingBefore, + DocumentContainerExistingAfter, + DocumentContainerExistingMissing, + // DocumentContainerNewBefore, + DocumentContainerNewBeforeNotBlockQuote, + DocumentContainerNewAfter, + DocumentContainersAfter, + DocumentFlowInside, + DocumentFlowEnd, + + FlowStart, + FlowBefore, + FlowAfter, + FlowBlankLineAfter, + FlowBeforeParagraph, + + HardBreakEscapeStart, + HardBreakEscapeAfter, + + HeadingAtxStart, + HeadingAtxBefore, + HeadingAtxSequenceOpen, + HeadingAtxAtBreak, + HeadingAtxSequenceFurther, + HeadingAtxData, + + HeadingSetextStart, + HeadingSetextBefore, + HeadingSetextInside, + HeadingSetextAfter, + + HtmlFlowStart, + HtmlFlowBefore, + HtmlFlowOpen, + HtmlFlowDeclarationOpen, + HtmlFlowCommentOpenInside, + HtmlFlowCdataOpenInside, + HtmlFlowTagCloseStart, + HtmlFlowTagName, + HtmlFlowBasicSelfClosing, + HtmlFlowCompleteClosingTagAfter, + HtmlFlowCompleteEnd, + HtmlFlowCompleteAttributeNameBefore, + HtmlFlowCompleteAttributeName, + HtmlFlowCompleteAttributeNameAfter, + HtmlFlowCompleteAttributeValueBefore, + HtmlFlowCompleteAttributeValueQuoted, + HtmlFlowCompleteAttributeValueQuotedAfter, + HtmlFlowCompleteAttributeValueUnquoted, + HtmlFlowCompleteAfter, + HtmlFlowBlankLineBefore, + HtmlFlowContinuation, + HtmlFlowContinuationDeclarationInside, + HtmlFlowContinuationAfter, + HtmlFlowContinuationStart, + HtmlFlowContinuationBefore, + HtmlFlowContinuationCommentInside, + HtmlFlowContinuationRawTagOpen, + HtmlFlowContinuationRawEndTag, + HtmlFlowContinuationClose, + HtmlFlowContinuationCdataInside, + HtmlFlowContinuationStartNonLazy, + + HtmlTextStart, + HtmlTextOpen, + HtmlTextDeclarationOpen, + HtmlTextTagCloseStart, + HtmlTextTagClose, + HtmlTextTagCloseBetween, + HtmlTextTagOpen, + HtmlTextTagOpenBetween, + HtmlTextTagOpenAttributeName, + HtmlTextTagOpenAttributeNameAfter, + HtmlTextTagOpenAttributeValueBefore, + HtmlTextTagOpenAttributeValueQuoted, + HtmlTextTagOpenAttributeValueQuotedAfter, + HtmlTextTagOpenAttributeValueUnquoted, + HtmlTextCdata, + HtmlTextCdataOpenInside, + HtmlTextCdataClose, + HtmlTextCdataEnd, + HtmlTextCommentOpenInside, + HtmlTextCommentStart, + HtmlTextCommentStartDash, + HtmlTextComment, + HtmlTextCommentClose, + HtmlTextDeclaration, + HtmlTextEnd, + HtmlTextInstruction, + HtmlTextInstructionClose, + HtmlTextLineEndingAfter, + HtmlTextLineEndingAfterPrefix, + + LabelStart, + LabelAtBreak, + LabelEolAfter, + LabelAtBlankLine, + LabelEscape, + LabelInside, + + LabelEndStart, + LabelEndAfter, + LabelEndResourceStart, + LabelEndResourceBefore, + LabelEndResourceOpen, + LabelEndResourceDestinationAfter, + LabelEndResourceDestinationMissing, + LabelEndResourceBetween, + LabelEndResourceTitleAfter, + LabelEndResourceEnd, + LabelEndOk, + LabelEndNok, + LabelEndReferenceFull, + LabelEndReferenceFullAfter, + LabelEndReferenceNotFull, + LabelEndReferenceCollapsed, + LabelEndReferenceCollapsedOpen, + + LabelStartImageStart, + LabelStartImageOpen, + + LabelStartLinkStart, + + ListStart, + ListBefore, + ListNok, + ListBeforeUnordered, + ListValue, + ListMarkerAfter, + ListAfter, + ListMarkerAfterFilled, + ListWhitespace, + ListPrefixOther, + ListWhitespaceAfter, + ListContStart, + ListContBlank, + ListContFilled, + ListOk, + + NonLazyContinuationStart, + NonLazyContinuationAfter, + + ParagraphStart, + ParagraphInside, + + SpaceOrTabStart, + SpaceOrTabInside, + + SpaceOrTabEolStart, + SpaceOrTabEolAfterFirst, + SpaceOrTabEolAfterEol, + SpaceOrTabEolAtEol, + SpaceOrTabEolAfterMore, + + StringStart, + StringBefore, + StringBeforeData, + + TextStart, + TextBefore, + TextBeforeData, + + ThematicBreakStart, + ThematicBreakBefore, + ThematicBreakSequence, + ThematicBreakAtBreak, + + TitleStart, + TitleBegin, + TitleAfterEol, + TitleAtBlankLine, + TitleEscape, + TitleInside, +} + +impl StateName { + /// Create a new tokenizer. + #[allow(clippy::too_many_lines)] + pub fn to_func(self) -> Box<dyn FnOnce(&mut Tokenizer) -> State + 'static> { + let func = match self { + StateName::AttentionStart => construct::attention::start, + StateName::AttentionInside => construct::attention::inside, + + StateName::AutolinkStart => construct::autolink::start, + StateName::AutolinkOpen => construct::autolink::open, + StateName::AutolinkSchemeOrEmailAtext => construct::autolink::scheme_or_email_atext, + StateName::AutolinkSchemeInsideOrEmailAtext => { + construct::autolink::scheme_inside_or_email_atext + } + StateName::AutolinkUrlInside => construct::autolink::url_inside, + StateName::AutolinkEmailAtSignOrDot => construct::autolink::email_at_sign_or_dot, + StateName::AutolinkEmailAtext => construct::autolink::email_atext, + StateName::AutolinkEmailValue => construct::autolink::email_value, + StateName::AutolinkEmailLabel => construct::autolink::email_label, + + StateName::BlankLineStart => construct::blank_line::start, + StateName::BlankLineAfter => construct::blank_line::after, + + StateName::BlockQuoteStart => construct::block_quote::start, + StateName::BlockQuoteBefore => construct::block_quote::before, + StateName::BlockQuoteContStart => construct::block_quote::cont_start, + StateName::BlockQuoteContBefore => construct::block_quote::cont_before, + StateName::BlockQuoteContAfter => construct::block_quote::cont_after, + + StateName::BomStart => construct::partial_bom::start, + StateName::BomInside => construct::partial_bom::inside, + + StateName::CharacterEscapeStart => construct::character_escape::start, + StateName::CharacterEscapeInside => construct::character_escape::inside, + + StateName::CharacterReferenceStart => construct::character_reference::start, + StateName::CharacterReferenceOpen => construct::character_reference::open, + StateName::CharacterReferenceNumeric => construct::character_reference::numeric, + StateName::CharacterReferenceValue => construct::character_reference::value, + + StateName::CodeFencedStart => construct::code_fenced::start, + StateName::CodeFencedBeforeSequenceOpen => construct::code_fenced::before_sequence_open, + StateName::CodeFencedSequenceOpen => construct::code_fenced::sequence_open, + StateName::CodeFencedInfoBefore => construct::code_fenced::info_before, + StateName::CodeFencedInfo => construct::code_fenced::info, + StateName::CodeFencedMetaBefore => construct::code_fenced::meta_before, + StateName::CodeFencedMeta => construct::code_fenced::meta, + StateName::CodeFencedAtNonLazyBreak => construct::code_fenced::at_non_lazy_break, + StateName::CodeFencedCloseBefore => construct::code_fenced::close_before, + StateName::CodeFencedCloseStart => construct::code_fenced::close_start, + StateName::CodeFencedBeforeSequenceClose => { + construct::code_fenced::before_sequence_close + } + StateName::CodeFencedSequenceClose => construct::code_fenced::sequence_close, + StateName::CodeFencedAfterSequenceClose => construct::code_fenced::sequence_close_after, + StateName::CodeFencedContentBefore => construct::code_fenced::content_before, + StateName::CodeFencedContentStart => construct::code_fenced::content_start, + StateName::CodeFencedBeforeContentChunk => construct::code_fenced::before_content_chunk, + StateName::CodeFencedContentChunk => construct::code_fenced::content_chunk, + StateName::CodeFencedAfter => construct::code_fenced::after, + + StateName::CodeIndentedStart => construct::code_indented::start, + StateName::CodeIndentedAtBreak => construct::code_indented::at_break, + StateName::CodeIndentedAfter => construct::code_indented::after, + StateName::CodeIndentedFurtherStart => construct::code_indented::further_start, + StateName::CodeIndentedInside => construct::code_indented::inside, + StateName::CodeIndentedFurtherEnd => construct::code_indented::further_end, + StateName::CodeIndentedFurtherBegin => construct::code_indented::further_begin, + StateName::CodeIndentedFurtherAfter => construct::code_indented::further_after, + + StateName::CodeTextStart => construct::code_text::start, + StateName::CodeTextSequenceOpen => construct::code_text::sequence_open, + StateName::CodeTextBetween => construct::code_text::between, + StateName::CodeTextData => construct::code_text::data, + StateName::CodeTextSequenceClose => construct::code_text::sequence_close, + + StateName::DataStart => construct::partial_data::start, + StateName::DataInside => construct::partial_data::inside, + StateName::DataAtBreak => construct::partial_data::at_break, + + StateName::DefinitionStart => construct::definition::start, + StateName::DefinitionBefore => construct::definition::before, + StateName::DefinitionLabelAfter => construct::definition::label_after, + StateName::DefinitionMarkerAfter => construct::definition::marker_after, + StateName::DefinitionDestinationBefore => construct::definition::destination_before, + StateName::DefinitionDestinationAfter => construct::definition::destination_after, + StateName::DefinitionDestinationMissing => construct::definition::destination_missing, + StateName::DefinitionTitleBefore => construct::definition::title_before, + StateName::DefinitionAfter => construct::definition::after, + StateName::DefinitionAfterWhitespace => construct::definition::after_whitespace, + StateName::DefinitionTitleBeforeMarker => construct::definition::title_before_marker, + StateName::DefinitionTitleAfter => construct::definition::title_after, + StateName::DefinitionTitleAfterOptionalWhitespace => { + construct::definition::title_after_optional_whitespace + } + + StateName::DestinationStart => construct::partial_destination::start, + StateName::DestinationEnclosedBefore => construct::partial_destination::enclosed_before, + StateName::DestinationEnclosed => construct::partial_destination::enclosed, + StateName::DestinationEnclosedEscape => construct::partial_destination::enclosed_escape, + StateName::DestinationRaw => construct::partial_destination::raw, + StateName::DestinationRawEscape => construct::partial_destination::raw_escape, + + StateName::DocumentStart => content::document::start, + StateName::DocumentLineStart => content::document::line_start, + // StateName::DocumentContainerExistingBefore => content::document::container_existing_before, + StateName::DocumentContainerExistingAfter => { + content::document::container_existing_after + } + StateName::DocumentContainerExistingMissing => { + content::document::container_existing_missing + } + // StateName::DocumentContainerNewBefore => content::document::container_new_before, + StateName::DocumentContainerNewBeforeNotBlockQuote => { + content::document::container_new_before_not_block_quote + } + StateName::DocumentContainerNewAfter => content::document::container_new_after, + StateName::DocumentContainersAfter => content::document::containers_after, + StateName::DocumentFlowEnd => content::document::flow_end, + StateName::DocumentFlowInside => content::document::flow_inside, + + StateName::FlowStart => content::flow::start, + StateName::FlowBefore => content::flow::before, + StateName::FlowAfter => content::flow::after, + StateName::FlowBlankLineAfter => content::flow::blank_line_after, + StateName::FlowBeforeParagraph => content::flow::before_paragraph, + + StateName::HardBreakEscapeStart => construct::hard_break_escape::start, + StateName::HardBreakEscapeAfter => construct::hard_break_escape::after, + + StateName::HeadingAtxStart => construct::heading_atx::start, + StateName::HeadingAtxBefore => construct::heading_atx::before, + StateName::HeadingAtxSequenceOpen => construct::heading_atx::sequence_open, + StateName::HeadingAtxAtBreak => construct::heading_atx::at_break, + StateName::HeadingAtxSequenceFurther => construct::heading_atx::sequence_further, + StateName::HeadingAtxData => construct::heading_atx::data, + + StateName::HeadingSetextStart => construct::heading_setext::start, + StateName::HeadingSetextBefore => construct::heading_setext::before, + StateName::HeadingSetextInside => construct::heading_setext::inside, + StateName::HeadingSetextAfter => construct::heading_setext::after, + + StateName::HtmlFlowStart => construct::html_flow::start, + StateName::HtmlFlowBefore => construct::html_flow::before, + StateName::HtmlFlowOpen => construct::html_flow::open, + StateName::HtmlFlowDeclarationOpen => construct::html_flow::declaration_open, + StateName::HtmlFlowCommentOpenInside => construct::html_flow::comment_open_inside, + StateName::HtmlFlowCdataOpenInside => construct::html_flow::cdata_open_inside, + StateName::HtmlFlowTagCloseStart => construct::html_flow::tag_close_start, + StateName::HtmlFlowTagName => construct::html_flow::tag_name, + StateName::HtmlFlowBasicSelfClosing => construct::html_flow::basic_self_closing, + StateName::HtmlFlowCompleteClosingTagAfter => { + construct::html_flow::complete_closing_tag_after + } + StateName::HtmlFlowCompleteEnd => construct::html_flow::complete_end, + StateName::HtmlFlowCompleteAttributeNameBefore => { + construct::html_flow::complete_attribute_name_before + } + StateName::HtmlFlowCompleteAttributeName => { + construct::html_flow::complete_attribute_name + } + StateName::HtmlFlowCompleteAttributeNameAfter => { + construct::html_flow::complete_attribute_name_after + } + StateName::HtmlFlowCompleteAttributeValueBefore => { + construct::html_flow::complete_attribute_value_before + } + StateName::HtmlFlowCompleteAttributeValueQuoted => { + construct::html_flow::complete_attribute_value_quoted + } + StateName::HtmlFlowCompleteAttributeValueQuotedAfter => { + construct::html_flow::complete_attribute_value_quoted_after + } + StateName::HtmlFlowCompleteAttributeValueUnquoted => { + construct::html_flow::complete_attribute_value_unquoted + } + StateName::HtmlFlowCompleteAfter => construct::html_flow::complete_after, + StateName::HtmlFlowBlankLineBefore => construct::html_flow::blank_line_before, + StateName::HtmlFlowContinuation => construct::html_flow::continuation, + StateName::HtmlFlowContinuationDeclarationInside => { + construct::html_flow::continuation_declaration_inside + } + StateName::HtmlFlowContinuationAfter => construct::html_flow::continuation_after, + StateName::HtmlFlowContinuationStart => construct::html_flow::continuation_start, + StateName::HtmlFlowContinuationBefore => construct::html_flow::continuation_before, + StateName::HtmlFlowContinuationCommentInside => { + construct::html_flow::continuation_comment_inside + } + StateName::HtmlFlowContinuationRawTagOpen => { + construct::html_flow::continuation_raw_tag_open + } + StateName::HtmlFlowContinuationRawEndTag => { + construct::html_flow::continuation_raw_end_tag + } + StateName::HtmlFlowContinuationClose => construct::html_flow::continuation_close, + StateName::HtmlFlowContinuationCdataInside => { + construct::html_flow::continuation_cdata_inside + } + StateName::HtmlFlowContinuationStartNonLazy => { + construct::html_flow::continuation_start_non_lazy + } + + StateName::HtmlTextStart => construct::html_text::start, + StateName::HtmlTextOpen => construct::html_text::open, + StateName::HtmlTextDeclarationOpen => construct::html_text::declaration_open, + StateName::HtmlTextTagCloseStart => construct::html_text::tag_close_start, + StateName::HtmlTextTagClose => construct::html_text::tag_close, + StateName::HtmlTextTagCloseBetween => construct::html_text::tag_close_between, + StateName::HtmlTextTagOpen => construct::html_text::tag_open, + StateName::HtmlTextTagOpenBetween => construct::html_text::tag_open_between, + StateName::HtmlTextTagOpenAttributeName => { + construct::html_text::tag_open_attribute_name + } + StateName::HtmlTextTagOpenAttributeNameAfter => { + construct::html_text::tag_open_attribute_name_after + } + StateName::HtmlTextTagOpenAttributeValueBefore => { + construct::html_text::tag_open_attribute_value_before + } + StateName::HtmlTextTagOpenAttributeValueQuoted => { + construct::html_text::tag_open_attribute_value_quoted + } + StateName::HtmlTextTagOpenAttributeValueQuotedAfter => { + construct::html_text::tag_open_attribute_value_quoted_after + } + StateName::HtmlTextTagOpenAttributeValueUnquoted => { + construct::html_text::tag_open_attribute_value_unquoted + } + StateName::HtmlTextCdata => construct::html_text::cdata, + StateName::HtmlTextCdataOpenInside => construct::html_text::cdata_open_inside, + StateName::HtmlTextCdataClose => construct::html_text::cdata_close, + StateName::HtmlTextCdataEnd => construct::html_text::cdata_end, + StateName::HtmlTextCommentOpenInside => construct::html_text::comment_open_inside, + StateName::HtmlTextCommentStart => construct::html_text::comment_start, + StateName::HtmlTextCommentStartDash => construct::html_text::comment_start_dash, + StateName::HtmlTextComment => construct::html_text::comment, + StateName::HtmlTextCommentClose => construct::html_text::comment_close, + StateName::HtmlTextDeclaration => construct::html_text::declaration, + StateName::HtmlTextEnd => construct::html_text::end, + StateName::HtmlTextInstruction => construct::html_text::instruction, + StateName::HtmlTextInstructionClose => construct::html_text::instruction_close, + StateName::HtmlTextLineEndingAfter => construct::html_text::line_ending_after, + StateName::HtmlTextLineEndingAfterPrefix => { + construct::html_text::line_ending_after_prefix + } + + StateName::LabelStart => construct::partial_label::start, + StateName::LabelAtBreak => construct::partial_label::at_break, + StateName::LabelEolAfter => construct::partial_label::eol_after, + StateName::LabelAtBlankLine => construct::partial_label::at_blank_line, + StateName::LabelEscape => construct::partial_label::escape, + StateName::LabelInside => construct::partial_label::inside, + + StateName::LabelEndStart => construct::label_end::start, + StateName::LabelEndAfter => construct::label_end::after, + StateName::LabelEndResourceStart => construct::label_end::resource_start, + StateName::LabelEndResourceBefore => construct::label_end::resource_before, + StateName::LabelEndResourceOpen => construct::label_end::resource_open, + StateName::LabelEndResourceDestinationAfter => { + construct::label_end::resource_destination_after + } + StateName::LabelEndResourceDestinationMissing => { + construct::label_end::resource_destination_missing + } + StateName::LabelEndResourceBetween => construct::label_end::resource_between, + StateName::LabelEndResourceTitleAfter => construct::label_end::resource_title_after, + StateName::LabelEndResourceEnd => construct::label_end::resource_end, + StateName::LabelEndOk => construct::label_end::ok, + StateName::LabelEndNok => construct::label_end::nok, + StateName::LabelEndReferenceFull => construct::label_end::reference_full, + StateName::LabelEndReferenceFullAfter => construct::label_end::reference_full_after, + StateName::LabelEndReferenceNotFull => construct::label_end::reference_not_full, + StateName::LabelEndReferenceCollapsed => construct::label_end::reference_collapsed, + StateName::LabelEndReferenceCollapsedOpen => { + construct::label_end::reference_collapsed_open + } + + StateName::LabelStartImageStart => construct::label_start_image::start, + StateName::LabelStartImageOpen => construct::label_start_image::open, + StateName::LabelStartLinkStart => construct::label_start_link::start, + + StateName::ListStart => construct::list::start, + StateName::ListBefore => construct::list::before, + StateName::ListNok => construct::list::nok, + StateName::ListBeforeUnordered => construct::list::before_unordered, + StateName::ListValue => construct::list::value, + StateName::ListMarkerAfter => construct::list::marker_after, + StateName::ListAfter => construct::list::after, + StateName::ListMarkerAfterFilled => construct::list::marker_after_filled, + StateName::ListWhitespace => construct::list::whitespace, + StateName::ListWhitespaceAfter => construct::list::whitespace_after, + StateName::ListPrefixOther => construct::list::prefix_other, + StateName::ListContStart => construct::list::cont_start, + StateName::ListContBlank => construct::list::cont_blank, + StateName::ListContFilled => construct::list::cont_filled, + StateName::ListOk => construct::list::ok, + + StateName::NonLazyContinuationStart => construct::partial_non_lazy_continuation::start, + StateName::NonLazyContinuationAfter => construct::partial_non_lazy_continuation::after, + + StateName::ParagraphStart => construct::paragraph::start, + StateName::ParagraphInside => construct::paragraph::inside, + + StateName::SpaceOrTabStart => construct::partial_space_or_tab::start, + StateName::SpaceOrTabInside => construct::partial_space_or_tab::inside, + + StateName::SpaceOrTabEolStart => construct::partial_space_or_tab::eol_start, + StateName::SpaceOrTabEolAfterFirst => construct::partial_space_or_tab::eol_after_first, + StateName::SpaceOrTabEolAfterEol => construct::partial_space_or_tab::eol_after_eol, + StateName::SpaceOrTabEolAtEol => construct::partial_space_or_tab::eol_at_eol, + StateName::SpaceOrTabEolAfterMore => construct::partial_space_or_tab::eol_after_more, + + StateName::StringStart => content::string::start, + StateName::StringBefore => content::string::before, + StateName::StringBeforeData => content::string::before_data, + + StateName::TextStart => content::text::start, + StateName::TextBefore => content::text::before, + StateName::TextBeforeData => content::text::before_data, + + StateName::ThematicBreakStart => construct::thematic_break::start, + StateName::ThematicBreakBefore => construct::thematic_break::before, + StateName::ThematicBreakSequence => construct::thematic_break::sequence, + StateName::ThematicBreakAtBreak => construct::thematic_break::at_break, + + StateName::TitleStart => construct::partial_title::start, + StateName::TitleBegin => construct::partial_title::begin, + StateName::TitleAfterEol => construct::partial_title::after_eol, + StateName::TitleAtBlankLine => construct::partial_title::at_blank_line, + StateName::TitleEscape => construct::partial_title::escape, + StateName::TitleInside => construct::partial_title::inside, + }; + + Box::new(func) + } +} + /// The result of a state. +#[derive(Debug, PartialEq)] pub enum State { - /// There is a future state: a boxed [`StateFn`][] to pass the next code to. - Fn(Box<StateFn>), + /// There is a future state: a [`StateName`][] to pass the next code to. + Fn(StateName), /// The state is successful. Ok, /// The state is not successful. @@ -163,7 +775,7 @@ struct InternalState { /// To do #[allow(clippy::struct_excessive_bools)] -pub struct TokenizeState { +pub struct TokenizeState<'a> { /// To do. pub connect: bool, /// To do. @@ -171,15 +783,15 @@ pub struct TokenizeState { /// To do. pub document_continued: usize, /// To do. - pub document_index: usize, - /// To do. - pub document_inject: Vec<(Vec<Event>, Vec<Event>)>, - /// To do. pub document_interrupt_before: bool, /// To do. pub document_paragraph_before: bool, /// To do. - pub document_next: Option<Box<StateFn>>, + pub document_data_index: Option<usize>, + /// To do. + pub document_child_state: Option<State>, + /// To do. + pub child_tokenizer: Option<Box<Tokenizer<'a>>>, /// To do. pub marker: u8, /// To do. @@ -187,7 +799,7 @@ pub struct TokenizeState { /// To do. pub prefix: usize, /// To do. - pub return_state: Option<Box<StateFn>>, + pub return_state: Option<StateName>, /// To do. pub seen: bool, /// To do. @@ -234,7 +846,7 @@ pub struct Tokenizer<'a> { /// Track whether this tokenizer is done. resolved: bool, /// To do. - attempt_balance: usize, + attempts: Vec<Attempt>, /// Current byte. pub current: Option<u8>, /// Previous byte. @@ -251,13 +863,13 @@ pub struct Tokenizer<'a> { pub map: EditMap, /// List of attached resolvers, which will be called when done feeding, /// to clean events. - resolvers: Vec<Box<Resolver>>, + pub resolvers: Vec<Box<Resolver>>, /// List of names associated with attached resolvers. - resolver_ids: Vec<String>, + pub resolver_ids: Vec<String>, /// Shared parsing state across tokenizers. pub parse_state: &'a ParseState<'a>, /// To do. - pub tokenize_state: TokenizeState, + pub tokenize_state: TokenizeState<'a>, /// Stack of label (start) that could form images and links. /// /// Used when tokenizing [text content][crate::content::text]. @@ -299,7 +911,7 @@ impl<'a> Tokenizer<'a> { line_start: point.clone(), consumed: true, resolved: false, - attempt_balance: 0, + attempts: vec![], point, stack: vec![], events: vec![], @@ -308,11 +920,11 @@ impl<'a> Tokenizer<'a> { connect: false, document_container_stack: vec![], document_continued: 0, - document_index: 0, - document_inject: vec![], document_interrupt_before: false, document_paragraph_before: false, - document_next: None, + document_data_index: None, + document_child_state: None, + child_tokenizer: None, marker: 0, marker_other: 0, prefix: 0, @@ -369,13 +981,22 @@ impl<'a> Tokenizer<'a> { } /// Define a jump between two places. - pub fn define_skip(&mut self, point: &Point) { - define_skip_impl(self, point.line, (point.index, point.vs)); - } + /// + /// This defines to which future index we move after a line ending. + pub fn define_skip(&mut self, mut point: Point) { + move_point_back(self, &mut point); + + let info = (point.index, point.vs); + log::debug!("position: define skip: {:?} -> ({:?})", point.line, info); + let at = point.line - self.first_line; + + if at >= self.column_start.len() { + self.column_start.push(info); + } else { + self.column_start[at] = info; + } - /// Define the current place as a jump between two places. - pub fn define_skip_current(&mut self) { - define_skip_impl(self, self.point.line, (self.point.index, self.point.vs)); + self.account_for_potential_skip(); } /// Increment the current positional info if we’re right after a line @@ -396,8 +1017,8 @@ impl<'a> Tokenizer<'a> { } /// Consume the current byte. - /// Each [`StateFn`][] is expected to call this to signal that this code is - /// used, or call a next `StateFn`. + /// Each state function is expected to call this to signal that this code is + /// used, or call a next function. pub fn consume(&mut self) { log::debug!("consume: `{:?}` ({:?})", self.current, self.point); debug_assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); @@ -473,16 +1094,7 @@ impl<'a> Tokenizer<'a> { pub fn enter_with_link(&mut self, token_type: Token, link: Option<Link>) { let mut point = self.point.clone(); - - // Move back past ignored bytes. - while point.index > 0 { - point.index -= 1; - let action = byte_action(self.parse_state.bytes, &point); - if !matches!(action, ByteAction::Ignore) { - point.index += 1; - break; - } - } + move_point_back(self, &mut point); log::debug!("enter: `{:?}` ({:?})", token_type, point); self.events.push(Event { @@ -527,15 +1139,7 @@ impl<'a> Tokenizer<'a> { if matches!(self.previous, Some(b'\n')) { point = self.line_start.clone(); } else { - // Move back past ignored bytes. - while point.index > 0 { - point.index -= 1; - let action = byte_action(self.parse_state.bytes, &point); - if !matches!(action, ByteAction::Ignore) { - point.index += 1; - break; - } - } + move_point_back(self, &mut point); } log::debug!("exit: `{:?}` ({:?})", token_type, point); @@ -575,29 +1179,20 @@ impl<'a> Tokenizer<'a> { self.stack.truncate(previous.stack_len); } - /// Parse with `state_fn` and its future states, switching to `ok` when + /// Parse with `state_name` and its future states, switching to `ok` when /// successful, and passing [`State::Nok`][] back up if it occurs. /// /// This function does not capture the current state, in case of /// `State::Nok`, as it is assumed that this `go` is itself wrapped in /// another `attempt`. #[allow(clippy::unused_self)] - pub fn go( - &mut self, - state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - after: impl FnOnce(&mut Tokenizer) -> State + 'static, - ) -> Box<StateFn> { - self.attempt_balance += 1; + pub fn go(&mut self, state_name: StateName, after: StateName) -> State { attempt_impl( - state_fn, - None, - self.point.index, - |tokenizer: &mut Tokenizer, state| { - tokenizer.attempt_balance -= 1; - + self, + state_name, + Box::new(move |_tokenizer: &mut Tokenizer, state| { if matches!(state, State::Ok) { - tokenizer.consumed = true; - State::Fn(Box::new(after)) + State::Fn(after) } else { // Must be `Nok`. // We don’t capture/free state because it is assumed that @@ -605,132 +1200,122 @@ impl<'a> Tokenizer<'a> { // if it can occur. state } - }, - ) - } - - /// Like `go`, but this lets you *hijack* back to some other state after a - /// certain code. - #[allow(clippy::unused_self)] - pub fn go_until( - &mut self, - state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - until: impl Fn(Option<u8>) -> bool + 'static, - done: impl FnOnce(State) -> Box<StateFn> + 'static, - ) -> Box<StateFn> { - self.attempt_balance += 1; - attempt_impl( - state_fn, - Some(Box::new(until)), - self.point.index, - |tokenizer: &mut Tokenizer, state| { - tokenizer.attempt_balance -= 1; - tokenizer.consumed = true; - // We don’t capture/free state because it is assumed that - // `go_until` itself is wrapped in another attempt that does - // that if it can occur. - State::Fn(done(state)) - }, + }), ) } - /// Parse with `state_fn` and its future states, to check if it result in + /// Parse with `state_name` and its future states, to check if it result in /// [`State::Ok`][] or [`State::Nok`][], revert on both cases, and then /// call `done` with whether it was successful or not. /// /// This captures the current state of the tokenizer, returns a wrapped - /// state that captures all codes and feeds them to `state_fn` and its + /// state that captures all codes and feeds them to `state_name` and its /// future states until it yields `State::Ok` or `State::Nok`. /// It then applies the captured state, calls `done`, and feeds all /// captured codes to its future states. pub fn check( &mut self, - state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - done: impl FnOnce(bool) -> Box<StateFn> + 'static, - ) -> Box<StateFn> { - self.attempt_balance += 1; + state_name: StateName, + done: impl FnOnce(bool) -> State + 'static, + ) -> State { let previous = self.capture(); attempt_impl( - state_fn, - None, - self.point.index, - |tokenizer: &mut Tokenizer, state| { - tokenizer.attempt_balance -= 1; + self, + state_name, + Box::new(|tokenizer: &mut Tokenizer, state| { tokenizer.free(previous); tokenizer.consumed = true; - State::Fn(done(matches!(state, State::Ok))) - }, + done(matches!(state, State::Ok)) + }), ) } - /// Parse with `state_fn` and its future states, to check if it results in + /// Parse with `state_name` and its future states, to check if it results in /// [`State::Ok`][] or [`State::Nok`][], revert on the case of /// `State::Nok`, and then call `done` with whether it was successful or /// not. /// /// This captures the current state of the tokenizer, returns a wrapped - /// state that captures all codes and feeds them to `state_fn` and its + /// state that captures all codes and feeds them to `state_name` and its /// future states until it yields `State::Ok`, at which point it calls /// `done` and yields its result. /// If instead `State::Nok` was yielded, the captured state is applied, /// `done` is called, and all captured codes are fed to its future states. pub fn attempt( &mut self, - state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - done: impl FnOnce(bool) -> Box<StateFn> + 'static, - ) -> Box<StateFn> { - self.attempt_balance += 1; + state_name: StateName, + done: impl FnOnce(bool) -> State + 'static, + ) -> State { let previous = self.capture(); + log::debug!("attempting: {:?}", state_name); + // self.consumed = false; attempt_impl( - state_fn, - None, - self.point.index, - |tokenizer: &mut Tokenizer, state| { - tokenizer.attempt_balance -= 1; + self, + state_name, + Box::new(move |tokenizer: &mut Tokenizer, state| { let ok = matches!(state, State::Ok); if !ok { tokenizer.free(previous); + tokenizer.consumed = true; } - log::debug!("attempt: {:?}, at {:?}", ok, tokenizer.point); + log::debug!( + "attempted {:?}: {:?}, at {:?}", + state_name, + ok, + tokenizer.point + ); - tokenizer.consumed = true; - State::Fn(done(ok)) - }, + done(ok) + }), ) } /// Just like [`attempt`][Tokenizer::attempt], but many. pub fn attempt_n( &mut self, - mut state_fns: Vec<Box<StateFn>>, - done: impl FnOnce(bool) -> Box<StateFn> + 'static, - ) -> Box<StateFn> { - if state_fns.is_empty() { + mut state_names: Vec<StateName>, + done: impl FnOnce(bool) -> State + 'static, + ) -> State { + if state_names.is_empty() { done(false) } else { - let state_fn = state_fns.remove(0); - self.attempt(state_fn, move |ok| { - if ok { - done(ok) - } else { - Box::new(|t| t.attempt_n(state_fns, done)(t)) - } - }) + let previous = self.capture(); + let state_name = state_names.remove(0); + self.consumed = false; + log::debug!("attempting (n): {:?}", state_name); + attempt_impl( + self, + state_name, + Box::new(move |tokenizer: &mut Tokenizer, state| { + let ok = matches!(state, State::Ok); + + log::debug!( + "attempted (n) {:?}: {:?}, at {:?}", + state_name, + ok, + tokenizer.point + ); + + if ok { + done(true) + } else { + tokenizer.free(previous); + tokenizer.consumed = true; + tokenizer.attempt_n(state_names, done) + } + }), + ) } } /// Just like [`attempt`][Tokenizer::attempt], but for when you don’t care /// about `ok`. - pub fn attempt_opt( - &mut self, - state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - after: impl FnOnce(&mut Tokenizer) -> State + 'static, - ) -> Box<StateFn> { - self.attempt(state_fn, |_ok| Box::new(after)) + pub fn attempt_opt(&mut self, state_name: StateName, after: StateName) -> State { + self.attempt(state_name, move |_ok| State::Fn(after)) } /// Feed a list of `codes` into `start`. @@ -738,30 +1323,40 @@ impl<'a> Tokenizer<'a> { /// This is set up to support repeatedly calling `feed`, and thus streaming /// markdown into the state machine, and normally pauses after feeding. // Note: if needed: accept `vs`? - pub fn push( - &mut self, - min: usize, - max: usize, - start: impl FnOnce(&mut Tokenizer) -> State + 'static, - ) -> State { + pub fn push(&mut self, min: usize, max: usize, state_name: StateName) -> State { debug_assert!(!self.resolved, "cannot feed after drain"); - debug_assert!(min >= self.point.index, "cannot move backwards"); - self.move_to((min, 0)); + // debug_assert!(min >= self.point.index, "cannot move backwards"); + if min > self.point.index { + self.move_to((min, 0)); + } - let mut state = State::Fn(Box::new(start)); + let mut state = State::Fn(state_name); while self.point.index < max { match state { - State::Ok | State::Nok => break, - State::Fn(func) => match byte_action(self.parse_state.bytes, &self.point) { + State::Ok | State::Nok => { + if let Some(attempt) = self.attempts.pop() { + let done = attempt.done; + self.consumed = true; + state = done(self, state); + } else { + break; + } + } + State::Fn(state_name) => match byte_action(self.parse_state.bytes, &self.point) { ByteAction::Ignore => { - state = State::Fn(Box::new(func)); + state = State::Fn(state_name); self.move_one(); } ByteAction::Insert(byte) | ByteAction::Normal(byte) => { - log::debug!("main: passing: `{:?}` ({:?})", byte, self.point); + log::debug!( + "main: passing: `{:?}` ({:?}) to {:?}", + byte, + self.point, + state_name + ); self.expect(Some(byte)); - state = func(self); + state = call_impl(self, state_name); } }, } @@ -778,8 +1373,16 @@ impl<'a> Tokenizer<'a> { loop { match state { - State::Ok | State::Nok => break, - State::Fn(func) => { + State::Ok | State::Nok => { + if let Some(attempt) = self.attempts.pop() { + let done = attempt.done; + self.consumed = true; + state = done(self, state); + } else { + break; + } + } + State::Fn(state_name) => { // We sometimes move back when flushing, so then we use those codes. let action = if self.point.index == max { None @@ -788,7 +1391,7 @@ impl<'a> Tokenizer<'a> { }; if let Some(ByteAction::Ignore) = action { - state = State::Fn(Box::new(func)); + state = State::Fn(state_name); self.move_one(); } else { let byte = @@ -800,14 +1403,20 @@ impl<'a> Tokenizer<'a> { None }; - log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point); + log::debug!( + "main: flushing: `{:?}` ({:?}) to {:?}", + byte, + self.point, + state_name + ); self.expect(byte); - state = func(self); + state = call_impl(self, state_name); } } } } + self.consumed = true; debug_assert!(matches!(state, State::Ok), "must be ok"); if resolve { @@ -869,80 +1478,29 @@ fn byte_action(bytes: &[u8], point: &Point) -> ByteAction { /// Recurses into itself. /// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. fn attempt_impl( - state: impl FnOnce(&mut Tokenizer) -> State + 'static, - pause: Option<Box<dyn Fn(Option<u8>) -> bool + 'static>>, - start: usize, - done: impl FnOnce(&mut Tokenizer, State) -> State + 'static, -) -> Box<StateFn> { - Box::new(move |tokenizer| { - if let Some(ref func) = pause { - if tokenizer.point.index > start && func(tokenizer.previous) { - return done(tokenizer, State::Fn(Box::new(state))); - } - } + tokenizer: &mut Tokenizer, + state_name: StateName, + done: Box<impl FnOnce(&mut Tokenizer, State) -> State + 'static>, +) -> State { + tokenizer.attempts.push(Attempt { done }); + call_impl(tokenizer, state_name) +} - let state = state(tokenizer); - - match state { - State::Ok | State::Nok => { - if tokenizer.attempt_balance == 0 { - debug_assert!(!tokenizer.tokenize_state.connect); - debug_assert_eq!(tokenizer.tokenize_state.document_continued, 0); - debug_assert_eq!(tokenizer.tokenize_state.document_index, 0); - debug_assert!(!tokenizer.tokenize_state.document_interrupt_before); - debug_assert!(!tokenizer.tokenize_state.document_paragraph_before); - debug_assert_eq!(tokenizer.tokenize_state.marker, 0); - debug_assert_eq!(tokenizer.tokenize_state.marker_other, 0); - debug_assert_eq!(tokenizer.tokenize_state.prefix, 0); - debug_assert!(!tokenizer.tokenize_state.seen); - debug_assert_eq!(tokenizer.tokenize_state.size, 0); - debug_assert_eq!(tokenizer.tokenize_state.size_other, 0); - debug_assert_eq!(tokenizer.tokenize_state.stop.len(), 0); - debug_assert_eq!(tokenizer.tokenize_state.start, 0); - debug_assert_eq!(tokenizer.tokenize_state.end, 0); - debug_assert!(tokenizer.tokenize_state.return_state.is_none()); - debug_assert!(!tokenizer.tokenize_state.space_or_tab_eol_connect); - debug_assert!(!tokenizer.tokenize_state.space_or_tab_eol_ok); - debug_assert!(tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .is_none()); - debug_assert!(!tokenizer.tokenize_state.space_or_tab_connect); - debug_assert!(tokenizer.tokenize_state.space_or_tab_content_type.is_none()); - debug_assert_eq!(tokenizer.tokenize_state.space_or_tab_min, 0); - debug_assert_eq!(tokenizer.tokenize_state.space_or_tab_max, 0); - debug_assert_eq!(tokenizer.tokenize_state.space_or_tab_size, 0); - debug_assert_eq!( - tokenizer.tokenize_state.space_or_tab_token, - Token::SpaceOrTab - ); - debug_assert_eq!(tokenizer.tokenize_state.token_1, Token::Data); - debug_assert_eq!(tokenizer.tokenize_state.token_2, Token::Data); - debug_assert_eq!(tokenizer.tokenize_state.token_3, Token::Data); - debug_assert_eq!(tokenizer.tokenize_state.token_4, Token::Data); - debug_assert_eq!(tokenizer.tokenize_state.token_5, Token::Data); - } +#[allow(clippy::too_many_lines)] +fn call_impl(tokenizer: &mut Tokenizer, state_name: StateName) -> State { + let func = state_name.to_func(); - done(tokenizer, state) - } - State::Fn(func) => State::Fn(attempt_impl(func, pause, start, done)), - } - }) + func(tokenizer) } -/// Flush `start`: pass `eof`s to it until done. -/// Define a jump between two places. -/// -/// This defines to which future index we move after a line ending. -fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize)) { - log::debug!("position: define skip: {:?} -> ({:?})", line, info); - let at = line - tokenizer.first_line; - - if at >= tokenizer.column_start.len() { - tokenizer.column_start.push(info); - } else { - tokenizer.column_start[at] = info; +fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) { + // Move back past ignored bytes. + while point.index > 0 { + point.index -= 1; + let action = byte_action(tokenizer.parse_state.bytes, point); + if !matches!(action, ByteAction::Ignore) { + point.index += 1; + break; + } } - - tokenizer.account_for_potential_skip(); } |