diff options
25 files changed, 1285 insertions, 1148 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 583fde2..fc2acfb 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -118,8 +118,9 @@ struct Sequence { pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'*' | b'_') if tokenizer.parse_state.constructs.attention => { + tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); tokenizer.enter(Token::AttentionSequence); - inside(tokenizer, tokenizer.current.unwrap()) + inside(tokenizer) } _ => State::Nok, } @@ -131,15 +132,16 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | ** /// ^^ /// ``` -fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { +fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'*' | b'_') if tokenizer.current.unwrap() == marker => { + Some(b'*' | b'_') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, marker))) + State::Fn(Box::new(inside)) } _ => { tokenizer.exit(Token::AttentionSequence); tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); + tokenizer.tokenize_state.marker = b'\0'; State::Ok } } diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index bac291e..1444c61 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -158,7 +158,9 @@ fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumeric and `+`, `-`, and `.`. Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { - scheme_inside_or_email_atext(tokenizer, 1) + // Count the previous alphabetical from `open` too. + tokenizer.tokenize_state.size = 1; + scheme_inside_or_email_atext(tokenizer) } _ => email_atext(tokenizer), } @@ -172,20 +174,25 @@ fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { /// > | a<user@example.com>b /// ^ /// ``` -fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State { +fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b':') => { tokenizer.consume(); + tokenizer.tokenize_state.size = 0; State::Fn(Box::new(url_inside)) } // ASCII alphanumeric and `+`, `-`, and `.`. Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') - if size < AUTOLINK_SCHEME_SIZE_MAX => + if tokenizer.tokenize_state.size < AUTOLINK_SCHEME_SIZE_MAX => { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(move |t| scheme_inside_or_email_atext(t, size + 1))) + State::Fn(Box::new(scheme_inside_or_email_atext)) + } + _ => { + tokenizer.tokenize_state.size = 0; + email_atext(tokenizer) } - _ => email_atext(tokenizer), } } @@ -220,7 +227,7 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'@') => { tokenizer.consume(); - State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) + State::Fn(Box::new(email_at_sign_or_dot)) } // ASCII atext. // @@ -255,10 +262,10 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// > | a<user.name@example.com>b /// ^ ^ /// ``` -fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { +fn email_at_sign_or_dot(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumeric. - Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size), + Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer), _ => State::Nok, } } @@ -269,13 +276,15 @@ fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { /// > | a<user.name@example.com>b /// ^ /// ``` -fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { +fn email_label(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'.') => { + tokenizer.tokenize_state.size = 0; tokenizer.consume(); - State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) + State::Fn(Box::new(email_at_sign_or_dot)) } Some(b'>') => { + tokenizer.tokenize_state.size = 0; let index = tokenizer.events.len(); tokenizer.exit(Token::AutolinkProtocol); // Change the token type. @@ -283,7 +292,7 @@ fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { tokenizer.events[index].token_type = Token::AutolinkEmail; end(tokenizer) } - _ => email_value(tokenizer, size), + _ => email_value(tokenizer), } } @@ -295,19 +304,25 @@ fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { /// > | a<user.name@ex-ample.com>b /// ^ /// ``` -fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { +fn email_value(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumeric or `-`. - Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => { + Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') + if tokenizer.tokenize_state.size < AUTOLINK_DOMAIN_SIZE_MAX => + { let func = if matches!(tokenizer.current, Some(b'-')) { email_value } else { email_label }; + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(move |t| func(t, size + 1))) + State::Fn(Box::new(func)) + } + _ => { + tokenizer.tokenize_state.size = 0; + State::Nok } - _ => State::Nok, } } diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 9393691..7cc74ba 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -69,17 +69,6 @@ use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; use crate::util::slice::Slice; -/// State needed to parse character references. -#[derive(Debug, Clone)] -struct Info { - /// Index of where value starts. - start: usize, - /// Marker of character reference. - marker: u8, - /// Maximum number of characters in the value for this kind. - max: usize, -} - /// Start of a character reference. /// /// ```markdown @@ -121,15 +110,9 @@ fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CharacterReferenceMarkerNumeric); State::Fn(Box::new(numeric)) } else { + tokenizer.tokenize_state.marker = b'&'; tokenizer.enter(Token::CharacterReferenceValue); - value( - tokenizer, - Info { - start: tokenizer.point.index, - marker: b'&', - max: CHARACTER_REFERENCE_NAMED_SIZE_MAX, - }, - ) + value(tokenizer) } } @@ -148,20 +131,12 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); tokenizer.enter(Token::CharacterReferenceValue); - let info = Info { - start: tokenizer.point.index, - marker: b'x', - max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, - }; - State::Fn(Box::new(|t| value(t, info))) + tokenizer.tokenize_state.marker = b'x'; + State::Fn(Box::new(value)) } else { tokenizer.enter(Token::CharacterReferenceValue); - let info = Info { - start: tokenizer.point.index, - marker: b'#', - max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, - }; - value(tokenizer, info) + tokenizer.tokenize_state.marker = b'#'; + value(tokenizer) } } @@ -179,50 +154,57 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { /// > | a	b /// ^ /// ``` -fn value(tokenizer: &mut Tokenizer, info: Info) -> State { - let size = tokenizer.point.index - info.start; +fn value(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b';')) && tokenizer.tokenize_state.size > 0 { + // Named. + if tokenizer.tokenize_state.marker == b'&' { + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( + tokenizer.parse_state.bytes, + tokenizer.point.index - tokenizer.tokenize_state.size, + tokenizer.point.index, + ); + let name = slice.as_str(); - match tokenizer.current { - Some(b';') if size > 0 => { - // Named. - if info.marker == b'&' { - // Guaranteed to be valid ASCII bytes. - let slice = Slice::from_indices( - tokenizer.parse_state.bytes, - info.start, - tokenizer.point.index, - ); - let name = slice.as_str(); - - if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) { - return State::Nok; - } + if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + return State::Nok; } - - tokenizer.exit(Token::CharacterReferenceValue); - tokenizer.enter(Token::CharacterReferenceMarkerSemi); - tokenizer.consume(); - tokenizer.exit(Token::CharacterReferenceMarkerSemi); - tokenizer.exit(Token::CharacterReference); - State::Ok - } - // ASCII digit, for named, decimal, and hexadecimal references. - Some(b'0'..=b'9') if size < info.max => { - tokenizer.consume(); - State::Fn(Box::new(|t| value(t, info))) } - // ASCII hex letters, for named and hexadecimal references. - Some(b'A'..=b'F' | b'a'..=b'f') - if matches!(info.marker, b'&' | b'x') && size < info.max => - { - tokenizer.consume(); - State::Fn(Box::new(|t| value(t, info))) - } - // Non-hex ASCII alphabeticals, for named references. - Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => { + + tokenizer.exit(Token::CharacterReferenceValue); + tokenizer.enter(Token::CharacterReferenceMarkerSemi); + tokenizer.consume(); + tokenizer.exit(Token::CharacterReferenceMarkerSemi); + tokenizer.exit(Token::CharacterReference); + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + return State::Ok; + } + + let max = match tokenizer.tokenize_state.marker { + b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX, + b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker), + }; + let test = match tokenizer.tokenize_state.marker { + b'&' => u8::is_ascii_alphanumeric, + b'x' => u8::is_ascii_hexdigit, + b'#' => u8::is_ascii_digit, + _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker), + }; + + if let Some(byte) = tokenizer.current { + if tokenizer.tokenize_state.size < max && test(&byte) { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(|t| value(t, info))) + return State::Fn(Box::new(value)); } - _ => State::Nok, } + + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + State::Nok } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index c4c3e86..a22a0f9 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -110,18 +110,6 @@ use crate::token::Token; use crate::tokenizer::{ContentType, State, Tokenizer}; use crate::util::slice::{Position, Slice}; -/// State needed to parse code (fenced). -#[derive(Debug, Clone)] -struct Info { - /// Number of markers on the opening fence sequence. - size: usize, - /// Number of tabs or spaces of indentation before the opening fence - /// sequence. - prefix: usize, - /// Marker of fences (`u8`). - marker: u8, -} - /// Start of fenced code. /// /// ```markdown @@ -173,15 +161,10 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { } if let Some(b'`' | b'~') = tokenizer.current { + tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); + tokenizer.tokenize_state.prefix = prefix; tokenizer.enter(Token::CodeFencedFenceSequence); - sequence_open( - tokenizer, - Info { - prefix, - size: 0, - marker: tokenizer.current.unwrap(), - }, - ) + sequence_open(tokenizer) } else { State::Nok } @@ -195,20 +178,23 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn sequence_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(|t| { - info.size += 1; - sequence_open(t, info) - })) + State::Fn(Box::new(sequence_open)) } - _ if info.size >= CODE_FENCED_SEQUENCE_SIZE_MIN => { + _ if tokenizer.tokenize_state.size >= CODE_FENCED_SEQUENCE_SIZE_MIN => { tokenizer.exit(Token::CodeFencedFenceSequence); - tokenizer.attempt_opt(space_or_tab(), |t| info_before(t, info))(tokenizer) + tokenizer.attempt_opt(space_or_tab(), info_before)(tokenizer) + } + _ => { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size = 0; + State::Nok } - _ => State::Nok, } } @@ -220,18 +206,18 @@ fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { +fn info_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; - at_break(tokenizer, info) + at_break(tokenizer) } _ => { tokenizer.enter(Token::CodeFencedFenceInfo); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - info_inside(tokenizer, info) + info_inside(tokenizer) } } } @@ -244,7 +230,7 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { +fn info_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Data); @@ -252,17 +238,23 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; - at_break(tokenizer, info) + at_break(tokenizer) } Some(b'\t' | b' ') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); - tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer) + tokenizer.attempt_opt(space_or_tab(), meta_before)(tokenizer) + } + Some(b'`') if tokenizer.tokenize_state.marker == b'`' => { + tokenizer.concrete = false; + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size = 0; + State::Nok } - Some(b'`') if info.marker == b'`' => State::Nok, Some(_) => { tokenizer.consume(); - State::Fn(Box::new(|t| info_inside(t, info))) + State::Fn(Box::new(info_inside)) } } } @@ -275,18 +267,18 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { +fn meta_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; - at_break(tokenizer, info) + at_break(tokenizer) } _ => { tokenizer.enter(Token::CodeFencedFenceMeta); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - meta(tokenizer, info) + meta(tokenizer) } } } @@ -299,7 +291,7 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { +fn meta(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Data); @@ -307,12 +299,18 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; - at_break(tokenizer, info) + at_break(tokenizer) + } + Some(b'`') if tokenizer.tokenize_state.marker == b'`' => { + tokenizer.concrete = false; + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size = 0; + State::Nok } - Some(b'`') if info.marker == b'`' => State::Nok, _ => { tokenizer.consume(); - State::Fn(Box::new(|t| meta(t, info))) + State::Fn(Box::new(meta)) } } } @@ -326,13 +324,9 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// | ~~~ /// ``` -fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { +fn at_break(tokenizer: &mut Tokenizer) -> State { tokenizer.check(partial_non_lazy_continuation, |ok| { - if ok { - Box::new(move |t| at_non_lazy_break(t, info)) - } else { - Box::new(after) - } + Box::new(if ok { at_non_lazy_break } else { after }) })(tokenizer) } @@ -345,19 +339,10 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// | ~~~ /// ``` -fn at_non_lazy_break(tokenizer: &mut Tokenizer, info: Info) -> State { - let clone = info.clone(); - - tokenizer.attempt( - |t| close_begin(t, info), - |ok| { - if ok { - Box::new(after) - } else { - Box::new(|t| content_before(t, clone)) - } - }, - )(tokenizer) +fn at_non_lazy_break(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt(close_begin, |ok| { + Box::new(if ok { after } else { content_before }) + })(tokenizer) } /// Before a closing fence, at the line ending. @@ -368,13 +353,13 @@ fn at_non_lazy_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// | ~~~ /// ``` -fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { +fn close_begin(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(|t| close_start(t, info))) + State::Fn(Box::new(close_start)) } _ => unreachable!("expected eol"), } @@ -388,7 +373,7 @@ fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | ~~~ /// ^ /// ``` -fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { +fn close_start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::CodeFencedFence); tokenizer.go( space_or_tab_min_max( @@ -399,7 +384,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { usize::MAX }, ), - |t| close_before(t, info), + close_before, )(tokenizer) } @@ -411,11 +396,11 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | ~~~ /// ^ /// ``` -fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { +fn close_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.enter(Token::CodeFencedFenceSequence); - close_sequence(tokenizer, info, 0) + close_sequence(tokenizer) } _ => State::Nok, } @@ -429,17 +414,24 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | ~~~ /// ^ /// ``` -fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { +fn close_sequence(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { + tokenizer.tokenize_state.size_other += 1; tokenizer.consume(); - State::Fn(Box::new(move |t| close_sequence(t, info, size + 1))) + State::Fn(Box::new(close_sequence)) } - _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { + _ if tokenizer.tokenize_state.size_other >= CODE_FENCED_SEQUENCE_SIZE_MIN + && tokenizer.tokenize_state.size_other >= tokenizer.tokenize_state.size => + { + tokenizer.tokenize_state.size_other = 0; tokenizer.exit(Token::CodeFencedFenceSequence); tokenizer.attempt_opt(space_or_tab(), close_sequence_after)(tokenizer) } - _ => State::Nok, + _ => { + tokenizer.tokenize_state.size_other = 0; + State::Nok + } } } @@ -469,11 +461,11 @@ fn close_sequence_after(tokenizer: &mut Tokenizer) -> State { /// ^ /// | ~~~ /// ``` -fn content_before(tokenizer: &mut Tokenizer, info: Info) -> State { +fn content_before(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(|t| content_start(t, info))) + State::Fn(Box::new(content_start)) } /// Before code content, definitely not before a closing fence. /// @@ -483,10 +475,11 @@ fn content_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// | ~~~ /// ``` -fn content_start(tokenizer: &mut Tokenizer, info: Info) -> State { - tokenizer.go(space_or_tab_min_max(0, info.prefix), |t| { - content_begin(t, info) - })(tokenizer) +fn content_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.go( + space_or_tab_min_max(0, tokenizer.tokenize_state.prefix), + content_begin, + )(tokenizer) } /// Before code content, after a prefix. @@ -497,12 +490,12 @@ fn content_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// | ~~~ /// ``` -fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { +fn content_begin(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') => at_break(tokenizer, info), + None | Some(b'\n') => at_break(tokenizer), _ => { tokenizer.enter(Token::CodeFlowChunk); - content_continue(tokenizer, info) + content_continue(tokenizer) } } } @@ -515,15 +508,15 @@ fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^^^^^^^^^^^^^^ /// | ~~~ /// ``` -fn content_continue(tokenizer: &mut Tokenizer, info: Info) -> State { +fn content_continue(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::CodeFlowChunk); - at_break(tokenizer, info) + at_break(tokenizer) } _ => { tokenizer.consume(); - State::Fn(Box::new(|t| content_continue(t, info))) + State::Fn(Box::new(content_continue)) } } } @@ -538,6 +531,9 @@ fn content_continue(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CodeFenced); + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size = 0; // Feel free to interrupt. tokenizer.interrupt = false; // No longer concrete. diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index 3f9e5e5..31777f4 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -105,7 +105,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { { tokenizer.enter(Token::CodeText); tokenizer.enter(Token::CodeTextSequence); - sequence_open(tokenizer, 0) + sequence_open(tokenizer) } _ => State::Nok, } @@ -117,13 +117,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | `a` /// ^ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { +fn sequence_open(tokenizer: &mut Tokenizer) -> State { if let Some(b'`') = tokenizer.current { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(move |t| sequence_open(t, size + 1))) + State::Fn(Box::new(sequence_open)) } else { tokenizer.exit(Token::CodeTextSequence); - between(tokenizer, size) + between(tokenizer) } } @@ -133,22 +134,25 @@ fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { /// > | `a` /// ^^ /// ``` -fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { +fn between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Nok, + None => { + tokenizer.tokenize_state.size = 0; + State::Nok + } Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(move |t| between(t, size_open))) + State::Fn(Box::new(between)) } Some(b'`') => { tokenizer.enter(Token::CodeTextSequence); - sequence_close(tokenizer, size_open, 0) + sequence_close(tokenizer) } _ => { tokenizer.enter(Token::CodeTextData); - data(tokenizer, size_open) + data(tokenizer) } } } @@ -159,15 +163,15 @@ fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// > | `a` /// ^ /// ``` -fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { +fn data(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n' | b'`') => { tokenizer.exit(Token::CodeTextData); - between(tokenizer, size_open) + between(tokenizer) } _ => { tokenizer.consume(); - State::Fn(Box::new(move |t| data(t, size_open))) + State::Fn(Box::new(data)) } } } @@ -178,16 +182,19 @@ fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// > | `a` /// ^ /// ``` -fn sequence_close(tokenizer: &mut Tokenizer, size_open: usize, size: usize) -> State { +fn sequence_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`') => { + tokenizer.tokenize_state.size_other += 1; tokenizer.consume(); - State::Fn(Box::new(move |t| sequence_close(t, size_open, size + 1))) + State::Fn(Box::new(sequence_close)) } _ => { - if size_open == size { + if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_other { tokenizer.exit(Token::CodeTextSequence); tokenizer.exit(Token::CodeText); + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.size_other = 0; State::Ok } else { let index = tokenizer.events.len(); @@ -195,7 +202,8 @@ fn sequence_close(tokenizer: &mut Tokenizer, size_open: usize, size: usize) -> S // More or less accents: mark as data. tokenizer.events[index - 1].token_type = Token::CodeTextData; tokenizer.events[index].token_type = Token::CodeTextData; - between(tokenizer, size_open) + tokenizer.tokenize_state.size_other = 0; + between(tokenizer) } } } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index bd7df82..a56dab4 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -94,10 +94,10 @@ //! [html-img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element use crate::construct::{ - partial_destination::{start as destination, Options as DestinationOptions}, - partial_label::{start as label, Options as LabelOptions}, + partial_destination::start as destination, + partial_label::start as label, partial_space_or_tab::{space_or_tab, space_or_tab_eol}, - partial_title::{start as title, Options as TitleOptions}, + partial_title::start as title, }; use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; @@ -138,19 +138,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'[') => tokenizer.go( - |t| { - label( - t, - LabelOptions { - label: Token::DefinitionLabel, - marker: Token::DefinitionLabelMarker, - string: Token::DefinitionLabelString, - }, - ) - }, - label_after, - )(tokenizer), + Some(b'[') => { + tokenizer.tokenize_state.token_1 = Token::DefinitionLabel; + tokenizer.tokenize_state.token_2 = Token::DefinitionLabelMarker; + tokenizer.tokenize_state.token_3 = Token::DefinitionLabelString; + tokenizer.go(label, label_after)(tokenizer) + } _ => State::Nok, } } @@ -162,6 +155,10 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn label_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; + match tokenizer.current { Some(b':') => { tokenizer.enter(Token::DefinitionMarker); @@ -182,22 +179,19 @@ fn label_after(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn destination_before(tokenizer: &mut Tokenizer) -> State { - tokenizer.go( - |t| { - destination( - t, - DestinationOptions { - limit: usize::MAX, - destination: Token::DefinitionDestination, - literal: Token::DefinitionDestinationLiteral, - marker: Token::DefinitionDestinationLiteralMarker, - raw: Token::DefinitionDestinationRaw, - string: Token::DefinitionDestinationString, - }, - ) - }, - destination_after, - )(tokenizer) + tokenizer.tokenize_state.token_1 = Token::DefinitionDestination; + tokenizer.tokenize_state.token_2 = Token::DefinitionDestinationLiteral; + tokenizer.tokenize_state.token_3 = Token::DefinitionDestinationLiteralMarker; + tokenizer.tokenize_state.token_4 = Token::DefinitionDestinationRaw; + tokenizer.tokenize_state.token_5 = Token::DefinitionDestinationString; + tokenizer.tokenize_state.size_other = usize::MAX; + tokenizer.attempt(destination, |ok| { + Box::new(if ok { + destination_after + } else { + destination_missing + }) + })(tokenizer) } /// After a destination. @@ -207,9 +201,26 @@ fn destination_before(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn destination_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; + tokenizer.tokenize_state.token_4 = Token::Data; + tokenizer.tokenize_state.token_5 = Token::Data; + tokenizer.tokenize_state.size_other = 0; tokenizer.attempt_opt(title_before, after)(tokenizer) } +/// Without destination. +fn destination_missing(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; + tokenizer.tokenize_state.token_4 = Token::Data; + tokenizer.tokenize_state.token_5 = Token::Data; + tokenizer.tokenize_state.size_other = 0; + State::Nok +} + /// After a definition. /// /// ```markdown @@ -262,19 +273,10 @@ fn title_before(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn title_before_marker(tokenizer: &mut Tokenizer) -> State { - tokenizer.go( - |t| { - title( - t, - TitleOptions { - title: Token::DefinitionTitle, - marker: Token::DefinitionTitleMarker, - string: Token::DefinitionTitleString, - }, - ) - }, - title_after, - )(tokenizer) + tokenizer.tokenize_state.token_1 = Token::DefinitionTitle; + tokenizer.tokenize_state.token_2 = Token::DefinitionTitleMarker; + tokenizer.tokenize_state.token_3 = Token::DefinitionTitleString; + tokenizer.go(title, title_after)(tokenizer) } /// After a title. @@ -284,6 +286,9 @@ fn title_before_marker(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn title_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.attempt_opt(space_or_tab(), title_after_after_optional_whitespace)(tokenizer) } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index d432b6c..6751567 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -93,7 +93,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn before(tokenizer: &mut Tokenizer) -> State { if Some(b'#') == tokenizer.current { tokenizer.enter(Token::HeadingAtxSequence); - sequence_open(tokenizer, 0) + sequence_open(tokenizer) } else { State::Nok } @@ -105,23 +105,27 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | ## aa /// ^ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { +fn sequence_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') if size > 0 => { + None | Some(b'\n') if tokenizer.tokenize_state.size > 0 => { + tokenizer.tokenize_state.size = 0; tokenizer.exit(Token::HeadingAtxSequence); at_break(tokenizer) } - Some(b'#') if size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + Some(b'#') if tokenizer.tokenize_state.size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(move |tokenizer| { - sequence_open(tokenizer, size + 1) - })) + State::Fn(Box::new(sequence_open)) } - _ if size > 0 => { + _ if tokenizer.tokenize_state.size > 0 => { + tokenizer.tokenize_state.size = 0; tokenizer.exit(Token::HeadingAtxSequence); tokenizer.go(space_or_tab(), at_break)(tokenizer) } - _ => State::Nok, + _ => { + tokenizer.tokenize_state.size = 0; + State::Nok + } } } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 98d7843..675b2ac 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -109,8 +109,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-' | b'=') => { + tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); tokenizer.enter(Token::HeadingSetextUnderline); - inside(tokenizer, tokenizer.current.unwrap()) + inside(tokenizer) } _ => State::Nok, } @@ -123,13 +124,14 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | == /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { +fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'-' | b'=') if tokenizer.current.unwrap() == marker => { + Some(b'-' | b'=') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, marker))) + State::Fn(Box::new(inside)) } _ => { + tokenizer.tokenize_state.marker = 0; tokenizer.exit(Token::HeadingSetextUnderline); tokenizer.attempt_opt(space_or_tab(), after)(tokenizer) } diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 064da35..aaa803d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -1,4 +1,4 @@ -//! HTML (flow) is a construct that occurs in the [flow][] content type. +//! HTML (flow) is a construct that occurs in the [flow][] cont&ent type. //! //! It forms with the following BNF: //! @@ -110,37 +110,20 @@ use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; use crate::util::slice::Slice; -/// Kind of HTML (flow). -#[derive(Debug, PartialEq)] -enum Kind { - /// Symbol for `<script>` (condition 1). - Raw, - /// Symbol for `<!---->` (condition 2). - Comment, - /// Symbol for `<?php?>` (condition 3). - Instruction, - /// Symbol for `<!doctype>` (condition 4). - Declaration, - /// Symbol for `<![CDATA[]]>` (condition 5). - Cdata, - /// Symbol for `<div` (condition 6). - Basic, - /// Symbol for `<x>` (condition 7). - Complete, -} - -/// State needed to parse HTML (flow). -#[derive(Debug)] -struct Info { - /// Kind of HTML (flow). - kind: Kind, - /// Whether this is a start tag (`<` not followed by `/`). - start_tag: bool, - /// Start index of a tag name or cdata prefix. - start: usize, - /// Current quote, when in a double or single quoted attribute value. - quote: u8, -} +/// Symbol for `<script>` (condition 1). +const RAW: u8 = 1; +/// Symbol for `<!---->` (condition 2). +const COMMENT: u8 = 2; +/// Symbol for `<?php?>` (condition 3). +const INSTRUCTION: u8 = 3; +/// Symbol for `<!doctype>` (condition 4). +const DECLARATION: u8 = 4; +/// Symbol for `<![CDATA[]]>` (condition 5). +const CDATA: u8 = 5; +/// Symbol for `<div` (condition 6). +const BASIC: u8 = 6; +/// Symbol for `<x>` (condition 7). +const COMPLETE: u8 = 7; /// Start of HTML (flow), before optional whitespace. /// @@ -197,39 +180,30 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn open(tokenizer: &mut Tokenizer) -> State { - let mut info = Info { - // Assume basic. - kind: Kind::Basic, - // Assume closing tag (or no tag). - start_tag: false, - start: 0, - quote: 0, - }; - match tokenizer.current { Some(b'!') => { tokenizer.consume(); - State::Fn(Box::new(|t| declaration_open(t, info))) + State::Fn(Box::new(declaration_open)) } Some(b'/') => { tokenizer.consume(); - info.start = tokenizer.point.index; - State::Fn(Box::new(|t| tag_close_start(t, info))) + tokenizer.tokenize_state.seen = true; + tokenizer.tokenize_state.start = tokenizer.point.index; + State::Fn(Box::new(tag_close_start)) } Some(b'?') => { - info.kind = Kind::Instruction; + tokenizer.tokenize_state.marker = INSTRUCTION; tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; // While we’re in an instruction instead of a declaration, we’re on a `?` // right now, so we do need to search for `>`, similar to declarations. - State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) + State::Fn(Box::new(continuation_declaration_inside)) } // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { - info.start_tag = true; - info.start = tokenizer.point.index; - tag_name(tokenizer, info) + tokenizer.tokenize_state.start = tokenizer.point.index; + tag_name(tokenizer) } _ => State::Nok, } @@ -245,25 +219,24 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// > | <![CDATA[>&<]]> /// ^ /// ``` -fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn declaration_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-') => { tokenizer.consume(); - info.kind = Kind::Comment; - State::Fn(Box::new(|t| comment_open_inside(t, info))) + tokenizer.tokenize_state.marker = COMMENT; + State::Fn(Box::new(comment_open_inside)) } Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - info.kind = Kind::Declaration; + tokenizer.tokenize_state.marker = DECLARATION; // Do not form containers. tokenizer.concrete = true; - State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) + State::Fn(Box::new(continuation_declaration_inside)) } Some(b'[') => { tokenizer.consume(); - info.kind = Kind::Cdata; - info.start = tokenizer.point.index; - State::Fn(Box::new(|t| cdata_open_inside(t, info))) + tokenizer.tokenize_state.marker = CDATA; + State::Fn(Box::new(cdata_open_inside)) } _ => State::Nok, } @@ -275,15 +248,15 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | <!--xxx--> /// ^ /// ``` -fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { - match tokenizer.current { - Some(b'-') => { - tokenizer.consume(); - // Do not form containers. - tokenizer.concrete = true; - State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) - } - _ => State::Nok, +fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { + if let Some(b'-') = tokenizer.current { + tokenizer.consume(); + // Do not form containers. + tokenizer.concrete = true; + State::Fn(Box::new(continuation_declaration_inside)) + } else { + tokenizer.tokenize_state.marker = 0; + State::Nok } } @@ -293,21 +266,23 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <![CDATA[>&<]]> /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { - match tokenizer.current { - Some(byte) if byte == HTML_CDATA_PREFIX[tokenizer.point.index - info.start] => { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(HTML_CDATA_PREFIX[tokenizer.tokenize_state.size]) { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); - if tokenizer.point.index - info.start == HTML_CDATA_PREFIX.len() { - info.start = 0; - // Do not form containers. - tokenizer.concrete = true; - State::Fn(Box::new(|t| continuation(t, info))) - } else { - State::Fn(Box::new(|t| cdata_open_inside(t, info))) - } + if tokenizer.tokenize_state.size == HTML_CDATA_PREFIX.len() { + tokenizer.tokenize_state.size = 0; + // Do not form containers. + tokenizer.concrete = true; + State::Fn(Box::new(continuation)) + } else { + State::Fn(Box::new(cdata_open_inside)) } - _ => State::Nok, + } else { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + State::Nok } } @@ -317,14 +292,14 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | </x> /// ^ /// ``` -fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { - match tokenizer.current { - // ASCII alphabetical. - Some(b'A'..=b'Z' | b'a'..=b'z') => { - tokenizer.consume(); - State::Fn(Box::new(|t| tag_name(t, info))) - } - _ => State::Nok, +fn tag_close_start(tokenizer: &mut Tokenizer) -> State { + if let Some(b'A'..=b'Z' | b'a'..=b'z') = tokenizer.current { + tokenizer.consume(); + State::Fn(Box::new(tag_name)) + } else { + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.start = 0; + State::Nok } } @@ -336,14 +311,15 @@ fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | </ab> /// ^^ /// ``` -fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn tag_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => { + let closing_tag = tokenizer.tokenize_state.seen; let slash = matches!(tokenizer.current, Some(b'/')); // Guaranteed to be valid ASCII bytes. let slice = Slice::from_indices( tokenizer.parse_state.bytes, - info.start, + tokenizer.tokenize_state.start, tokenizer.point.index, ); let name = slice @@ -351,42 +327,48 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { // The line ending case might result in a `\r` that is already accounted for. .trim() .to_ascii_lowercase(); - info.start = 0; + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.start = 0; - if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) { - info.kind = Kind::Raw; + if !slash && !closing_tag && HTML_RAW_NAMES.contains(&name.as_str()) { + tokenizer.tokenize_state.marker = RAW; // Do not form containers. tokenizer.concrete = true; - continuation(tokenizer, info) + continuation(tokenizer) } else if HTML_BLOCK_NAMES.contains(&name.as_str()) { - // Basic is assumed, no need to set `kind`. + tokenizer.tokenize_state.marker = BASIC; + if slash { tokenizer.consume(); - State::Fn(Box::new(|t| basic_self_closing(t, info))) + State::Fn(Box::new(basic_self_closing)) } else { // Do not form containers. tokenizer.concrete = true; - continuation(tokenizer, info) + continuation(tokenizer) } } else { - info.kind = Kind::Complete; + tokenizer.tokenize_state.marker = COMPLETE; // Do not support complete HTML when interrupting. if tokenizer.interrupt && !tokenizer.lazy { + tokenizer.tokenize_state.marker = 0; State::Nok - } else if info.start_tag { - complete_attribute_name_before(tokenizer, info) + } else if closing_tag { + complete_closing_tag_after(tokenizer) } else { - complete_closing_tag_after(tokenizer, info) + complete_attribute_name_before(tokenizer) } } } // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(|t| tag_name(t, info))) + State::Fn(Box::new(tag_name)) + } + Some(_) => { + tokenizer.tokenize_state.seen = false; + State::Nok } - Some(_) => State::Nok, } } @@ -396,15 +378,15 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | <div/> /// ^ /// ``` -fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { - match tokenizer.current { - Some(b'>') => { - tokenizer.consume(); - // Do not form containers. - tokenizer.concrete = true; - State::Fn(Box::new(|t| continuation(t, info))) - } - _ => State::Nok, +fn basic_self_closing(tokenizer: &mut Tokenizer) -> State { + if let Some(b'>') = tokenizer.current { + tokenizer.consume(); + // Do not form containers. + tokenizer.concrete = true; + State::Fn(Box::new(continuation)) + } else { + tokenizer.tokenize_state.marker = 0; + State::Nok } } @@ -414,13 +396,13 @@ fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <x/> /// ^ /// ``` -fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { +fn complete_closing_tag_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_closing_tag_after(t, info))) + State::Fn(Box::new(complete_closing_tag_after)) } - _ => complete_end(tokenizer, info), + _ => complete_end(tokenizer), } } @@ -443,22 +425,22 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <a > /// ^ /// ``` -fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State { +fn complete_attribute_name_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) + State::Fn(Box::new(complete_attribute_name_before)) } Some(b'/') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_end(t, info))) + State::Fn(Box::new(complete_end)) } // ASCII alphanumerical and `:` and `_`. Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_name(t, info))) + State::Fn(Box::new(complete_attribute_name)) } - _ => complete_end(tokenizer, info), + _ => complete_end(tokenizer), } } @@ -472,14 +454,14 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat /// > | <a b> /// ^ /// ``` -fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { +fn complete_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // ASCII alphanumerical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_name(t, info))) + State::Fn(Box::new(complete_attribute_name)) } - _ => complete_attribute_name_after(tokenizer, info), + _ => complete_attribute_name_after(tokenizer), } } @@ -492,17 +474,17 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <a b=c> /// ^ /// ``` -fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State { +fn complete_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_name_after(t, info))) + State::Fn(Box::new(complete_attribute_name_after)) } Some(b'=') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) + State::Fn(Box::new(complete_attribute_value_before)) } - _ => complete_attribute_name_before(tokenizer, info), + _ => complete_attribute_name_before(tokenizer), } } @@ -515,19 +497,22 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State /// > | <a b="c"> /// ^ /// ``` -fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn complete_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, + None | Some(b'<' | b'=' | b'>' | b'`') => { + tokenizer.tokenize_state.marker = 0; + State::Nok + } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) + State::Fn(Box::new(complete_attribute_value_before)) } Some(b'"' | b'\'') => { - info.quote = tokenizer.current.unwrap(); + tokenizer.tokenize_state.marker_other = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) + State::Fn(Box::new(complete_attribute_value_quoted)) } - _ => complete_attribute_value_unquoted(tokenizer, info), + _ => complete_attribute_value_unquoted(tokenizer), } } @@ -539,16 +524,23 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> /// > | <a b='c'> /// ^ /// ``` -fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State { +fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') => State::Nok, - Some(b'"' | b'\'') if tokenizer.current.unwrap() == info.quote => { + None | Some(b'\n') => { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.marker_other = 0; + State::Nok + } + Some(b'"' | b'\'') + if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker_other => + { + tokenizer.tokenize_state.marker_other = 0; tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info))) + State::Fn(Box::new(complete_attribute_value_quoted_after)) } _ => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) + State::Fn(Box::new(complete_attribute_value_quoted)) } } } @@ -559,14 +551,14 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> Sta /// > | <a b=c> /// ^ /// ``` -fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> State { +fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\t' | b'\n' | b' ' | b'"' | b'\'' | b'/' | b'<' | b'=' | b'>' | b'`') => { - complete_attribute_name_after(tokenizer, info) + complete_attribute_name_after(tokenizer) } Some(_) => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_unquoted(t, info))) + State::Fn(Box::new(complete_attribute_value_unquoted)) } } } @@ -578,10 +570,12 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> S /// > | <a b="c"> /// ^ /// ``` -fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) -> State { - match tokenizer.current { - Some(b'\t' | b' ' | b'/' | b'>') => complete_attribute_name_before(tokenizer, info), - _ => State::Nok, +fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { + if let Some(b'\t' | b' ' | b'/' | b'>') = tokenizer.current { + complete_attribute_name_before(tokenizer) + } else { + tokenizer.tokenize_state.marker = 0; + State::Nok } } @@ -591,13 +585,13 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) /// > | <a b="c"> /// ^ /// ``` -fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { - match tokenizer.current { - Some(b'>') => { - tokenizer.consume(); - State::Fn(Box::new(|t| complete_after(t, info))) - } - _ => State::Nok, +fn complete_end(tokenizer: &mut Tokenizer) -> State { + if let Some(b'>') = tokenizer.current { + tokenizer.consume(); + State::Fn(Box::new(complete_after)) + } else { + tokenizer.tokenize_state.marker = 0; + State::Nok } } @@ -607,18 +601,21 @@ fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <x> /// ^ /// ``` -fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { +fn complete_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { // Do not form containers. tokenizer.concrete = true; - continuation(tokenizer, info) + continuation(tokenizer) } Some(b'\t' | b' ') => { tokenizer.consume(); - State::Fn(Box::new(|t| complete_after(t, info))) + State::Fn(Box::new(complete_after)) + } + Some(_) => { + tokenizer.tokenize_state.marker = 0; + State::Nok } - Some(_) => State::Nok, } } @@ -628,46 +625,49 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <!--xxx--> /// ^ /// ``` -fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { + Some(b'\n') + if tokenizer.tokenize_state.marker == BASIC + || tokenizer.tokenize_state.marker == COMPLETE => + { tokenizer.exit(Token::HtmlFlowData); tokenizer.check(blank_line_before, |ok| { - if ok { - Box::new(continuation_after) + Box::new(if ok { + continuation_after } else { - Box::new(move |t| continuation_start(t, info)) - } + continuation_start + }) })(tokenizer) } // Note: important that this is after the basic/complete case. None | Some(b'\n') => { tokenizer.exit(Token::HtmlFlowData); - continuation_start(tokenizer, info) + continuation_start(tokenizer) } - Some(b'-') if info.kind == Kind::Comment => { + Some(b'-') if tokenizer.tokenize_state.marker == COMMENT => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_comment_inside(t, info))) + State::Fn(Box::new(continuation_comment_inside)) } - Some(b'<') if info.kind == Kind::Raw => { + Some(b'<') if tokenizer.tokenize_state.marker == RAW => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_raw_tag_open(t, info))) + State::Fn(Box::new(continuation_raw_tag_open)) } - Some(b'>') if info.kind == Kind::Declaration => { + Some(b'>') if tokenizer.tokenize_state.marker == DECLARATION => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_close(t, info))) + State::Fn(Box::new(continuation_close)) } - Some(b'?') if info.kind == Kind::Instruction => { + Some(b'?') if tokenizer.tokenize_state.marker == INSTRUCTION => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) + State::Fn(Box::new(continuation_declaration_inside)) } - Some(b']') if info.kind == Kind::Cdata => { + Some(b']') if tokenizer.tokenize_state.marker == CDATA => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_character_data_inside(t, info))) + State::Fn(Box::new(continuation_character_data_inside)) } _ => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation(t, info))) + State::Fn(Box::new(continuation)) } } } @@ -679,13 +679,13 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// | asd /// ``` -fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_start(tokenizer: &mut Tokenizer) -> State { tokenizer.check(partial_non_lazy_continuation, |ok| { - if ok { - Box::new(move |t| continuation_start_non_lazy(t, info)) + Box::new(if ok { + continuation_start_non_lazy } else { - Box::new(continuation_after) - } + continuation_after + }) })(tokenizer) } @@ -696,13 +696,13 @@ fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// | asd /// ``` -fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_start_non_lazy(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(|t| continuation_before(t, info))) + State::Fn(Box::new(continuation_before)) } _ => unreachable!("expected eol"), } @@ -715,12 +715,12 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | asd /// ^ /// ``` -fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') => continuation_start(tokenizer, info), + None | Some(b'\n') => continuation_start(tokenizer), _ => { tokenizer.enter(Token::HtmlFlowData); - continuation(tokenizer, info) + continuation(tokenizer) } } } @@ -731,13 +731,13 @@ fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <!--xxx--> /// ^ /// ``` -fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_comment_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'-') => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) + State::Fn(Box::new(continuation_declaration_inside)) } - _ => continuation(tokenizer, info), + _ => continuation(tokenizer), } } @@ -747,14 +747,14 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <script>console.log(1)</script> /// ^ /// ``` -fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'/') => { tokenizer.consume(); - info.start = tokenizer.point.index; - State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) + tokenizer.tokenize_state.start = tokenizer.point.index; + State::Fn(Box::new(continuation_raw_end_tag)) } - _ => continuation(tokenizer, info), + _ => continuation(tokenizer), } } @@ -764,35 +764,35 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State /// > | <script>console.log(1)</script> /// ^^^^^^ /// ``` -fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { // Guaranteed to be valid ASCII bytes. let slice = Slice::from_indices( tokenizer.parse_state.bytes, - info.start, + tokenizer.tokenize_state.start, tokenizer.point.index, ); let name = slice.as_str().to_ascii_lowercase(); - info.start = 0; + tokenizer.tokenize_state.start = 0; if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_close(t, info))) + State::Fn(Box::new(continuation_close)) } else { - continuation(tokenizer, info) + continuation(tokenizer) } } Some(b'A'..=b'Z' | b'a'..=b'z') - if tokenizer.point.index - info.start < HTML_RAW_SIZE_MAX => + if tokenizer.point.index - tokenizer.tokenize_state.start < HTML_RAW_SIZE_MAX => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) + State::Fn(Box::new(continuation_raw_end_tag)) } _ => { - info.start = 0; - continuation(tokenizer, info) + tokenizer.tokenize_state.start = 0; + continuation(tokenizer) } } } @@ -803,13 +803,13 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State /// > | <![CDATA[>&<]]> /// ^ /// ``` -fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_character_data_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b']') => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) + State::Fn(Box::new(continuation_declaration_inside)) } - _ => continuation(tokenizer, info), + _ => continuation(tokenizer), } } @@ -827,17 +827,17 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> /// > | <![CDATA[>&<]]> /// ^ /// ``` -fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_declaration_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_close(t, info))) + State::Fn(Box::new(continuation_close)) } - Some(b'-') if info.kind == Kind::Comment => { + Some(b'-') if tokenizer.tokenize_state.marker == COMMENT => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) + State::Fn(Box::new(continuation_declaration_inside)) } - _ => continuation(tokenizer, info), + _ => continuation(tokenizer), } } @@ -847,7 +847,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> Sta /// > | <!doctype> /// ^ /// ``` -fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::HtmlFlowData); @@ -855,7 +855,7 @@ fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State { } _ => { tokenizer.consume(); - State::Fn(Box::new(|t| continuation_close(t, info))) + State::Fn(Box::new(continuation_close)) } } } @@ -868,6 +868,7 @@ fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::HtmlFlow); + tokenizer.tokenize_state.marker = 0; // Feel free to interrupt. tokenizer.interrupt = false; // No longer concrete. diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 8a44c29..a4c0349 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -57,7 +57,7 @@ use crate::constant::HTML_CDATA_PREFIX; use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; -use crate::tokenizer::{State, StateFn, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of HTML (text) /// @@ -132,7 +132,7 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { } Some(b'[') => { tokenizer.consume(); - State::Fn(Box::new(|t| cdata_open_inside(t, 0))) + State::Fn(Box::new(cdata_open_inside)) } _ => State::Nok, } @@ -207,7 +207,10 @@ fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { fn comment(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some(b'\n') => at_line_ending(tokenizer, Box::new(comment)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(comment)); + at_line_ending(tokenizer) + } Some(b'-') => { tokenizer.consume(); State::Fn(Box::new(comment_close)) @@ -241,14 +244,16 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { - if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) { +fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(HTML_CDATA_PREFIX[tokenizer.tokenize_state.size]) { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - if size + 1 == HTML_CDATA_PREFIX.len() { + if tokenizer.tokenize_state.size == HTML_CDATA_PREFIX.len() { + tokenizer.tokenize_state.size = 0; State::Fn(Box::new(cdata)) } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1))) + State::Fn(Box::new(cdata_open_inside)) } } else { State::Nok @@ -264,7 +269,10 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { fn cdata(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some(b'\n') => at_line_ending(tokenizer, Box::new(cdata)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(cdata)); + at_line_ending(tokenizer) + } Some(b']') => { tokenizer.consume(); State::Fn(Box::new(cdata_close)) @@ -315,7 +323,10 @@ fn cdata_end(tokenizer: &mut Tokenizer) -> State { fn declaration(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'>') => end(tokenizer), - Some(b'\n') => at_line_ending(tokenizer, Box::new(declaration)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(declaration)); + at_line_ending(tokenizer) + } _ => { tokenizer.consume(); State::Fn(Box::new(declaration)) @@ -332,7 +343,10 @@ fn declaration(tokenizer: &mut Tokenizer) -> State { fn instruction(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some(b'\n') => at_line_ending(tokenizer, Box::new(instruction)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(instruction)); + at_line_ending(tokenizer) + } Some(b'?') => { tokenizer.consume(); State::Fn(Box::new(instruction_close)) @@ -399,7 +413,10 @@ fn tag_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_close_between)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(tag_close_between)); + at_line_ending(tokenizer) + } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_close_between)) @@ -434,7 +451,10 @@ fn tag_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_open_between)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_between)); + at_line_ending(tokenizer) + } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_between)) @@ -478,7 +498,10 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_attribute_name_after)); + at_line_ending(tokenizer) + } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name_after)) @@ -501,17 +524,18 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, - Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)), + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_attribute_value_before)); + at_line_ending(tokenizer) + } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } Some(b'"' | b'\'') => { - let marker = tokenizer.current.unwrap(); + tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(move |t| { - tag_open_attribute_value_quoted(t, marker) - })) + State::Fn(Box::new(tag_open_attribute_value_quoted)) } Some(_) => { tokenizer.consume(); @@ -526,22 +550,24 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { /// > | a <b c="d"> e /// ^ /// ``` -fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> State { +fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Nok, - Some(b'\n') => at_line_ending( - tokenizer, - Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), - ), - Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => { + None => { + tokenizer.tokenize_state.marker = 0; + State::Nok + } + Some(b'\n') => { + tokenizer.tokenize_state.return_state = Some(Box::new(tag_open_attribute_value_quoted)); + at_line_ending(tokenizer) + } + Some(b'"' | b'\'') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { + tokenizer.tokenize_state.marker = 0; tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } _ => { tokenizer.consume(); - State::Fn(Box::new(move |t| { - tag_open_attribute_value_quoted(t, marker) - })) + State::Fn(Box::new(tag_open_attribute_value_quoted)) } } } @@ -605,14 +631,14 @@ fn end(tokenizer: &mut Tokenizer) -> State { /// ^ /// | b--> /// ``` -fn at_line_ending(tokenizer: &mut Tokenizer, return_state: Box<StateFn>) -> State { +fn at_line_ending(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.exit(Token::HtmlTextData); tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(|t| after_line_ending(t, return_state))) + State::Fn(Box::new(after_line_ending)) } _ => unreachable!("expected eol"), } @@ -628,10 +654,8 @@ fn at_line_ending(tokenizer: &mut Tokenizer, return_state: Box<StateFn>) -> Stat /// > | b--> /// ^ /// ``` -fn after_line_ending(tokenizer: &mut Tokenizer, return_state: Box<StateFn>) -> State { - tokenizer.attempt_opt(space_or_tab(), |t| { - after_line_ending_prefix(t, return_state) - })(tokenizer) +fn after_line_ending(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt_opt(space_or_tab(), after_line_ending_prefix)(tokenizer) } /// After a line ending, after indent. @@ -644,7 +668,8 @@ fn after_line_ending(tokenizer: &mut Tokenizer, return_state: Box<StateFn>) -> S /// > | b--> /// ^ /// ``` -fn after_line_ending_prefix(tokenizer: &mut Tokenizer, return_state: Box<StateFn>) -> State { +fn after_line_ending_prefix(tokenizer: &mut Tokenizer) -> State { + let return_state = tokenizer.tokenize_state.return_state.take().unwrap(); tokenizer.enter(Token::HtmlTextData); return_state(tokenizer) } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index d3191a8..b38e15a 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -148,10 +148,8 @@ use crate::constant::RESOURCE_DESTINATION_BALANCE_MAX; use crate::construct::{ - partial_destination::{start as destination, Options as DestinationOptions}, - partial_label::{start as label, Options as LabelOptions}, - partial_space_or_tab::space_or_tab_eol, - partial_title::{start as title, Options as TitleOptions}, + partial_destination::start as destination, partial_label::start as label, + partial_space_or_tab::space_or_tab_eol, partial_title::start as title, }; use crate::token::Token; use crate::tokenizer::{Event, EventType, Media, State, Tokenizer}; @@ -161,15 +159,6 @@ use crate::util::{ slice::{Position, Slice}, }; -/// State needed to parse label end. -#[derive(Debug)] -struct Info { - /// Index into `label_start_stack` of the corresponding opening. - label_start_index: usize, - /// The proposed `Media` that this seems to represent. - media: Media, -} - /// Start of label end. /// /// ```markdown @@ -202,36 +191,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { .get_mut(label_start_index) .unwrap(); + tokenizer.tokenize_state.start = label_start_index; + tokenizer.tokenize_state.end = tokenizer.events.len(); + // Mark as balanced if the info is inactive. if label_start.inactive { - return nok(tokenizer, label_start_index); + return nok(tokenizer); } - let label_end_start = tokenizer.events.len(); - - let info = Info { - label_start_index, - media: Media { - start: label_start.start, - end: (label_end_start, label_end_start + 3), - id: normalize_identifier( - // We don’t care about virtual spaces, so `indices` and `as_str` are fine. - Slice::from_indices( - tokenizer.parse_state.bytes, - tokenizer.events[label_start.start.1].point.index, - tokenizer.events[label_end_start - 1].point.index, - ) - .as_str(), - ), - }, - }; - tokenizer.enter(Token::LabelEnd); tokenizer.enter(Token::LabelMarker); tokenizer.consume(); tokenizer.exit(Token::LabelMarker); tokenizer.exit(Token::LabelEnd); - return State::Fn(Box::new(move |t| after(t, info))); + return State::Fn(Box::new(after)); } } @@ -250,40 +223,40 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | [a] b /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer, info: Info) -> State { - let defined = tokenizer.parse_state.definitions.contains(&info.media.id); +fn after(tokenizer: &mut Tokenizer) -> State { + let start = &tokenizer.label_start_stack[tokenizer.tokenize_state.start]; + let defined = tokenizer + .parse_state + .definitions + .contains(&normalize_identifier( + // We don’t care about virtual spaces, so `indices` and `as_str` are fine. + Slice::from_indices( + tokenizer.parse_state.bytes, + tokenizer.events[start.start.1].point.index, + tokenizer.events[tokenizer.tokenize_state.end].point.index, + ) + .as_str(), + )); match tokenizer.current { // Resource (`[asd](fgh)`)? Some(b'(') => tokenizer.attempt(resource, move |is_ok| { - Box::new(move |t| { - // Also fine if `defined`, as then it’s a valid shortcut. - if is_ok || defined { - ok(t, info) - } else { - nok(t, info.label_start_index) - } - }) + Box::new(if is_ok || defined { ok } else { nok }) })(tokenizer), // Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference? Some(b'[') => tokenizer.attempt(full_reference, move |is_ok| { - Box::new(move |t| { - if is_ok { - ok(t, info) - } else if defined { - reference_not_full(t, info) - } else { - nok(t, info.label_start_index) - } + Box::new(if is_ok { + ok + } else if defined { + reference_not_full + } else { + nok }) })(tokenizer), // Shortcut (`[asd]`) reference? _ => { - if defined { - ok(tokenizer, info) - } else { - nok(tokenizer, info.label_start_index) - } + let func = if defined { ok } else { nok }; + func(tokenizer) } } } @@ -298,15 +271,9 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | [a] b /// ^ /// ``` -fn reference_not_full(tokenizer: &mut Tokenizer, info: Info) -> State { - tokenizer.attempt(collapsed_reference, move |is_ok| { - Box::new(move |t| { - if is_ok { - ok(t, info) - } else { - nok(t, info.label_start_index) - } - }) +fn reference_not_full(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt(collapsed_reference, |is_ok| { + Box::new(if is_ok { ok } else { nok }) })(tokenizer) } @@ -322,16 +289,15 @@ fn reference_not_full(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | [a] b /// ^ /// ``` -fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn ok(tokenizer: &mut Tokenizer) -> State { + let label_start_index = tokenizer.tokenize_state.start; // Remove this one and everything after it. - let mut left = tokenizer - .label_start_stack - .split_off(info.label_start_index); + let mut left = tokenizer.label_start_stack.split_off(label_start_index); // Remove this one from `left`, as we’ll move it to `media_list`. - left.remove(0); + let label_start = left.remove(0); tokenizer.label_start_list_loose.append(&mut left); - let is_link = tokenizer.events[info.media.start.0].token_type == Token::LabelLink; + let is_link = tokenizer.events[label_start.start.0].token_type == Token::LabelLink; if is_link { let mut index = 0; @@ -344,8 +310,12 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State { } } - info.media.end.1 = tokenizer.events.len() - 1; - tokenizer.media_list.push(info.media); + tokenizer.media_list.push(Media { + start: label_start.start, + end: (tokenizer.tokenize_state.end, tokenizer.events.len() - 1), + }); + tokenizer.tokenize_state.start = 0; + tokenizer.tokenize_state.end = 0; tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media)); State::Ok } @@ -362,12 +332,14 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | [a] b /// ^ /// ``` -fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { +fn nok(tokenizer: &mut Tokenizer) -> State { tokenizer .label_start_stack - .get_mut(label_start_index) + .get_mut(tokenizer.tokenize_state.start) .unwrap() .balanced = true; + tokenizer.tokenize_state.start = 0; + tokenizer.tokenize_state.end = 0; State::Nok } @@ -407,24 +379,23 @@ fn resource_start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn resource_open(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(b')') => resource_end(tokenizer), - _ => tokenizer.go( - |t| { - destination( - t, - DestinationOptions { - limit: RESOURCE_DESTINATION_BALANCE_MAX, - destination: Token::ResourceDestination, - literal: Token::ResourceDestinationLiteral, - marker: Token::ResourceDestinationLiteralMarker, - raw: Token::ResourceDestinationRaw, - string: Token::ResourceDestinationString, - }, - ) - }, - destination_after, - )(tokenizer), + if let Some(b')') = tokenizer.current { + resource_end(tokenizer) + } else { + tokenizer.tokenize_state.token_1 = Token::ResourceDestination; + tokenizer.tokenize_state.token_2 = Token::ResourceDestinationLiteral; + tokenizer.tokenize_state.token_3 = Token::ResourceDestinationLiteralMarker; + tokenizer.tokenize_state.token_4 = Token::ResourceDestinationRaw; + tokenizer.tokenize_state.token_5 = Token::ResourceDestinationString; + tokenizer.tokenize_state.size_other = RESOURCE_DESTINATION_BALANCE_MAX; + + tokenizer.attempt(destination, |ok| { + Box::new(if ok { + destination_after + } else { + destination_missing + }) + })(tokenizer) } } @@ -435,11 +406,29 @@ fn resource_open(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn destination_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; + tokenizer.tokenize_state.token_4 = Token::Data; + tokenizer.tokenize_state.token_5 = Token::Data; + tokenizer.tokenize_state.size_other = 0; + tokenizer.attempt(space_or_tab_eol(), |ok| { Box::new(if ok { resource_between } else { resource_end }) })(tokenizer) } +/// Without destination. +fn destination_missing(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; + tokenizer.tokenize_state.token_4 = Token::Data; + tokenizer.tokenize_state.token_5 = Token::Data; + tokenizer.tokenize_state.size_other = 0; + State::Nok +} + /// In a resource, after a destination, after whitespace. /// /// ```markdown @@ -448,19 +437,12 @@ fn destination_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'"' | b'\'' | b'(') => tokenizer.go( - |t| { - title( - t, - TitleOptions { - title: Token::ResourceTitle, - marker: Token::ResourceTitleMarker, - string: Token::ResourceTitleString, - }, - ) - }, - title_after, - )(tokenizer), + Some(b'"' | b'\'' | b'(') => { + tokenizer.tokenize_state.token_1 = Token::ResourceTitle; + tokenizer.tokenize_state.token_2 = Token::ResourceTitleMarker; + tokenizer.tokenize_state.token_3 = Token::ResourceTitleString; + tokenizer.go(title, title_after)(tokenizer) + } _ => resource_end(tokenizer), } } @@ -472,6 +454,9 @@ fn resource_between(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn title_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.attempt_opt(space_or_tab_eol(), resource_end)(tokenizer) } @@ -502,19 +487,12 @@ fn resource_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn full_reference(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'[') => tokenizer.go( - |t| { - label( - t, - LabelOptions { - label: Token::Reference, - marker: Token::ReferenceMarker, - string: Token::ReferenceString, - }, - ) - }, - full_reference_after, - )(tokenizer), + Some(b'[') => { + tokenizer.tokenize_state.token_1 = Token::Reference; + tokenizer.tokenize_state.token_2 = Token::ReferenceMarker; + tokenizer.tokenize_state.token_3 = Token::ReferenceString; + tokenizer.go(label, full_reference_after)(tokenizer) + } _ => unreachable!("expected `[`"), } } @@ -526,6 +504,10 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn full_reference_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Token::Data; + tokenizer.tokenize_state.token_2 = Token::Data; + tokenizer.tokenize_state.token_3 = Token::Data; + if tokenizer .parse_state .definitions diff --git a/src/construct/list.rs b/src/construct/list.rs index d5a9899..0e12b7c 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -123,7 +123,7 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { fn before_ordered(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::ListItemPrefix); tokenizer.enter(Token::ListItemValue); - inside(tokenizer, 0) + inside(tokenizer) } /// In an ordered list item value. @@ -132,17 +132,21 @@ fn before_ordered(tokenizer: &mut Tokenizer) -> State { /// > | 1. a /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer, size: usize) -> State { +fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => { + Some(b'.' | b')') if !tokenizer.interrupt || tokenizer.tokenize_state.size < 2 => { tokenizer.exit(Token::ListItemValue); marker(tokenizer) } - Some(b'0'..=b'9') if size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { + Some(b'0'..=b'9') if tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, size + 1))) + State::Fn(Box::new(inside)) + } + _ => { + tokenizer.tokenize_state.size = 0; + State::Nok } - _ => State::Nok, } } @@ -170,12 +174,9 @@ fn marker(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn marker_after(tokenizer: &mut Tokenizer) -> State { - tokenizer.check(blank_line, move |ok| { - if ok { - Box::new(|t| after(t, true)) - } else { - Box::new(marker_after_not_blank) - } + tokenizer.tokenize_state.size = 1; + tokenizer.check(blank_line, |ok| { + Box::new(if ok { after } else { marker_after_not_blank }) })(tokenizer) } @@ -186,13 +187,11 @@ fn marker_after(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn marker_after_not_blank(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.size = 0; + // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace. - tokenizer.attempt(whitespace, move |ok| { - if ok { - Box::new(|t| after(t, false)) - } else { - Box::new(prefix_other) - } + tokenizer.attempt(whitespace, |ok| { + Box::new(if ok { after } else { prefix_other }) })(tokenizer) } @@ -232,7 +231,7 @@ fn prefix_other(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::SpaceOrTab); tokenizer.consume(); tokenizer.exit(Token::SpaceOrTab); - State::Fn(Box::new(|t| after(t, false))) + State::Fn(Box::new(after)) } _ => State::Nok, } @@ -244,7 +243,10 @@ fn prefix_other(tokenizer: &mut Tokenizer) -> State { /// > | * a /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer, blank: bool) -> State { +fn after(tokenizer: &mut Tokenizer) -> State { + let blank = tokenizer.tokenize_state.size == 1; + tokenizer.tokenize_state.size = 0; + if blank && tokenizer.interrupt { State::Nok } else { diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index ec5669c..7fdaa66 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -44,9 +44,7 @@ use crate::util::skip::opt as skip_opt; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') => { - unreachable!("unexpected eol/eof") - } + None | Some(b'\n') => unreachable!("unexpected eol/eof"), _ => { tokenizer.enter(Token::Paragraph); tokenizer.enter_with_content(Token::Data, Some(ContentType::Text)); diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs index d92c9c1..2257bfd 100644 --- a/src/construct/partial_bom.rs +++ b/src/construct/partial_bom.rs @@ -13,6 +13,8 @@ use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; +const BOM: [u8; 3] = [0xEF, 0xBB, 0xBF]; + /// Before a BOM. /// /// ```text @@ -20,42 +22,33 @@ use crate::tokenizer::{State, Tokenizer}; /// ^^^^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.current == Some(0xEF) { + if tokenizer.current == Some(BOM[0]) { tokenizer.enter(Token::ByteOrderMark); - tokenizer.consume(); - State::Fn(Box::new(cont)) - } else { - State::Nok - } -} - -/// Second byte in BOM. -/// -/// ```text -/// > | 0xEF 0xBB 0xBF -/// ^^^^ -/// ``` -fn cont(tokenizer: &mut Tokenizer) -> State { - if tokenizer.current == Some(0xBB) { - tokenizer.consume(); - State::Fn(Box::new(end)) + inside(tokenizer) } else { State::Nok } } -/// Last byte in BOM. +/// Inside the BOM. /// /// ```text /// > | 0xEF 0xBB 0xBF -/// ^^^^ +/// ^^^^ ^^^^ ^^^^ /// ``` -fn end(tokenizer: &mut Tokenizer) -> State { - if tokenizer.current == Some(0xBF) { +fn inside(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(BOM[tokenizer.tokenize_state.size]) { + tokenizer.tokenize_state.size += 1; tokenizer.consume(); - tokenizer.exit(Token::ByteOrderMark); - State::Ok + if tokenizer.tokenize_state.size == BOM.len() { + tokenizer.exit(Token::ByteOrderMark); + tokenizer.tokenize_state.size = 0; + State::Ok + } else { + State::Fn(Box::new(inside)) + } } else { + tokenizer.tokenize_state.size = 0; State::Nok } } diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 335d7ab..0365489 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -15,14 +15,14 @@ use crate::tokenizer::{EventType, State, Tokenizer}; /// > | abc /// ^ /// ``` -pub fn start(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { +pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if stop.contains(&byte) => { + Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => { tokenizer.enter(Token::Data); tokenizer.consume(); - State::Fn(Box::new(move |t| data(t, stop))) + State::Fn(Box::new(data)) } - _ => at_break(tokenizer, stop), + _ => at_break(tokenizer), } } @@ -32,22 +32,22 @@ pub fn start(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { /// > | abc /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { +fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(move |t| at_break(t, stop))) + State::Fn(Box::new(at_break)) } - Some(byte) if stop.contains(&byte) => { + Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => { tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data)); State::Ok } _ => { tokenizer.enter(Token::Data); - data(tokenizer, stop) + data(tokenizer) } } } @@ -58,19 +58,19 @@ fn at_break(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { /// > | abc /// ^^^ /// ``` -fn data(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { +fn data(tokenizer: &mut Tokenizer) -> State { let done = match tokenizer.current { None | Some(b'\n') => true, - Some(byte) if stop.contains(&byte) => true, + Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => true, _ => false, }; if done { tokenizer.exit(Token::Data); - at_break(tokenizer, stop) + at_break(tokenizer) } else { tokenizer.consume(); - State::Fn(Box::new(move |t| data(t, stop))) + State::Fn(Box::new(data)) } } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 809aa27..f1cfc7d 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -74,34 +74,6 @@ use crate::token::Token; use crate::tokenizer::{ContentType, State, Tokenizer}; -/// Configuration. -/// -/// You must pass the token types in that are used. -#[derive(Debug)] -pub struct Options { - /// Token for the whole destination. - pub destination: Token, - /// Token for a literal (enclosed) destination. - pub literal: Token, - /// Token for a literal marker. - pub marker: Token, - /// Token for a raw destination. - pub raw: Token, - /// Token for a the string. - pub string: Token, - /// Maximum unbalanced parens. - pub limit: usize, -} - -/// State needed to parse destination. -#[derive(Debug)] -struct Info { - /// Paren balance (used in raw). - balance: usize, - /// Configuration. - options: Options, -} - /// Before a destination. /// /// ```markdown @@ -110,29 +82,24 @@ struct Info { /// > | aa /// ^ /// ``` -pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { - let info = Info { - balance: 0, - options, - }; - +pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'<') => { - tokenizer.enter(info.options.destination.clone()); - tokenizer.enter(info.options.literal.clone()); - tokenizer.enter(info.options.marker.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - State::Fn(Box::new(|t| enclosed_before(t, info))) + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + State::Fn(Box::new(enclosed_before)) } // ASCII control, space, closing paren, but *not* `\0`. None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok, Some(_) => { - tokenizer.enter(info.options.destination.clone()); - tokenizer.enter(info.options.raw.clone()); - tokenizer.enter(info.options.string.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_4.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_5.clone()); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - raw(tokenizer, info) + raw(tokenizer) } } } @@ -143,18 +110,18 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// > | <aa> /// ^ /// ``` -fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { +fn enclosed_before(tokenizer: &mut Tokenizer) -> State { if let Some(b'>') = tokenizer.current { - tokenizer.enter(info.options.marker.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - tokenizer.exit(info.options.literal.clone()); - tokenizer.exit(info.options.destination); + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + tokenizer.exit(tokenizer.tokenize_state.token_1.clone()); State::Ok } else { - tokenizer.enter(info.options.string.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_5.clone()); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - enclosed(tokenizer, info) + enclosed(tokenizer) } } @@ -164,21 +131,21 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <aa> /// ^ /// ``` -fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { +fn enclosed(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n' | b'<') => State::Nok, Some(b'>') => { tokenizer.exit(Token::Data); - tokenizer.exit(info.options.string.clone()); - enclosed_before(tokenizer, info) + tokenizer.exit(tokenizer.tokenize_state.token_5.clone()); + enclosed_before(tokenizer) } Some(b'\\') => { tokenizer.consume(); - State::Fn(Box::new(|t| enclosed_escape(t, info))) + State::Fn(Box::new(enclosed_escape)) } _ => { tokenizer.consume(); - State::Fn(Box::new(|t| enclosed(t, info))) + State::Fn(Box::new(enclosed)) } } } @@ -189,13 +156,13 @@ fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <a\*a> /// ^ /// ``` -fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { +fn enclosed_escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'<' | b'>' | b'\\') => { tokenizer.consume(); - State::Fn(Box::new(|t| enclosed(t, info))) + State::Fn(Box::new(enclosed)) } - _ => enclosed(tokenizer, info), + _ => enclosed(tokenizer), } } @@ -205,34 +172,38 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | aa /// ^ /// ``` -fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn raw(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\t' | b'\n' | b' ' | b')') if info.balance == 0 => { + None | Some(b'\t' | b'\n' | b' ' | b')') if tokenizer.tokenize_state.size == 0 => { tokenizer.exit(Token::Data); - tokenizer.exit(info.options.string.clone()); - tokenizer.exit(info.options.raw.clone()); - tokenizer.exit(info.options.destination); + tokenizer.exit(tokenizer.tokenize_state.token_5.clone()); + tokenizer.exit(tokenizer.tokenize_state.token_4.clone()); + tokenizer.exit(tokenizer.tokenize_state.token_1.clone()); + tokenizer.tokenize_state.size = 0; State::Ok } - Some(b'(') if info.balance < info.options.limit => { + Some(b'(') if tokenizer.tokenize_state.size < tokenizer.tokenize_state.size_other => { tokenizer.consume(); - info.balance += 1; - State::Fn(Box::new(move |t| raw(t, info))) + tokenizer.tokenize_state.size += 1; + State::Fn(Box::new(raw)) } // ASCII control (but *not* `\0`) and space and `(`. - None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => State::Nok, + None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => { + tokenizer.tokenize_state.size = 0; + State::Nok + } Some(b')') => { tokenizer.consume(); - info.balance -= 1; - State::Fn(Box::new(move |t| raw(t, info))) + tokenizer.tokenize_state.size -= 1; + State::Fn(Box::new(raw)) } Some(b'\\') => { tokenizer.consume(); - State::Fn(Box::new(move |t| raw_escape(t, info))) + State::Fn(Box::new(raw_escape)) } Some(_) => { tokenizer.consume(); - State::Fn(Box::new(move |t| raw(t, info))) + State::Fn(Box::new(raw)) } } } @@ -243,12 +214,12 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | a\*a /// ^ /// ``` -fn raw_escape(tokenizer: &mut Tokenizer, info: Info) -> State { +fn raw_escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'(' | b')' | b'\\') => { tokenizer.consume(); - State::Fn(Box::new(move |t| raw(t, info))) + State::Fn(Box::new(raw)) } - _ => raw(tokenizer, info), + _ => raw(tokenizer), } } diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 6fdb70d..0e1c2ec 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -64,53 +64,21 @@ use crate::subtokenize::link; use crate::token::Token; use crate::tokenizer::{ContentType, State, Tokenizer}; -/// Configuration. -/// -/// You must pass the token types in that are used. -#[derive(Debug)] -pub struct Options { - /// Token for the whole label. - pub label: Token, - /// Token for the markers. - pub marker: Token, - /// Token for the string (inside the markers). - pub string: Token, -} - -/// State needed to parse labels. -#[derive(Debug)] -struct Info { - /// Whether we’ve seen our first `ChunkString`. - connect: bool, - /// Whether there are non-blank bytes in the label. - data: bool, - /// Number of bytes in the label. - size: usize, - /// Configuration. - options: Options, -} - /// Before a label. /// /// ```markdown /// > | [a] /// ^ /// ``` -pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { +pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'[') => { - let info = Info { - connect: false, - data: false, - size: 0, - options, - }; - tokenizer.enter(info.options.label.clone()); - tokenizer.enter(info.options.marker.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - tokenizer.enter(info.options.string.clone()); - State::Fn(Box::new(|t| at_break(t, info))) + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); + State::Fn(Box::new(at_break)) } _ => State::Nok, } @@ -122,72 +90,88 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// > | [a] /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { - if info.size > LINK_REFERENCE_SIZE_MAX +fn at_break(tokenizer: &mut Tokenizer) -> State { + if tokenizer.tokenize_state.size > LINK_REFERENCE_SIZE_MAX || matches!(tokenizer.current, None | Some(b'[')) - || (matches!(tokenizer.current, Some(b']')) && !info.data) + || (matches!(tokenizer.current, Some(b']')) && !tokenizer.tokenize_state.seen) { + tokenizer.tokenize_state.connect = false; + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.size = 0; State::Nok } else { match tokenizer.current { - Some(b'\n') => tokenizer.go( + Some(b'\n') => tokenizer.attempt( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), - connect: info.connect, + connect: tokenizer.tokenize_state.connect, }), - |t| { - info.connect = true; - at_break(t, info) - }, + |ok| Box::new(if ok { after_eol } else { at_blank_line }), )(tokenizer), Some(b']') => { - tokenizer.exit(info.options.string.clone()); - tokenizer.enter(info.options.marker.clone()); + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - tokenizer.exit(info.options.label); + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + tokenizer.exit(tokenizer.tokenize_state.token_1.clone()); + tokenizer.tokenize_state.connect = false; + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.size = 0; State::Ok } _ => { tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - if info.connect { + if tokenizer.tokenize_state.connect { let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); } else { - info.connect = true; + tokenizer.tokenize_state.connect = true; } - label(tokenizer, info) + label(tokenizer) } } } } +/// To do. +fn after_eol(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.connect = true; + at_break(tokenizer) +} + +/// To do. +fn at_blank_line(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.connect = false; + State::Nok +} + /// In a label, in text. /// /// ```markdown /// > | [a] /// ^ /// ``` -fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn label(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n' | b'[' | b']') => { tokenizer.exit(Token::Data); - at_break(tokenizer, info) + at_break(tokenizer) } Some(byte) => { - if info.size > LINK_REFERENCE_SIZE_MAX { + if tokenizer.tokenize_state.size > LINK_REFERENCE_SIZE_MAX { tokenizer.exit(Token::Data); - at_break(tokenizer, info) + at_break(tokenizer) } else { let func = if matches!(byte, b'\\') { escape } else { label }; tokenizer.consume(); - info.size += 1; - if !info.data && !matches!(byte, b'\t' | b' ') { - info.data = true; + tokenizer.tokenize_state.size += 1; + if !tokenizer.tokenize_state.seen && !matches!(byte, b'\t' | b' ') { + tokenizer.tokenize_state.seen = true; } - State::Fn(Box::new(move |t| func(t, info))) + State::Fn(Box::new(func)) } } } @@ -199,13 +183,13 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | [a\*a] /// ^ /// ``` -fn escape(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'[' | b'\\' | b']') => { tokenizer.consume(); - info.size += 1; - State::Fn(Box::new(|t| label(t, info))) + tokenizer.tokenize_state.size += 1; + State::Fn(Box::new(label)) } - _ => label(tokenizer, info), + _ => label(tokenizer), } } diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index f31cbc6..e3eac45 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -32,26 +32,6 @@ pub struct EolOptions { pub content_type: Option<ContentType>, } -/// State needed to parse `space_or_tab`. -#[derive(Debug)] -struct Info { - /// Current size. - size: usize, - /// Configuration. - options: Options, -} - -/// State needed to parse `space_or_tab_eol`. -#[derive(Debug)] -struct EolInfo { - /// Whether to connect the next whitespace to the event before. - connect: bool, - /// Whether there was initial whitespace. - ok: bool, - /// Configuration. - options: EolOptions, -} - /// One or more `space_or_tab`. /// /// ```bnf @@ -78,7 +58,14 @@ pub fn space_or_tab_min_max(min: usize, max: usize) -> Box<StateFn> { /// `space_or_tab`, with the given options. pub fn space_or_tab_with_options(options: Options) -> Box<StateFn> { - Box::new(|t| start(t, Info { size: 0, options })) + Box::new(|tokenizer| { + tokenizer.tokenize_state.space_or_tab_connect = options.connect; + tokenizer.tokenize_state.space_or_tab_content_type = options.content_type; + tokenizer.tokenize_state.space_or_tab_min = options.min; + tokenizer.tokenize_state.space_or_tab_max = options.max; + tokenizer.tokenize_state.space_or_tab_token = options.kind; + start(tokenizer) + }) } /// `space_or_tab`, or optionally `space_or_tab`, one `eol`, and @@ -97,30 +84,28 @@ pub fn space_or_tab_eol() -> Box<StateFn> { /// `space_or_tab_eol`, with the given options. pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> { Box::new(move |tokenizer| { - let mut info = EolInfo { - connect: options.connect, - ok: false, - options, - }; + tokenizer.tokenize_state.space_or_tab_eol_content_type = options.content_type; + tokenizer.tokenize_state.space_or_tab_eol_connect = options.connect; tokenizer.attempt( space_or_tab_with_options(Options { kind: Token::SpaceOrTab, min: 1, max: usize::MAX, - content_type: info.options.content_type.clone(), - connect: info.options.connect, + content_type: tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .clone(), + connect: tokenizer.tokenize_state.space_or_tab_eol_connect, }), move |ok| { - if ok { - info.ok = ok; - - if info.options.content_type.is_some() { - info.connect = true; + Box::new(move |tokenizer| { + if ok { + tokenizer.tokenize_state.space_or_tab_eol_ok = ok; } - } - Box::new(|t| after_space_or_tab(t, info)) + after_space_or_tab(tokenizer) + }) }, )(tokenizer) }) @@ -132,28 +117,24 @@ pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> { /// > | a␠␠b /// ^ /// ``` -fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\t' | b' ') if info.options.max > 0 => { - tokenizer - .enter_with_content(info.options.kind.clone(), info.options.content_type.clone()); + Some(b'\t' | b' ') if tokenizer.tokenize_state.space_or_tab_max > 0 => { + tokenizer.enter_with_content( + tokenizer.tokenize_state.space_or_tab_token.clone(), + tokenizer.tokenize_state.space_or_tab_content_type.clone(), + ); - if info.options.content_type.is_some() { + if tokenizer.tokenize_state.space_or_tab_connect { let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); + } else if tokenizer.tokenize_state.space_or_tab_content_type.is_some() { + tokenizer.tokenize_state.space_or_tab_connect = true; } - tokenizer.consume(); - info.size += 1; - State::Fn(Box::new(|t| inside(t, info))) - } - _ => { - if info.options.min == 0 { - State::Ok - } else { - State::Nok - } + inside(tokenizer) } + _ => after(tokenizer), } } @@ -163,24 +144,46 @@ fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | a␠␠b /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\t' | b' ') if info.size < info.options.max => { + Some(b'\t' | b' ') + if tokenizer.tokenize_state.space_or_tab_size + < tokenizer.tokenize_state.space_or_tab_max => + { tokenizer.consume(); - info.size += 1; - State::Fn(Box::new(|t| inside(t, info))) + tokenizer.tokenize_state.space_or_tab_size += 1; + State::Fn(Box::new(inside)) } _ => { - tokenizer.exit(info.options.kind.clone()); - if info.size >= info.options.min { - State::Ok - } else { - State::Nok - } + tokenizer.exit(tokenizer.tokenize_state.space_or_tab_token.clone()); + after(tokenizer) } } } +/// After `space_or_tab`. +/// +/// ```markdown +/// > | a␠␠b +/// ^ +/// ``` +fn after(tokenizer: &mut Tokenizer) -> State { + let state = if tokenizer.tokenize_state.space_or_tab_size + >= tokenizer.tokenize_state.space_or_tab_min + { + State::Ok + } else { + State::Nok + }; + tokenizer.tokenize_state.space_or_tab_connect = false; + tokenizer.tokenize_state.space_or_tab_content_type = None; + tokenizer.tokenize_state.space_or_tab_size = 0; + tokenizer.tokenize_state.space_or_tab_max = 0; + tokenizer.tokenize_state.space_or_tab_min = 0; + tokenizer.tokenize_state.space_or_tab_token = Token::SpaceOrTab; + state +} + /// `space_or_tab_eol`: after optionally first `space_or_tab`. /// /// ```markdown @@ -188,24 +191,49 @@ fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ^ /// | b /// ``` -fn after_space_or_tab(tokenizer: &mut Tokenizer, mut info: EolInfo) -> State { - match tokenizer.current { - Some(b'\n') => { - tokenizer.enter_with_content(Token::LineEnding, info.options.content_type.clone()); +fn after_space_or_tab(tokenizer: &mut Tokenizer) -> State { + if tokenizer.tokenize_state.space_or_tab_eol_ok + && tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .is_some() + { + tokenizer.tokenize_state.space_or_tab_eol_connect = true; + } - if info.connect { - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - } else if info.options.content_type.is_some() { - info.connect = true; - } + if let Some(b'\n') = tokenizer.current { + tokenizer.enter_with_content( + Token::LineEnding, + tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .clone(), + ); - tokenizer.consume(); - tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(|t| after_eol(t, info))) + if tokenizer.tokenize_state.space_or_tab_eol_connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else if tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .is_some() + { + tokenizer.tokenize_state.space_or_tab_eol_connect = true; } - _ if info.ok => State::Ok, - _ => State::Nok, + + tokenizer.consume(); + tokenizer.exit(Token::LineEnding); + State::Fn(Box::new(after_eol)) + } else { + let state = if tokenizer.tokenize_state.space_or_tab_eol_ok { + State::Ok + } else { + State::Nok + }; + tokenizer.tokenize_state.space_or_tab_eol_content_type = None; + tokenizer.tokenize_state.space_or_tab_eol_connect = false; + tokenizer.tokenize_state.space_or_tab_eol_ok = false; + state } } @@ -217,14 +245,17 @@ fn after_space_or_tab(tokenizer: &mut Tokenizer, mut info: EolInfo) -> State { /// ^ /// ``` #[allow(clippy::needless_pass_by_value)] -fn after_eol(tokenizer: &mut Tokenizer, info: EolInfo) -> State { +fn after_eol(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt_opt( space_or_tab_with_options(Options { kind: Token::SpaceOrTab, min: 1, max: usize::MAX, - content_type: info.options.content_type, - connect: info.connect, + content_type: tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .clone(), + connect: tokenizer.tokenize_state.space_or_tab_eol_connect, }), after_more_space_or_tab, )(tokenizer) @@ -238,6 +269,10 @@ fn after_eol(tokenizer: &mut Tokenizer, info: EolInfo) -> State { /// ^ /// ``` fn after_more_space_or_tab(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.space_or_tab_eol_content_type = None; + tokenizer.tokenize_state.space_or_tab_eol_connect = false; + tokenizer.tokenize_state.space_or_tab_eol_ok = false; + // Blank line not allowed. if matches!(tokenizer.current, None | Some(b'\n')) { State::Nok diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 9cf2f14..6bf9099 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -35,50 +35,22 @@ use crate::subtokenize::link; use crate::token::Token; use crate::tokenizer::{ContentType, State, Tokenizer}; -/// Configuration. -/// -/// You must pass the token types in that are used. -#[derive(Debug)] -pub struct Options { - /// Token for the whole title. - pub title: Token, - /// Token for the marker. - pub marker: Token, - /// Token for the string inside the quotes. - pub string: Token, -} - -/// State needed to parse titles. -#[derive(Debug)] -struct Info { - /// Whether we’ve seen data. - connect: bool, - /// Closing marker. - marker: u8, - /// Configuration. - options: Options, -} - /// Before a title. /// /// ```markdown /// > | "a" /// ^ /// ``` -pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { +pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'"' | b'\'' | b'(') => { let marker = tokenizer.current.unwrap(); - let info = Info { - connect: false, - marker: if marker == b'(' { b')' } else { marker }, - options, - }; - tokenizer.enter(info.options.title.clone()); - tokenizer.enter(info.options.marker.clone()); + tokenizer.tokenize_state.marker = if marker == b'(' { b')' } else { marker }; + tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - State::Fn(Box::new(|t| begin(t, info))) + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + State::Fn(Box::new(begin)) } _ => State::Nok, } @@ -92,18 +64,22 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// > | "a" /// ^ /// ``` -fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { +fn begin(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { - tokenizer.enter(info.options.marker.clone()); + Some(b'"' | b'\'' | b')') + if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => + { + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - tokenizer.exit(info.options.title); + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + tokenizer.exit(tokenizer.tokenize_state.token_1.clone()); + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.connect = false; State::Ok } _ => { - tokenizer.enter(info.options.string.clone()); - at_break(tokenizer, info) + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); + at_break(tokenizer) } } } @@ -114,58 +90,76 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | "a" /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Nok, - Some(b'\n') => tokenizer.go( + None => { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.connect = false; + State::Nok + } + Some(b'\n') => tokenizer.attempt( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), - connect: info.connect, + connect: tokenizer.tokenize_state.connect, }), - |t| { - info.connect = true; - at_break(t, info) - }, + |ok| Box::new(if ok { after_eol } else { at_blank_line }), )(tokenizer), - Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { - tokenizer.exit(info.options.string.clone()); - begin(tokenizer, info) + Some(b'"' | b'\'' | b')') + if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => + { + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + begin(tokenizer) } Some(_) => { tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - if info.connect { + if tokenizer.tokenize_state.connect { let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); } else { - info.connect = true; + tokenizer.tokenize_state.connect = true; } - title(tokenizer, info) + title(tokenizer) } } } +/// To do. +fn after_eol(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.connect = true; + at_break(tokenizer) +} + +/// To do. +fn at_blank_line(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.connect = false; + State::Nok +} + /// In title text. /// /// ```markdown /// > | "a" /// ^ /// ``` -fn title(tokenizer: &mut Tokenizer, info: Info) -> State { +fn title(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Token::Data); - at_break(tokenizer, info) + at_break(tokenizer) } - Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { + Some(b'"' | b'\'' | b')') + if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => + { tokenizer.exit(Token::Data); - at_break(tokenizer, info) + at_break(tokenizer) } Some(byte) => { let func = if matches!(byte, b'\\') { escape } else { title }; tokenizer.consume(); - State::Fn(Box::new(move |t| func(t, info))) + State::Fn(Box::new(func)) } } } @@ -176,12 +170,12 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | "a\*b" /// ^ /// ``` -fn escape(tokenizer: &mut Tokenizer, info: Info) -> State { +fn escape(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'"' | b'\'' | b')') => { tokenizer.consume(); - State::Fn(Box::new(|t| title(t, info))) + State::Fn(Box::new(title)) } - _ => title(tokenizer, info), + _ => title(tokenizer), } } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index bf3bd4d..0905e10 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -47,17 +47,9 @@ use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN; use crate::token::Token; -use crate::tokenizer::{Event, EventType, Resolver, Tokenizer}; +use crate::tokenizer::{Event, EventType, Tokenizer}; use crate::util::slice::{Position, Slice}; -/// Create a resolver to handle trailing whitespace in events. -/// -/// Performing this as a resolver instead of a tokenizer improves performance -/// *a lot*. -pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> Box<Resolver> { - Box::new(move |t| resolve_whitespace(t, hard_break, trim_whole)) -} - /// Resolve whitespace. pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) { let mut index = 0; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 785d132..2ed2046 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -53,15 +53,6 @@ use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; -/// State needed to parse thematic breaks. -#[derive(Debug)] -struct Info { - /// Marker. - marker: u8, - /// Number of markers. - size: usize, -} - /// Start of a thematic break. /// /// ```markdown @@ -95,13 +86,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'*' | b'-' | b'_') => at_break( - tokenizer, - Info { - marker: tokenizer.current.unwrap(), - size: 0, - }, - ), + Some(b'*' | b'-' | b'_') => { + tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); + at_break(tokenizer) + } _ => State::Nok, } } @@ -112,19 +100,27 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | *** /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { +fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some(b'\n') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { + None | Some(b'\n') if tokenizer.tokenize_state.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; tokenizer.exit(Token::ThematicBreak); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => { + Some(b'*' | b'-' | b'_') + if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => + { tokenizer.enter(Token::ThematicBreakSequence); - sequence(tokenizer, info) + sequence(tokenizer) + } + _ => { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + State::Nok } - _ => State::Nok, } } @@ -134,16 +130,18 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | *** /// ^ /// ``` -fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn sequence(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => { + Some(b'*' | b'-' | b'_') + if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => + { tokenizer.consume(); - info.size += 1; - State::Fn(Box::new(|t| sequence(t, info))) + tokenizer.tokenize_state.size += 1; + State::Fn(Box::new(sequence)) } _ => { tokenizer.exit(Token::ThematicBreakSequence); - tokenizer.attempt_opt(space_or_tab(), |t| at_break(t, info))(tokenizer) + tokenizer.attempt_opt(space_or_tab(), at_break)(tokenizer) } } } diff --git a/src/content/document.rs b/src/content/document.rs index 76d510a..33c8ff9 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -17,9 +17,7 @@ use crate::content::flow::start as flow; use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::token::Token; -use crate::tokenizer::{ - Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, -}; +use crate::tokenizer::{Container, ContainerState, Event, EventType, Point, State, Tokenizer}; use crate::util::{ normalize_identifier::normalize_identifier, skip, @@ -57,29 +55,11 @@ enum Phase { Eof, } -/// State needed to parse document. -struct DocumentInfo { - /// Number of containers that have continued. - continued: usize, - /// Index into `tokenizer.events` we need to track. - index: usize, - /// Events of containers added back later. - inject: Vec<(Vec<Event>, Vec<Event>)>, - /// The value of the previous line of flow’s `interrupt`. - interrupt_before: bool, - /// Whether the previous line of flow was a paragraph. - paragraph_before: bool, - /// Current containers. - stack: Vec<ContainerState>, - /// Current flow state function. - next: Box<StateFn>, -} - /// Turn `codes` as the document content type into events. pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.bytes.len(), Box::new(before)); + let state = tokenizer.push(0, parse_state.bytes.len(), Box::new(start)); tokenizer.flush(state, true); let mut index = 0; @@ -123,28 +103,8 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { /// > | a /// ^ /// ``` -fn before(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt_opt(bom, start)(tokenizer) -} - -/// Before document. -// -/// ```markdown -/// > | * a -/// ^ -/// | > b -/// ``` fn start(tokenizer: &mut Tokenizer) -> State { - let info = DocumentInfo { - index: 0, - continued: 0, - inject: vec![], - next: Box::new(flow), - paragraph_before: false, - interrupt_before: false, - stack: vec![], - }; - line_start(tokenizer, info) + tokenizer.attempt_opt(bom, line_start)(tokenizer) } /// Start of a line. @@ -155,13 +115,16 @@ fn start(tokenizer: &mut Tokenizer) -> State { /// > | > b /// ^ /// ``` -fn line_start(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State { - info.index = tokenizer.events.len(); - info.inject.push((vec![], vec![])); - info.continued = 0; +fn line_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.document_continued = 0; + tokenizer.tokenize_state.document_index = tokenizer.events.len(); + tokenizer + .tokenize_state + .document_inject + .push((vec![], vec![])); // Containers would only be interrupting if we’ve continued. tokenizer.interrupt = false; - container_existing_before(tokenizer, info) + container_existing_before(tokenizer) } /// Before existing containers. @@ -171,27 +134,32 @@ fn line_start(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State { /// > | > b /// ^ /// ``` -fn container_existing_before(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State { +fn container_existing_before(tokenizer: &mut Tokenizer) -> State { // If there are more existing containers, check whether the next one continues. - if info.continued < info.stack.len() { - let container = info.stack.remove(info.continued); + if tokenizer.tokenize_state.document_continued + < tokenizer.tokenize_state.document_container_stack.len() + { + let container = tokenizer + .tokenize_state + .document_container_stack + .remove(tokenizer.tokenize_state.document_continued); let cont = match container.kind { Container::BlockQuote => block_quote_cont, Container::ListItem => list_item_const, }; tokenizer.container = Some(container); - tokenizer.attempt(cont, move |ok| { - if ok { - Box::new(|t| container_existing_after(t, info)) + tokenizer.attempt(cont, |ok| { + Box::new(if ok { + container_existing_after } else { - Box::new(|t| container_existing_missing(t, info)) - } + container_existing_missing + }) })(tokenizer) } // Otherwise, check new containers. else { - container_new_before(tokenizer, info) + container_new_before(tokenizer) } } @@ -202,10 +170,13 @@ fn container_existing_before(tokenizer: &mut Tokenizer, mut info: DocumentInfo) /// > | > b /// ^ /// ``` -fn container_existing_missing(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State { +fn container_existing_missing(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); - info.stack.insert(info.continued, container); - container_new_before(tokenizer, info) + tokenizer + .tokenize_state + .document_container_stack + .insert(tokenizer.tokenize_state.document_continued, container); + container_new_before(tokenizer) } /// After an existing container. @@ -215,11 +186,14 @@ fn container_existing_missing(tokenizer: &mut Tokenizer, mut info: DocumentInfo) /// > | b /// ^ /// ``` -fn container_existing_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State { +fn container_existing_after(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); - info.stack.insert(info.continued, container); - info.continued += 1; - container_existing_before(tokenizer, info) + tokenizer + .tokenize_state + .document_container_stack + .insert(tokenizer.tokenize_state.document_continued, container); + tokenizer.tokenize_state.document_continued += 1; + container_existing_before(tokenizer) } /// Before a new container. @@ -230,16 +204,18 @@ fn container_existing_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) - /// > | > b /// ^ /// ``` -fn container_new_before(tokenizer: &mut Tokenizer, info: DocumentInfo) -> State { +fn container_new_before(tokenizer: &mut Tokenizer) -> State { // If we have completely continued, restore the flow’s past `interrupt` // status. - if info.continued == info.stack.len() { - tokenizer.interrupt = info.interrupt_before; + if tokenizer.tokenize_state.document_continued + == tokenizer.tokenize_state.document_container_stack.len() + { + tokenizer.interrupt = tokenizer.tokenize_state.document_interrupt_before; // …and if we’re in a concrete construct, new containers can’t “pierce” // into them. if tokenizer.concrete { - return containers_after(tokenizer, info); + return containers_after(tokenizer); } } @@ -251,41 +227,42 @@ fn container_new_before(tokenizer: &mut Tokenizer, info: DocumentInfo) -> State size: 0, }); - tokenizer.attempt(block_quote, move |ok| { - if ok { - Box::new(|t| container_new_after(t, info)) + tokenizer.attempt(block_quote, |ok| { + Box::new(if ok { + container_new_after } else { - Box::new(|tokenizer| { - // List item? - tokenizer.container = Some(ContainerState { - kind: Container::ListItem, - blank_initial: false, - size: 0, - }); - - tokenizer.attempt(list_item, |ok| { - Box::new(move |t| { - if ok { - container_new_after(t, info) - } else { - containers_after(t, info) - } - }) - })(tokenizer) - }) - } + container_new_before_not_blockquote + }) + })(tokenizer) +} + +/// To do. +fn container_new_before_not_blockquote(tokenizer: &mut Tokenizer) -> State { + // List item? + tokenizer.container = Some(ContainerState { + kind: Container::ListItem, + blank_initial: false, + size: 0, + }); + + tokenizer.attempt(list_item, |ok| { + Box::new(if ok { + container_new_after + } else { + containers_after + }) })(tokenizer) } /// After a new container. -// +/// /// ```markdown /// > | * a /// ^ /// > | > b /// ^ /// ``` -fn container_new_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State { +fn container_new_after(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); // Remove from the event stack. @@ -312,16 +289,21 @@ fn container_new_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> Sta // If we did not continue all existing containers, and there is a new one, // close the flow and those containers. - if info.continued != info.stack.len() { - info = exit_containers(tokenizer, info, &Phase::Prefix); + if tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len() + { + exit_containers(tokenizer, &Phase::Prefix); } // Try another new container. - info.stack.push(container); - info.continued += 1; - info.interrupt_before = false; + tokenizer + .tokenize_state + .document_container_stack + .push(container); + tokenizer.tokenize_state.document_continued += 1; + tokenizer.tokenize_state.document_interrupt_before = false; tokenizer.interrupt = false; - container_new_before(tokenizer, info) + container_new_before(tokenizer) } /// After containers, before flow. @@ -332,26 +314,36 @@ fn container_new_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> Sta /// > | > b /// ^ /// ``` -fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State { +fn containers_after(tokenizer: &mut Tokenizer) -> State { // Store the container events we parsed. - info.inject + tokenizer + .tokenize_state + .document_inject .last_mut() .unwrap() .0 - .append(&mut tokenizer.events.split_off(info.index)); - - tokenizer.lazy = info.continued != info.stack.len(); - tokenizer.interrupt = info.interrupt_before; + .append( + &mut tokenizer + .events + .split_off(tokenizer.tokenize_state.document_index), + ); + + tokenizer.lazy = tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len(); + tokenizer.interrupt = tokenizer.tokenize_state.document_interrupt_before; tokenizer.define_skip_current(); - let state = info.next; - info.next = Box::new(flow); + let state = tokenizer + .tokenize_state + .document_next + .take() + .unwrap_or_else(|| Box::new(flow)); // Parse flow, pausing after eols. tokenizer.go_until( state, |code| matches!(code, Some(b'\n')), - move |state| Box::new(move |t| flow_end(t, info, state)), + |state| Box::new(|t| flow_end(t, state)), )(tokenizer) } @@ -362,7 +354,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State /// > | > b /// ^ ^ /// ``` -fn flow_end(tokenizer: &mut Tokenizer, mut info: DocumentInfo, result: State) -> State { +fn flow_end(tokenizer: &mut Tokenizer, result: State) -> State { let paragraph = !tokenizer.events.is_empty() && tokenizer.events[skip::opt_back( &tokenizer.events, @@ -372,53 +364,59 @@ fn flow_end(tokenizer: &mut Tokenizer, mut info: DocumentInfo, result: State) -> .token_type == Token::Paragraph; - if tokenizer.lazy && info.paragraph_before && paragraph { - info.continued = info.stack.len(); + if tokenizer.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before { + tokenizer.tokenize_state.document_continued = + tokenizer.tokenize_state.document_container_stack.len(); } - if info.continued != info.stack.len() { - info = exit_containers(tokenizer, info, &Phase::After); + if tokenizer.tokenize_state.document_continued + != tokenizer.tokenize_state.document_container_stack.len() + { + exit_containers(tokenizer, &Phase::After); } - info.paragraph_before = paragraph; - info.interrupt_before = tokenizer.interrupt; - match result { State::Ok => { - if !info.stack.is_empty() { - info.continued = 0; - info = exit_containers(tokenizer, info, &Phase::Eof); + if !tokenizer.tokenize_state.document_container_stack.is_empty() { + tokenizer.tokenize_state.document_continued = 0; + exit_containers(tokenizer, &Phase::Eof); } - resolve(tokenizer, &mut info); - result + resolve(tokenizer); + State::Ok } State::Nok => unreachable!("unexpected `nok` from flow"), State::Fn(func) => { - info.next = func; - line_start(tokenizer, info) + tokenizer.tokenize_state.document_paragraph_before = paragraph; + tokenizer.tokenize_state.document_interrupt_before = tokenizer.interrupt; + tokenizer.tokenize_state.document_next = Some(func); + line_start(tokenizer) } } } /// Close containers (and flow if needed). -fn exit_containers( - tokenizer: &mut Tokenizer, - mut info: DocumentInfo, - phase: &Phase, -) -> DocumentInfo { - let mut stack_close = info.stack.split_off(info.continued); +fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { + let mut stack_close = tokenizer + .tokenize_state + .document_container_stack + .split_off(tokenizer.tokenize_state.document_continued); // So, we’re at the end of a line, but we need to close the *previous* line. if *phase != Phase::Eof { tokenizer.define_skip_current(); - let mut current_events = tokenizer.events.split_off(info.index); - let next = info.next; - info.next = Box::new(flow); // This is weird but Rust needs a function there. - tokenizer.flush(State::Fn(next), false); + let mut current_events = tokenizer + .events + .split_off(tokenizer.tokenize_state.document_index); + let state = tokenizer + .tokenize_state + .document_next + .take() + .unwrap_or_else(|| Box::new(flow)); + tokenizer.flush(State::Fn(state), false); if *phase == Phase::Prefix { - info.index = tokenizer.events.len(); + tokenizer.tokenize_state.document_index = tokenizer.events.len(); } tokenizer.events.append(&mut current_events); @@ -442,17 +440,18 @@ fn exit_containers( }); } - let index = info.inject.len() - (if *phase == Phase::Eof { 1 } else { 2 }); - info.inject[index].1.append(&mut exits); - info.interrupt_before = false; - - info + let index = + tokenizer.tokenize_state.document_inject.len() - (if *phase == Phase::Eof { 1 } else { 2 }); + tokenizer.tokenize_state.document_inject[index] + .1 + .append(&mut exits); + tokenizer.tokenize_state.document_interrupt_before = false; } // Inject the container events. -fn resolve(tokenizer: &mut Tokenizer, info: &mut DocumentInfo) { +fn resolve(tokenizer: &mut Tokenizer) { let mut index = 0; - let mut inject = info.inject.split_off(0); + let mut inject = tokenizer.tokenize_state.document_inject.split_off(0); inject.reverse(); let mut first_line_ending_in_run = None; diff --git a/src/content/string.rs b/src/content/string.rs index d2aec3f..2e738fb 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -14,7 +14,7 @@ use crate::construct::{ character_escape::start as character_escape, character_reference::start as character_reference, - partial_data::start as data, partial_whitespace::create_resolve_whitespace, + partial_data::start as data, partial_whitespace::resolve_whitespace, }; use crate::tokenizer::{State, Tokenizer}; @@ -22,10 +22,8 @@ const MARKERS: [u8; 2] = [b'&', b'\\']; /// Start of string. pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver( - "whitespace".to_string(), - Box::new(create_resolve_whitespace(false, false)), - ); + tokenizer.register_resolver("whitespace".to_string(), Box::new(resolve)); + tokenizer.tokenize_state.stop = &MARKERS; before(tokenizer) } @@ -42,5 +40,10 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// At data. fn before_data(tokenizer: &mut Tokenizer) -> State { - tokenizer.go(|t| data(t, &MARKERS), before)(tokenizer) + tokenizer.go(data, before)(tokenizer) +} + +/// Resolve whitespace. +pub fn resolve(tokenizer: &mut Tokenizer) { + resolve_whitespace(tokenizer, false, false); } diff --git a/src/content/text.rs b/src/content/text.rs index 30c98a3..f4666d1 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -26,7 +26,7 @@ use crate::construct::{ code_text::start as code_text, hard_break_escape::start as hard_break_escape, html_text::start as html_text, label_end::start as label_end, label_start_image::start as label_start_image, label_start_link::start as label_start_link, - partial_data::start as data, partial_whitespace::create_resolve_whitespace, + partial_data::start as data, partial_whitespace::resolve_whitespace, }; use crate::tokenizer::{State, Tokenizer}; @@ -44,13 +44,8 @@ const MARKERS: [u8; 9] = [ /// Start of text. pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver( - "whitespace".to_string(), - Box::new(create_resolve_whitespace( - tokenizer.parse_state.constructs.hard_break_trailing, - true, - )), - ); + tokenizer.register_resolver("whitespace".to_string(), Box::new(resolve)); + tokenizer.tokenize_state.stop = &MARKERS; before(tokenizer) } @@ -82,5 +77,14 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { /// |qwe /// ``` fn before_data(tokenizer: &mut Tokenizer) -> State { - tokenizer.go(|t| data(t, &MARKERS), before)(tokenizer) + tokenizer.go(data, before)(tokenizer) +} + +/// Resolve whitespace. +pub fn resolve(tokenizer: &mut Tokenizer) { + resolve_whitespace( + tokenizer, + tokenizer.parse_state.constructs.hard_break_trailing, + true, + ); } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9ab4309..3068ddf 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -121,8 +121,6 @@ pub struct Media { pub start: (usize, usize), /// Indices of where the media’s label end starts and ends in `events`. pub end: (usize, usize), - /// Identifier - pub id: String, } /// Supported containers. @@ -163,6 +161,62 @@ struct InternalState { point: Point, } +/// To do +#[allow(clippy::struct_excessive_bools)] +pub struct TokenizeState { + /// To do. + pub connect: bool, + /// To do. + pub document_container_stack: Vec<ContainerState>, + /// To do. + pub document_continued: usize, + /// To do. + pub document_index: usize, + /// To do. + pub document_inject: Vec<(Vec<Event>, Vec<Event>)>, + /// To do. + pub document_interrupt_before: bool, + /// To do. + pub document_paragraph_before: bool, + /// To do. + pub document_next: Option<Box<StateFn>>, + /// To do. + pub marker: u8, + /// To do. + pub marker_other: u8, + /// To do. + pub prefix: usize, + /// To do. + pub return_state: Option<Box<StateFn>>, + /// To do. + pub seen: bool, + /// To do. + pub size: usize, + /// To do. + pub size_other: usize, + /// To do. + pub start: usize, + /// To do. + pub end: usize, + /// To do. + pub stop: &'static [u8], + pub space_or_tab_eol_content_type: Option<ContentType>, + pub space_or_tab_eol_connect: bool, + pub space_or_tab_eol_ok: bool, + pub space_or_tab_connect: bool, + pub space_or_tab_content_type: Option<ContentType>, + pub space_or_tab_min: usize, + pub space_or_tab_max: usize, + pub space_or_tab_size: usize, + pub space_or_tab_token: Token, + /// To do. + pub token_1: Token, + pub token_2: Token, + pub token_3: Token, + pub token_4: Token, + pub token_5: Token, +} + /// A tokenizer itself. #[allow(clippy::struct_excessive_bools)] pub struct Tokenizer<'a> { @@ -179,6 +233,8 @@ pub struct Tokenizer<'a> { consumed: bool, /// Track whether this tokenizer is done. resolved: bool, + /// To do. + attempt_balance: usize, /// Current byte. pub current: Option<u8>, /// Previous byte. @@ -200,6 +256,8 @@ pub struct Tokenizer<'a> { resolver_ids: Vec<String>, /// Shared parsing state across tokenizers. pub parse_state: &'a ParseState<'a>, + /// To do. + pub tokenize_state: TokenizeState, /// Stack of label (start) that could form images and links. /// /// Used when tokenizing [text content][crate::content::text]. @@ -241,10 +299,45 @@ impl<'a> Tokenizer<'a> { line_start: point.clone(), consumed: true, resolved: false, + attempt_balance: 0, point, stack: vec![], events: vec![], parse_state, + tokenize_state: TokenizeState { + connect: false, + document_container_stack: vec![], + document_continued: 0, + document_index: 0, + document_inject: vec![], + document_interrupt_before: false, + document_paragraph_before: false, + document_next: None, + marker: 0, + marker_other: 0, + prefix: 0, + seen: false, + size: 0, + size_other: 0, + start: 0, + end: 0, + stop: &[], + return_state: None, + space_or_tab_eol_content_type: None, + space_or_tab_eol_connect: false, + space_or_tab_eol_ok: false, + space_or_tab_connect: false, + space_or_tab_content_type: None, + space_or_tab_min: 0, + space_or_tab_max: 0, + space_or_tab_size: 0, + space_or_tab_token: Token::SpaceOrTab, + token_1: Token::Data, + token_2: Token::Data, + token_3: Token::Data, + token_4: Token::Data, + token_5: Token::Data, + }, map: EditMap::new(), label_start_stack: vec![], label_start_list_loose: vec![], @@ -494,11 +587,14 @@ impl<'a> Tokenizer<'a> { state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, after: impl FnOnce(&mut Tokenizer) -> State + 'static, ) -> Box<StateFn> { + self.attempt_balance += 1; attempt_impl( state_fn, None, self.point.index, |tokenizer: &mut Tokenizer, state| { + tokenizer.attempt_balance -= 1; + if matches!(state, State::Ok) { tokenizer.consumed = true; State::Fn(Box::new(after)) @@ -522,11 +618,13 @@ impl<'a> Tokenizer<'a> { until: impl Fn(Option<u8>) -> bool + 'static, done: impl FnOnce(State) -> Box<StateFn> + 'static, ) -> Box<StateFn> { + self.attempt_balance += 1; attempt_impl( state_fn, Some(Box::new(until)), self.point.index, |tokenizer: &mut Tokenizer, state| { + tokenizer.attempt_balance -= 1; tokenizer.consumed = true; // We don’t capture/free state because it is assumed that // `go_until` itself is wrapped in another attempt that does @@ -550,6 +648,7 @@ impl<'a> Tokenizer<'a> { state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, done: impl FnOnce(bool) -> Box<StateFn> + 'static, ) -> Box<StateFn> { + self.attempt_balance += 1; let previous = self.capture(); attempt_impl( @@ -557,6 +656,7 @@ impl<'a> Tokenizer<'a> { None, self.point.index, |tokenizer: &mut Tokenizer, state| { + tokenizer.attempt_balance -= 1; tokenizer.free(previous); tokenizer.consumed = true; State::Fn(done(matches!(state, State::Ok))) @@ -580,6 +680,7 @@ impl<'a> Tokenizer<'a> { state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, done: impl FnOnce(bool) -> Box<StateFn> + 'static, ) -> Box<StateFn> { + self.attempt_balance += 1; let previous = self.capture(); attempt_impl( @@ -587,6 +688,7 @@ impl<'a> Tokenizer<'a> { None, self.point.index, |tokenizer: &mut Tokenizer, state| { + tokenizer.attempt_balance -= 1; let ok = matches!(state, State::Ok); if !ok { @@ -782,7 +884,47 @@ fn attempt_impl( let state = state(tokenizer); match state { - State::Ok | State::Nok => done(tokenizer, state), + State::Ok | State::Nok => { + if tokenizer.attempt_balance == 0 { + debug_assert!(!tokenizer.tokenize_state.connect); + debug_assert_eq!(tokenizer.tokenize_state.document_continued, 0); + debug_assert_eq!(tokenizer.tokenize_state.document_index, 0); + debug_assert!(!tokenizer.tokenize_state.document_interrupt_before); + debug_assert!(!tokenizer.tokenize_state.document_paragraph_before); + debug_assert_eq!(tokenizer.tokenize_state.marker, 0); + debug_assert_eq!(tokenizer.tokenize_state.marker_other, 0); + debug_assert_eq!(tokenizer.tokenize_state.prefix, 0); + debug_assert!(!tokenizer.tokenize_state.seen); + debug_assert_eq!(tokenizer.tokenize_state.size, 0); + debug_assert_eq!(tokenizer.tokenize_state.size_other, 0); + debug_assert_eq!(tokenizer.tokenize_state.stop.len(), 0); + debug_assert_eq!(tokenizer.tokenize_state.start, 0); + debug_assert_eq!(tokenizer.tokenize_state.end, 0); + debug_assert!(tokenizer.tokenize_state.return_state.is_none()); + debug_assert!(!tokenizer.tokenize_state.space_or_tab_eol_connect); + debug_assert!(!tokenizer.tokenize_state.space_or_tab_eol_ok); + debug_assert!(tokenizer + .tokenize_state + .space_or_tab_eol_content_type + .is_none()); + debug_assert!(!tokenizer.tokenize_state.space_or_tab_connect); + debug_assert!(tokenizer.tokenize_state.space_or_tab_content_type.is_none()); + debug_assert_eq!(tokenizer.tokenize_state.space_or_tab_min, 0); + debug_assert_eq!(tokenizer.tokenize_state.space_or_tab_max, 0); + debug_assert_eq!(tokenizer.tokenize_state.space_or_tab_size, 0); + debug_assert_eq!( + tokenizer.tokenize_state.space_or_tab_token, + Token::SpaceOrTab + ); + debug_assert_eq!(tokenizer.tokenize_state.token_1, Token::Data); + debug_assert_eq!(tokenizer.tokenize_state.token_2, Token::Data); + debug_assert_eq!(tokenizer.tokenize_state.token_3, Token::Data); + debug_assert_eq!(tokenizer.tokenize_state.token_4, Token::Data); + debug_assert_eq!(tokenizer.tokenize_state.token_5, Token::Data); + } + + done(tokenizer, state) + } State::Fn(func) => State::Fn(attempt_impl(func, pause, start, done)), } }) |