From 2d35cbfceace81a217cd0fbdae7a8777c7a6465e Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 11 Aug 2022 13:31:20 +0200 Subject: Refactor internal docs, code style of tokenizer --- src/construct/code_fenced.rs | 22 +- src/construct/code_text.rs | 8 +- src/construct/definition.rs | 13 +- src/construct/html_flow.rs | 10 +- src/construct/html_text.rs | 86 +++--- src/construct/label_end.rs | 36 ++- src/construct/label_start_image.rs | 2 +- src/construct/label_start_link.rs | 2 +- src/construct/partial_data.rs | 8 +- src/construct/partial_destination.rs | 2 +- src/construct/partial_label.rs | 17 +- src/construct/partial_title.rs | 17 +- src/content/document.rs | 42 +-- src/content/string.rs | 2 +- src/content/text.rs | 10 +- src/subtokenize.rs | 6 +- src/tokenizer.rs | 518 +++++++++++++++++------------------ 17 files changed, 420 insertions(+), 381 deletions(-) diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 0d4345a..26e1148 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -162,7 +162,7 @@ pub fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { if let Some(b'`' | b'~') = tokenizer.current { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); - tokenizer.tokenize_state.prefix = prefix; + tokenizer.tokenize_state.size_c = prefix; tokenizer.enter(Token::CodeFencedFenceSequence); State::Retry(StateName::CodeFencedSequenceOpen) } else { @@ -196,7 +196,7 @@ pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size_c = 0; tokenizer.tokenize_state.size = 0; State::Nok } @@ -259,7 +259,7 @@ pub fn info(tokenizer: &mut Tokenizer) -> State { Some(b'`') if tokenizer.tokenize_state.marker == b'`' => { tokenizer.concrete = false; tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size_c = 0; tokenizer.tokenize_state.size = 0; State::Nok } @@ -307,7 +307,7 @@ pub fn meta(tokenizer: &mut Tokenizer) -> State { Some(b'`') if tokenizer.tokenize_state.marker == b'`' => { tokenizer.concrete = false; tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size_c = 0; tokenizer.tokenize_state.size = 0; State::Nok } @@ -410,14 +410,14 @@ pub fn before_sequence_close(tokenizer: &mut Tokenizer) -> State { pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { - tokenizer.tokenize_state.size_other += 1; + tokenizer.tokenize_state.size_b += 1; tokenizer.consume(); State::Next(StateName::CodeFencedSequenceClose) } - _ if tokenizer.tokenize_state.size_other >= CODE_FENCED_SEQUENCE_SIZE_MIN - && tokenizer.tokenize_state.size_other >= tokenizer.tokenize_state.size => + _ if tokenizer.tokenize_state.size_b >= CODE_FENCED_SEQUENCE_SIZE_MIN + && tokenizer.tokenize_state.size_b >= tokenizer.tokenize_state.size => { - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; tokenizer.exit(Token::CodeFencedFenceSequence); let name = space_or_tab(tokenizer); tokenizer.attempt( @@ -427,7 +427,7 @@ pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { ) } _ => { - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; State::Nok } } @@ -474,7 +474,7 @@ pub fn content_before(tokenizer: &mut Tokenizer) -> State { /// | ~~~ /// ``` pub fn content_start(tokenizer: &mut Tokenizer) -> State { - let name = space_or_tab_min_max(tokenizer, 0, tokenizer.tokenize_state.prefix); + let name = space_or_tab_min_max(tokenizer, 0, tokenizer.tokenize_state.size_c); tokenizer.attempt( name, State::Next(StateName::CodeFencedBeforeContentChunk), @@ -536,7 +536,7 @@ pub fn content_chunk(tokenizer: &mut Tokenizer) -> State { pub fn after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CodeFenced); tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.prefix = 0; + tokenizer.tokenize_state.size_c = 0; tokenizer.tokenize_state.size = 0; // Feel free to interrupt. tokenizer.interrupt = false; diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index 2c8faf3..d7ada3d 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -185,16 +185,16 @@ pub fn data(tokenizer: &mut Tokenizer) -> State { pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`') => { - tokenizer.tokenize_state.size_other += 1; + tokenizer.tokenize_state.size_b += 1; tokenizer.consume(); State::Next(StateName::CodeTextSequenceClose) } _ => { - if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_other { + if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_b { tokenizer.exit(Token::CodeTextSequence); tokenizer.exit(Token::CodeText); tokenizer.tokenize_state.size = 0; - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; State::Ok } else { let index = tokenizer.events.len(); @@ -202,7 +202,7 @@ pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { // More or less accents: mark as data. tokenizer.events[index - 1].token_type = Token::CodeTextData; tokenizer.events[index].token_type = Token::CodeTextData; - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; State::Retry(StateName::CodeTextBetween) } } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 62d0f3b..5db611b 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -174,7 +174,12 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State { } } -/// To do. +/// After the marker. +/// +/// ```markdown +/// > | [a]: b "c" +/// ^ +/// ``` pub fn marker_after(tokenizer: &mut Tokenizer) -> State { let name = space_or_tab_eol(tokenizer); tokenizer.attempt( @@ -196,7 +201,7 @@ pub fn destination_before(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_3 = Token::DefinitionDestinationLiteralMarker; tokenizer.tokenize_state.token_4 = Token::DefinitionDestinationRaw; tokenizer.tokenize_state.token_5 = Token::DefinitionDestinationString; - tokenizer.tokenize_state.size_other = usize::MAX; + tokenizer.tokenize_state.size_b = usize::MAX; tokenizer.attempt( StateName::DestinationStart, State::Next(StateName::DefinitionDestinationAfter), @@ -216,7 +221,7 @@ pub fn destination_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.tokenize_state.token_4 = Token::Data; tokenizer.tokenize_state.token_5 = Token::Data; - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; tokenizer.attempt( StateName::DefinitionTitleBefore, State::Next(StateName::DefinitionAfter), @@ -231,7 +236,7 @@ pub fn destination_missing(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.tokenize_state.token_4 = Token::Data; tokenizer.tokenize_state.token_5 = Token::Data; - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; State::Nok } diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index b49b231..7a346e9 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -508,7 +508,7 @@ pub fn complete_attribute_value_before(tokenizer: &mut Tokenizer) -> State { State::Next(StateName::HtmlFlowCompleteAttributeValueBefore) } Some(b'"' | b'\'') => { - tokenizer.tokenize_state.marker_other = tokenizer.current.unwrap(); + tokenizer.tokenize_state.marker_b = tokenizer.current.unwrap(); tokenizer.consume(); State::Next(StateName::HtmlFlowCompleteAttributeValueQuoted) } @@ -528,13 +528,11 @@ pub fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.marker_other = 0; + tokenizer.tokenize_state.marker_b = 0; State::Nok } - Some(b'"' | b'\'') - if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker_other => - { - tokenizer.tokenize_state.marker_other = 0; + Some(b'"' | b'\'') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker_b => { + tokenizer.tokenize_state.marker_b = 0; tokenizer.consume(); State::Next(StateName::HtmlFlowCompleteAttributeValueQuotedAfter) } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index df6bd99..7474dbf 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -207,10 +207,11 @@ pub fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { pub fn comment(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextComment); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextComment), + State::Nok, + ), Some(b'-') => { tokenizer.consume(); State::Next(StateName::HtmlTextCommentClose) @@ -269,10 +270,11 @@ pub fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { pub fn cdata(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextCdata); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextCdata), + State::Nok, + ), Some(b']') => { tokenizer.consume(); State::Next(StateName::HtmlTextCdataClose) @@ -323,10 +325,11 @@ pub fn cdata_end(tokenizer: &mut Tokenizer) -> State { pub fn declaration(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'>') => State::Retry(StateName::HtmlTextEnd), - Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextDeclaration); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextDeclaration), + State::Nok, + ), _ => { tokenizer.consume(); State::Next(StateName::HtmlTextDeclaration) @@ -343,10 +346,11 @@ pub fn declaration(tokenizer: &mut Tokenizer) -> State { pub fn instruction(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextInstruction); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextInstruction), + State::Nok, + ), Some(b'?') => { tokenizer.consume(); State::Next(StateName::HtmlTextInstructionClose) @@ -413,10 +417,11 @@ pub fn tag_close(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn tag_close_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextTagCloseBetween); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextTagCloseBetween), + State::Nok, + ), Some(b'\t' | b' ') => { tokenizer.consume(); State::Next(StateName::HtmlTextTagCloseBetween) @@ -451,10 +456,11 @@ pub fn tag_open(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn tag_open_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\n') => { - tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextTagOpenBetween); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextTagOpenBetween), + State::Nok, + ), Some(b'\t' | b' ') => { tokenizer.consume(); State::Next(StateName::HtmlTextTagOpenBetween) @@ -498,11 +504,11 @@ pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(b'\n') => { - tokenizer.tokenize_state.return_state = - Some(StateName::HtmlTextTagOpenAttributeNameAfter); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextTagOpenAttributeNameAfter), + State::Nok, + ), Some(b'\t' | b' ') => { tokenizer.consume(); State::Next(StateName::HtmlTextTagOpenAttributeNameAfter) @@ -525,11 +531,11 @@ pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, - Some(b'\n') => { - tokenizer.tokenize_state.return_state = - Some(StateName::HtmlTextTagOpenAttributeValueBefore); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextTagOpenAttributeValueBefore), + State::Nok, + ), Some(b'\t' | b' ') => { tokenizer.consume(); State::Next(StateName::HtmlTextTagOpenAttributeValueBefore) @@ -558,11 +564,11 @@ pub fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.marker = 0; State::Nok } - Some(b'\n') => { - tokenizer.tokenize_state.return_state = - Some(StateName::HtmlTextTagOpenAttributeValueQuoted); - State::Retry(StateName::HtmlTextLineEndingBefore) - } + Some(b'\n') => tokenizer.attempt( + StateName::HtmlTextLineEndingBefore, + State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted), + State::Nok, + ), Some(b'"' | b'\'') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => { tokenizer.tokenize_state.marker = 0; tokenizer.consume(); @@ -678,5 +684,5 @@ pub fn line_ending_after(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn line_ending_after_prefix(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Token::HtmlTextData); - State::Retry(tokenizer.tokenize_state.return_state.take().unwrap()) + State::Ok } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 3337cec..a25f917 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -170,12 +170,12 @@ use crate::util::{ pub fn start(tokenizer: &mut Tokenizer) -> State { if Some(b']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { let mut label_start_index = None; - let mut index = tokenizer.label_start_stack.len(); + let mut index = tokenizer.tokenize_state.label_start_stack.len(); while index > 0 { index -= 1; - if !tokenizer.label_start_stack[index].balanced { + if !tokenizer.tokenize_state.label_start_stack[index].balanced { label_start_index = Some(index); break; } @@ -184,6 +184,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { // If there is an okay opening: if let Some(label_start_index) = label_start_index { let label_start = tokenizer + .tokenize_state .label_start_stack .get_mut(label_start_index) .unwrap(); @@ -221,7 +222,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` pub fn after(tokenizer: &mut Tokenizer) -> State { - let start = &tokenizer.label_start_stack[tokenizer.tokenize_state.start]; + let start = &tokenizer.tokenize_state.label_start_stack[tokenizer.tokenize_state.start]; let defined = tokenizer .parse_state .definitions @@ -298,17 +299,23 @@ pub fn reference_not_full(tokenizer: &mut Tokenizer) -> State { pub fn ok(tokenizer: &mut Tokenizer) -> State { let label_start_index = tokenizer.tokenize_state.start; // Remove this one and everything after it. - let mut left = tokenizer.label_start_stack.split_off(label_start_index); + let mut left = tokenizer + .tokenize_state + .label_start_stack + .split_off(label_start_index); // Remove this one from `left`, as we’ll move it to `media_list`. let label_start = left.remove(0); - tokenizer.label_start_list_loose.append(&mut left); + tokenizer + .tokenize_state + .label_start_list_loose + .append(&mut left); let is_link = tokenizer.events[label_start.start.0].token_type == Token::LabelLink; if is_link { let mut index = 0; - while index < tokenizer.label_start_stack.len() { - let label_start = &mut tokenizer.label_start_stack[index]; + while index < tokenizer.tokenize_state.label_start_stack.len() { + let label_start = &mut tokenizer.tokenize_state.label_start_stack[index]; if tokenizer.events[label_start.start.0].token_type == Token::LabelLink { label_start.inactive = true; } @@ -316,7 +323,7 @@ pub fn ok(tokenizer: &mut Tokenizer) -> State { } } - tokenizer.media_list.push(Media { + tokenizer.tokenize_state.media_list.push(Media { start: label_start.start, end: (tokenizer.tokenize_state.end, tokenizer.events.len() - 1), }); @@ -340,6 +347,7 @@ pub fn ok(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn nok(tokenizer: &mut Tokenizer) -> State { tokenizer + .tokenize_state .label_start_stack .get_mut(tokenizer.tokenize_state.start) .unwrap() @@ -398,7 +406,7 @@ pub fn resource_open(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_3 = Token::ResourceDestinationLiteralMarker; tokenizer.tokenize_state.token_4 = Token::ResourceDestinationRaw; tokenizer.tokenize_state.token_5 = Token::ResourceDestinationString; - tokenizer.tokenize_state.size_other = RESOURCE_DESTINATION_BALANCE_MAX; + tokenizer.tokenize_state.size_b = RESOURCE_DESTINATION_BALANCE_MAX; tokenizer.attempt( StateName::DestinationStart, @@ -420,7 +428,7 @@ pub fn resource_destination_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.tokenize_state.token_4 = Token::Data; tokenizer.tokenize_state.token_5 = Token::Data; - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; let name = space_or_tab_eol(tokenizer); tokenizer.attempt( name, @@ -436,7 +444,7 @@ pub fn resource_destination_missing(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_3 = Token::Data; tokenizer.tokenize_state.token_4 = Token::Data; tokenizer.tokenize_state.token_5 = Token::Data; - tokenizer.tokenize_state.size_other = 0; + tokenizer.tokenize_state.size_b = 0; State::Nok } @@ -605,9 +613,9 @@ pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State { /// images, or turns them back into data. #[allow(clippy::too_many_lines)] pub fn resolve_media(tokenizer: &mut Tokenizer) { - let mut left = tokenizer.label_start_list_loose.split_off(0); - let mut left_2 = tokenizer.label_start_stack.split_off(0); - let media = tokenizer.media_list.split_off(0); + let mut left = tokenizer.tokenize_state.label_start_list_loose.split_off(0); + let mut left_2 = tokenizer.tokenize_state.label_start_stack.split_off(0); + let media = tokenizer.tokenize_state.media_list.split_off(0); left.append(&mut left_2); let events = &tokenizer.events; diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index 1730fc3..629e836 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -64,7 +64,7 @@ pub fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::LabelMarker); tokenizer.exit(Token::LabelImage); - tokenizer.label_start_stack.push(LabelStart { + tokenizer.tokenize_state.label_start_stack.push(LabelStart { start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1), balanced: false, inactive: false, diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs index c47941c..6eb7b40 100644 --- a/src/construct/label_start_link.rs +++ b/src/construct/label_start_link.rs @@ -46,7 +46,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::LabelMarker); tokenizer.exit(Token::LabelLink); - tokenizer.label_start_stack.push(LabelStart { + tokenizer.tokenize_state.label_start_stack.push(LabelStart { start: (start, tokenizer.events.len() - 1), balanced: false, inactive: false, diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index a68f359..0ad67c5 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -17,8 +17,8 @@ use crate::tokenizer::{EventType, State, StateName, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - // Make sure to eat the first `stop`. - Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => { + // Make sure to eat the first `markers`. + Some(byte) if tokenizer.tokenize_state.markers.contains(&byte) => { tokenizer.enter(Token::Data); tokenizer.consume(); State::Next(StateName::DataInside) @@ -42,7 +42,7 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::LineEnding); State::Next(StateName::DataAtBreak) } - Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => { + Some(byte) if tokenizer.tokenize_state.markers.contains(&byte) => { tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data)); State::Ok } @@ -62,7 +62,7 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { pub fn inside(tokenizer: &mut Tokenizer) -> State { let done = match tokenizer.current { None | Some(b'\n') => true, - Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => true, + Some(byte) if tokenizer.tokenize_state.markers.contains(&byte) => true, _ => false, }; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 26fadc4..735fb38 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -182,7 +182,7 @@ pub fn raw(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.size = 0; State::Ok } - Some(b'(') if tokenizer.tokenize_state.size < tokenizer.tokenize_state.size_other => { + Some(b'(') if tokenizer.tokenize_state.size < tokenizer.tokenize_state.size_b => { tokenizer.consume(); tokenizer.tokenize_state.size += 1; State::Next(StateName::DestinationRaw) diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index a151841..6447961 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -142,13 +142,26 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { } } -/// To do. +/// In a label, after whitespace. +/// +/// ```markdown +/// | [a␊ +/// > | b] +/// ^ +/// ``` pub fn eol_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.connect = true; State::Retry(StateName::LabelAtBreak) } -/// To do. +/// In a label, at a blank line. +/// +/// ```markdown +/// | [a␊ +/// > | ␊ +/// ^ +/// | b] +/// ``` pub fn at_blank_line(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.marker = 0; tokenizer.tokenize_state.connect = false; diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 0b81418..209240e 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -133,13 +133,26 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { } } -/// To do. +/// In a title, after whitespace. +/// +/// ```markdown +/// | "a␊ +/// > | b" +/// ^ +/// ``` pub fn after_eol(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.connect = true; State::Retry(StateName::TitleAtBreak) } -/// To do. +/// In a title, at a blank line. +/// +/// ```markdown +/// | "a␊ +/// > | ␊ +/// ^ +/// | b" +/// ``` pub fn at_blank_line(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.marker = 0; tokenizer.tokenize_state.connect = false; diff --git a/src/content/document.rs b/src/content/document.rs index 98f8a7d..49ca919 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -59,7 +59,7 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let state = tokenizer.push( (0, 0), (parse_state.bytes.len(), 0), - StateName::DocumentStart, + State::Next(StateName::DocumentStart), ); tokenizer.flush(state, true); @@ -105,7 +105,7 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.tokenize_state.child_tokenizer = Some(Box::new(Tokenizer::new( + tokenizer.tokenize_state.document_child = Some(Box::new(Tokenizer::new( tokenizer.point.clone(), tokenizer.parse_state, ))); @@ -173,7 +173,7 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State { if tokenizer.tokenize_state.document_continued == tokenizer.tokenize_state.document_container_stack.len() { - let child = tokenizer.tokenize_state.child_tokenizer.as_ref().unwrap(); + let child = tokenizer.tokenize_state.document_child.as_ref().unwrap(); tokenizer.interrupt = child.interrupt; @@ -209,7 +209,12 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State { ) } -/// To do. +/// Maybe before a new container, but not a block quote. +// +/// ```markdown +/// > | * a +/// ^ +/// ``` pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State { // List item? // We replace the empty block quote container for this new list one. @@ -227,7 +232,12 @@ pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State ) } -/// To do. +/// Maybe before a new container, but not a list. +// +/// ```markdown +/// > | a +/// ^ +/// ``` pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State { // It wasn’t a new block quote or a list. // Swap the new container (in the middle) with the existing one (at the end). @@ -283,7 +293,7 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` pub fn containers_after(tokenizer: &mut Tokenizer) -> State { - let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap(); + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); child.lazy = tokenizer.tokenize_state.document_continued != tokenizer.tokenize_state.document_container_stack.len(); @@ -312,7 +322,12 @@ pub fn containers_after(tokenizer: &mut Tokenizer) -> State { } } -/// To do. +/// In flow. +// +/// ```markdown +/// > | * ab +/// ^ +/// ``` pub fn flow_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => { @@ -340,23 +355,18 @@ pub fn flow_inside(tokenizer: &mut Tokenizer) -> State { /// ^ ^ /// ``` pub fn flow_end(tokenizer: &mut Tokenizer) -> State { - let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap(); + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); let state = tokenizer .tokenize_state .document_child_state .unwrap_or(State::Next(StateName::FlowStart)); - let name = match state { - State::Next(name) => name, - _ => unreachable!("expected state name"), - }; - tokenizer.tokenize_state.document_exits.push(None); let state = child.push( (child.point.index, child.point.vs), (tokenizer.point.index, tokenizer.point.vs), - name, + state, ); let paragraph = matches!(state, State::Next(StateName::ParagraphInside)) @@ -403,7 +413,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { .document_container_stack .split_off(tokenizer.tokenize_state.document_continued); - let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap(); + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); // Flush if needed. if *phase != Phase::After { @@ -463,7 +473,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { // Inject everything together. fn resolve(tokenizer: &mut Tokenizer) { - let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap(); + let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); // First, add the container exits into `child`. let mut child_index = 0; diff --git a/src/content/string.rs b/src/content/string.rs index 75cd56a..5dfceb0 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -20,7 +20,7 @@ const MARKERS: [u8; 2] = [b'&', b'\\']; /// Start of string. pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.register_resolver("whitespace".to_string(), Box::new(resolve)); - tokenizer.tokenize_state.stop = &MARKERS; + tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::StringBefore) } diff --git a/src/content/text.rs b/src/content/text.rs index ee70f33..4e93779 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -38,7 +38,7 @@ const MARKERS: [u8; 9] = [ /// Start of text. pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.register_resolver("whitespace".to_string(), Box::new(resolve)); - tokenizer.tokenize_state.stop = &MARKERS; + tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::TextBefore) } @@ -91,7 +91,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { } } -/// To do. +/// At `<`, which wasn’t an autolink: before HTML? pub fn before_html(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( StateName::HtmlTextStart, @@ -100,7 +100,7 @@ pub fn before_html(tokenizer: &mut Tokenizer) -> State { ) } -/// To do. +/// At `\`, which wasn’t a character escape: before a hard break? pub fn before_hard_break_escape(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( StateName::HardBreakEscapeStart, @@ -110,10 +110,6 @@ pub fn before_hard_break_escape(tokenizer: &mut Tokenizer) -> State { } /// At data. -/// -/// ```markdown -/// |qwe -/// ``` pub fn before_data(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( StateName::DataStart, diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 3d923d3..bf6a106 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -99,10 +99,7 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { state = tokenizer.push( (enter.point.index, enter.point.vs), (end.index, end.vs), - match state { - State::Next(func) => func, - _ => unreachable!("cannot be ok/nok"), - }, + state, ); link_index = link_curr.next; @@ -112,7 +109,6 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { divide_events(&mut map, events, index, &mut tokenizer.events); - // To do: check `tokenizer.events` if there is a deep content type? done = false; } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3cdd2d3..04a8cc3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -29,11 +29,16 @@ pub enum ContentType { Text, } -/// To do. +/// How to handle a byte. #[derive(Debug, PartialEq)] pub enum ByteAction { + /// This is a normal byte. + /// + /// Includes replaced bytes. Normal(u8), + /// This is a new byte. Insert(u8), + /// This byte must be ignored. Ignore, } @@ -84,22 +89,6 @@ pub struct Event { pub link: Option, } -#[derive(Debug, PartialEq)] -enum AttemptKind { - Attempt, - Check, -} - -/// To do. -#[derive(Debug)] -struct Attempt { - /// To do. - ok: State, - nok: State, - kind: AttemptKind, - state: Option, -} - /// Callback that can be registered and is called when the tokenizer is done. /// /// Resolvers are supposed to change the list of events, because parsing is @@ -107,6 +96,7 @@ struct Attempt { /// the compiler and other users. pub type Resolver = dyn FnOnce(&mut Tokenizer); +/// Names of functions to move to. #[derive(Debug, Clone, Copy, PartialEq)] pub enum StateName { AttentionStart, @@ -447,62 +437,73 @@ pub struct ContainerState { pub size: usize, } +/// Different kinds of attempts. +#[derive(Debug, PartialEq)] +enum AttemptKind { + /// Discard what was tokenizer when unsuccessful. + Attempt, + /// Discard always. + Check, +} + +/// How to handle [`State::Ok`][] or [`State::Nok`][]. +#[derive(Debug)] +struct Attempt { + /// Where to go to when successful. + ok: State, + /// Where to go to when unsuccessful. + nok: State, + /// Kind of attempt. + kind: AttemptKind, + /// If needed, the progress to revert to. + /// + /// It is not needed to discard an [`AttemptKind::Attempt`] that has a + /// `nok` of [`State::Nok`][], because that means it is used in *another* + /// attempt, which will receive that `Nok`, and has to handle it. + progress: Option, +} + /// The internal state of a tokenizer, not to be confused with states from the /// state machine, this instead is all the information about where we currently /// are and what’s going on. #[derive(Debug, Clone)] -struct InternalState { - /// Length of `events`. We only add to events, so reverting will just pop stuff off. +struct Progress { + /// Length of `events`. + /// + /// It’s not allowed to remove events, so reverting will just pop stuff off. events_len: usize, - /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. + /// Length of the stack. + /// + /// It’s not allowed to decrease the stack in an attempt. stack_len: usize, /// Previous code. previous: Option, /// Current code. current: Option, - /// Current relative and absolute position in the file. + /// Current place in the file. point: Point, } -/// To do +/// A lot of shared fields used to tokenize things. #[allow(clippy::struct_excessive_bools)] pub struct TokenizeState<'a> { - /// To do. - pub connect: bool, - /// To do. + // Couple complex fields used to tokenize the document. + /// Tokenizer, used to tokenize flow in document. + pub document_child: Option>>, + /// State, used to tokenize containers. + pub document_child_state: Option, + /// Stack of currently active containers. pub document_container_stack: Vec, - /// To do. - pub document_exits: Vec>>, - /// To do. + /// How many active containers continued. pub document_continued: usize, - /// To do. - pub document_paragraph_before: bool, - /// To do. + /// Index of last `data`. pub document_data_index: Option, - /// To do. - pub document_child_state: Option, - /// To do. - pub child_tokenizer: Option>>, - /// To do. - pub marker: u8, - /// To do. - pub marker_other: u8, - /// To do. - pub prefix: usize, - /// To do. - pub return_state: Option, - /// To do. - pub seen: bool, - /// To do. - pub size: usize, - /// To do. - pub size_other: usize, - /// To do. - pub start: usize, - /// To do. - pub end: usize, - /// To do. - pub stop: &'static [u8], + /// Container exits by line number. + pub document_exits: Vec>>, + /// Whether the previous flow was a paragraph. + pub document_paragraph_before: bool, + + // Couple of very frequent settings for parsing whitespace. pub space_or_tab_eol_content_type: Option, pub space_or_tab_eol_connect: bool, pub space_or_tab_eol_ok: bool, @@ -512,11 +513,50 @@ pub struct TokenizeState<'a> { pub space_or_tab_max: usize, pub space_or_tab_size: usize, pub space_or_tab_token: Token, - /// To do. + + // Couple of media related fields. + /// Stack of label (start) that could form images and links. + /// + /// Used when tokenizing [text content][crate::content::text]. + pub label_start_stack: Vec, + /// Stack of label (start) that cannot form images and links. + /// + /// Used when tokenizing [text content][crate::content::text]. + pub label_start_list_loose: Vec, + /// Stack of images and links. + /// + /// Used when tokenizing [text content][crate::content::text]. + pub media_list: Vec, + + /// Whether to connect tokens. + pub connect: bool, + /// Marker. + pub marker: u8, + /// Secondary marker. + pub marker_b: u8, + /// Several markers. + pub markers: &'static [u8], + /// Whether something was seen. + pub seen: bool, + /// Size. + pub size: usize, + /// Secondary size. + pub size_b: usize, + /// Tertiary size. + pub size_c: usize, + /// Index. + pub start: usize, + /// Index. + pub end: usize, + /// Slot for a token type. pub token_1: Token, + /// Slot for a token type. pub token_2: Token, + /// Slot for a token type. pub token_3: Token, + /// Slot for a token type. pub token_4: Token, + /// Slot for a token type. pub token_5: Token, } @@ -525,9 +565,9 @@ pub struct TokenizeState<'a> { pub struct Tokenizer<'a> { /// Jump between line endings. column_start: Vec<(usize, usize)>, - // First line. + // First line where this tokenizer starts. first_line: usize, - /// First point after the last line ending. + /// Current point after the last line ending (excluding jump). line_start: Point, /// Track whether the current byte is already consumed (`true`) or expected /// to be consumed (`false`). @@ -536,7 +576,7 @@ pub struct Tokenizer<'a> { consumed: bool, /// Track whether this tokenizer is done. resolved: bool, - /// To do. + /// Stack of how to handle attempts. attempts: Vec, /// Current byte. pub current: Option, @@ -544,7 +584,7 @@ pub struct Tokenizer<'a> { pub previous: Option, /// Current relative and absolute place in the file. pub point: Point, - /// Semantic labels of one or more codes in `codes`. + /// Semantic labels. pub events: Vec, /// Hierarchy of semantic labels. /// @@ -559,20 +599,8 @@ pub struct Tokenizer<'a> { pub resolver_ids: Vec, /// Shared parsing state across tokenizers. pub parse_state: &'a ParseState<'a>, - /// To do. + /// A lot of shared fields used to tokenize things. pub tokenize_state: TokenizeState<'a>, - /// Stack of label (start) that could form images and links. - /// - /// Used when tokenizing [text content][crate::content::text]. - pub label_start_stack: Vec, - /// Stack of label (start) that cannot form images and links. - /// - /// Used when tokenizing [text content][crate::content::text]. - pub label_start_list_loose: Vec, - /// Stack of images and links. - /// - /// Used when tokenizing [text content][crate::content::text]. - pub media_list: Vec, /// Whether we would be interrupting something. /// /// Used when tokenizing [flow content][crate::content::flow]. @@ -613,17 +641,19 @@ impl<'a> Tokenizer<'a> { document_paragraph_before: false, document_data_index: None, document_child_state: None, - child_tokenizer: None, + document_child: None, marker: 0, - marker_other: 0, - prefix: 0, + marker_b: 0, + markers: &[], seen: false, size: 0, - size_other: 0, + size_b: 0, + size_c: 0, start: 0, end: 0, - stop: &[], - return_state: None, + label_start_stack: vec![], + label_start_list_loose: vec![], + media_list: vec![], space_or_tab_eol_content_type: None, space_or_tab_eol_connect: false, space_or_tab_eol_ok: false, @@ -640,15 +670,11 @@ impl<'a> Tokenizer<'a> { token_5: Token::Data, }, map: EditMap::new(), - label_start_stack: vec![], - label_start_list_loose: vec![], - media_list: vec![], interrupt: false, concrete: false, lazy: false, - // Assume about 10 resolvers. - resolvers: Vec::with_capacity(10), - resolver_ids: Vec::with_capacity(10), + resolvers: vec![], + resolver_ids: vec![], } } @@ -698,7 +724,7 @@ impl<'a> Tokenizer<'a> { } /// Prepare for a next code to get consumed. - pub fn expect(&mut self, byte: Option) { + fn expect(&mut self, byte: Option) { debug_assert!(self.consumed, "expected previous byte to be consumed"); self.consumed = false; self.current = byte; @@ -721,7 +747,7 @@ impl<'a> Tokenizer<'a> { } /// Move to the next (virtual) byte. - pub fn move_one(&mut self) { + fn move_one(&mut self) { match byte_action(self.parse_state.bytes, &self.point) { ByteAction::Ignore => { self.point.index += 1; @@ -756,7 +782,7 @@ impl<'a> Tokenizer<'a> { } /// Move (virtual) bytes. - pub fn move_to(&mut self, to: (usize, usize)) { + fn move_to(&mut self, to: (usize, usize)) { let (to_index, to_vs) = to; while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs { self.move_one(); @@ -838,9 +864,9 @@ impl<'a> Tokenizer<'a> { }); } - /// Capture the internal state. - fn capture(&mut self) -> InternalState { - InternalState { + /// Capture the tokenizer progress. + fn capture(&mut self) -> Progress { + Progress { previous: self.previous, current: self.current, point: self.point.clone(), @@ -849,8 +875,8 @@ impl<'a> Tokenizer<'a> { } } - /// Apply the internal state. - fn free(&mut self, previous: InternalState) { + /// Apply tokenizer progress. + fn free(&mut self, previous: Progress) { self.previous = previous.previous; self.current = previous.current; self.point = previous.point; @@ -866,123 +892,168 @@ impl<'a> Tokenizer<'a> { self.stack.truncate(previous.stack_len); } - /// Parse with `name` and its future states, to check if it result in - /// [`State::Ok`][] or [`State::Nok`][], revert on both cases, and then - /// call `done` with whether it was successful or not. - /// - /// This captures the current state of the tokenizer, returns a wrapped - /// state that captures all codes and feeds them to `name` and its - /// future states until it yields `State::Ok` or `State::Nok`. - /// It then applies the captured state, calls `done`, and feeds all - /// captured codes to its future states. + /// Parse with `name` and its future states, to see if that results in + /// [`State::Ok`][] or [`State::Nok`][], then revert in both cases. pub fn check(&mut self, name: StateName, ok: State, nok: State) -> State { - attempt_impl(self, name, ok, nok, AttemptKind::Check) + // Always capture (and restore) when checking. + // No need to capture (and restore) when `nok` is `State::Nok`, because the + // parent attempt will do it. + let progress = Some(self.capture()); + + self.attempts.push(Attempt { + kind: AttemptKind::Check, + progress, + ok, + nok, + }); + + call_impl(self, name) } - /// Parse with `name` and its future states, to check if it results in - /// [`State::Ok`][] or [`State::Nok`][], revert on the case of - /// `State::Nok`, and then call `done` with whether it was successful or - /// not. - /// - /// This captures the current state of the tokenizer, returns a wrapped - /// state that captures all codes and feeds them to `name` and its - /// future states until it yields `State::Ok`, at which point it calls - /// `done` and yields its result. - /// If instead `State::Nok` was yielded, the captured state is applied, - /// `done` is called, and all captured codes are fed to its future states. + /// Parse with `name` and its future states, to see if that results in + /// [`State::Ok`][] or [`State::Nok`][], revert in the case of + /// `State::Nok`. pub fn attempt(&mut self, name: StateName, ok: State, nok: State) -> State { - attempt_impl(self, name, ok, nok, AttemptKind::Attempt) - } + // Always capture (and restore) when checking. + // No need to capture (and restore) when `nok` is `State::Nok`, because the + // parent attempt will do it. + let progress = if nok == State::Nok { + None + } else { + Some(self.capture()) + }; - /// Feed a list of `codes` into `start`. - /// - /// This is set up to support repeatedly calling `feed`, and thus streaming - /// markdown into the state machine, and normally pauses after feeding. - // Note: if needed: accept `vs`? - pub fn push(&mut self, min: (usize, usize), max: (usize, usize), name: StateName) -> State { - debug_assert!(!self.resolved, "cannot feed after drain"); + self.attempts.push(Attempt { + kind: AttemptKind::Attempt, + progress, + ok, + nok, + }); - // debug_assert!(min >= self.point.index, "cannot move backwards"); + call_impl(self, name) + } - if min.0 > self.point.index || (min.0 == self.point.index && min.1 > self.point.vs) { - self.move_to(min); - } + /// Tokenize. + pub fn push(&mut self, from: (usize, usize), to: (usize, usize), state: State) -> State { + push_impl(self, from, to, state, false) + } - let mut state = State::Next(name); + /// Flush. + pub fn flush(&mut self, state: State, resolve: bool) { + let to = (self.point.index, self.point.vs); + push_impl(self, to, to, state, true); - while self.point.index < max.0 || (self.point.index == max.0 && self.point.vs < max.1) { - match state { - State::Ok | State::Nok => { - if let Some(attempt) = self.attempts.pop() { - state = attempt_done_impl(self, attempt, state); - } else { - break; - } - } - State::Next(name) => { - let action = byte_action(self.parse_state.bytes, &self.point); - state = feed_action_impl(self, &Some(action), name); - } - State::Retry(name) => { - log::debug!(" retry {:?}", name); - state = call_impl(self, name); - } + if resolve { + self.resolved = true; + + while !self.resolvers.is_empty() { + let resolver = self.resolvers.remove(0); + resolver(self); } + + self.map.consume(&mut self.events); } + } +} - state +/// Move back past ignored bytes. +fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) { + while point.index > 0 { + point.index -= 1; + let action = byte_action(tokenizer.parse_state.bytes, point); + if !matches!(action, ByteAction::Ignore) { + point.index += 1; + break; + } } +} - /// Flush the tokenizer. - pub fn flush(&mut self, mut state: State, resolve: bool) { - let max = self.point.index; +/// Run the tokenizer. +fn push_impl( + tokenizer: &mut Tokenizer, + from: (usize, usize), + to: (usize, usize), + mut state: State, + flush: bool, +) -> State { + debug_assert!(!tokenizer.resolved, "cannot feed after drain"); + debug_assert!( + from.0 > tokenizer.point.index + || (from.0 == tokenizer.point.index && from.1 >= tokenizer.point.vs), + "cannot move backwards" + ); + + tokenizer.move_to(from); + + loop { + match state { + State::Ok | State::Nok => { + if let Some(attempt) = tokenizer.attempts.pop() { + if attempt.kind == AttemptKind::Check || state == State::Nok { + if let Some(progress) = attempt.progress { + tokenizer.free(progress); + } + } - self.consumed = true; + tokenizer.consumed = true; - loop { - match state { - State::Ok | State::Nok => { - if let Some(attempt) = self.attempts.pop() { - state = attempt_done_impl(self, attempt, state); + let next = if state == State::Ok { + attempt.ok } else { - break; - } - } - State::Next(name) => { - // We sometimes move back when flushing, so then we use those codes. - state = feed_action_impl( - self, - &if self.point.index == max { - None - } else { - Some(byte_action(self.parse_state.bytes, &self.point)) - }, - name, - ); - } - State::Retry(name) => { - log::debug!(" retry {:?}", name); - state = call_impl(self, name); + attempt.nok + }; + + log::debug!("attempt: `{:?}` -> `{:?}`", state, next); + state = next; + } else { + break; } } - } - - self.consumed = true; - debug_assert!(matches!(state, State::Ok), "must be ok"); + State::Next(name) => { + let action = if tokenizer.point.index < to.0 + || (tokenizer.point.index == to.0 && tokenizer.point.vs < to.1) + { + Some(byte_action(tokenizer.parse_state.bytes, &tokenizer.point)) + } else if flush { + None + } else { + break; + }; - if resolve { - self.resolved = true; + if let Some(ByteAction::Ignore) = action { + tokenizer.move_one(); + } else { + let byte = + if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = action { + Some(byte) + } else { + None + }; - while !self.resolvers.is_empty() { - let resolver = self.resolvers.remove(0); - resolver(self); + log::debug!("feed: `{:?}` to {:?}", byte, name); + tokenizer.expect(byte); + state = call_impl(tokenizer, name); + }; + } + State::Retry(name) => { + log::debug!("retry: {:?}", name); + state = call_impl(tokenizer, name); } - - self.map.consume(&mut self.events); } } + + tokenizer.consumed = true; + + if flush { + debug_assert!(matches!(state, State::Ok), "must be ok"); + } else { + debug_assert!(matches!(state, State::Next(_)), "must have a next state"); + } + + state } +/// Figure out how to handle a byte. fn byte_action(bytes: &[u8], point: &Point) -> ByteAction { if point.index < bytes.len() { let byte = bytes[point.index]; @@ -1024,73 +1095,8 @@ fn byte_action(bytes: &[u8], point: &Point) -> ByteAction { } } -/// Internal utility to wrap states to also capture codes. -/// -/// Recurses into itself. -/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. -fn attempt_impl( - tokenizer: &mut Tokenizer, - name: StateName, - ok: State, - nok: State, - kind: AttemptKind, -) -> State { - // Always capture (and restore) when checking. - // No need to capture (and restore) when `nok` is `State::Nok`, because the - // parent attempt will do it. - let state = if kind == AttemptKind::Check || nok != State::Nok { - Some(tokenizer.capture()) - } else { - None - }; - - tokenizer.attempts.push(Attempt { - ok, - nok, - kind, - state, - }); - - call_impl(tokenizer, name) -} - -fn attempt_done_impl(tokenizer: &mut Tokenizer, attempt: Attempt, state: State) -> State { - if attempt.kind == AttemptKind::Check || state == State::Nok { - if let Some(state) = attempt.state { - tokenizer.free(state); - } - } - - tokenizer.consumed = true; - if state == State::Ok { - attempt.ok - } else { - attempt.nok - } -} - -fn feed_action_impl( - tokenizer: &mut Tokenizer, - action: &Option, - name: StateName, -) -> State { - if let Some(ByteAction::Ignore) = action { - tokenizer.move_one(); - State::Next(name) - } else { - let byte = if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = action { - Some(*byte) - } else { - None - }; - - log::debug!("feed: `{:?}` to {:?}", byte, name); - tokenizer.expect(byte); - call_impl(tokenizer, name) - } -} - #[allow(clippy::too_many_lines)] +/// Call the corresponding function for a state name. fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State { let func = match name { StateName::AttentionStart => construct::attention::start, @@ -1422,15 +1428,3 @@ fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State { func(tokenizer) } - -fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) { - // Move back past ignored bytes. - while point.index > 0 { - point.index -= 1; - let action = byte_action(tokenizer.parse_state.bytes, point); - if !matches!(action, ByteAction::Ignore) { - point.index += 1; - break; - } - } -} -- cgit