From 11304728b6607bc2a8d41a640308f3379a25b933 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 25 Jul 2022 15:29:11 +0200 Subject: Improve performance w/ a single feed loop --- src/construct/attention.rs | 2 +- src/construct/blank_line.rs | 4 +- src/construct/block_quote.rs | 2 +- src/construct/code_fenced.rs | 6 +- src/construct/code_indented.rs | 8 +- src/construct/code_text.rs | 2 +- src/construct/definition.rs | 6 +- src/construct/hard_break_escape.rs | 2 +- src/construct/hard_break_trailing.rs | 2 +- src/construct/heading_atx.rs | 2 +- src/construct/heading_setext.rs | 2 +- src/construct/html_flow.rs | 4 +- src/construct/label_end.rs | 8 +- src/construct/list.rs | 10 +- src/construct/paragraph.rs | 2 +- src/construct/partial_data.rs | 2 +- src/construct/partial_destination.rs | 4 +- src/construct/partial_non_lazy_continuation.rs | 4 +- src/construct/partial_space_or_tab.rs | 8 +- src/construct/partial_whitespace.rs | 4 +- src/construct/thematic_break.rs | 2 +- src/content/document.rs | 2 +- src/subtokenize.rs | 8 +- src/tokenizer.rs | 187 ++++++++++++------------- 24 files changed, 138 insertions(+), 145 deletions(-) (limited to 'src') diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 1aa25c0..eb93810 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -193,7 +193,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, marker: MarkerKind) -> State { _ => { tokenizer.exit(Token::AttentionSequence); tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } } } diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 1121b81..dc36784 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -59,9 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> State { /// ``` fn after(_tokenizer: &mut Tokenizer, code: Code) -> State { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) - } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok(0), _ => State::Nok, } } diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index da21add..9925a5a 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -128,7 +128,7 @@ fn cont_after(tokenizer: &mut Tokenizer, code: Code) -> State { } _ => { tokenizer.exit(Token::BlockQuotePrefix); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } } } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 3923ba0..a814142 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -506,7 +506,7 @@ fn close_sequence_after(tokenizer: &mut Tokenizer, code: Code) -> State { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(Token::CodeFencedFence); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => State::Nok, } @@ -589,11 +589,11 @@ fn content_continue(tokenizer: &mut Tokenizer, code: Code, info: Info) -> State /// > | ~~~ /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer, code: Code) -> State { +fn after(tokenizer: &mut Tokenizer, _code: Code) -> State { tokenizer.exit(Token::CodeFenced); // Feel free to interrupt. tokenizer.interrupt = false; // No longer concrete. tokenizer.concrete = false; - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 512a816..6c528ff 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -115,11 +115,11 @@ fn content(tokenizer: &mut Tokenizer, code: Code) -> State { /// > | aaa /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer, code: Code) -> State { +fn after(tokenizer: &mut Tokenizer, _code: Code) -> State { tokenizer.exit(Token::CodeIndented); // Feel free to interrupt. tokenizer.interrupt = false; - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } /// Right at a line ending, trying to parse another indent. @@ -154,8 +154,8 @@ fn further_start(tokenizer: &mut Tokenizer, code: Code) -> State { /// > | bbb /// ^ /// ``` -fn further_end(_tokenizer: &mut Tokenizer, code: Code) -> State { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) +fn further_end(_tokenizer: &mut Tokenizer, _code: Code) -> State { + State::Ok(0) } /// At the beginning of a line that is not indented enough. diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index e68d489..451ef45 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -190,7 +190,7 @@ fn sequence_close(tokenizer: &mut Tokenizer, code: Code, size_open: usize, size: _ if size_open == size => { tokenizer.exit(Token::CodeTextSequence); tokenizer.exit(Token::CodeText); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => { let index = tokenizer.events.len(); diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 9e43d18..766bd8a 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -237,7 +237,7 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> State { tokenizer.exit(Token::Definition); // You’d be interrupting. tokenizer.interrupt = true; - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => State::Nok, } @@ -297,9 +297,7 @@ fn title_after(tokenizer: &mut Tokenizer, code: Code) -> State { /// ``` fn title_after_after_optional_whitespace(_tokenizer: &mut Tokenizer, code: Code) -> State { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) - } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok(0), _ => State::Nok, } } diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 617b0ce..2ac693e 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -74,7 +74,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code) -> State { match code { Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(Token::HardBreakEscape); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => State::Nok, } diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs index 8ce4201..35097ec 100644 --- a/src/construct/hard_break_trailing.rs +++ b/src/construct/hard_break_trailing.rs @@ -81,7 +81,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, size: usize) -> State { { tokenizer.exit(Token::HardBreakTrailingSpace); tokenizer.exit(Token::HardBreakTrailing); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => State::Nok, } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 1eabb56..4ef1192 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -134,7 +134,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code) -> State { tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve)); // Feel free to interrupt. tokenizer.interrupt = false; - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } Code::VirtualSpace | Code::Char('\t' | ' ') => { tokenizer.go(space_or_tab(), at_break)(tokenizer, code) diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 7aa0054..83c41e2 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -189,7 +189,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> State { // Feel free to interrupt. tokenizer.interrupt = false; tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve)); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => State::Nok, } diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 7a7c25f..add2308 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -924,13 +924,13 @@ fn continuation_close(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Stat /// > | /// ^ /// ``` -fn continuation_after(tokenizer: &mut Tokenizer, code: Code) -> State { +fn continuation_after(tokenizer: &mut Tokenizer, _code: Code) -> State { tokenizer.exit(Token::HtmlFlow); // Feel free to interrupt. tokenizer.interrupt = false; // No longer concrete. tokenizer.concrete = false; - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } /// Before a line ending, expecting a blank line. diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 35dfcdf..13af833 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -320,7 +320,7 @@ fn reference_not_full(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Stat /// > | [a] b /// ^ /// ``` -fn ok(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State { +fn ok(tokenizer: &mut Tokenizer, _code: Code, mut info: Info) -> State { // Remove this one and everything after it. let mut left = tokenizer .label_start_stack @@ -345,7 +345,7 @@ fn ok(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State { info.media.end.1 = tokenizer.events.len() - 1; tokenizer.media_list.push(info.media); tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media)); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } /// Done, it’s nothing. @@ -526,7 +526,7 @@ fn full_reference(tokenizer: &mut Tokenizer, code: Code) -> State { /// > | [a][b] d /// ^ /// ``` -fn full_reference_after(tokenizer: &mut Tokenizer, code: Code) -> State { +fn full_reference_after(tokenizer: &mut Tokenizer, _code: Code) -> State { let events = &tokenizer.events; let mut index = events.len() - 1; let mut start: Option = None; @@ -558,7 +558,7 @@ fn full_reference_after(tokenizer: &mut Tokenizer, code: Code) -> State { false, ))) { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } else { State::Nok } diff --git a/src/construct/list.rs b/src/construct/list.rs index 7437d4a..ae3fc34 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -276,7 +276,7 @@ fn whitespace_after(_tokenizer: &mut Tokenizer, code: Code) -> State { if matches!(code, Code::VirtualSpace | Code::Char('\t' | ' ')) { State::Nok } else { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } } @@ -304,7 +304,7 @@ fn prefix_other(tokenizer: &mut Tokenizer, code: Code) -> State { /// > | * a /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer, code: Code, blank: bool) -> State { +fn after(tokenizer: &mut Tokenizer, _code: Code, blank: bool) -> State { if blank && tokenizer.interrupt { State::Nok } else { @@ -322,7 +322,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code, blank: bool) -> State { tokenizer.exit(Token::ListItemPrefix); tokenizer.register_resolver_before("list_item".to_string(), Box::new(resolve_list_item)); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } } @@ -377,8 +377,8 @@ pub fn not_blank_cont(tokenizer: &mut Tokenizer, code: Code) -> State { } /// A state fn to yield [`State::Ok`]. -pub fn ok(_tokenizer: &mut Tokenizer, code: Code) -> State { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) +pub fn ok(_tokenizer: &mut Tokenizer, _code: Code) -> State { + State::Ok(0) } /// A state fn to yield [`State::Nok`]. diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 5409532..bc980b2 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -69,7 +69,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code) -> State { tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve)); // You’d be interrupting. tokenizer.interrupt = true; - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => { tokenizer.consume(code); diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index d60ef36..ce10763 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -42,7 +42,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: &'static [Code]) -> Sta } _ if stop.contains(&code) => { tokenizer.register_resolver("data".to_string(), Box::new(resolve_data)); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } _ => { tokenizer.enter(Token::Data); diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index f898eb5..4a43ec2 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -224,7 +224,7 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State { tokenizer.exit(info.options.string.clone()); tokenizer.exit(info.options.raw.clone()); tokenizer.exit(info.options.destination); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } else { tokenizer.consume(code); info.balance -= 1; @@ -242,7 +242,7 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State { tokenizer.exit(info.options.string.clone()); tokenizer.exit(info.options.raw.clone()); tokenizer.exit(info.options.destination); - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } } Code::Char(char) if char.is_ascii_control() => State::Nok, diff --git a/src/construct/partial_non_lazy_continuation.rs b/src/construct/partial_non_lazy_continuation.rs index c6ac493..62e8989 100644 --- a/src/construct/partial_non_lazy_continuation.rs +++ b/src/construct/partial_non_lazy_continuation.rs @@ -39,10 +39,10 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> State { /// > | b /// ^ /// ``` -fn after(tokenizer: &mut Tokenizer, code: Code) -> State { +fn after(tokenizer: &mut Tokenizer, _code: Code) -> State { if tokenizer.lazy { State::Nok } else { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } } diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 6eb3f1d..f13414a 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -149,7 +149,7 @@ fn start(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State { } _ => { if info.options.min == 0 { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } else { State::Nok } @@ -173,7 +173,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State { _ => { tokenizer.exit(info.options.kind.clone()); if info.size >= info.options.min { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } else { State::Nok } @@ -204,7 +204,7 @@ fn after_space_or_tab(tokenizer: &mut Tokenizer, code: Code, mut info: EolInfo) tokenizer.exit(Token::LineEnding); State::Fn(Box::new(|t, c| after_eol(t, c, info))) } - _ if info.ok => State::Ok(if matches!(code, Code::None) { 0 } else { 1 }), + _ if info.ok => State::Ok(0), _ => State::Nok, } } @@ -245,6 +245,6 @@ fn after_more_space_or_tab(_tokenizer: &mut Tokenizer, code: Code) -> State { ) { State::Nok } else { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 4fc013e..acdd4d1 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -57,6 +57,6 @@ fn at_eol(tokenizer: &mut Tokenizer, code: Code) -> State { } /// Fine. -fn ok(_tokenizer: &mut Tokenizer, code: Code) -> State { - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) +fn ok(_tokenizer: &mut Tokenizer, _code: Code) -> State { + State::Ok(0) } diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 4159146..66edaf8 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -183,7 +183,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> State { tokenizer.exit(Token::ThematicBreak); // Feel free to interrupt. tokenizer.interrupt = false; - State::Ok(if matches!(code, Code::None) { 0 } else { 1 }) + State::Ok(0) } Code::Char(char) if char == info.kind.as_char() => { tokenizer.enter(Token::ThematicBreakSequence); diff --git a/src/content/document.rs b/src/content/document.rs index c1017a7..f8d7b55 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -78,7 +78,7 @@ struct DocumentInfo { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let mut tokenizer = Tokenizer::new(point, parse_state); - tokenizer.push(&parse_state.codes, Box::new(start), true); + tokenizer.push(parse_state.codes.clone(), Box::new(start), true); let mut index = 0; let mut definitions = vec![]; diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 2b5d775..0c9df34 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -84,6 +84,7 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { } else { text })); + let mut size = 0; // Loop through links to pass them in order to the subtokenizer. while let Some(index) = link_index { @@ -96,7 +97,7 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { }; if link_curr.previous != None { - tokenizer.define_skip(&enter.point); + tokenizer.define_skip(&enter.point, size); } let func = match state { @@ -105,10 +106,13 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { }; state = tokenizer.push( - span::codes(&parse_state.codes, &span), + span::codes(&parse_state.codes, &span).to_vec(), func, link_curr.next == None, ); + + size += span.end_index - span.start_index; + link_index = link_curr.next; } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 931ffae..7ec0d91 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -173,7 +173,7 @@ struct InternalState { #[allow(clippy::struct_excessive_bools)] pub struct Tokenizer<'a> { /// Jump between line endings. - column_start: Vec<(usize, usize, usize)>, + column_start: Vec<(usize, usize, usize, usize)>, // First line. line_start: usize, /// Track whether a character is expected to be consumed, and whether it’s @@ -183,20 +183,20 @@ pub struct Tokenizer<'a> { consumed: bool, /// Track whether this tokenizer is done. drained: bool, + /// Current character code. + current: Code, + /// Previous character code. + pub previous: Code, + /// Current relative and absolute place in the file. + pub point: Point, /// Semantic labels of one or more codes in `codes`. pub events: Vec, /// Hierarchy of semantic labels. /// /// Tracked to make sure everything’s valid. pub stack: Vec, - /// Previous character code. - pub previous: Code, /// To do. pub map: EditMap, - /// Current character code. - current: Code, - /// Current relative and absolute place in the file. - pub point: Point, /// List of attached resolvers, which will be called when done feeding, /// to clean events. resolvers: Vec>, @@ -204,6 +204,8 @@ pub struct Tokenizer<'a> { resolver_ids: Vec, /// Shared parsing state across tokenizers. pub parse_state: &'a ParseState<'a>, + codes: Vec, + pub index: usize, /// Stack of label (start) that could form images and links. /// /// Used when tokenizing [text content][crate::content::text]. @@ -216,6 +218,8 @@ pub struct Tokenizer<'a> { /// /// Used when tokenizing [text content][crate::content::text]. pub media_list: Vec, + /// Current container state. + pub container: Option, /// Whether we would be interrupting something. /// /// Used when tokenizing [flow content][crate::content::flow]. @@ -229,8 +233,6 @@ pub struct Tokenizer<'a> { /// The previous line was a paragraph, and this line’s containers did not /// match. pub lazy: bool, - /// Current container state. - pub container: Option, } impl<'a> Tokenizer<'a> { @@ -248,14 +250,16 @@ impl<'a> Tokenizer<'a> { stack: vec![], events: vec![], parse_state, + codes: vec![], + index: 0, map: EditMap::new(), label_start_stack: vec![], label_start_list_loose: vec![], media_list: vec![], + container: None, interrupt: false, concrete: false, lazy: false, - container: None, // Assume about 10 resolvers. resolvers: Vec::with_capacity(10), resolver_ids: Vec::with_capacity(10), @@ -288,8 +292,12 @@ impl<'a> Tokenizer<'a> { } /// Define a jump between two places. - pub fn define_skip(&mut self, point: &Point) { - define_skip_impl(self, point.line, (point.column, point.offset, point.index)); + pub fn define_skip(&mut self, point: &Point, index: usize) { + define_skip_impl( + self, + point.line, + (point.column, point.offset, point.index, index), + ); } /// Define the current place as a jump between two places. @@ -297,7 +305,12 @@ impl<'a> Tokenizer<'a> { define_skip_impl( self, self.point.line, - (self.point.column, self.point.offset, self.point.index), + ( + self.point.column, + self.point.offset, + self.point.index, + self.index, + ), ); } @@ -307,10 +320,11 @@ impl<'a> Tokenizer<'a> { let at = self.point.line - self.line_start; if self.point.column == 1 && at != self.column_start.len() { - let (column, offset, index) = &self.column_start[at]; + let (column, offset, index_abs, index_rel) = &self.column_start[at]; self.point.column = *column; self.point.offset = *offset; - self.point.index = *index; + self.point.index = *index_abs; + self.index = *index_rel; } } @@ -326,6 +340,7 @@ impl<'a> Tokenizer<'a> { assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); self.point.index += 1; + self.index += 1; match code { Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { @@ -342,6 +357,7 @@ impl<'a> Tokenizer<'a> { self.point.column, self.point.offset, self.point.index, + self.index, )); } @@ -482,11 +498,13 @@ impl<'a> Tokenizer<'a> { ) -> Box { attempt_impl( state_fn, - |_code| false, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + None, + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { if matches!(state, State::Ok(_)) { - feed_impl(tokenizer, result.1, after) + tokenizer.index = result.1; + tokenizer.consumed = true; + State::Fn(Box::new(after)) } else { state } @@ -505,11 +523,12 @@ impl<'a> Tokenizer<'a> { ) -> Box { attempt_impl( state_fn, - until, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + Some(Box::new(until)), + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { + tokenizer.index = result.1; tokenizer.consumed = true; - feed_impl(tokenizer, result.1, done(state)) + State::Fn(done(state)) }, ) } @@ -532,16 +551,13 @@ impl<'a> Tokenizer<'a> { attempt_impl( state_fn, - |_code| false, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + None, + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { tokenizer.free(previous); - feed_twice_impl( - tokenizer, - result.0, - result.1, - done(matches!(state, State::Ok(_))), - ) + tokenizer.index = result.0; + tokenizer.consumed = true; + State::Fn(done(matches!(state, State::Ok(_)))) }, ) } @@ -566,9 +582,9 @@ impl<'a> Tokenizer<'a> { attempt_impl( state_fn, - |_code| false, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + None, + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { let ok = matches!(state, State::Ok(_)); if !ok { @@ -577,12 +593,9 @@ impl<'a> Tokenizer<'a> { log::debug!("attempt: {:?}, at {:?}", ok, tokenizer.point); - feed_twice_impl( - tokenizer, - if ok { &[] } else { result.0 }, - result.1, - done(ok), - ) + tokenizer.index = result.1; + tokenizer.consumed = true; + State::Fn(done(ok)) }, ) } @@ -623,7 +636,7 @@ impl<'a> Tokenizer<'a> { /// markdown into the state machine, and normally pauses after feeding. pub fn push( &mut self, - codes: &[Code], + mut codes: Vec, start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, drain: bool, ) -> State { @@ -632,7 +645,9 @@ impl<'a> Tokenizer<'a> { // Let’s assume an event per character. self.events.reserve(codes.len()); - let mut result = feed_impl(self, codes, start); + self.codes.append(&mut codes); + + let mut result = feed_impl(self, start); if drain { let func = match result { @@ -667,41 +682,34 @@ impl<'a> Tokenizer<'a> { /// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. fn attempt_impl( state: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, - pause: impl Fn(Code) -> bool + 'static, - mut codes: Vec, - done: impl FnOnce((&[Code], &[Code]), &mut Tokenizer, State) -> State + 'static, + pause: Option bool + 'static>>, + start: usize, + done: impl FnOnce((usize, usize), &mut Tokenizer, State) -> State + 'static, ) -> Box { - Box::new(|tokenizer, code| { - if !codes.is_empty() && pause(tokenizer.previous) { - let after = if matches!(code, Code::None) { - vec![] - } else { - vec![code] - }; - - return done((&codes, &after), tokenizer, State::Fn(Box::new(state))); + Box::new(move |tokenizer, code| { + if let Some(ref func) = pause { + if tokenizer.index > start && func(tokenizer.previous) { + return done( + (start, tokenizer.index), + tokenizer, + State::Fn(Box::new(state)), + ); + } } let state = state(tokenizer, code); - match code { - Code::None => {} - _ => { - codes.push(code); - } - } - match state { State::Ok(back) => { + let stop = tokenizer.index - back; assert!( - back <= codes.len(), - "`back` must be smaller than or equal to `codes.len()`" + stop >= start, + "`back` must not result in an index smaller than `start`" ); - let remaining = codes.split_off(codes.len() - back); - done((&codes, &remaining), tokenizer, state) + done((start, stop), tokenizer, state) } - State::Nok => done((&[], &codes), tokenizer, state), - State::Fn(func) => State::Fn(attempt_impl(func, pause, codes, done)), + State::Nok => done((start, start), tokenizer, state), + State::Fn(func) => State::Fn(attempt_impl(func, pause, start, done)), } }) } @@ -709,28 +717,23 @@ fn attempt_impl( /// Feed a list of `codes` into `start`. fn feed_impl( tokenizer: &mut Tokenizer, - codes: &[Code], start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, ) -> State { let mut state = State::Fn(Box::new(start)); - let mut index = 0; tokenizer.consumed = true; - while index < codes.len() { - let code = codes[index]; + while tokenizer.index < tokenizer.codes.len() { + let code = tokenizer.codes[tokenizer.index]; match state { - State::Ok(back) => { - state = State::Ok((codes.len() - index) + back); + State::Ok(_) | State::Nok => { break; } - State::Nok => break, State::Fn(func) => { - log::debug!("main: passing: `{:?}` ({:?})", code, index); + log::debug!("main: passing: `{:?}` ({:?})", code, tokenizer.index); tokenizer.expect(code, false); state = func(tokenizer, code); - index += 1; } } } @@ -738,37 +741,27 @@ fn feed_impl( state } -/// Feed a list of `codes` into `start`. -fn feed_twice_impl( - tokenizer: &mut Tokenizer, - left: &[Code], - right: &[Code], - start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, -) -> State { - let res = feed_impl(tokenizer, left, start); - - match res { - State::Fn(func) => feed_impl(tokenizer, right, func), - State::Ok(back) => State::Ok(back + right.len()), - State::Nok => res, - } -} - /// Flush `start`: pass `eof`s to it until done. fn flush_impl( tokenizer: &mut Tokenizer, start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, ) -> State { let mut state = State::Fn(Box::new(start)); + let max = tokenizer.index; tokenizer.consumed = true; loop { match state { State::Ok(_) | State::Nok => break, State::Fn(func) => { - log::debug!("main: passing eof"); - tokenizer.expect(Code::None, false); - state = func(tokenizer, Code::None); + let code = if tokenizer.index < max { + tokenizer.codes[tokenizer.index] + } else { + Code::None + }; + log::debug!("main: flushing {:?}", code); + tokenizer.expect(code, false); + state = func(tokenizer, code); } } } @@ -785,7 +778,7 @@ fn flush_impl( /// /// This defines how much columns, offsets, and the `index` are increased when /// consuming a line ending. -fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize, usize)) { +fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize, usize, usize)) { log::debug!("position: define skip: {:?} -> ({:?})", line, info); let at = line - tokenizer.line_start; -- cgit